[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"tool-Tencent-Hunyuan--HunyuanDiT":3,"similar-Tencent-Hunyuan--HunyuanDiT":89},{"id":4,"github_repo":5,"name":6,"description_en":7,"description_zh":8,"ai_summary_zh":8,"readme_en":9,"readme_zh":10,"quickstart_zh":11,"use_case_zh":12,"hero_image_url":13,"owner_login":14,"owner_name":14,"owner_avatar_url":15,"owner_bio":16,"owner_company":17,"owner_location":17,"owner_email":17,"owner_twitter":17,"owner_website":18,"owner_url":19,"languages":20,"stars":33,"forks":34,"last_commit_at":35,"license":36,"difficulty_score":37,"env_os":38,"env_gpu":39,"env_ram":40,"env_deps":41,"category_tags":51,"github_topics":17,"view_count":54,"oss_zip_url":17,"oss_zip_packed_at":17,"status":55,"created_at":56,"updated_at":57,"faqs":58,"releases":88},6662,"Tencent-Hunyuan\u002FHunyuanDiT","HunyuanDiT","Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding","HunyuanDiT 是腾讯开源的一款强大文生图模型，基于多分辨率扩散 Transformer 架构打造。它核心解决了传统 AI 绘画工具在理解复杂中文提示词时不够精准、生成高分辨率图像细节模糊的痛点，能够实现对中文语义的细粒度理解，从而生成更符合用户意图的高质量图像。\n\n这款工具特别适合研究人员探索扩散模型前沿技术，开发者进行二次开发或集成应用，以及设计师和创作者用于实际工作流中快速产出视觉素材。得益于其对中文语境的自然掌握，国内用户无需刻意转换思维即可轻松上手。\n\nHunyuanDiT 的技术亮点在于其独特的多分辨率训练策略，既保证了生成效率又提升了画面细节表现力。此外，它生态完善，不仅原生支持 LoRA 微调和 ControlNet 精确控制，还兼容 ComfyUI、Diffusers 等主流框架，甚至提供了针对低显存显卡的优化方案，让不同配置的用户都能流畅体验先进的 AI 创作能力。","\u003C!-- ## **HunyuanDiT** -->\r\n\r\n\u003Cp align=\"center\">\r\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_b33a1a6f0888.png\"  height=100>\r\n\u003C\u002Fp>\r\n\r\n# Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding\r\n\r\n\u003Cdiv align=\"center\">\r\n  \u003Ca href=\"https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan\u002FHunyuanDiT\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Hunyuan-DiT Code&message=Github&color=blue&logo=github-pages\">\u003C\u002Fa> &ensp;\r\n  \u003Ca href=\"https:\u002F\u002Fdit.hunyuan.tencent.com\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Project%20Page&message=Github&color=blue&logo=github-pages\">\u003C\u002Fa> &ensp;\r\n  \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.08748\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Tech Report&message=Arxiv:HunYuan-DiT&color=red&logo=arxiv\">\u003C\u002Fa> &ensp;\r\n  \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.08857\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Paper&message=Arxiv:DialogGen&color=red&logo=arxiv\">\u003C\u002Fa> &ensp;\r\n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Hunyuan-DiT&message=HuggingFace&color=yellow\">\u003C\u002Fa> &ensp;\r\n  \u003Ca href=\"https:\u002F\u002Fyuanbao.tencent.com\u002Fchat\u002FnaQivTmsDa\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Hunyuan Bot&message=Web&color=green\">\u003C\u002Fa> &ensp;\r\n  \u003Ca href=\".\u002Fcomfyui\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=ComfyUI Support&message=ComfyUI&color=purple&logo=github-pages\">\u003C\u002Fa> &ensp;\r\n\u003C\u002Fdiv>\r\n\r\n-----\r\n\r\nThis repo contains PyTorch model definitions, pre-trained weights and inference\u002Fsampling code for our paper exploring Hunyuan-DiT. You can find more visualizations on our [project page](https:\u002F\u002Fdit.hunyuan.tencent.com\u002F).\r\n\r\n> [**Hunyuan-DiT: A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.08748) \u003Cbr>\r\n\r\n> [**DialogGen: Multi-modal Interactive Dialogue System for Multi-turn Text-to-Image Generation**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.08857) \u003Cbr>\r\n\r\n## 🔥🔥🔥 News!!\r\n* Dec 17, 2024: :tada: Optimize Lora training with `refined grad checkpoint` and `low-bit optimizer`. Just use `--lowbit-opt` to get started.\r\n* Sep 13, 2024: 🎉 IPAdapter is officially supported by HunYuanDiT. Document for it: [.\u002Fipadapter](.\u002Fipadapter). And scaled attention is utilized to replace flash attention on V100 GPUs.\r\n* Aug 26, 2024, 🎉 HunYuanDIT Controlnet and LoRA are officially supported by ComfyUI. Document for it: [.\u002Fcomfyui](.\u002Fcomfyui)\r\n* Jul 15, 2024: 🚀 HunYuanDiT and Shakker.Ai have jointly launched a fine-tuning event based on the HunYuanDiT 1.2 model. By publishing a lora or fine-tuned model based on HunYuanDiT, you can earn up to $230 bonus from Shakker.Ai. See [Shakker.Ai](https:\u002F\u002Fwww.shakker.ai\u002Factivitys\u002Fshaker-the-world-hunyuan) for more details.\r\n* Jul 15, 2024: :tada: Update ComfyUI to support standardized workflows and compatibility with weights from t2i module and Lora training for versions 1.1\u002F1.2, as well as those trained by Kohya or the official script. \r\n* Jul 15, 2024: :zap: We offer Docker environments for CUDA 11\u002F12, allowing you to bypass complex installations and play with a single click! See [dockers](#installation-guide-for-linux) for details. \r\n* Jul 08, 2024: :tada: HYDiT-v1.2 version is released. Please check [HunyuanDiT-v1.2](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.2) and [Distillation-v1.2](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FDistillation-v1.2) for more details.\r\n* Jul 03, 2024: :tada: Kohya-hydit version now available for v1.1 and v1.2 models, with GUI for inference. Official Kohya version is under review. See [kohya](.\u002Fkohya_ss-hydit) for details.\r\n* Jun 27, 2024: :art: Hunyuan-Captioner is released, providing fine-grained caption for training data. See [mllm](.\u002Fmllm) for details.\r\n* Jun 27, 2024: :tada: Support LoRa and ControlNet in diffusers. See [diffusers](.\u002Fdiffusers) for details.\r\n* Jun 27, 2024: :tada: 6GB GPU VRAM Inference scripts are released. See [lite](.\u002Flite) for details.\r\n* Jun 19, 2024: :tada: ControlNet is released, supporting canny, pose and depth control. See [training\u002Finference codes](#controlnet) for details.\r\n* Jun 13, 2024: :zap: HYDiT-v1.1 version is released, which mitigates the issue of image oversaturation and alleviates the watermark issue. Please check [HunyuanDiT-v1.1](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.1) and \r\n[Distillation-v1.1](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FDistillation-v1.1) for more details.\r\n* Jun 13, 2024: :truck: The training code is released, offering [full-parameter training](#full-parameter-training) and [LoRA training](#lora).\r\n* Jun 06, 2024: :tada: Hunyuan-DiT is now available in ComfyUI. Please check [ComfyUI](#using-comfyui) for more details.\r\n* Jun 06, 2024: 🚀 We introduce Distillation version for Hunyuan-DiT acceleration, which achieves **50%** acceleration on NVIDIA GPUs. Please check [Distillation](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FDistillation) for more details.\r\n* Jun 05, 2024: 🤗 Hunyuan-DiT is now available in 🤗 Diffusers! Please check the [example](#using--diffusers) below.\r\n* Jun 04, 2024: :globe_with_meridians: Support Tencent Cloud links to download the pretrained models! Please check the [links](#-download-pretrained-models) below.\r\n* May 22, 2024: 🚀 We introduce TensorRT version for Hunyuan-DiT acceleration, which achieves **47%** acceleration on NVIDIA GPUs. Please check [TensorRT-libs](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FTensorRT-libs) for instructions.\r\n* May 22, 2024: 💬 We support demo running multi-turn text2image generation now. Please check the [script](#using-gradio) below.\r\n\r\n## 🤖 Try it on the web\r\n\r\nWelcome to our web-based [**Tencent Hunyuan Bot**](https:\u002F\u002Fhunyuan.tencent.com\u002Fbot\u002Fchat), where you can explore our innovative products! Just input the suggested prompts below or any other **imaginative prompts containing drawing-related keywords** to activate the Hunyuan text-to-image generation feature.  Unleash your creativity and create any picture you desire, **all for free!**\r\n\r\nYou can use simple prompts similar to natural language text\r\n\r\n> 画一只穿着西装的猪\r\n>\r\n> draw a pig in a suit\r\n>\r\n> 生成一幅画，赛博朋克风，跑车\r\n> \r\n> generate a painting, cyberpunk style, sports car\r\n\r\nor multi-turn language interactions to create the picture. \r\n\r\n> 画一个木制的鸟\r\n>\r\n> draw a wooden bird\r\n>\r\n> 变成玻璃的\r\n>\r\n> turn into glass\r\n\r\n\r\n## 🤗 Community Contribution Leaderboard\r\n1. By [@TTPlanetPig](https:\u002F\u002Fgithub.com\u002FTTPlanetPig)\r\n   - HunyuanDIT_v1.2 ControlNet models\r\n     - Inpaint controlnet: https:\u002F\u002Fhuggingface.co\u002FTTPlanet\u002FHunyuanDiT_Controlnet_inpainting\r\n     - Tile controlnet: https:\u002F\u002Fhuggingface.co\u002FTTPlanet\u002FHunyuanDiT_Controlnet_tile\r\n     - Lineart controlnet: https:\u002F\u002Fhuggingface.co\u002FTTPlanet\u002FHunyuanDiT_Controlnet_lineart\r\n   - HunyuanDIT_v1.2 ComfyUI nodes\r\n     - Comfyui_TTP_CN_Preprocessor: https:\u002F\u002Fgithub.com\u002FTTPlanetPig\u002FComfyui_TTP_CN_Preprocessor\r\n     - Comfyui_TTP_Toolset: https:\u002F\u002Fgithub.com\u002FTTPlanetPig\u002FComfyui_TTP_Toolset\r\n\r\n2. By [@sdbds](https:\u002F\u002Fgithub.com\u002Fsdbds) (bilibili up [青龙圣者](https:\u002F\u002Fspace.bilibili.com\u002F219296))\r\n   - Kohya_ss-hydit train tools: https:\u002F\u002Fgithub.com\u002Fzml-ai\u002FHunyuanDIT-PRE\u002Ftree\u002Fmain\u002Fkohya_ss-hydit\r\n\r\n3. By [@CrazyBoyM](https:\u002F\u002Fgithub.com\u002FCrazyBoyM) (bilibili up [飞鸟白菜](https:\u002F\u002Fspace.bilibili.com\u002F291593914))\r\n   - ComfyUI support for HunyuanDIT_v1.2 Controlnet: https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI\u002Fpull\u002F4245\r\n    \r\n4. By [@L_A_X](https:\u002F\u002Fhuggingface.co\u002FLaxhar\u002FFreeway_Animation_HunYuan_Demo)\r\n   - HunyuanDIT_v1.2 base model for anime\r\n     - Original hf: https:\u002F\u002Fhuggingface.co\u002FLaxhar\u002FFreeway_Animation_HunYuan_Demo\r\n     - Converted ComfyUI model: https:\u002F\u002Fhuggingface.co\u002Fcomfyanonymous\u002FFreeway_Animation_Hunyuan_Demo_ComfyUI_Converted\r\n    \r\n## 📑 Open-source Plan\r\n\r\n- Hunyuan-DiT (Text-to-Image Model)\r\n  - [x] Inference \r\n  - [x] Checkpoints \r\n  - [x] Distillation Version\r\n  - [x] TensorRT Version\r\n  - [x] Training\r\n  - [x] Lora\r\n  - [x] Controlnet (Pose, Canny, Depth)\r\n  - [x] 6GB GPU VRAM Inference \r\n  - [x] IP-adapter\r\n  - [ ] Hunyuan-DiT-S checkpoints (0.7B model)\r\n- Mllm\r\n  - Hunyuan-Captioner (Re-caption the raw image-text pairs)\r\n    - [x] Inference\r\n  - [Hunyuan-DialogGen](https:\u002F\u002Fgithub.com\u002FCentaurusalpha\u002FDialogGen) (Prompt Enhancement Model)\r\n    - [x] Inference\r\n- [X] Web Demo (Gradio) \r\n- [x] Multi-turn T2I Demo (Gradio)\r\n- [X] Cli Demo \r\n- [X] ComfyUI\r\n- [X] Diffusers\r\n- [X] Kohya\r\n- [ ] WebUI\r\n\r\n\r\n## Contents\r\n- [Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding](#hunyuan-dit--a-powerful-multi-resolution-diffusion-transformer-with-fine-grained-chinese-understanding)\r\n  - [🔥🔥🔥 News!!](#-news)\r\n  - [🤖 Try it on the web](#-try-it-on-the-web)\r\n  - [🤗 Community Contribution Leaderboard](#-community-contribution-leaderboard)\r\n  - [📑 Open-source Plan](#-open-source-plan)\r\n  - [Contents](#contents)\r\n  - [Abstract](#abstract)\r\n  - [🎉 Hunyuan-DiT Key Features](#-hunyuan-dit-key-features)\r\n    - [Chinese-English Bilingual DiT Architecture](#chinese-english-bilingual-dit-architecture)\r\n    - [Multi-turn Text2Image Generation](#multi-turn-text2image-generation)\r\n  - [📈 Comparisons](#-comparisons)\r\n  - [🎥 Visualization](#-visualization)\r\n  - [📜 Requirements](#-requirements)\r\n  - [🛠️ Dependencies and Installation](#️-dependencies-and-installation)\r\n    - [Installation Guide for Linux](#installation-guide-for-linux)\r\n  - [🧱 Download Pretrained Models](#-download-pretrained-models)\r\n        - [1. Using HF-Mirror](#1-using-hf-mirror)\r\n        - [2. Resume Download](#2-resume-download)\r\n  - [:truck: Training](#truck-training)\r\n    - [Data Preparation](#data-preparation)\r\n    - [Full-parameter Training](#full-parameter-training)\r\n    - [LoRA](#lora)\r\n  - [🔑 Inference](#-inference)\r\n    - [6GB GPU VRAM Inference](#6gb-gpu-vram-inference)\r\n    - [Using Gradio](#using-gradio)\r\n    - [Using 🤗 Diffusers](#using--diffusers)\r\n    - [Using Command Line](#using-command-line)\r\n    - [More Configurations](#more-configurations)\r\n    - [Using ComfyUI](#using-comfyui)\r\n    - [Using Kohya](#using-kohya)\r\n    - [Using Previous versions](#using-previous-versions)\r\n  - [:building\\_construction: Adapter](#building_construction-adapter)\r\n    - [ControlNet](#controlnet)\r\n    - [IP-Adapter](#IP-Adapter)\r\n  - [:art: Hunyuan-Captioner](#art-hunyuan-captioner)\r\n    - [Examples](#examples)\r\n    - [Instructions](#instructions)\r\n    - [Inference](#inference)\r\n    - [Gradio](#gradio)\r\n  - [🚀 Acceleration (for Linux)](#-acceleration-for-linux)\r\n  - [🔗 BibTeX](#-bibtex)\r\n  - [Start History](#start-history)\r\n\r\n## **Abstract**\r\n\r\nWe present Hunyuan-DiT, a text-to-image diffusion transformer with fine-grained understanding of both English and Chinese. To construct Hunyuan-DiT, we carefully designed the transformer structure, text encoder, and positional encoding. We also build from scratch a whole data pipeline to update and evaluate data for iterative model optimization. For fine-grained language understanding, we train a Multimodal Large Language Model to refine the captions of the images. Finally, Hunyuan-DiT can perform multi-round multi-modal dialogue with users, generating and refining images according to the context.\r\nThrough our carefully designed holistic human evaluation protocol with more than 50 professional human evaluators, Hunyuan-DiT sets a new state-of-the-art in Chinese-to-image generation compared with other open-source models.\r\n\r\n\r\n## 🎉 **Hunyuan-DiT Key Features**\r\n### **Chinese-English Bilingual DiT Architecture**\r\nHunyuan-DiT is a diffusion model in the latent space, as depicted in figure below. Following the Latent Diffusion Model, we use a pre-trained Variational Autoencoder (VAE) to compress the images into low-dimensional latent spaces and train a diffusion model to learn the data distribution with diffusion models. Our diffusion model is parameterized with a transformer. To encode the text prompts, we leverage a combination of pre-trained bilingual (English and Chinese) CLIP and multilingual T5 encoder.\r\n\u003Cp align=\"center\">\r\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_0bfffe0d2c01.png\"  height=450>\r\n\u003C\u002Fp>\r\n\r\n### Multi-turn Text2Image Generation\r\nUnderstanding natural language instructions and performing multi-turn interaction with users are important for a\r\ntext-to-image system. It can help build a dynamic and iterative creation process that bring the user’s idea into reality\r\nstep by step. In this section, we will detail how we empower Hunyuan-DiT with the ability to perform multi-round\r\nconversations and image generation. We train MLLM to understand the multi-round user dialogue\r\nand output the new text prompt for image generation.\r\n\u003Cp align=\"center\">\r\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_af6e03945c98.png\"  height=300>\r\n\u003C\u002Fp>\r\n\r\n## 📈 Comparisons\r\nIn order to comprehensively compare the generation capabilities of HunyuanDiT and other models, we constructed a 4-dimensional test set, including Text-Image Consistency, Excluding AI Artifacts, Subject Clarity, Aesthetic. More than 50 professional evaluators performs the evaluation.\r\n\r\n\u003Cp align=\"center\">\r\n\u003Ctable> \r\n\u003Cthead> \r\n\u003Ctr> \r\n    \u003Cth rowspan=\"2\">Model\u003C\u002Fth> \u003Cth rowspan=\"2\">Open Source\u003C\u002Fth> \u003Cth>Text-Image Consistency (%)\u003C\u002Fth> \u003Cth>Excluding AI Artifacts (%)\u003C\u002Fth> \u003Cth>Subject Clarity (%)\u003C\u002Fth> \u003Cth rowspan=\"2\">Aesthetics (%)\u003C\u002Fth> \u003Cth rowspan=\"2\">Overall (%)\u003C\u002Fth> \r\n\u003C\u002Ftr> \r\n\u003C\u002Fthead> \r\n\u003Ctbody> \r\n\u003Ctr> \r\n    \u003Ctd>SDXL\u003C\u002Ftd> \u003Ctd> ✔ \u003C\u002Ftd> \u003Ctd>64.3\u003C\u002Ftd> \u003Ctd>60.6\u003C\u002Ftd> \u003Ctd>91.1\u003C\u002Ftd> \u003Ctd>76.3\u003C\u002Ftd> \u003Ctd>42.7\u003C\u002Ftd> \r\n\u003C\u002Ftr> \r\n\u003Ctr> \r\n    \u003Ctd>PixArt-α\u003C\u002Ftd> \u003Ctd> ✔\u003C\u002Ftd> \u003Ctd>68.3\u003C\u002Ftd> \u003Ctd>60.9\u003C\u002Ftd> \u003Ctd>93.2\u003C\u002Ftd> \u003Ctd>77.5\u003C\u002Ftd> \u003Ctd>45.5\u003C\u002Ftd> \r\n\u003C\u002Ftr> \r\n\u003Ctr> \r\n    \u003Ctd>Playground 2.5\u003C\u002Ftd> \u003Ctd>✔\u003C\u002Ftd> \u003Ctd>71.9\u003C\u002Ftd> \u003Ctd>70.8\u003C\u002Ftd> \u003Ctd>94.9\u003C\u002Ftd> \u003Ctd>83.3\u003C\u002Ftd> \u003Ctd>54.3\u003C\u002Ftd> \r\n\u003C\u002Ftr> \r\n\r\n\u003Ctr> \r\n    \u003Ctd>SD 3\u003C\u002Ftd> \u003Ctd>&#10008\u003C\u002Ftd> \u003Ctd>77.1\u003C\u002Ftd> \u003Ctd>69.3\u003C\u002Ftd> \u003Ctd>94.6\u003C\u002Ftd> \u003Ctd>82.5\u003C\u002Ftd> \u003Ctd>56.7\u003C\u002Ftd> \r\n    \r\n\u003C\u002Ftr> \r\n\u003Ctr> \r\n    \u003Ctd>MidJourney v6\u003C\u002Ftd>\u003Ctd>&#10008\u003C\u002Ftd> \u003Ctd>73.5\u003C\u002Ftd> \u003Ctd>80.2\u003C\u002Ftd> \u003Ctd>93.5\u003C\u002Ftd> \u003Ctd>87.2\u003C\u002Ftd> \u003Ctd>63.3\u003C\u002Ftd> \r\n\u003C\u002Ftr> \r\n\u003Ctr> \r\n    \u003Ctd>DALL-E 3\u003C\u002Ftd>\u003Ctd>&#10008\u003C\u002Ftd> \u003Ctd>83.9\u003C\u002Ftd> \u003Ctd>80.3\u003C\u002Ftd> \u003Ctd>96.5\u003C\u002Ftd> \u003Ctd>89.4\u003C\u002Ftd> \u003Ctd>71.0\u003C\u002Ftd> \r\n\u003C\u002Ftr> \r\n\u003Ctr style=\"font-weight: bold; background-color: #f2f2f2;\"> \r\n    \u003Ctd>Hunyuan-DiT\u003C\u002Ftd>\u003Ctd>✔\u003C\u002Ftd> \u003Ctd>74.2\u003C\u002Ftd> \u003Ctd>74.3\u003C\u002Ftd> \u003Ctd>95.4\u003C\u002Ftd> \u003Ctd>86.6\u003C\u002Ftd> \u003Ctd>59.0\u003C\u002Ftd> \r\n\u003C\u002Ftr>\r\n\u003C\u002Ftbody>\r\n\u003C\u002Ftable>\r\n\u003C\u002Fp>\r\n\r\n## 🎥 Visualization\r\n\r\n* **Chinese Elements**\r\n\u003Cp align=\"center\">\r\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_e2e83cd27bba.png\"  height=220>\r\n\u003C\u002Fp>\r\n\r\n* **Long Text Input**\r\n\r\n\r\n\u003Cp align=\"center\">\r\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_edb58f563a56.png\"  height=310>\r\n\u003C\u002Fp>\r\n\r\n* **Multi-turn Text2Image Generation**\r\n\r\nhttps:\u002F\u002Fgithub.com\u002FTencent\u002Ftencent.github.io\u002Fassets\u002F27557933\u002F94b4dcc3-104d-44e1-8bb2-dc55108763d1\r\n\r\n\r\n\r\n---\r\n\r\n## 📜 Requirements\r\n\r\nThis repo consists of DialogGen (a prompt enhancement model) and Hunyuan-DiT (a text-to-image model).\r\n\r\nThe following table shows the requirements for running the models (batch size = 1):\r\n\r\n|          Model          | --load-4bit (DialogGen) | GPU Peak Memory |       GPU       |\r\n|:-----------------------:|:-----------------------:|:---------------:|:---------------:|\r\n| DialogGen + Hunyuan-DiT |            ✘            |       32G       |      A100       |\r\n| DialogGen + Hunyuan-DiT |            ✔            |       22G       |      A100       |\r\n|       Hunyuan-DiT       |            -            |       11G       |      A100       |\r\n|       Hunyuan-DiT       |            -            |       14G       | RTX3090\u002FRTX4090 |\r\n\r\n* An NVIDIA GPU with CUDA support is required. \r\n  * We have tested V100 and A100 GPUs.\r\n  * **Minimum**: The minimum GPU memory required is 11GB.\r\n  * **Recommended**: We recommend using a GPU with 32GB of memory for better generation quality.\r\n* Tested operating system: Linux\r\n\r\n## 🛠️ Dependencies and Installation\r\n\r\nBegin by cloning the repository:\r\n```shell\r\ngit clone https:\u002F\u002Fgithub.com\u002Ftencent\u002FHunyuanDiT\r\ncd HunyuanDiT\r\n```\r\n\r\n### Installation Guide for Linux\r\n\r\nWe provide an `environment.yml` file for setting up a Conda environment.\r\nConda's installation instructions are available [here](https:\u002F\u002Fdocs.anaconda.com\u002Ffree\u002Fminiconda\u002Findex.html).\r\n\r\nWe recommend CUDA versions 11.7 and 12.0+.\r\n\r\n```shell\r\n# 1. Prepare conda environment\r\nconda env create -f environment.yml\r\n\r\n# 2. Activate the environment\r\nconda activate HunyuanDiT\r\n\r\n# 3. Install pip dependencies\r\npython -m pip install -r requirements.txt\r\n\r\n# 4. Install flash attention v2 for acceleration (requires CUDA 11.6 or above)\r\npython -m pip install git+https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention.git@v2.1.2.post3\r\n```\r\n\r\nAdditionally, you can also use docker to set up the environment.\r\n```shell\r\n# 1. Use the following link to download the docker image tar file.\r\n# For CUDA 12\r\nwget https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fhunyuan_dit_cu12.tar\r\n# For CUDA 11\r\nwget https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fhunyuan_dit_cu11.tar\r\n\r\n# 2. Import the docker tar file and show the image meta information\r\n# For CUDA 12\r\ndocker load -i hunyuan_dit_cu12.tar\r\n# For CUDA 11\r\ndocker load -i hunyuan_dit_cu11.tar  \r\n\r\ndocker image ls\r\n\r\n# 3. Run the container based on the image\r\ndocker run -dit --gpus all --init --net=host --uts=host --ipc=host --name hunyuandit --security-opt=seccomp=unconfined --ulimit=stack=67108864 --ulimit=memlock=-1 --privileged  docker_image_tag\r\n```\r\n\r\n## 🧱 Download Pretrained Models\r\nTo download the model, first install the huggingface-cli. (Detailed instructions are available [here](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fhuggingface_hub\u002Fguides\u002Fcli).)\r\n\r\n```shell\r\npython -m pip install \"huggingface_hub[cli]\"\r\n```\r\n\r\nThen download the model using the following commands:\r\n\r\n```shell\r\n# Create a directory named 'ckpts' where the model will be saved, fulfilling the prerequisites for running the demo.\r\nmkdir ckpts\r\n# Use the huggingface-cli tool to download the model.\r\n# The download time may vary from 10 minutes to 1 hour depending on network conditions.\r\nhuggingface-cli download Tencent-Hunyuan\u002FHunyuanDiT-v1.2 --local-dir .\u002Fckpts\r\n```\r\n\r\n\u003Cdetails>\r\n\u003Csummary>💡Tips for using huggingface-cli (network problem)\u003C\u002Fsummary>\r\n\r\n##### 1. Using HF-Mirror\r\n\r\nIf you encounter slow download speeds in China, you can try a mirror to speed up the download process. For example,\r\n\r\n```shell\r\nHF_ENDPOINT=https:\u002F\u002Fhf-mirror.com huggingface-cli download Tencent-Hunyuan\u002FHunyuanDiT-v1.2 --local-dir .\u002Fckpts\r\n```\r\n\r\n##### 2. Resume Download\r\n\r\n`huggingface-cli` supports resuming downloads. If the download is interrupted, you can just rerun the download \r\ncommand to resume the download process.\r\n\r\nNote: If an `No such file or directory: 'ckpts\u002F.huggingface\u002F.gitignore.lock'` like error occurs during the download \r\nprocess, you can ignore the error and rerun the download command.\r\n\r\n\u003C\u002Fdetails>\r\n\r\n---\r\n\r\nAll models will be automatically downloaded. For more information about the model, visit the Hugging Face repository [here](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT).\r\n\r\n|       Model       | #Params |                                        Huggingface Download URL                                        |                                   Tencent Cloud Download URL                                   |\r\n|:-----------------:|:-------:|:------------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:|\r\n|        mT5        |  1.6B   |               [mT5](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Ftree\u002Fmain\u002Ft2i\u002Fmt5)               |               [mT5](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fmt5.zip)               |\r\n|       CLIP        |  350M   |       [CLIP](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Ftree\u002Fmain\u002Ft2i\u002Fclip_text_encoder)        |       [CLIP](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fclip_text_encoder.zip)        |\r\n|     Tokenizer     |  -      |         [Tokenizer](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Ftree\u002Fmain\u002Ft2i\u002Ftokenizer)         |         [Tokenizer](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Ftokenizer.zip)         |\r\n|     DialogGen     |  7.0B   |           [DialogGen](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Ftree\u002Fmain\u002Fdialoggen)           |         [DialogGen](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fdialoggen.zip)         |\r\n| sdxl-vae-fp16-fix |   83M   | [sdxl-vae-fp16-fix](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Ftree\u002Fmain\u002Ft2i\u002Fsdxl-vae-fp16-fix) | [sdxl-vae-fp16-fix](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fsdxl-vae-fp16-fix.zip) |\r\n| Hunyuan-DiT-v1.0  |  1.5B   |          [Hunyuan-DiT](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Ftree\u002Fmain\u002Ft2i\u002Fmodel)          |       [Hunyuan-DiT-v1.0](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fmodel.zip)        |\r\n| Hunyuan-DiT-v1.1  |  1.5B   |     [Hunyuan-DiT-v1.1](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.1\u002Ftree\u002Fmain\u002Ft2i\u002Fmodel)     |     [Hunyuan-DiT-v1.1](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fmodel-v1_1.zip)     |\r\n| Hunyuan-DiT-v1.2  |  1.5B   |     [Hunyuan-DiT-v1.2](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.2\u002Ftree\u002Fmain\u002Ft2i\u002Fmodel)     |     [Hunyuan-DiT-v1.2](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fmodel-v1_2.zip)     |\r\n|     Data demo     |  -      |                                                   -                                                    |         [Data demo](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fdata_demo.zip)         |\r\n\r\n## :truck: Training\r\n\r\n### Data Preparation\r\n\r\n  Refer to the commands below to prepare the training data. \r\n  \r\n  1. Install dependencies\r\n  \r\n      We offer an efficient data management library, named IndexKits, supporting the management of reading hundreds of millions of data during training, see more in [docs](.\u002FIndexKits\u002FREADME.md).\r\n      ```shell\r\n      # 1 Install dependencies\r\n      cd HunyuanDiT\r\n      pip install -e .\u002FIndexKits\r\n     ```\r\n  2. Data download \r\n  \r\n     Feel free to download the [data demo](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fdata_demo.zip).\r\n     ```shell\r\n     # 2 Data download\r\n     wget -O .\u002Fdataset\u002Fdata_demo.zip https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fdata_demo.zip\r\n     unzip .\u002Fdataset\u002Fdata_demo.zip -d .\u002Fdataset\r\n     mkdir .\u002Fdataset\u002Fporcelain\u002Farrows .\u002Fdataset\u002Fporcelain\u002Fjsons\r\n     ```\r\n  3. Data conversion \r\n  \r\n     Create a CSV file for training data with the fields listed in the table below.\r\n    \r\n     |    Fields       | Required  |  Description     |   Example   |\r\n     |:---------------:| :------:  |:----------------:|:-----------:|\r\n     |   `image_path`  | Required  |  image path               |     `.\u002Fdataset\u002Fporcelain\u002Fimages\u002F0.png`        | \r\n     |   `text_zh`     | Required  |    text               |  青花瓷风格，一只蓝色的鸟儿站在蓝色的花瓶上，周围点缀着白色花朵，背景是白色 | \r\n     |   `md5`         | Optional  |    image md5 (Message Digest Algorithm 5)  |    `d41d8cd98f00b204e9800998ecf8427e`         | \r\n     |   `width`       | Optional  |    image width    |     `1024 `       | \r\n     |   `height`      | Optional  |    image height   |    ` 1024 `       | \r\n     \r\n     > ⚠️ Optional fields like MD5, width, and height can be omitted. If omitted, the script below will automatically calculate them. This process can be time-consuming when dealing with large-scale training data.\r\n  \r\n     We utilize [Arrow](https:\u002F\u002Fgithub.com\u002Fapache\u002Farrow) for training data format, offering a standard and efficient in-memory data representation. A conversion script is provided to transform CSV files into Arrow format.\r\n     ```shell  \r\n     # 3 Data conversion \r\n     python .\u002Fhydit\u002Fdata_loader\u002Fcsv2arrow.py .\u002Fdataset\u002Fporcelain\u002Fcsvfile\u002Fimage_text.csv .\u002Fdataset\u002Fporcelain\u002Farrows 1\r\n     ```\r\n  \r\n  4. Data Selection and Configuration File Creation \r\n     \r\n      We configure the training data through YAML files. In these files, you can set up standard data processing strategies for filtering, copying, deduplicating, and more regarding the training data. For more details, see [.\u002FIndexKits](IndexKits\u002Fdocs\u002FMakeDataset.md).\r\n  \r\n      For a sample file, please refer to [file](.\u002Fdataset\u002Fyamls\u002Fporcelain.yaml). For a full parameter configuration file, see [file](.\u002FIndexKits\u002Fdocs\u002FMakeDataset.md).\r\n  \r\n     \r\n  5. Create training data index file using YAML file.\r\n    \r\n     ```shell\r\n      # Single Resolution Data Preparation\r\n      idk base -c dataset\u002Fyamls\u002Fporcelain.yaml -t dataset\u002Fporcelain\u002Fjsons\u002Fporcelain.json\r\n   \r\n      # Multi Resolution Data Preparation     \r\n      idk multireso -c dataset\u002Fyamls\u002Fporcelain_mt.yaml -t dataset\u002Fporcelain\u002Fjsons\u002Fporcelain_mt.json\r\n      ```\r\n   \r\n  The directory structure for `porcelain` dataset is:\r\n\r\n  ```shell\r\n   cd .\u002Fdataset\r\n  \r\n   porcelain\r\n      ├──images\u002F  (image files)\r\n      │  ├──0.png\r\n      │  ├──1.png\r\n      │  ├──......\r\n      ├──csvfile\u002F  (csv files containing text-image pairs)\r\n      │  ├──image_text.csv\r\n      ├──arrows\u002F  (arrow files containing all necessary training data)\r\n      │  ├──00000.arrow\r\n      │  ├──00001.arrow\r\n      │  ├──......\r\n      ├──jsons\u002F  (final training data index files which read data from arrow files during training)\r\n      │  ├──porcelain.json\r\n      │  ├──porcelain_mt.json\r\n   ```\r\n\r\n### Full-parameter Training\r\n  \r\n  **Requirement:** \r\n  1. The minimum requriment is a single GPU with at least 20GB memory, but we recommend to use a GPU with about 30 GB memory to avoid host memory offloading. \r\n  2. Additionally, we encourage users to leverage the multiple GPUs across different nodes to speed up training on large datasets. \r\n  \r\n  **Notice:**\r\n  1. Personal users can also use the light-weight Kohya to finetune the model with about 16 GB memory. Currently, we are trying to further reduce the memory usage of our industry-level framework for personal users. \r\n  2. If you have enough GPU memory, please try to remove  `--cpu-offloading` or `--gradient-checkpointing` for less time costs.\r\n\r\n  Specifically for distributed training, you have the flexibility to control **single-node** \u002F **multi-node** training by adjusting parameters such as `--hostfile` and `--master_addr`. For more details, see [link](https:\u002F\u002Fwww.deepspeed.ai\u002Fgetting-started\u002F#resource-configuration-multi-node).\r\n\r\n  ```shell\r\n  # Single Resolution Training\r\n  PYTHONPATH=.\u002F sh hydit\u002Ftrain.sh --index-file dataset\u002Fporcelain\u002Fjsons\u002Fporcelain.json\r\n  \r\n  # Multi Resolution Training\r\n  PYTHONPATH=.\u002F sh hydit\u002Ftrain.sh --index-file dataset\u002Fporcelain\u002Fjsons\u002Fporcelain_mt.json --multireso --reso-step 64\r\n  \r\n  # Training with old version of HunyuanDiT (\u003C= v1.1)\r\n  PYTHONPATH=.\u002F sh hydit\u002Ftrain_v1.1.sh --index-file dataset\u002Fporcelain\u002Fjsons\u002Fporcelain.json\r\n  ```\r\n\r\n  After checkpoints are saved, you can use the following command to evaluate the model.\r\n  ```shell\r\n  # Inference\r\n    #   You should replace the 'log_EXP\u002Fxxx\u002Fcheckpoints\u002Ffinal.pt' with your actual path.\r\n  python sample_t2i.py --infer-mode fa --prompt \"青花瓷风格，一只可爱的哈士奇\" --no-enhance --dit-weight log_EXP\u002Fxxx\u002Fcheckpoints\u002Ffinal.pt --load-key module\r\n  \r\n  # Old version of HunyuanDiT (\u003C= v1.1)\r\n  #   You should replace the 'log_EXP\u002Fxxx\u002Fcheckpoints\u002Ffinal.pt' with your actual path.\r\n  python sample_t2i.py --infer-mode fa --prompt \"青花瓷风格，一只可爱的哈士奇\" --model-root .\u002FHunyuanDiT-v1.1 --use-style-cond --size-cond 1024 1024 --beta-end 0.03 --no-enhance --dit-weight log_EXP\u002Fxxx\u002Fcheckpoints\u002Ffinal.pt --load-key module\r\n  ```\r\n\r\n### LoRA\r\n\r\n\r\n\r\nWe provide training and inference scripts for LoRA, detailed in the [.\u002Flora](.\u002Flora\u002FREADME.md). \r\n\r\n  ```shell\r\n  # Training for porcelain LoRA.\r\n  PYTHONPATH=.\u002F sh lora\u002Ftrain_lora_with_fa.sh --index-file dataset\u002Fporcelain\u002Fjsons\u002Fporcelain.json\r\n  \r\n  # Inference using trained LORA weights.\r\n  python sample_t2i.py --infer-mode fa --prompt \"青花瓷风格，一只小狗\"  --no-enhance --lora-ckpt log_EXP\u002F001-lora_porcelain_ema_rank64\u002Fcheckpoints\u002F0001000.pt\r\n  ```\r\n If you can't install flash_attn, use code:\r\n  ```shell\r\n  # Training for porcelain LoRA.\r\n  PYTHONPATH=.\u002F sh lora\u002Ftrain_lora.sh --index-file dataset\u002Fporcelain\u002Fjsons\u002Fporcelain.json\r\n  \r\n  # Inference using trained LORA weights.\r\n  python sample_t2i.py --infer-mode torch --prompt \"青花瓷风格，一只小狗\"  --no-enhance --lora-ckpt log_EXP\u002F001-lora_porcelain_ema_rank64\u002Fcheckpoints\u002F0001000.pt\r\n  ```\r\n We offer two types of trained LoRA weights for `porcelain` and `jade`, see details at [links](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHYDiT-LoRA)\r\n  ```shell\r\n  cd HunyuanDiT\r\n  # Use the huggingface-cli tool to download the model.\r\n  huggingface-cli download Tencent-Hunyuan\u002FHYDiT-LoRA --local-dir .\u002Fckpts\u002Ft2i\u002Flora\r\n  \r\n  # Quick start\r\n  python sample_t2i.py --infer-mode fa --prompt \"青花瓷风格，一只猫在追蝴蝶\"  --no-enhance --load-key ema --lora-ckpt .\u002Fckpts\u002Ft2i\u002Flora\u002Fporcelain\r\n  ```\r\n \u003Ctable>\r\n  \u003Ctr>\r\n    \u003Ctd colspan=\"4\" align=\"center\">Examples of training data\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n  \r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_92a234afe07f.png\" alt=\"Image 0\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_ecaf0c0a9113.png\" alt=\"Image 1\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_d0e812758c7d.png\" alt=\"Image 2\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_eed1c5f301b3.png\" alt=\"Image 3\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">青花瓷风格，一只蓝色的鸟儿站在蓝色的花瓶上，周围点缀着白色花朵，背景是白色 （Porcelain style, a blue bird stands on a blue vase, surrounded by white flowers, with a white background.\r\n）\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">青花瓷风格，这是一幅蓝白相间的陶瓷盘子，上面描绘着一只狐狸和它的幼崽在森林中漫步，背景是白色 （Porcelain style, this is a blue and white ceramic plate depicting a fox and its cubs strolling in the forest, with a white background.）\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">青花瓷风格，在黑色背景上，一只蓝色的狼站在蓝白相间的盘子上，周围是树木和月亮 （Porcelain style, on a black background, a blue wolf stands on a blue and white plate, surrounded by trees and the moon.）\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">青花瓷风格，在蓝色背景上，一只蓝色蝴蝶和白色花朵被放置在中央 （Porcelain style, on a blue background, a blue butterfly and white flowers are placed in the center.）\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n  \u003Ctr>\r\n    \u003Ctd colspan=\"4\" align=\"center\">Examples of inference results\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_ffb073621f69.png\" alt=\"Image 4\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_7f5a35ac2d56.png\" alt=\"Image 5\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_02ece4cb2b95.png\" alt=\"Image 6\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_29334ca04fa9.png\" alt=\"Image 7\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">青花瓷风格，苏州园林 （Porcelain style,  Suzhou Gardens.）\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">青花瓷风格，一朵荷花 （Porcelain style,  a lotus flower.）\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">青花瓷风格，一只羊（Porcelain style, a sheep.）\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">青花瓷风格，一个女孩在雨中跳舞（Porcelain style, a girl dancing in the rain.）\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n  \r\n\u003C\u002Ftable>\r\n\r\n\r\n## 🔑 Inference\r\n\r\n### 6GB GPU VRAM Inference\r\nRunning HunyuanDiT in under 6GB GPU VRAM is available now based on [diffusers](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fdiffusers\u002Fmain\u002Fen\u002Fapi\u002Fpipelines\u002Fhunyuandit). Here we provide instructions and demo for your quick start.\r\n\r\n> The 6GB version supports Nvidia Ampere architecture series graphics cards such as RTX 3070\u002F3080\u002F4080\u002F4090, A100, and so on.\r\n\r\nThe only thing you need do is to install the following library:\r\n\r\n```bash\r\npip install -U bitsandbytes\r\npip install git+https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Fdiffusers\r\npip install \"torch>=2.7.1\"\r\n```\r\n\r\nThen you can enjoy your HunyuanDiT text-to-image journey under 6GB GPU VRAM directly!\r\n\r\nHere is a demo for you.\r\n\r\n```bash\r\ncd HunyuanDiT\r\n\r\n# Quick start\r\nmodel_id=Tencent-Hunyuan\u002FHunyuanDiT-v1.2-Diffusers-Distilled\r\nprompt=一个宇航员在骑马\r\ninfer_steps=50\r\nguidance_scale=6\r\npython3 lite\u002Finference.py ${model_id} ${prompt} ${infer_steps} ${guidance_scale}\r\n```\r\n\r\nMore details can be found in [.\u002Flite](lite\u002FREADME.md).\r\n\r\n\r\n### Using Gradio\r\n\r\nMake sure the conda environment is activated before running the following command.\r\n\r\n```shell\r\n# By default, we start a Chinese UI. Using Flash Attention for acceleration.\r\npython app\u002Fhydit_app.py --infer-mode fa\r\n\r\n# Using special port and host\r\npython app\u002Fhydit_app.py --infer-mode fa --server_name 0.0.0.0 --server_port 443 --load-key distill\r\n\r\n# You can disable the enhancement model if the GPU memory is insufficient.\r\n# The enhancement will be unavailable until you restart the app without the `--no-enhance` flag. \r\npython app\u002Fhydit_app.py --no-enhance --infer-mode fa\r\n\r\n# Start with English UI\r\npython app\u002Fhydit_app.py --lang en --infer-mode fa\r\n\r\n# Start a multi-turn T2I generation UI. \r\n# If your GPU memory is less than 32GB, use '--load-4bit' to enable 4-bit quantization, which requires at least 22GB of memory.\r\npython app\u002FmultiTurnT2I_app.py --infer-mode fa\r\n```\r\nThen the demo can be accessed through http:\u002F\u002F0.0.0.0:443. It should be noted that the 0.0.0.0 here needs to be X.X.X.X with your server IP.\r\n\r\n### Using 🤗 Diffusers\r\n\r\nPlease install PyTorch version 2.0 or higher in advance to satisfy the requirements of the specified version of the diffusers library.  \r\n\r\nInstall 🤗 diffusers, ensuring that the version is at least 0.28.1:\r\n\r\n```shell\r\npip install git+https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Fdiffusers.git\r\n```\r\nor\r\n```shell\r\npip install diffusers\r\n```\r\n\r\nYou can generate images with both Chinese and English prompts using the following Python script:\r\n```py\r\nimport torch\r\nfrom diffusers import HunyuanDiTPipeline\r\n\r\npipe = HunyuanDiTPipeline.from_pretrained(\"Tencent-Hunyuan\u002FHunyuanDiT-v1.2-Diffusers\", torch_dtype=torch.float16)\r\npipe.to(\"cuda\")\r\n\r\n# You may also use English prompt as HunyuanDiT supports both English and Chinese\r\n# prompt = \"An astronaut riding a horse\"\r\nprompt = \"一个宇航员在骑马\"\r\nimage = pipe(prompt).images[0]\r\n```\r\nYou can use our distilled model to generate images even faster:\r\n\r\n```py\r\nimport torch\r\nfrom diffusers import HunyuanDiTPipeline\r\n\r\npipe = HunyuanDiTPipeline.from_pretrained(\"Tencent-Hunyuan\u002FHunyuanDiT-v1.2-Diffusers-Distilled\", torch_dtype=torch.float16)\r\npipe.to(\"cuda\")\r\n\r\n# You may also use English prompt as HunyuanDiT supports both English and Chinese\r\n# prompt = \"An astronaut riding a horse\"\r\nprompt = \"一个宇航员在骑马\"\r\nimage = pipe(prompt, num_inference_steps=25).images[0]\r\n```\r\nMore details can be found in [HunyuanDiT-v1.2-Diffusers-Distilled](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.2-Diffusers-Distilled)\r\n\r\n**More functions:** For other functions like LoRA and ControlNet, please have a look at the README of [.\u002Fdiffusers](diffusers).\r\n\r\n### Using Command Line\r\n\r\nWe provide several commands to quick start: \r\n\r\n```shell\r\n# Only Text-to-Image. Flash Attention mode\r\npython sample_t2i.py --infer-mode fa --prompt \"渔舟唱晚\" --no-enhance\r\n\r\n# Generate an image with other image sizes.\r\npython sample_t2i.py --infer-mode fa --prompt \"渔舟唱晚\" --image-size 1280 768\r\n\r\n# Prompt Enhancement + Text-to-Image. DialogGen loads with 4-bit quantization, but it may loss performance.\r\npython sample_t2i.py --infer-mode fa --prompt \"渔舟唱晚\"  --load-4bit\r\n\r\n```\r\n\r\nMore example prompts can be found in [example_prompts.txt](example_prompts.txt)\r\n\r\n### More Configurations\r\n\r\nWe list some more useful configurations for easy usage:\r\n\r\n|    Argument     |  Default  |                     Description                     |\r\n|:---------------:|:---------:|:---------------------------------------------------:|\r\n|   `--prompt`    |   None    |        The text prompt for image generation         |\r\n| `--image-size`  | 1024 1024 |           The size of the generated image           |\r\n|    `--seed`     |    42     |        The random seed for generating images        |\r\n| `--infer-steps` |    100    |          The number of steps for sampling           |\r\n|  `--negative`   |     -     |      The negative prompt for image generation       |\r\n| `--infer-mode`  |   torch   |       The inference mode (torch, fa, or trt)        |\r\n|   `--sampler`   |   ddpm    |    The diffusion sampler (ddpm, ddim, or dpmms)     |\r\n| `--no-enhance`  |   False   |        Disable the prompt enhancement model         |\r\n| `--model-root`  |   ckpts   |     The root directory of the model checkpoints     |\r\n|  `--load-key`   |    ema    | Load the student model or EMA model (ema or module) |\r\n|  `--load-4bit`  |   Fasle   |     Load DialogGen model with 4bit quantization     |\r\n\r\n### Using ComfyUI\r\n\r\n- Support two workflows: Standard ComfyUI and Diffusers Wrapper, with the former being recommended.\r\n- Support HunyuanDiT-v1.1 and v1.2.\r\n- Support module, lora and clip lora models trained by Kohya.\r\n- Support module, lora models trained by HunyunDiT official training scripts.\r\n- ControlNet support.\r\n\r\nMore details can be found in [.\u002Fcomfyui](comfyui\u002FREADME.md)\r\n\r\n### Using Kohya\r\n\r\nWe support custom codes for kohya_ss GUI, and sd-scripts training codes for HunyuanDiT.\r\n![dreambooth](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_a9d3c7b9d192.png)\r\nMore details can be found in [.\u002Fkohya_ss-hydit](kohya_ss-hydit\u002FREADME.md)\r\n\r\n### Using Previous versions\r\n\r\n* **Hunyuan-DiT \u003C= v1.1**\r\n\r\n```shell\r\n# ============================== v1.1 ==============================\r\n# Download the model\r\nhuggingface-cli download Tencent-Hunyuan\u002FHunyuanDiT-v1.1 --local-dir .\u002FHunyuanDiT-v1.1\r\n# Inference with the model\r\npython sample_t2i.py --infer-mode fa --prompt \"渔舟唱晚\" --model-root .\u002FHunyuanDiT-v1.1 --use-style-cond --size-cond 1024 1024 --beta-end 0.03\r\n\r\n# ============================== v1.0 ==============================\r\n# Download the model\r\nhuggingface-cli download Tencent-Hunyuan\u002FHunyuanDiT --local-dir .\u002FHunyuanDiT-v1.0\r\n# Inference with the model\r\npython sample_t2i.py --infer-mode fa --prompt \"渔舟唱晚\" --model-root .\u002FHunyuanDiT-v1.0 --use-style-cond --size-cond 1024 1024 --beta-end 0.03\r\n```\r\n\r\n## :building_construction: Adapter\r\n\r\n### ControlNet\r\n\r\nWe provide training scripts for ControlNet, detailed in the [.\u002Fcontrolnet](.\u002Fcontrolnet\u002FREADME.md). \r\n\r\n  ```shell\r\n  # Training for canny ControlNet.\r\n  PYTHONPATH=.\u002F sh hydit\u002Ftrain_controlnet.sh\r\n  ```\r\n We offer three types of trained ControlNet weights for `canny` `depth` and `pose`, see details at [links](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHYDiT-ControlNet)\r\n  ```shell\r\n  cd HunyuanDiT\r\n  # Use the huggingface-cli tool to download the model.\r\n  # We recommend using distilled weights as the base model for ControlNet inference, as our provided pretrained weights are trained on them.\r\n  huggingface-cli download Tencent-Hunyuan\u002FHYDiT-ControlNet-v1.2 --local-dir .\u002Fckpts\u002Ft2i\u002Fcontrolnet\r\n  huggingface-cli download Tencent-Hunyuan\u002FDistillation-v1.2 .\u002Fpytorch_model_distill.pt --local-dir .\u002Fckpts\u002Ft2i\u002Fmodel\r\n  \r\n  # Quick start\r\n  python3 sample_controlnet.py --infer-mode fa --no-enhance --load-key distill --infer-steps 50 --control-type canny --prompt \"在夜晚的酒店门前，一座古老的中国风格的狮子雕像矗立着，它的眼睛闪烁着光芒，仿佛在守护着这座建筑。背景是夜晚的酒店前，构图方式是特写，平视，居中构图。这张照片呈现了真实摄影风格，蕴含了中国雕塑文化，同时展现了神秘氛围\" --condition-image-path https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_ebbf8f65c4e8.jpg --control-weight 1.0\r\n  \r\n  ```\r\n \r\n \u003Ctable>\r\n  \u003Ctr>\r\n    \u003Ctd colspan=\"3\" align=\"center\">Condition Input\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n  \r\n   \u003Ctr>\r\n    \u003Ctd align=\"center\">Canny ControlNet \u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">Depth ControlNet \u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">Pose ControlNet \u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">在夜晚的酒店门前，一座古老的中国风格的狮子雕像矗立着，它的眼睛闪烁着光芒，仿佛在守护着这座建筑。背景是夜晚的酒店前，构图方式是特写，平视，居中构图。这张照片呈现了真实摄影风格，蕴含了中国雕塑文化，同时展现了神秘氛围\u003Cbr>（At night, an ancient Chinese-style lion statue stands in front of the hotel, its eyes gleaming as if guarding the building. The background is the hotel entrance at night, with a close-up, eye-level, and centered composition. This photo presents a realistic photographic style, embodies Chinese sculpture culture, and reveals a mysterious atmosphere.） \u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">在茂密的森林中，一只黑白相间的熊猫静静地坐在绿树红花中，周围是山川和海洋。背景是白天的森林，光线充足。照片采用特写、平视和居中构图的方式，呈现出写实的效果\u003Cbr>（In the dense forest, a black and white panda sits quietly among the green trees and red flowers, surrounded by mountains and oceans. The background is a daytime forest with ample light. The photo uses a close-up, eye-level, and centered composition to create a realistic effect.） \u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">在白天的森林中，一位穿着绿色上衣的亚洲女性站在大象旁边。照片采用了中景、平视和居中构图的方式，呈现出写实的效果。这张照片蕴含了人物摄影文化，并展现了宁静的氛围\u003Cbr>（In the daytime forest, an Asian woman wearing a green shirt stands beside an elephant. The photo uses a medium shot, eye-level, and centered composition to create a realistic effect. This picture embodies the character photography culture and conveys a serene atmosphere.） \u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_ebbf8f65c4e8.jpg\" alt=\"Image 0\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_cd9e5d78d9f4.jpg\" alt=\"Image 1\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_2dd46be733e2.jpg\" alt=\"Image 2\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \r\n  \u003C\u002Ftr>\r\n  \r\n  \u003Ctr>\r\n    \u003Ctd colspan=\"3\" align=\"center\">ControlNet Output\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_d8b27e57a49a.jpg\" alt=\"Image 3\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_5e84c71ed676.jpg\" alt=\"Image 4\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_76a1004a5d0a.jpg\" alt=\"Image 5\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n \r\n\u003C\u002Ftable>\r\n\r\n### IP-Adapter\r\n  We provide training scripts for IP-Adapter, detailed in the [.\u002Fipadapter](.\u002Fipadapter\u002FREADME.md). \r\n  ```shell\r\n  # Training for IP-Adapter.\r\n  PYTHONPATH=.\u002F sh hydit\u002Ftrain_ipadapter.sh\r\n  ```\r\n   We offer  trained IP-Adapter weights, see details at [links](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHYDiT-IP-Adapter)\r\n  ```shell\r\n  cd HunyuanDiT\r\n  # Use the huggingface-cli tool to download the model.\r\n  # We recommend using module weights as the base model for IP-Adapter inference, as our provided pretrained weights are trained on them.\r\n  huggingface-cli download Tencent-Hunyuan\u002FIP-Adapter ipa.pt --local-dir .\u002Fckpts\u002Ft2i\u002Fmodel\r\n  huggingface-cli download Tencent-Hunyuan\u002FIP-Adapter clip_img_encoder.pt  --local-dir .\u002Fckpts\u002Ft2i\u002Fmodel\u002Fclip_img_encoder\r\n  \r\n  # Quick start\r\n  python3 sample_ipadapter.py  --infer-mode fa --ref-image-path https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_8af468f74f94.png --i-scale 1.0 --prompt 一只老虎在海洋中游泳，背景是海洋。构图方式是居中构图，呈现了动漫风格和文化，营造了平静的氛围。 --infer-steps 100 --is-ipa True --load-key distill\r\n  ```\r\n\r\nExamples of ref input and IP-Adapter results are as follows:\r\n\u003Ctable>\r\n  \u003Ctr>\r\n    \u003Ctd colspan=\"3\" align=\"center\">Ref Input\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n  \r\n\r\n\r\n  \r\n\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_8af468f74f94.png\" alt=\"Image 0\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_b0e648a02174.png\" alt=\"Image 1\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_c6b7fc3f322d.png\" alt=\"Image 2\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \r\n  \u003C\u002Ftr>\r\n  \r\n  \u003Ctr>\r\n    \u003Ctd colspan=\"3\" align=\"center\">IP-Adapter Output\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">一只老虎在奔跑。\u003Cbr>（A tiger running.） \u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">一个卡通美女，抱着一只小猪。\u003Cbr>（A cartoon beauty holding a little pig.） \u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">一片紫色薰衣草地。\u003Cbr>（A purple lavender field.） \u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_97841782c229.png\" alt=\"Image 3\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_0b6805eddd6d.png\" alt=\"Image 4\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_dc835c21880c.png\" alt=\"Image 5\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">一只老虎在看书。\u003Cbr>（A tiger is reading a book.） \u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">一个卡通美女，穿着绿色衣服。\u003Cbr>（A cartoon beauty wearing green clothes.） \u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">一片紫色薰衣草地，有一只可爱的小狗。\u003Cbr>（A purple lavender field with a cute puppy.） \u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_8492f559a6c0.png\" alt=\"Image 3\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_4b89c5cbc055.png\" alt=\"Image 4\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_0c71929ea3f9.png\" alt=\"Image 5\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">一只老虎在咆哮。\u003Cbr>（A tiger is roaring.） \u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">一个卡通美女，戴着墨镜。\u003Cbr>（A cartoon beauty wearing sunglasses.） \u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">水墨风格,一片紫色薰衣草地。\u003Cbr>（Ink style. A purple lavender field.） \u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n  \u003Ctr>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_6d8a7bd12771.png\" alt=\"Image 3\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_d3a9e0270c23.png\" alt=\"Image 4\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_a55eaf3e0033.png\" alt=\"Image 5\" width=\"200\"\u002F>\u003C\u002Ftd>\r\n  \u003C\u002Ftr>\r\n \r\n  \r\n\u003C\u002Ftable>\r\n\r\n  \r\n## :art: Hunyuan-Captioner\r\nHunyuan-Captioner meets the need of text-to-image techniques by maintaining a high degree of image-text consistency. It can generate high-quality image descriptions from a variety of angles, including object description, objects relationships, background information, image style, etc. Our code is based on [LLaVA](https:\u002F\u002Fgithub.com\u002Fhaotian-liu\u002FLLaVA) implementation.\r\n\r\n### Examples\r\n\r\n\u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_0fa959ae7e39.jpg\" alt=\"Image 3\" width=\"1200\"\u002F>\u003C\u002Ftd>\r\n\r\n### Instructions\r\na. Install dependencies\r\n     \r\nThe dependencies and installation are basically the same as the [**base model**](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.2).\r\n\r\nb. Model download\r\n```shell\r\n# Use the huggingface-cli tool to download the model.\r\nhuggingface-cli download Tencent-Hunyuan\u002FHunyuanCaptioner --local-dir .\u002Fckpts\u002Fcaptioner\r\n```\r\n\r\n### Inference\r\n\r\nOur model supports three different modes including: **directly generating Chinese caption**, **generating Chinese caption based on specific knowledge**, and **directly generating English caption**. The injected information can be either accurate cues or noisy labels (e.g., raw descriptions crawled from the internet). The model is capable of generating reliable and accurate descriptions based on both the inserted information and the image content.\r\n\r\n|Mode           | Prompt Template                           |Description                           | \r\n| ---           | ---                                       | ---                                  |\r\n|caption_zh     | 描述这张图片                               |Caption in Chinese                    | \r\n|insert_content | 根据提示词“{}”,描述这张图片                 |Caption with inserted knowledge| \r\n|caption_en     | Please describe the content of this image |Caption in English                    |\r\n|               |                                           |                                      |\r\n \r\n\r\na. Single picture inference in Chinese\r\n\r\n```bash\r\npython mllm\u002Fcaption_demo.py --mode \"caption_zh\" --image_file \"mllm\u002Fimages\u002Fdemo1.png\" --model_path \".\u002Fckpts\u002Fcaptioner\"\r\n```\r\n\r\nb. Insert specific knowledge into caption\r\n\r\n```bash\r\npython mllm\u002Fcaption_demo.py --mode \"insert_content\" --content \"宫保鸡丁\" --image_file \"mllm\u002Fimages\u002Fdemo2.png\" --model_path \".\u002Fckpts\u002Fcaptioner\"\r\n```\r\n\r\nc. Single picture inference in English\r\n\r\n```bash\r\npython mllm\u002Fcaption_demo.py --mode \"caption_en\" --image_file \"mllm\u002Fimages\u002Fdemo3.png\" --model_path \".\u002Fckpts\u002Fcaptioner\"\r\n```\r\n\r\nd. Multiple pictures inference in Chinese\r\n\r\n```bash\r\n### Convert multiple pictures to csv file. \r\npython mllm\u002Fmake_csv.py --img_dir \"mllm\u002Fimages\" --input_file \"mllm\u002Fimages\u002Fdemo.csv\"\r\n\r\n### Multiple pictures inference\r\npython mllm\u002Fcaption_demo.py --mode \"caption_zh\" --input_file \"mllm\u002Fimages\u002Fdemo.csv\" --output_file \"mllm\u002Fimages\u002Fdemo_res.csv\" --model_path \".\u002Fckpts\u002Fcaptioner\"\r\n```\r\n\r\n(Optional) To convert the output csv file to Arrow format, please refer to [Data Preparation #3](#data-preparation) for detailed instructions. \r\n\r\n\r\n### Gradio \r\nTo launch a Gradio demo locally, please run the following commands one by one. For more detailed instructions, please refer to [LLaVA](https:\u002F\u002Fgithub.com\u002Fhaotian-liu\u002FLLaVA). \r\n```bash\r\ncd mllm\r\npython -m llava.serve.controller --host 0.0.0.0 --port 10000\r\n\r\npython -m llava.serve.gradio_web_server --controller http:\u002F\u002F0.0.0.0:10000 --model-list-mode reload --port 443\r\n\r\npython -m llava.serve.model_worker --host 0.0.0.0 --controller http:\u002F\u002F0.0.0.0:10000 --port 40000 --worker http:\u002F\u002F0.0.0.0:40000 --model-path \"..\u002Fckpts\u002Fcaptioner\" --model-name LlavaMistral\r\n```\r\nThen the demo can be accessed through http:\u002F\u002F0.0.0.0:443. It should be noted that the 0.0.0.0 here needs to be X.X.X.X with your server IP.\r\n\r\n## 🚀 Acceleration (for Linux)\r\n\r\n- We provide TensorRT version of HunyuanDiT for inference acceleration (faster than flash attention).\r\nSee [Tencent-Hunyuan\u002FTensorRT-libs](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FTensorRT-libs) for more details.\r\n\r\n- We provide Distillation version of HunyuanDiT for inference acceleration.\r\nSee [Tencent-Hunyuan\u002FDistillation](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FDistillation) for more details.\r\n\r\n## 🔗 BibTeX\r\nIf you find [Hunyuan-DiT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.08748) or [DialogGen](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.08857) useful for your research and applications, please cite using this BibTeX:\r\n\r\n```BibTeX\r\n@misc{li2024hunyuandit,\r\n      title={Hunyuan-DiT: A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding}, \r\n      author={Zhimin Li and Jianwei Zhang and Qin Lin and Jiangfeng Xiong and Yanxin Long and Xinchi Deng and Yingfang Zhang and Xingchao Liu and Minbin Huang and Zedong Xiao and Dayou Chen and Jiajun He and Jiahao Li and Wenyue Li and Chen Zhang and Rongwei Quan and Jianxiang Lu and Jiabin Huang and Xiaoyan Yuan and Xiaoxiao Zheng and Yixuan Li and Jihong Zhang and Chao Zhang and Meng Chen and Jie Liu and Zheng Fang and Weiyan Wang and Jinbao Xue and Yangyu Tao and Jianchen Zhu and Kai Liu and Sihuan Lin and Yifu Sun and Yun Li and Dongdong Wang and Mingtao Chen and Zhichao Hu and Xiao Xiao and Yan Chen and Yuhong Liu and Wei Liu and Di Wang and Yong Yang and Jie Jiang and Qinglin Lu},\r\n      year={2024},\r\n      eprint={2405.08748},\r\n      archivePrefix={arXiv},\r\n      primaryClass={cs.CV}\r\n}\r\n\r\n@article{huang2024dialoggen,\r\n  title={DialogGen: Multi-modal Interactive Dialogue System for Multi-turn Text-to-Image Generation},\r\n  author={Huang, Minbin and Long, Yanxin and Deng, Xinchi and Chu, Ruihang and Xiong, Jiangfeng and Liang, Xiaodan and Cheng, Hong and Lu, Qinglin and Liu, Wei},\r\n  journal={arXiv preprint arXiv:2403.08857},\r\n  year={2024}\r\n}\r\n```\r\n\r\n## Start History\r\n\r\n\u003Ca href=\"https:\u002F\u002Fstar-history.com\u002F#Tencent\u002FHunyuanDiT&Date\">\r\n \u003Cpicture>\r\n   \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_28e0c7e62f38.png&theme=dark\" \u002F>\r\n   \u003Csource media=\"(prefers-color-scheme: light)\" srcset=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_28e0c7e62f38.png\" \u002F>\r\n   \u003Cimg alt=\"Star History Chart\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_28e0c7e62f38.png\" \u002F>\r\n \u003C\u002Fpicture>\r\n\u003C\u002Fa>\r\n\r\n","\u003C!-- ## **HunyuanDiT** -->\r\n\r\n\u003Cp align=\"center\">\r\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_b33a1a6f0888.png\"  height=100>\r\n\u003C\u002Fp>\r\n\r\n# Hunyuan-DiT：一款具备细粒度中文理解能力的强大多分辨率扩散Transformer\r\n\r\n\u003Cdiv align=\"center\">\r\n  \u003Ca href=\"https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan\u002FHunyuanDiT\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Hunyuan-DiT代码&message=Github&color=blue&logo=github-pages\">\u003C\u002Fa> &ensp;\r\n  \u003Ca href=\"https:\u002F\u002Fdit.hunyuan.tencent.com\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=项目页面&message=Github&color=blue&logo=github-pages\">\u003C\u002Fa> &ensp;\r\n  \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.08748\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=技术报告&message=Arxiv:HunYuan-DiT&color=red&logo=arxiv\">\u003C\u002Fa> &ensp;\r\n  \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.08857\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=论文&message=Arxiv:DialogGen&color=red&logo=arxiv\">\u003C\u002Fa> &ensp;\r\n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Hunyuan-DiT&message=HuggingFace&color=yellow\">\u003C\u002Fa> &ensp;\r\n  \u003Ca href=\"https:\u002F\u002Fyuanbao.tencent.com\u002Fchat\u002FnaQivTmsDa\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=Hunyuan机器人&message=网页&color=green\">\u003C\u002Fa> &ensp;\r\n  \u003Ca href=\".\u002Fcomfyui\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fstatic\u002Fv1?label=ComfyUI支持&message=ComfyUI&color=purple&logo=github-pages\">\u003C\u002Fa> &ensp;\r\n\u003C\u002Fdiv>\r\n\r\n-----\r\n\r\n本仓库包含我们探索Hunyuan-DiT的论文所用的PyTorch模型定义、预训练权重以及推理\u002F采样代码。您可以在我们的[项目页面](https:\u002F\u002Fdit.hunyuan.tencent.com\u002F)上找到更多可视化内容。\n\n> [**Hunyuan-DiT：一款具备细粒度中文理解能力的强大多分辨率扩散Transformer**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.08748) \u003Cbr>\n\n> [**DialogGen：用于多轮文生图的多模态交互式对话系统**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.08857) \u003Cbr>\n\n## 🔥🔥🔥 最新消息！！\n* 2024年12月17日：:tada: 使用“精炼梯度检查点”和“低比特优化器”优化LoRA训练。只需使用`--lowbit-opt`即可开始。\n* 2024年9月13日：🎉 IPAdapter已正式被HunYuanDiT支持。相关文档请参见[.\u002Fipadapter](.\u002Fipadapter)。同时，在V100 GPU上，已采用缩放注意力机制替代Flash Attention。\n* 2024年8月26日，🎉 HunYuanDIT Controlnet和LoRA已正式被ComfyUI支持。相关文档请参见[.\u002Fcomfyui](.\u002Fcomfyui)。\n* 2024年7月15日：🚀 HunYuanDiT与Shakker.Ai联合推出了基于HunYuanDiT 1.2模型的微调活动。通过发布基于HunYuanDiT的LoRA或微调模型，您最高可获得来自Shakker.Ai的230美元奖励。详情请参见[Shakker.Ai](https:\u002F\u002Fwww.shakker.ai\u002Factivitys\u002Fshaker-the-world-hunyuan)。\n* 2024年7月15日：:tada: 更新ComfyUI以支持标准化工作流，并兼容t2i模块权重及1.1\u002F1.2版本的LoRA训练权重，还包括由Kohya或官方脚本训练的权重。\n* 2024年7月15日：:zap: 我们提供适用于CUDA 11\u002F12的Docker环境，让您无需复杂安装，一键即可体验！详情请参见[dockers](#installation-guide-for-linux)。\n* 2024年7月8日：:tada: HYDiT-v1.2版本发布。请查看[HunyuanDiT-v1.2](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.2)和[Distillation-v1.2](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FDistillation-v1.2)以获取更多详细信息。\n* 2024年7月3日：:tada: Kohya-hydit版本现已适用于v1.1和v1.2模型，并提供GUI进行推理。官方Kohya版本正在审核中。详情请参见[kohya](.\u002Fkohya_ss-hydit)。\n* 2024年6月27日：:art: Hunyuan-Captioner发布，为训练数据提供细粒度描述。详情请参见[mllm](.\u002Fmllm)。\n* 2024年6月27日：:tada: 在diffusers中支持LoRa和ControlNet。详情请参见[diffusers](.\u002Fdiffusers)。\n* 2024年6月27日：:tada: 发布了适用于6GB显存GPU的推理脚本。详情请参见[lite](.\u002Flite)。\n* 2024年6月19日：:tada: ControlNet发布，支持Canny、姿态和深度控制。详情请参见[训练\u002F推理代码](#controlnet)。\n* 2024年6月13日：:zap: HYDiT-v1.1版本发布，该版本缓解了图像过度饱和的问题，并减轻了水印问题。请查看[HunyuanDiT-v1.1](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.1)和[Distillation-v1.1](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FDistillation-v1.1)以获取更多信息。\n* 2024年6月13日：:truck: 训练代码发布，提供[全参数训练](#full-parameter-training)和[LoRA训练](#lora)。\n* 2024年6月6日：:tada: Hunyuan-DiT现已可在ComfyUI中使用。详情请参见[ComfyUI](#using-comfyui)。\n* 2024年6月6日：🚀 我们推出了Hunyuan-DiT加速版——Distillation，可在NVIDIA GPU上实现**50%**的加速效果。详情请参见[Distillation](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FDistillation)。\n* 2024年6月5日：🤗 Hunyuan-DiT现已在🤗 Diffusers中可用！请参见下方的[示例](#using--diffusers)。\n* 2024年6月4日：:globe_with_meridians: 支持通过腾讯云链接下载预训练模型！详情请参见下方的[链接](#-download-pretrained-models)。\n* 2024年5月22日：🚀 我们推出了Hunyuan-DiT的TensorRT版本，可在NVIDIA GPU上实现**47%**的加速效果。请参见[TensorRT-libs](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FTensorRT-libs)获取使用说明。\n* 2024年5月22日：💬 现在支持演示多轮文本生成图像的功能。请参见下方的[脚本](#using-gradio)。\n\n## 🤖 在网页上试用\n\n欢迎来到我们的基于网页的[**腾讯Hunyuan机器人**](https:\u002F\u002Fhunyuan.tencent.com\u002Fbot\u002Fchat)，在这里您可以探索我们的创新产品！只需输入以下建议提示，或任何其他**包含绘画相关关键词的创意提示**，即可激活Hunyuan文生图功能。尽情发挥您的创造力，绘制任何您想要的画面，**全部免费！**\n\n您可以使用类似自然语言的简单提示：\n\n> 画一只穿着西装的猪\n>\n> draw a pig in a suit\n>\n> 生成一幅画，赛博朋克风，跑车\n> \n> generate a painting, cyberpunk style, sports car\n\n或者通过多轮语言交互来创作图片：\n\n> 画一个木制的鸟\n>\n> draw a wooden bird\n>\n> 变成玻璃的\n>\n> turn into glass\n\n## 🤗 社区贡献排行榜\n1. 由 [@TTPlanetPig](https:\u002F\u002Fgithub.com\u002FTTPlanetPig) 贡献\n   - HunyuanDIT_v1.2 ControlNet 模型\n     - 填充 ControlNet：https:\u002F\u002Fhuggingface.co\u002FTTPlanet\u002FHunyuanDiT_Controlnet_inpainting\n     - 瓷砖 ControlNet：https:\u002F\u002Fhuggingface.co\u002FTTPlanet\u002FHunyuanDiT_Controlnet_tile\n     - 线稿 ControlNet：https:\u002F\u002Fhuggingface.co\u002FTTPlanet\u002FHunyuanDiT_Controlnet_lineart\n   - HunyuanDIT_v1.2 ComfyUI 节点\n     - Comfyui_TTP_CN_Preprocessor：https:\u002F\u002Fgithub.com\u002FTTPlanetPig\u002FComfyui_TTP_CN_Preprocessor\n     - Comfyui_TTP_Toolset：https:\u002F\u002Fgithub.com\u002FTTPlanetPig\u002FComfyui_TTP_Toolset\n\n2. 由 [@sdbds](https:\u002F\u002Fgithub.com\u002Fsdbds)（B站UP主 [青龙圣者](https:\u002F\u002Fspace.bilibili.com\u002F219296)）贡献\n   - Kohya_ss-hydit 训练工具：https:\u002F\u002Fgithub.com\u002Fzml-ai\u002FHunyuanDIT-PRE\u002Ftree\u002Fmain\u002Fkohya_ss-hydit\n\n3. 由 [@CrazyBoyM](https:\u002F\u002Fgithub.com\u002FCrazyBoyM)（B站UP主 [飞鸟白菜](https:\u002F\u002Fspace.bilibili.com\u002F291593914)）贡献\n   - HunyuanDIT_v1.2 ControlNet 的 ComfyUI 支持：https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI\u002Fpull\u002F4245\n\n4. 由 [@L_A_X](https:\u002F\u002Fhuggingface.co\u002FLaxhar\u002FFreeway_Animation_HunYuan_Demo) 贡献\n   - 面向动漫的 HunyuanDIT_v1.2 基础模型\n     - 原始 Hugging Face 模型：https:\u002F\u002Fhuggingface.co\u002FLaxhar\u002FFreeway_Animation_HunYuan_Demo\n     - 转换后的 ComfyUI 模型：https:\u002F\u002Fhuggingface.co\u002Fcomfyanonymous\u002FFreeway_Animation_Hunyuan_Demo_ComfyUI_Converted\n\n## 📑 开源计划\n\n- Hunyuan-DiT（文本到图像模型）\n  - [x] 推理\n  - [x] 检查点\n  - [x] 蒸馏版\n  - [x] TensorRT 版\n  - [x] 训练\n  - [x] LoRA\n  - [x] Controlnet（姿态、Canny、深度）\n  - [x] 6GB 显存 GPU 推理\n  - [x] IP-adapter\n  - [ ] Hunyuan-DiT-S 检查点（0.7B 模型）\n- MLLM\n  - Hunyuan-Captioner（对原始图像-文本对进行重新标注）\n    - [x] 推理\n  - [Hunyuan-DialogGen](https:\u002F\u002Fgithub.com\u002FCentaurusalpha\u002FDialogGen)（提示增强模型）\n    - [x] 推理\n- [X] Web Demo（Gradio）\n- [x] 多轮 T2I Demo（Gradio）\n- [X] CLI Demo\n- [X] ComfyUI\n- [X] Diffusers\n- [X] Kohya\n- [ ] WebUI\n\n## 目录\n- [Hunyuan-DiT：一款具有细粒度中文理解能力的强大多分辨率扩散Transformer](#hunyuan-dit--a-powerful-multi-resolution-diffusion-transformer-with-fine-grained-chinese-understanding)\n  - [🔥🔥🔥 新闻！！](#-news)\n  - [🤖 在线试用](#-try-it-on-the-web)\n  - [🤗 社区贡献排行榜](#-community-contribution-leaderboard)\n  - [📑 开源计划](#-open-source-plan)\n  - [目录](#contents)\n  - [摘要](#abstract)\n  - [🎉 Hunyuan-DiT 主要特性](#-hunyuan-dit-key-features)\n    - [中英双语 DiT 架构](#chinese-english-bilingual-dit-architecture)\n    - [多轮文本到图像生成](#multi-turn-text2image-generation)\n  - [📈 对比](#-comparisons)\n  - [🎥 可视化](#-visualization)\n  - [📜 要求](#-requirements)\n  - [🛠️ 依赖与安装](#️-dependencies-and-installation)\n    - [Linux 安装指南](#installation-guide-for-linux)\n  - [🧱 下载预训练模型](#-download-pretrained-models)\n        - [1. 使用 HF-Mirror](#1-using-hf-mirror)\n        - [2. 断点续传](#2-resume-download)\n  - [[:truck: 训练]](#truck-training)\n    - [数据准备](#data-preparation)\n    - [全参数训练](#full-parameter-training)\n    - [LoRA](#lora)\n  - [🔑 推理](#-inference)\n    - [6GB 显存 GPU 推理](#6gb-gpu-vram-inference)\n    - [使用 Gradio](#using-gradio)\n    - [使用 🤗 Diffusers](#using--diffusers)\n    - [使用命令行](#using-command-line)\n    - [更多配置](#more-configurations)\n    - [使用 ComfyUI](#using-comfyui)\n    - [使用 Kohya](#using-kohya)\n    - [使用旧版本](#using-previous-versions)\n  - [[:building\\_construction: Adapter]](#building_construction-adapter)\n    - [ControlNet](#controlnet)\n    - [IP-Adapter](#IP-Adapter)\n  - [[:art: Hunyuan-Captioner]](#art-hunyuan-captioner)\n    - [示例](#examples)\n    - [说明](#instructions)\n    - [推理](#inference)\n    - [Gradio](#gradio)\n  - [🚀 加速（适用于 Linux）](#-acceleration-for-linux)\n  - [🔗 BibTeX](#-bibtex)\n  - [历史沿革](#start-history)\n\n## **摘要**\n\n我们提出了 Hunyuan-DiT，这是一种能够对英语和中文进行细粒度理解的文本到图像扩散Transformer。为了构建 Hunyuan-DiT，我们精心设计了 Transformer 结构、文本编码器和位置编码。此外，我们从零开始搭建了一整套数据流水线，用于更新和评估数据，以实现模型的迭代优化。为了实现细粒度的语言理解，我们训练了一个多模态大语言模型来优化图像的标题描述。最终，Hunyuan-DiT 能够与用户进行多轮多模态对话，根据上下文生成并不断优化图像。\n通过我们精心设计的、由超过 50 名专业人类评估员参与的整体评估协议，Hunyuan-DiT 在中文到图像生成方面相较于其他开源模型树立了新的最先进水平。\n\n\n## 🎉 **Hunyuan-DiT 主要特性**\n### **中英双语 DiT 架构**\nHunyuan-DiT 是一种在潜在空间中的扩散模型，如图所示。遵循潜在扩散模型的思路，我们使用一个预训练的变分自编码器（VAE）将图像压缩到低维潜在空间，并训练一个扩散模型来学习数据分布。我们的扩散模型采用 Transformer 参数化。为了编码文本提示，我们结合使用了预训练的中英双语 CLIP 和多语言 T5 编码器。\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_0bfffe0d2c01.png\"  height=450>\n\u003C\u002Fp>\n\n### 多轮文本到图像生成\n理解自然语言指令并进行多轮交互对于文本到图像系统至关重要。这有助于构建一个动态且迭代的创作过程，逐步将用户的想法变为现实。在这一部分，我们将详细介绍我们如何赋予 Hunyuan-DiT 多轮对话和图像生成的能力。我们训练 MLLM 来理解用户的多轮对话，并输出用于图像生成的新文本提示。\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_af6e03945c98.png\"  height=300>\n\u003C\u002Fp>\n\n## 📈 比较\n\n为了全面比较 HunyuanDiT 与其他模型的生成能力，我们构建了一个包含文本与图像一致性、是否排除 AI 伪影、主体清晰度和美学四个维度的测试集。共有50多位专业评估者参与了此次评估。\n\n\u003Cp align=\"center\">\n\u003Ctable> \n\u003Cthead> \n\u003Ctr> \n    \u003Cth rowspan=\"2\">模型\u003C\u002Fth> \u003Cth rowspan=\"2\">开源\u003C\u002Fth> \u003Cth>文本与图像一致性 (%)\u003C\u002Fth> \u003Cth>是否排除 AI 伪影 (%)\u003C\u002Fth> \u003Cth>主体清晰度 (%)\u003C\u002Fth> \u003Cth rowspan=\"2\">美学 (%)\u003C\u002Fth> \u003Cth rowspan=\"2\">综合 (%)\u003C\u002Fth> \n\u003C\u002Ftr> \n\u003C\u002Fthead> \n\u003Ctbody> \n\u003Ctr> \n    \u003Ctd>SDXL\u003C\u002Ftd> \u003Ctd> ✔ \u003C\u002Ftd> \u003Ctd>64.3\u003C\u002Ftd> \u003Ctd>60.6\u003C\u002Ftd> \u003Ctd>91.1\u003C\u002Ftd> \u003Ctd>76.3\u003C\u002Ftd> \u003Ctd>42.7\u003C\u002Ftd> \n\u003C\u002Ftr> \n\u003Ctr> \n    \u003Ctd>PixArt-α\u003C\u002Ftd> \u003Ctd> ✔\u003C\u002Ftd> \u003Ctd>68.3\u003C\u002Ftd> \u003Ctd>60.9\u003C\u002Ftd> \u003Ctd>93.2\u003C\u002Ftd> \u003Ctd>77.5\u003C\u002Ftd> \u003Ctd>45.5\u003C\u002Ftd> \n\u003C\u002Ftr> \n\u003Ctr> \n    \u003Ctd>Playground 2.5\u003C\u002Ftd> \u003Ctd>✔\u003C\u002Ftd> \u003Ctd>71.9\u003C\u002Ftd> \u003Ctd>70.8\u003C\u002Ftd> \u003Ctd>94.9\u003C\u002Ftd> \u003Ctd>83.3\u003C\u002Ftd> \u003Ctd>54.3\u003C\u002Ftd> \n\u003C\u002Ftr> \n\n\u003Ctr> \n    \u003Ctd>SD 3\u003C\u002Ftd> \u003Ctd>&#10008\u003C\u002Ftd> \u003Ctd>77.1\u003C\u002Ftd> \u003Ctd>69.3\u003C\u002Ftd> \u003Ctd>94.6\u003C\u002Ftd> \u003Ctd>82.5\u003C\u002Ftd> \u003Ctd>56.7\u003C\u002Ftd> \n    \n\u003C\u002Ftr> \n\u003Ctr> \n    \u003Ctd>MidJourney v6\u003C\u002Ftd>\u003Ctd>&#10008\u003C\u002Ftd> \u003Ctd>73.5\u003C\u002Ftd> \u003Ctd>80.2\u003C\u002Ftd> \u003Ctd>93.5\u003C\u002Ftd> \u003Ctd>87.2\u003C\u002Ftd> \u003Ctd>63.3\u003C\u002Ftd> \n\u003C\u002Ftr> \n\u003Ctr> \n    \u003Ctd>DALL-E 3\u003C\u002Ftd>\u003Ctd>&#10008\u003C\u002Ftd> \u003Ctd>83.9\u003C\u002Ftd> \u003Ctd>80.3\u003C\u002Ftd> \u003Ctd>96.5\u003C\u002Ftd> \u003Ctd>89.4\u003C\u002Ftd> \u003Ctd>71.0\u003C\u002Ftd> \n\u003C\u002Ftr> \n\u003Ctr style=\"font-weight: bold; background-color: #f2f2f2;\"> \n    \u003Ctd>Hunyuan-DiT\u003C\u002Ftd>\u003Ctd>✔\u003C\u002Ftd> \u003Ctd>74.2\u003C\u002Ftd> \u003Ctd>74.3\u003C\u002Ftd> \u003Ctd>95.4\u003C\u002Ftd> \u003Ctd>86.6\u003C\u002Ftd> \u003Ctd>59.0\u003C\u002Ftd> \n\u003C\u002Ftr>\n\u003C\u002Ftbody>\n\u003C\u002Ftable>\n\u003C\u002Fp>\n\n## 🎥 可视化\n\n* **中国元素**\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_e2e83cd27bba.png\"  height=220>\n\u003C\u002Fp>\n\n* **长文本输入**\n\n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_edb58f563a56.png\"  height=310>\n\u003C\u002Fp>\n\n* **多轮 Text2Image 生成**\n\nhttps:\u002F\u002Fgithub.com\u002FTencent\u002Ftencent.github.io\u002Fassets\u002F27557933\u002F94b4dcc3-104d-44e1-8bb2-dc55108763d1\n\n\n\n---\n\n## 📜 系统要求\n\n本仓库包含 DialogGen（提示增强模型）和 Hunyuan-DiT（文本到图像模型）。\n\n下表展示了运行这些模型所需的硬件配置（批量大小 = 1）：\n\n|          模型          | --load-4bit (DialogGen) | GPU 峰值内存 |       GPU       |\n|:-----------------------:|:-----------------------:|:---------------:|:---------------:|\n| DialogGen + Hunyuan-DiT |            ✘            |       32G       |      A100       |\n| DialogGen + Hunyuan-DiT |            ✔            |       22G       |      A100       |\n|       Hunyuan-DiT       |            -            |       11G       |      A100       |\n|       Hunyuan-DiT       |            -            |       14G       | RTX3090\u002FRTX4090 |\n\n* 需要支持 CUDA 的 NVIDIA 显卡。\n  * 我们已测试过 V100 和 A100 显卡。\n  * **最低要求**：显存至少需要 11GB。\n  * **推荐**：为获得更好的生成效果，建议使用 32GB 显存的显卡。\n* 测试过的操作系统：Linux\n\n## 🛠️ 依赖与安装\n\n首先克隆仓库：\n```shell\ngit clone https:\u002F\u002Fgithub.com\u002Ftencent\u002FHunyuanDiT\ncd HunyuanDiT\n```\n\n### Linux 安装指南\n\n我们提供了一个 `environment.yml` 文件来设置 Conda 环境。\nConda 的安装说明请参见 [这里](https:\u002F\u002Fdocs.anaconda.com\u002Ffree\u002Fminiconda\u002Findex.html)。\n\n推荐使用 CUDA 11.7 和 12.0 及以上版本。\n\n```shell\n# 1. 准备 Conda 环境\nconda env create -f environment.yml\n\n# 2. 激活环境\nconda activate HunyuanDiT\n\n# 3. 安装 pip 依赖\npython -m pip install -r requirements.txt\n\n# 4. 安装 flash attention v2 以加速（需 CUDA 11.6 或以上）\npython -m pip install git+https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention.git@v2.1.2.post3\n```\n\n此外，您也可以使用 Docker 来设置环境。\n```shell\n# 1. 使用以下链接下载 Docker 镜像 tar 文件。\n# 对于 CUDA 12\nwget https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fhunyuan_dit_cu12.tar\n# 对于 CUDA 11\nwget https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fhunyuan_dit_cu11.tar\n\n# 2. 导入 Docker tar 文件并查看镜像元信息\n# 对于 CUDA 12\ndocker load -i hunyuan_dit_cu12.tar\n# 对于 CUDA 11\ndocker load -i hunyuan_dit_cu11.tar  \n\ndocker image ls\n\n# 3. 根据镜像运行容器\ndocker run -dit --gpus all --init --net=host --uts=host --ipc=host --name hunyuandit --security-opt=seccomp=unconfined --ulimit=stack=67108864 --ulimit=memlock=-1 --privileged  docker_image_tag\n```\n\n## 🧱 下载预训练模型\n要下载模型，首先需要安装 huggingface-cli。（详细说明请参见 [这里](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fhuggingface_hub\u002Fguides\u002Fcli)。）\n\n```shell\npython -m pip install \"huggingface_hub[cli]\"\n```\n\n然后使用以下命令下载模型：\n\n```shell\n# 创建一个名为 'ckpts' 的目录，用于保存模型，这是运行演示的前提条件。\nmkdir ckpts\n# 使用 huggingface-cli 工具下载模型。\n\n# 下载时间可能因网络状况而异，从10分钟到1小时不等。\nhuggingface-cli download Tencent-Hunyuan\u002FHunyuanDiT-v1.2 --local-dir .\u002Fckpts\n```\n\n\u003Cdetails>\n\u003Csummary>💡使用 huggingface-cli 的提示（网络问题）\u003C\u002Fsummary>\n\n##### 1. 使用 HF 镜像\n\n如果在中国境内遇到下载速度较慢的情况，可以尝试使用镜像来加快下载速度。例如：\n\n```shell\nHF_ENDPOINT=https:\u002F\u002Fhf-mirror.com huggingface-cli download Tencent-Hunyuan\u002FHunyuanDiT-v1.2 --local-dir .\u002Fckpts\n```\n\n##### 2. 断点续传\n\n`huggingface-cli` 支持断点续传功能。如果下载过程中断，只需重新运行下载命令即可继续完成下载。\n\n注意：如果在下载过程中出现类似 `No such file or directory: 'ckpts\u002F.huggingface\u002F.gitignore.lock'` 的错误，可以忽略该错误并重新运行下载命令。\n\n\u003C\u002Fdetails>\n\n---\n\n所有模型都会自动下载。有关该模型的更多信息，请访问 Hugging Face 仓库 [这里](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT)。\n\n|       模型       | 参数量 |                                        Hugging Face 下载地址                                        |                                   腾讯云 下载地址                                   |\n|:-----------------:|:-------:|:------------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:|\n|        mT5        |  1.6B   |               [mT5](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Ftree\u002Fmain\u002Ft2i\u002Fmt5)               |               [mT5](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fmt5.zip)               |\n|       CLIP        |  350M   |       [CLIP](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Ftree\u002Fmain\u002Ft2i\u002Fclip_text_encoder)        |       [CLIP](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fclip_text_encoder.zip)        |\n|     Tokenizer     |  -      |         [Tokenizer](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Ftree\u002Fmain\u002Ft2i\u002Ftokenizer)         |         [Tokenizer](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Ftokenizer.zip)         |\n|     DialogGen     |  7.0B   |           [DialogGen](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Ftree\u002Fmain\u002Fdialoggen)           |         [DialogGen](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fdialoggen.zip)         |\n| sdxl-vae-fp16-fix |   83M   | [sdxl-vae-fp16-fix](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Ftree\u002Fmain\u002Ft2i\u002Fsdxl-vae-fp16-fix) | [sdxl-vae-fp16-fix](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fsdxl-vae-fp16-fix.zip) |\n| Hunyuan-DiT-v1.0  |  1.5B   |          [Hunyuan-DiT](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Ftree\u002Fmain\u002Ft2i\u002Fmodel)          |       [Hunyuan-DiT-v1.0](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fmodel.zip)        |\n| Hunyuan-DiT-v1.1  |  1.5B   |     [Hunyuan-DiT-v1.1](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.1\u002Ftree\u002Fmain\u002Ft2i\u002Fmodel)     |     [Hunyuan-DiT-v1.1](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fmodel-v1_1.zip)     |\n| Hunyuan-DiT-v1.2  |  1.5B   |     [Hunyuan-DiT-v1.2](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.2\u002Ftree\u002Fmain\u002Ft2i\u002Fmodel)     |     [Hunyuan-DiT-v1.2](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fmodel-v1_2.zip)     |\n|     Data demo     |  -      |                                                   -                                                    |         [Data demo](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fdata_demo.zip)         |\n\n## :truck: 训练\n\n### 数据准备\n\n请参考以下命令准备训练数据。\n\n1. 安装依赖\n\n我们提供了一个高效的数据管理库 IndexKits，支持在训练过程中管理数亿条数据，更多信息请参见 [文档](.\u002FIndexKits\u002FREADME.md)。\n```shell\n# 1 安装依赖\ncd HunyuanDiT\npip install -e .\u002FIndexKits\n```\n\n2. 数据下载\n\n您可以自由下载 [数据示例](https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fdata_demo.zip)。\n```shell\n# 2 数据下载\nwget -O .\u002Fdataset\u002Fdata_demo.zip https:\u002F\u002Fdit.hunyuan.tencent.com\u002Fdownload\u002FHunyuanDiT\u002Fdata_demo.zip\nunzip .\u002Fdataset\u002Fdata_demo.zip -d .\u002Fdataset\nmkdir .\u002Fdataset\u002Fporcelain\u002Farrows .\u002Fdataset\u002Fporcelain\u002Fjsons\n```\n\n3. 数据转换\n\n为训练数据创建一个 CSV 文件，字段如下表所示。\n\n|    字段       | 必需  |  描述     |   示例   |\n|:---------------:| :------:  |:----------------:|:-----------:|\n|   `image_path`  | 必需  |  图像路径               |     `.\u002Fdataset\u002Fporcelain\u002Fimages\u002F0.png`        | \n|   `text_zh`     | 必需  |    文本               |  青花瓷风格，一只蓝色的鸟儿站在蓝色的花瓶上，周围点缀着白色花朵，背景是白色 | \n|   `md5`         | 可选  |    图像 MD5（消息摘要算法 5）  |    `d41d8cd98f00b204e9800998ecf8427e`         | \n|   `width`       | 可选  |    图像宽度    |     `1024 `       | \n|   `height`      | 可选  |    图像高度   |    ` 1024 `       | \n\n> ⚠️ 可选字段如 MD5、宽度和高度可以省略。如果省略，下面的脚本会自动计算它们。但在处理大规模训练数据时，这一过程可能会比较耗时。\n\n我们使用 [Arrow](https:\u002F\u002Fgithub.com\u002Fapache\u002Farrow) 作为训练数据格式，它提供了一种标准且高效的内存中数据表示方式。我们提供了一个转换脚本，用于将 CSV 文件转换为 Arrow 格式。\n```shell  \n# 3 数据转换 \npython .\u002Fhydit\u002Fdata_loader\u002Fcsv2arrow.py .\u002Fdataset\u002Fporcelain\u002Fcsvfile\u002Fimage_text.csv .\u002Fdataset\u002Fporcelain\u002Farrows 1\n```\n\n4. 数据选择与配置文件创建\n\n我们通过 YAML 文件来配置训练数据。在这些文件中，您可以设置标准的数据处理策略，例如过滤、复制、去重等操作。更多详情请参见 [.\u002FIndexKits](IndexKits\u002Fdocs\u002FMakeDataset.md)。\n\n有关示例文件，请参阅 [文件](.\u002Fdataset\u002Fyamls\u002Fporcelain.yaml)。完整的参数配置文件请参见 [文件](.\u002FIndexKits\u002Fdocs\u002FMakeDataset.md)。\n\n5. 使用 YAML 文件创建训练数据索引文件。\n\n```shell\n# 单分辨率数据准备\nidk base -c dataset\u002Fyamls\u002Fporcelain.yaml -t dataset\u002Fporcelain\u002Fjsons\u002Fporcelain.json\n\n# 多分辨率数据准备     \nidk multireso -c dataset\u002Fyamls\u002Fporcelain_mt.yaml -t dataset\u002Fporcelain\u002Fjsons\u002Fporcelain_mt.json\n```\n\n`porcelain` 数据集的目录结构如下：\n\n```shell\ncd .\u002Fdataset\n\nporcelain\n   ├──images\u002F  (图像文件)\n   │  ├──0.png\n   │  ├──1.png\n   │  ├──......\n   ├──csvfile\u002F  (包含文本-图像对的 CSV 文件)\n   │  ├──image_text.csv\n   ├──arrows\u002F  (包含所有必要训练数据的 Arrow 文件)\n   │  ├──00000.arrow\n   │  ├──00001.arrow\n   │  ├──......\n   ├──jsons\u002F  (最终的训练数据索引文件，在训练时从 Arrow 文件中读取数据)\n   │  ├──porcelain.json\n   │  ├──porcelain_mt.json\n```\n\n### 全参数训练\n\n**要求：**\n\n1. 最低要求是一张至少有 20GB 显存的 GPU，但我们建议使用约 30GB 显存的 GPU，以避免主机内存换出。\n2. 此外，我们鼓励用户利用多节点上的多张 GPU 来加速大规模数据集的训练。\n\n**注意：**\n\n1. 个人用户也可以使用轻量级的 Kohya 模型，在约 16GB 显存的情况下对模型进行微调。目前，我们正努力进一步降低面向个人用户的工业级框架的显存占用。\n2. 如果您的 GPU 显存充足，请尝试移除 `--cpu-offloading` 或 `--gradient-checkpointing`，以减少时间开销。\n\n对于分布式训练，您可以通过调整 `--hostfile` 和 `--master_addr` 等参数灵活控制 **单节点** 或 **多节点** 训练。更多详情请参见 [链接](https:\u002F\u002Fwww.deepspeed.ai\u002Fgetting-started\u002F#resource-configuration-multi-node)。\n\n```shell\n# 单分辨率训练\nPYTHONPATH=.\u002F sh hydit\u002Ftrain.sh --index-file dataset\u002Fporcelain\u002Fjsons\u002Fporcelain.json\n\n# 多分辨率训练\nPYTHONPATH=.\u002F sh hydit\u002Ftrain.sh --index-file dataset\u002Fporcelain\u002Fjsons\u002Fporcelain_mt.json --multireso --reso-step 64\n\n# 使用旧版本 HunyuanDiT（\u003C= v1.1）进行训练\nPYTHONPATH=.\u002F sh hydit\u002Ftrain_v1.1.sh --index-file dataset\u002Fporcelain\u002Fjsons\u002Fporcelain.json\n```\n\n保存检查点后，您可以使用以下命令评估模型。\n```shell\n# 推理\n    # 请将 'log_EXP\u002Fxxx\u002Fcheckpoints\u002Ffinal.pt' 替换为您实际的路径。\npython sample_t2i.py --infer-mode fa --prompt \"青花瓷风格，一只可爱的哈士奇\" --no-enhance --dit-weight log_EXP\u002Fxxx\u002Fcheckpoints\u002Ffinal.pt --load-key module\n\n# 旧版本 HunyuanDiT（\u003C= v1.1）\n    # 请将 'log_EXP\u002Fxxx\u002Fcheckpoints\u002Ffinal.pt' 替换为您实际的路径。\npython sample_t2i.py --infer-mode fa --prompt \"青花瓷风格，一只可爱的哈士奇\" --model-root .\u002FHunyuanDiT-v1.1 --use-style-cond --size-cond 1024 1024 --beta-end 0.03 --no-enhance --dit-weight log_EXP\u002Fxxx\u002Fcheckpoints\u002Ffinal.pt --load-key module\n```\n\n### LoRA\n\n\n\n我们提供了LoRA的训练和推理脚本，详细说明请参阅[.\u002Flora](.\u002Flora\u002FREADME.md)。\n\n  ```shell\n  # 训练瓷器风格的LoRA。\n  PYTHONPATH=.\u002F sh lora\u002Ftrain_lora_with_fa.sh --index-file dataset\u002Fporcelain\u002Fjsons\u002Fporcelain.json\n  \n  # 使用训练好的LoRA权重进行推理。\n  python sample_t2i.py --infer-mode fa --prompt \"青花瓷风格，一只小狗\"  --no-enhance --lora-ckpt log_EXP\u002F001-lora_porcelain_ema_rank64\u002Fcheckpoints\u002F0001000.pt\n  ```  \n如果无法安装flash_attn，可以使用以下代码：\n\n  ```shell\n  # 训练瓷器风格的LoRA。\n  PYTHONPATH=.\u002F sh lora\u002Ftrain_lora.sh --index-file dataset\u002Fporcelain\u002Fjsons\u002Fporcelain.json\n  \n  # 使用训练好的LoRA权重进行推理。\n  python sample_t2i.py --infer-mode torch --prompt \"青花瓷风格，一只小狗\"  --no-enhance --lora-ckpt log_EXP\u002F001-lora_porcelain_ema_rank64\u002Fcheckpoints\u002F0001000.pt\n  ```  \n\n我们提供了两种训练好的LoRA权重，分别适用于“瓷器”和“玉石”风格，详情请见[Hugging Face链接](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHYDiT-LoRA)。\n\n  ```shell\n  cd HunyuanDiT\n  # 使用huggingface-cli工具下载模型。\n  huggingface-cli download Tencent-Hunyuan\u002FHYDiT-LoRA --local-dir .\u002Fckpts\u002Ft2i\u002Flora\n  \n  # 快速入门\n  python sample_t2i.py --infer-mode fa --prompt \"青花瓷风格，一只猫在追蝴蝶\"  --no-enhance --load-key ema --lora-ckpt .\u002Fckpts\u002Ft2i\u002Flora\u002Fporcelain\n  ```  \n\n\u003Ctable>\n  \u003Ctr>\n    \u003Ctd colspan=\"4\" align=\"center\">训练数据示例\u003C\u002Ftd>\n  \u003C\u002Ftr>\n  \n  \u003Ctr>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_92a234afe07f.png\" alt=\"图片0\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_ecaf0c0a9113.png\" alt=\"图片1\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_d0e812758c7d.png\" alt=\"图片2\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_eed1c5f301b3.png\" alt=\"图片3\" width=\"200\"\u002F>\u003C\u002Ftd>\n  \u003C\u002Ftr>\n  \u003Ctr>\n    \u003Ctd align=\"center\">青花瓷风格，一只蓝色的鸟儿站在蓝色的花瓶上，周围点缀着白色花朵，背景是白色 （Porcelain style, a blue bird stands on a blue vase, surrounded by white flowers, with a white background.\n）\u003C\u002Ftd>\n    \u003Ctd align=\"center\">青花瓷风格，这是一幅蓝白相间的陶瓷盘子，上面描绘着一只狐狸和它的幼崽在森林中漫步，背景是白色 （Porcelain style, this is a blue and white ceramic plate depicting a fox and its cubs strolling in the forest, with a white background.）\u003C\u002Ftd>\n    \u003Ctd align=\"center\">青花瓷风格，在黑色背景上，一只蓝色的狼站在蓝白相间的盘子上，周围是树木和月亮 （Porcelain style, on a black background, a blue wolf stands on a blue and white plate, surrounded by trees and the moon.）\u003C\u002Ftd>\n    \u003Ctd align=\"center\">青花瓷风格，在蓝色背景上，一只蓝色蝴蝶和白色花朵被放置在中央 （Porcelain style, on a blue background, a blue butterfly and white flowers are placed in the center.）\u003C\u002Ftd>\n  \u003C\u002Ftr>\n  \u003Ctr>\n    \u003Ctd colspan=\"4\" align=\"center\">推理结果示例\u003C\u002Ftd>\n  \u003C\u002Ftr>\n  \u003Ctr>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_ffb073621f69.png\" alt=\"图片4\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_7f5a35ac2d56.png\" alt=\"图片5\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_02ece4cb2b95.png\" alt=\"图片6\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_29334ca04fa9.png\" alt=\"图片7\" width=\"200\"\u002F>\u003C\u002Ftd>\n  \u003C\u002Ftr>\n  \u003Ctr>\n    \u003Ctd align=\"center\">青花瓷风格，苏州园林 （Porcelain style,  Suzhou Gardens.）\u003C\u002Ftd>\n    \u003Ctd align=\"center\">青花瓷风格，一朵荷花 （Porcelain style,  a lotus flower.）\u003C\u002Ftd>\n    \u003Ctd align=\"center\">青花瓷风格，一只羊（Porcelain style, a sheep.）\u003C\u002Ftd>\n    \u003Ctd align=\"center\">青花瓷风格，一个女孩在雨中跳舞（Porcelain style, a girl dancing in the rain.）\u003C\u002Ftd>\n  \u003C\u002Ftr>\n  \n\u003C\u002Ftable>\n\n\n## 🔑 推理\n\n### 6GB显存下的推理\n基于[huggingface\u002Fdiffusers](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fdiffusers\u002Fmain\u002Fen\u002Fapi\u002Fpipelines\u002Fhunyuandit)，现在可以在6GB以下显存的GPU上运行HunyuanDiT。我们在此提供快速入门的说明和演示。\n\n> 6GB版本支持Nvidia Ampere架构系列显卡，如RTX 3070\u002F3080\u002F4080\u002F4090、A100等。\n\n您只需安装以下库即可：\n\n```bash\npip install -U bitsandbytes\npip install git+https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Fdiffusers\npip install \"torch>=2.7.1\"\n```\n\n随后，您就可以直接在6GB以下显存的GPU上享受HunyuanDiT文生图之旅了！\n\n以下是一个演示：\n\n```bash\ncd HunyuanDiT\n\n# 快速入门\nmodel_id=Tencent-Hunyuan\u002FHunyuanDiT-v1.2-Diffusers-Distilled\nprompt=一个宇航员在骑马\ninfer_steps=50\nguidance_scale=6\npython3 lite\u002Finference.py ${model_id} ${prompt} ${infer_steps} ${guidance_scale}\n```  \n\n更多细节请参阅[.\u002Flite](lite\u002FREADME.md)。\n\n\n### 使用Gradio\n\n在运行以下命令前，请确保已激活conda环境。\n\n```shell\n# 默认启动中文界面，并使用Flash Attention加速。\npython app\u002Fhydit_app.py --infer-mode fa\n\n# 使用特定端口和主机\npython app\u002Fhydit_app.py --infer-mode fa --server_name 0.0.0.0 --server_port 443 --load-key distill\n\n# 如果显存不足，可以关闭增强模型。\n# 增强功能将在您不带`--no-enhance`标志重启应用后恢复可用。\npython app\u002Fhydit_app.py --no-enhance --infer-mode fa\n\n# 启动英文界面\npython app\u002Fhydit_app.py --lang en --infer-mode fa\n\n# 启动多轮文生图界面。\n# 如果您的显存低于32GB，请使用`--load-4bit`启用4位量化，这至少需要22GB显存。\npython app\u002FmultiTurnT2I_app.py --infer-mode fa\n```  \n\n随后，您可以通过http:\u002F\u002F0.0.0.0:443访问演示页面。需要注意的是，这里的0.0.0.0应替换为您的服务器IP地址X.X.X.X。\n\n### 使用🤗 Diffusers\n\n请提前安装PyTorch 2.0或更高版本，以满足diffusers库指定版本的要求。\n\n安装🤗 diffusers，确保版本不低于0.28.1：\n\n```shell\npip install git+https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Fdiffusers.git\n```  \n或者  \n```shell\npip install diffusers\n```  \n\n您可以使用以下Python脚本生成中英文提示下的图像：\n```py\nimport torch\nfrom diffusers import HunyuanDiTPipeline\n\npipe = HunyuanDiTPipeline.from_pretrained(\"Tencent-Hunyuan\u002FHunyuanDiT-v1.2-Diffusers\", torch_dtype=torch.float16)\npipe.to(\"cuda\")\n\n# 您也可以使用英文提示，因为HunyuanDiT同时支持中英文\n# prompt = \"An astronaut riding a horse\"\nprompt = \"一个宇航员在骑马\"\nimage = pipe(prompt).images[0]\n```  \n\n您还可以使用我们的蒸馏模型更快地生成图像：\n\n```py\nimport torch\nfrom diffusers import HunyuanDiTPipeline\n\npipe = HunyuanDiTPipeline.from_pretrained(\"Tencent-Hunyuan\u002FHunyuanDiT-v1.2-Diffusers-Distilled\", torch_dtype=torch.float16)\npipe.to(\"cuda\")\n\n# 您也可以使用英文提示，因为HunyuanDiT同时支持中英文\n\n# prompt = \"一个宇航员在骑马\"\nprompt = \"一个宇航员在骑马\"\nimage = pipe(prompt, num_inference_steps=25).images[0]\n```\n更多详情请参见 [HunyuanDiT-v1.2-Diffusers-Distilled](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.2-Diffusers-Distilled)\n\n**更多功能：** 关于 LoRA 和 ControlNet 等其他功能，请查看 [.\u002Fdiffusers](diffusers) 的 README 文件。\n\n### 命令行使用\n\n我们提供了几个快速入门的命令：\n\n```shell\n# 仅文本到图像。Flash Attention 模式\npython sample_t2i.py --infer-mode fa --prompt \"渔舟唱晚\" --no-enhance\n\n# 生成其他尺寸的图像。\npython sample_t2i.py --infer-mode fa --prompt \"渔舟唱晚\" --image-size 1280 768\n\n# 提示增强 + 文本到图像。DialogGen 使用 4 位量化加载，但可能会损失性能。\npython sample_t2i.py --infer-mode fa --prompt \"渔舟唱晚\"  --load-4bit\n\n```\n\n更多示例提示可在 [example_prompts.txt](example_prompts.txt) 中找到。\n\n### 更多配置\n\n我们列出了一些更实用的配置，方便使用：\n\n|    参数     |  默认值  |                     说明                     |\n|:---------------:|:---------:|:---------------------------------------------------:|\n|   `--prompt`    |   无    |        图像生成的文本提示         |\n| `--image-size`  | 1024 1024 |           生成图像的尺寸           |\n|    `--seed`     |    42     |        图像生成的随机种子        |\n| `--infer-steps` |    100    |          采样步骤数           |\n|  `--negative`   |     -     |      图像生成的负面提示       |\n| `--infer-mode`  |   torch   |       推理模式（torch、fa 或 trt）        |\n|   `--sampler`   |   ddpm    |    扩散采样器（ddpm、ddim 或 dpmms）     |\n| `--no-enhance`  |   False   |        禁用提示增强模型         |\n| `--model-root`  |   ckpts   |     模型检查点的根目录     |\n|  `--load-key`   |    ema    | 加载学生模型或 EMA 模型（ema 或 module） |\n|  `--load-4bit`  |   Fasle   |     使用 4 位量化加载 DialogGen 模型     |\n\n### 使用 ComfyUI\n\n- 支持两种工作流：标准 ComfyUI 和 Diffusers 包装器，推荐使用前者。\n- 支持 HunyuanDiT-v1.1 和 v1.2。\n- 支持由 Kohya 训练的 module、lora 和 clip lora 模型。\n- 支持由 HunyunDiT 官方训练脚本训练的 module、lora 模型。\n- 支持 ControlNet。\n\n更多详情请参见 [.\u002Fcomfyui](comfyui\u002FREADME.md)\n\n### 使用 Kohya\n\n我们支持 kohya_ss GUI 的自定义代码，以及用于 HunyuanDiT 的 sd-scripts 训练代码。\n![dreambooth](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_a9d3c7b9d192.png)\n更多详情请参见 [.\u002Fkohya_ss-hydit](kohya_ss-hydit\u002FREADME.md)\n\n### 使用旧版本\n\n* **Hunyuan-DiT \u003C= v1.1**\n\n```shell\n# ============================== v1.1 ==============================\n# 下载模型\nhuggingface-cli download Tencent-Hunyuan\u002FHunyuanDiT-v1.1 --local-dir .\u002FHunyuanDiT-v1.1\n# 使用模型进行推理\npython sample_t2i.py --infer-mode fa --prompt \"渔舟唱晚\" --model-root .\u002FHunyuanDiT-v1.1 --use-style-cond --size-cond 1024 1024 --beta-end 0.03\n\n# ============================== v1.0 ==============================\n# 下载模型\nhuggingface-cli download Tencent-Hunyuan\u002FHunyuanDiT --local-dir .\u002FHunyuanDiT-v1.0\n# 使用模型进行推理\npython sample_t2i.py --infer-mode fa --prompt \"渔舟唱晚\" --model-root .\u002FHunyuanDiT-v1.0 --use-style-cond --size-cond 1024 1024 --beta-end 0.03\n```\n\n## :building_construction: 适配器\n\n### ControlNet\n\n我们提供了 ControlNet 的训练脚本，详细信息请参阅 [.\u002Fcontrolnet](.\u002Fcontrolnet\u002FREADME.md)。\n\n  ```shell\n  # 训练 Canny ControlNet。\n  PYTHONPATH=.\u002F sh hydit\u002Ftrain_controlnet.sh\n  ```\n\n我们提供了三种类型的已训练 ControlNet 权重，分别对应 `canny`、`depth` 和 `pose` 模式，详情请见 [链接](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHYDiT-ControlNet)：\n\n  ```shell\n  cd HunyuanDiT\n  # 使用 huggingface-cli 工具下载模型。\n  # 我们建议在 ControlNet 推理时使用蒸馏权重作为基础模型，因为我们提供的预训练权重正是基于这些权重训练的。\n  huggingface-cli download Tencent-Hunyuan\u002FHYDiT-ControlNet-v1.2 --local-dir .\u002Fckpts\u002Ft2i\u002Fcontrolnet\n  huggingface-cli download Tencent-Hunyuan\u002FDistillation-v1.2 .\u002Fpytorch_model_distill.pt --local-dir .\u002Fckpts\u002Ft2i\u002Fmodel\n  \n  # 快速入门\n  python3 sample_controlnet.py --infer-mode fa --no-enhance --load-key distill --infer-steps 50 --control-type canny --prompt \"在夜晚的酒店门前，一座古老的中国风格的狮子雕像矗立着，它的眼睛闪烁着光芒，仿佛在守护着这座建筑。背景是夜晚的酒店前，构图方式是特写，平视，居中构图。这张照片呈现了真实摄影风格，蕴含了中国雕塑文化，同时展现了神秘氛围\" --condition-image-path https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_ebbf8f65c4e8.jpg --control-weight 1.0\n  ```\n\n\u003Ctable>\n  \u003Ctr>\n    \u003Ctd colspan=\"3\" align=\"center\">条件输入\u003C\u002Ftd>\n  \u003C\u002Ftr>\n  \n   \u003Ctr>\n    \u003Ctd align=\"center\">Canny ControlNet \u003C\u002Ftd>\n    \u003Ctd align=\"center\">Depth ControlNet \u003C\u002Ftd>\n    \u003Ctd align=\"center\">Pose ControlNet \u003C\u002Ftd>\n  \u003C\u002Ftr>\n\n  \u003Ctr>\n    \u003Ctd align=\"center\">在夜晚的酒店门前，一座古老的中国风格的狮子雕像矗立着，它的眼睛闪烁着光芒，仿佛在守护着这座建筑。背景是夜晚的酒店前，构图方式是特写，平视，居中构图。这张照片呈现了真实摄影风格，蕴含了中国雕塑文化，同时展现了神秘氛围\u003Cbr>（At night, an ancient Chinese-style lion statue stands in front of the hotel, its eyes gleaming as if guarding the building. The background is the hotel entrance at night, with a close-up, eye-level, and centered composition. This photo presents a realistic photographic style, embodies Chinese sculpture culture, and reveals a mysterious atmosphere.） \u003C\u002Ftd>\n    \u003Ctd align=\"center\">在茂密的森林中，一只黑白相间的熊猫静静地坐在绿树红花中，周围是山川和海洋。背景是白天的森林，光线充足。照片采用特写、平视和居中构图的方式，呈现出写实的效果\u003Cbr>（In the dense forest, a black and white panda sits quietly among the green trees and red flowers, surrounded by mountains and oceans. The background is a daytime forest with ample light. The photo uses a close-up, eye-level, and centered composition to create a realistic effect.） \u003C\u002Ftd>\n    \u003Ctd align=\"center\">在白天的森林中，一位穿着绿色上衣的亚洲女性站在大象旁边。照片采用了中景、平视和居中构图的方式，呈现出写实的效果。这张照片蕴含了人物摄影文化，并展现了宁静的氛围\u003Cbr>（In the daytime forest, an Asian woman wearing a green shirt stands beside an elephant. The photo uses a medium shot, eye-level, and centered composition to create a realistic effect. This picture embodies the character photography culture and conveys a serene atmosphere.） \u003C\u002Ftd>\n  \u003C\u002Ftr>\n\n  \u003Ctr>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_ebbf8f65c4e8.jpg\" alt=\"Image 0\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_cd9e5d78d9f4.jpg\" alt=\"Image 1\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_2dd46be733e2.jpg\" alt=\"Image 2\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \n  \u003C\u002Ftr>\n  \n  \u003Ctr>\n    \u003Ctd colspan=\"3\" align=\"center\">ControlNet 输出\u003C\u002Ftd>\n  \u003C\u002Ftr>\n\n  \u003Ctr>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_d8b27e57a49a.jpg\" alt=\"Image 3\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_5e84c71ed676.jpg\" alt=\"Image 4\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_76a1004a5d0a.jpg\" alt=\"Image 5\" width=\"200\"\u002F>\u003C\u002Ftd>\n  \u003C\u002Ftr>\n \n\u003C\u002Ftable>\n\n### IP-Adapter\n我们提供了 IP-Adapter 的训练脚本，详细信息请参阅 [.\u002Fipadapter](.\u002Fipadapter\u002FREADME.md)。\n\n  ```shell\n  # 训练 IP-Adapter。\n  PYTHONPATH=.\u002F sh hydit\u002Ftrain_ipadapter.sh\n  ```\n\n我们提供了训练好的 IP-Adapter 权重，详情请见 [链接](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHYDiT-IP-Adapter)：\n\n  ```shell\n  cd HunyuanDiT\n  # 使用 huggingface-cli 工具下载模型。\n  # 我们建议在 IP-Adapter 推理时使用模块权重作为基础模型，因为我们提供的预训练权重正是基于这些权重训练的。\n  huggingface-cli download Tencent-Hunyuan\u002FIP-Adapter ipa.pt --local-dir .\u002Fckpts\u002Ft2i\u002Fmodel\n  huggingface-cli download Tencent-Hunyuan\u002FIP-Adapter clip_img_encoder.pt  --local-dir .\u002Fckpts\u002Ft2i\u002Fmodel\u002Fclip_img_encoder\n  \n  # 快速入门\n  python3 sample_ipadapter.py  --infer-mode fa --ref-image-path https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_8af468f74f94.png --i-scale 1.0 --prompt 一只老虎在海洋中游泳，背景是海洋。构图方式是居中构图，呈现了动漫风格和文化，营造了平静的氛围。 --infer-steps 100 --is-ipa True --load-key distill\n  ```\n\n参考输入及 IP-Adapter 结果示例如下：\n\u003Ctable>\n  \u003Ctr>\n    \u003Ctd colspan=\"3\" align=\"center\">参考输入\u003C\u002Ftd>\n  \u003C\u002Ftr>\n  \n\n\n  \n\n  \u003Ctr>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_8af468f74f94.png\" alt=\"Image 0\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_b0e648a02174.png\" alt=\"Image 1\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_c6b7fc3f322d.png\" alt=\"Image 2\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \n  \u003C\u002Ftr>\n  \n  \u003Ctr>\n    \u003Ctd colspan=\"3\" align=\"center\">IP-Adapter 输出\u003C\u002Ftd>\n  \u003C\u002Ftr>\n\n  \u003Ctr>\n    \u003Ctd align=\"center\">一只老虎在奔跑。\u003Cbr>（A tiger running.） \u003C\u002Ftd>\n    \u003Ctd align=\"center\">一个卡通美女，抱着一只小猪。\u003Cbr>（A cartoon beauty holding a little pig.） \u003C\u002Ftd>\n    \u003Ctd align=\"center\">一片紫色薰衣草地。\u003Cbr>（A purple lavender field.） \u003C\u002Ftd>\n  \u003C\u002Ftr>\n\n  \u003Ctr>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_97841782c229.png\" alt=\"Image 3\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_0b6805eddd6d.png\" alt=\"Image 4\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_dc835c21880c.png\" alt=\"Image 5\" width=\"200\"\u002F>\u003C\u002Ftd>\n  \u003C\u002Ftr>\n\n  \u003Ctr>\n    \u003Ctd align=\"center\">一只老虎在看书。\u003Cbr>（A tiger is reading a book.） \u003C\u002Ftd>\n    \u003Ctd align=\"center\">一个卡通美女，穿着绿色衣服。\u003Cbr>（A cartoon beauty wearing green clothes.） \u003C\u002Ftd>\n    \u003Ctd align=\"center\">一片紫色薰衣草地，有一只可爱的小狗。\u003Cbr>（A purple lavender field with a cute puppy.） \u003C\u002Ftd>\n  \u003C\u002Ftr>\n  \u003Ctr>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_8492f559a6c0.png\" alt=\"Image 3\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_4b89c5cbc055.png\" alt=\"Image 4\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_0c71929ea3f9.png\" alt=\"Image 5\" width=\"200\"\u002F>\u003C\u002Ftd>\n  \u003C\u002Ftr.\n\n  \u003Ctr>\n    \u003Ctd align=\"center\">一只老虎在咆哮。\u003Cbr>（A tiger is roaring.） \u003C\u002Ftd>\n    \u003Ctd align=\"center\">一个卡通美女，戴着墨镜。\u003Cbr>（A cartoon beauty wearing sunglasses.） \u003C\u002Ftd>\n    \u003Ctd align=\"center\">水墨风格,一片紫色薰衣草地。\u003Cbr>（Ink style. A purple lavender field.） \u003C\u002Ftd>\n  \u003C\u002Ftr\n  \u003Ctr>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_6d8a7bd12771.png\" alt=\"Image 3\" width=\"200\"\u002F>\u003C\u002Ftd>\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_d3a9e0270c23.png\" alt=\"Image 4\" width=\"200\"\u002F>\u003C\u002Ftd\n    \u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_a55eaf3e0033.png\" alt=\"Image 5\" width=\"200\"\u002F>\u003C\u002Ftd\n  \u003C\u002Ftr.\n \n\u003C\u002Ftable>\n\n## :art: Hunyuan-Captioner\nHunyuan-Captioner 通过保持高度的图文一致性，满足文本到图像技术的需求。它可以从多个角度生成高质量的图像描述，包括物体描述、物体之间的关系、背景信息、图像风格等。我们的代码基于 [LLaVA](https:\u002F\u002Fgithub.com\u002Fhaotian-liu\u002FLLaVA) 的实现。\n\n### 示例\n\n\u003Ctd align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_0fa959ae7e39.jpg\" alt=\"Image 3\" width=\"1200\"\u002F>\u003C\u002Ftd>\n\n### 使用说明\na. 安装依赖\n\n依赖和安装步骤与 [**基础模型**](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.2) 基本相同。\n\nb. 模型下载\n```shell\n# 使用 huggingface-cli 工具下载模型。\nhuggingface-cli download Tencent-Hunyuan\u002FHunyuanCaptioner --local-dir .\u002Fckpts\u002Fcaptioner\n```\n\n### 推理\n\n我们的模型支持三种不同的模式，包括：**直接生成中文描述**、**基于特定知识生成中文描述**以及 **直接生成英文描述**。注入的信息可以是准确的提示词，也可以是噪声标签（例如从互联网上爬取的原始描述）。无论输入的是何种信息，模型都能根据插入的信息和图像内容生成可靠且准确的描述。\n\n|模式           | 提示模板                           | 描述                           | \n| ---           | ---                                       | ---                                  |\n|caption_zh     | 描述这张图片                               | 中文描述                    | \n|insert_content | 根据提示词“{}”,描述这张图片                 | 带有插入知识的描述| \n|caption_en     | Please describe the content of this image | 英文描述                    |\n|               |                                           |                                      |\n\na. 单张图片中文推理\n\n```bash\npython mllm\u002Fcaption_demo.py --mode \"caption_zh\" --image_file \"mllm\u002Fimages\u002Fdemo1.png\" --model_path \".\u002Fckpts\u002Fcaptioner\"\n```\n\nb. 在描述中插入特定知识\n\n```bash\npython mllm\u002Fcaption_demo.py --mode \"insert_content\" --content \"宫保鸡丁\" --image_file \"mllm\u002Fimages\u002Fdemo2.png\" --model_path \".\u002Fckpts\u002Fcaptioner\"\n```\n\nc. 单张图片英文推理\n\n```bash\npython mllm\u002Fcaption_demo.py --mode \"caption_en\" --image_file \"mllm\u002Fimages\u002Fdemo3.png\" --model_path \".\u002Fckpts\u002Fcaptioner\"\n```\n\nd. 多张图片中文推理\n\n```bash\n### 将多张图片转换为 csv 文件。\npython mllm\u002Fmake_csv.py --img_dir \"mllm\u002Fimages\" --input_file \"mllm\u002Fimages\u002Fdemo.csv\"\n\n### 多张图片推理\npython mllm\u002Fcaption_demo.py --mode \"caption_zh\" --input_file \"mllm\u002Fimages\u002Fdemo.csv\" --output_file \"mllm\u002Fimages\u002Fdemo_res.csv\" --model_path \".\u002Fckpts\u002Fcaptioner\"\n```\n\n（可选）若需将输出的 csv 文件转换为 Arrow 格式，请参阅 [数据准备 #3](#data-preparation) 获取详细说明。\n\n\n### Gradio \n要在本地启动 Gradio 演示，请依次运行以下命令。更多详细说明请参考 [LLaVA](https:\u002F\u002Fgithub.com\u002Fhaotian-liu\u002FLLaVA)。\n```bash\ncd mllm\npython -m llava.serve.controller --host 0.0.0.0 --port 10000\n\npython -m llava.serve.gradio_web_server --controller http:\u002F\u002F0.0.0.0:10000 --model-list-mode reload --port 443\n\npython -m llava.serve.model_worker --host 0.0.0.0 --controller http:\u002F\u002F0.0.0.0:10000 --port 40000 --worker http:\u002F\u002F0.0.0.0:40000 --model-path \"..\u002Fckpts\u002Fcaptioner\" --model-name LlavaMistral\n```\n随后可以通过 http:\u002F\u002F0.0.0.0:443 访问演示。需要注意的是，这里的 0.0.0.0 需要替换为您的服务器 IP 地址 X.X.X.X。\n\n## 🚀 加速（适用于 Linux）\n\n- 我们提供了 HunyuanDiT 的 TensorRT 版本，用于加速推理（比 flash attention 更快）。\n详情请参见 [Tencent-Hunyuan\u002FTensorRT-libs](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FTensorRT-libs)。\n\n- 我们提供了 HunyuanDiT 的蒸馏版本，用于加速推理。\n详情请参见 [Tencent-Hunyuan\u002FDistillation](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FDistillation)。\n\n## 🔗 BibTeX\n如果您在研究和应用中发现 [Hunyuan-DiT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.08748) 或 [DialogGen](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.08857) 有所帮助，请使用以下 BibTeX 引用：\n\n```BibTeX\n@misc{li2024hunyuandit,\n      title={Hunyuan-DiT: A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding}, \n      author={Zhimin Li and Jianwei Zhang and Qin Lin and Jiangfeng Xiong and Yanxin Long and Xinchi Deng and Yingfang Zhang and Xingchao Liu and Minbin Huang and Zedong Xiao and Dayou Chen and Jiajun He and Jiahao Li and Wenyue Li and Chen Zhang and Rongwei Quan and Jianxiang Lu and Jiabin Huang and Xiaoyan Yuan and Xiaoxiao Zheng and Yixuan Li and Jihong Zhang and Chao Zhang and Meng Chen and Jie Liu and Zheng Fang and Weiyan Wang and Jinbao Xue and Yangyu Tao and Jianchen Zhu and Kai Liu and Sihuan Lin and Yifu Sun and Yun Li and Dongdong Wang and Mingtao Chen and Zhichao Hu and Xiao Xiao and Yan Chen and Yuhong Liu and Wei Liu and Di Wang and Yong Yang and Jie Jiang and Qinglin Lu},\n      year={2024},\n      eprint={2405.08748},\n      archivePrefix={arXiv},\n      primaryClass={cs.CV}\n}\n\n@article{huang2024dialoggen,\n  title={DialogGen: Multi-modal Interactive Dialogue System for Multi-turn Text-to-Image Generation},\n  author={Huang, Minbin and Long, Yanxin and Deng, Xinchi and Chu, Ruihang and Xiong, Jiangfeng and Liang, Xiaodan and Cheng, Hong and Lu, Qinglin and Liu, Wei},\n  journal={arXiv preprint arXiv:2403.08857},\n  year={2024}\n}\n``` \n\n## 星级历史\n\n\u003Ca href=\"https:\u002F\u002Fstar-history.com\u002F#Tencent\u002FHunyuanDiT&Date\">\n \u003Cpicture>\n   \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_28e0c7e62f38.png&theme=dark\" \u002F>\n   \u003Csource media=\"(prefers-color-scheme: light)\" srcset=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_28e0c7e62f38.png\" \u002F>\n   \u003Cimg alt=\"星级历史图表\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_readme_28e0c7e62f38.png\" \u002F>\n \u003C\u002Fpicture>\n\u003C\u002Fa>","# HunyuanDiT 快速上手指南\n\nHunyuanDiT 是腾讯开源的一款强大的多分辨率扩散 Transformer 模型，具备精细的中英文双语理解能力，支持文生图、多轮对话生成、ControlNet 控制及 LoRA 微调等功能。\n\n## 环境准备\n\n### 系统要求\n- **操作系统**: Linux (推荐 Ubuntu 18.04+) 或 Windows (需配置相应 CUDA 环境)\n- **GPU**: NVIDIA 显卡，显存建议 **8GB** 以上（最低支持 6GB 显存推理，需使用 lite 脚本）\n- **CUDA**: 11.x 或 12.x\n- **Python**: 3.8 - 3.10\n\n### 前置依赖\n确保已安装以下基础工具：\n- Git\n- Conda 或 Miniconda (推荐用于环境管理)\n- CUDA Toolkit 和 cuDNN\n\n## 安装步骤\n\n### 1. 克隆代码库\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan\u002FHunyuanDiT.git\ncd HunyuanDiT\n```\n\n### 2. 创建并激活虚拟环境\n```bash\nconda create -n hunyuan-dit python=3.9\nconda activate hunyuan-dit\n```\n\n### 3. 安装依赖\n项目提供了 `requirements.txt`，直接安装即可。国内用户建议使用清华或阿里镜像源加速下载：\n\n```bash\npip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n> **提示**：如果你希望避免复杂的环境配置，项目官方提供了 Docker 镜像（支持 CUDA 11\u002F12），可直接拉取使用。\n\n### 4. 下载预训练模型\n模型权重托管在 Hugging Face，国内访问较慢，推荐使用 **HF-Mirror** 或 **腾讯云链接**（如果可用）。\n\n**方式一：使用 huggingface-cli (配合镜像)**\n```bash\nexport HF_ENDPOINT=https:\u002F\u002Fhf-mirror.com\nhuggingface-cli download Tencent-Hunyuan\u002FHunyuanDiT-v1.2 --local-dir .\u002Fckpts\n```\n\n**方式二：手动下载**\n访问 [HuggingFace 仓库](https:\u002F\u002Fhuggingface.co\u002FTencent-Hunyuan\u002FHunyuanDiT-v1.2) 下载权重文件，并放置于 `.\u002Fckpts` 目录下。\n\n主要文件包括：\n- `hunyuan-dit-v1.2.pt` (主模型权重)\n- `clip_text_encoder.pt` \u002F `mt5_text_encoder.pt` (文本编码器)\n- `vae.pt` (VAE 权重)\n\n## 基本使用\n\n### 方法一：命令行推理 (CLI)\n这是最基础的用法，适合快速测试。\n\n```bash\npython sample.py \\\n    --prompt \"一只穿着西装的猪在赛博朋克城市中\" \\\n    --model_path .\u002Fckpts\u002Fhunyuan-dit-v1.2.pt \\\n    --image_size 1024 1024 \\\n    --seed 42\n```\n生成的图片默认保存在 `samples\u002F` 目录。\n\n### 方法二：本地 Web 界面 (Gradio)\n启动交互式网页界面，支持多轮对话修改图片。\n\n```bash\npython gradio_server.py \\\n    --model_path .\u002Fckpts\u002Fhunyuan-dit-v1.2.pt\n```\n启动后在浏览器访问显示的地址（通常为 `http:\u002F\u002F127.0.0.1:7860`）。\n\n**使用示例**：\n1. 输入提示词：`画一个木制的鸟`\n2. 点击生成。\n3. 在多轮对话框中输入：`变成玻璃材质的`，模型将基于上下文重新生成。\n\n### 方法三：低显存推理 (6GB VRAM)\n如果你的显存仅为 6GB，请使用 `lite` 目录下的专用脚本：\n\n```bash\npython lite\u002Fsample_lite.py \\\n    --prompt \"风景画，夕阳，山脉\" \\\n    --model_path .\u002Fckpts\u002Fhunyuan-dit-v1.2.pt\n```\n\n### 方法四：集成到 ComfyUI\nHunyuanDiT 已原生支持 ComfyUI。\n1. 将模型文件放入 ComfyUI 的 `models\u002Fcheckpoints` 目录。\n2. 启动 ComfyUI，加载官方提供的 HunyuanDiT 工作流 JSON 文件即可使用。\n3. 支持 ControlNet 和 LoRA 节点。\n\n---\n*更多高级功能（如 ControlNet、LoRA 训练、TensorRT 加速等）请参考项目仓库中的详细文档。*","某电商公司的设计团队需要为“双十一”大促快速生成大量包含特定中文成语、古诗词或复杂文化元素的商品宣传海报，以迎合国内消费者的审美偏好。\n\n### 没有 HunyuanDiT 时\n- **中文语义理解偏差**：使用国外主流模型时，输入“龙腾虎跃”或“水墨山水”等具有深厚文化底蕴的提示词，生成的图像往往只体现字面意思（如真的画一条龙和老虎在跳），缺乏意境和神韵。\n- **文字渲染乱码**：试图让模型直接在图中生成中文标语时，输出结果多为无法识别的伪汉字或乱码，设计师必须后期手动 PS 添加文字，效率极低。\n- **多分辨率适配困难**：为了适配手机竖屏、PC 横屏及户外大屏等不同渠道，需要针对不同分辨率反复调整参数或裁剪图片，导致主体构图经常崩坏或细节丢失。\n- **细粒度控制缺失**：难以精准控制画面中多个中文元素的相对位置和交互关系，修改一次需求往往需要重新生成数十张图才能碰巧得到一张可用的。\n\n### 使用 HunyuanDiT 后\n- **深度中文语境还原**：HunyuanDiT 凭借细粒度的中文理解能力，能准确捕捉“留白”、“气韵生动”等抽象概念，生成的画面完美契合中国传统美学风格。\n- **原生中文文本支持**：直接输入中文标语即可在图中生成清晰、正确的汉字，大幅减少了后期排版和修图的工作量，实现了从提示词到成图的一站式产出。\n- **灵活的多分辨率生成**：利用其强大的多分辨率 Diffusion Transformer 架构，同一套提示词可直接生成任意比例的高质量图像，且主体结构和细节始终保持一致。\n- **精准的细节操控**：通过自然语言即可精确描述物体间的空间关系（如“灯笼挂在屋檐左下角”），显著提升了首图可用率，将单张海报的平均制作时间从 30 分钟缩短至 5 分钟。\n\nHunyuanDiT 通过突破性的中文语义理解与多分辨率生成技术，彻底解决了本土化创意内容生产中“懂中文难、出图慢、控图弱”的核心痛点。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FTencent-Hunyuan_HunyuanDiT_e2e83cd2.png","Tencent-Hunyuan","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FTencent-Hunyuan_c6e5ecd4.png","",null,"https:\u002F\u002Fhunyuan.tencent.com\u002F","https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan",[21,25,29],{"name":22,"color":23,"percentage":24},"Jupyter Notebook","#DA5B0B",68.2,{"name":26,"color":27,"percentage":28},"Python","#3572A5",31.2,{"name":30,"color":31,"percentage":32},"Shell","#89e051",0.6,4293,361,"2026-04-11T08:05:48","NOASSERTION",3,"Linux","需要 NVIDIA GPU。标准推理未明确最低显存，但提供 6GB 显存优化脚本；训练及高性能推理建议更高显存。支持 CUDA 11 和 CUDA 12（官方提供对应 Docker 环境）。V100 显卡需使用 scaled attention 替代 flash attention。","未说明",{"notes":42,"python":40,"dependencies":43},"官方推荐使用 Docker 环境（支持 CUDA 11\u002F12）以简化安装。项目支持多种运行方式：原生 PyTorch 脚本、Hugging Face Diffusers、ComfyUI 节点、Kohya_ss 训练工具以及 Gradio Web 演示。针对低显存用户（6GB VRAM）提供了专门的轻量级推理脚本。支持 LoRA 微调、ControlNet 控制生成及 IP-Adapter 功能。模型权重可通过 HuggingFace 或腾讯云链接下载。",[44,45,46,47,48,49,50],"PyTorch","transformers","diffusers","accelerate","gradio","ComfyUI","TensorRT (可选加速)",[52,53],"图像","其他",2,"ready","2026-03-27T02:49:30.150509","2026-04-11T23:23:56.587553",[59,64,69,74,79,84],{"id":60,"question_zh":61,"answer_zh":62,"source_url":63},30089,"在哪里可以试用多轮对话生成图片的功能？","您可以通过以下两种方式体验：\n1. **在线体验**：访问 [腾讯混元助手](https:\u002F\u002Fhunyuan.tencent.com\u002Fbot\u002Fchat)，通过连续的语言交互来修改和生成图片。\n2. **本地部署**：运行官方提供的 Demo 脚本。如果您的显存小于 32GB，建议添加 `--load-4bit` 参数启用 4 比特量化（至少需要 22GB 显存）。\n命令如下：\n```bash\npython app\u002FmultiTurnT2I_app.py\n# 显存不足时使用：\npython app\u002FmultiTurnT2I_app.py --load-4bit\n```","https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Fissues\u002F3",{"id":65,"question_zh":66,"answer_zh":67,"source_url":68},30090,"运行时出现 'Cannot copy out of meta tensor; no data!' 错误如何解决？","该错误通常与文本增强（Text Enhancement）模块有关。解决方法是关闭文本增强选项。\n请在启动命令中添加 `--no-enhance` 参数，例如：\n```bash\npython app\u002Fhydit_app.py --lang en --no-enhance\n```\n这样即可绕过该错误正常运行。","https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Fissues\u002F8",{"id":70,"question_zh":71,"answer_zh":72,"source_url":73},30091,"如何在 ComfyUI 中使用 ControlNet 功能？","官方提供了包含 ControlNet 和 LoRA 支持的工作流文件。您可以下载并导入以下 JSON 文件到 ComfyUI 中使用：\n[workflow_lora_controlnet.json](https:\u002F\u002Fgithub.com\u002FTencent\u002FHunyuanDiT\u002Fblob\u002Fmain\u002Fcomfyui-hydit\u002Fworkflow\u002Fworkflow_lora_controlnet.json)\n注意：如果导入后无法运行，请检查是否缺少相关自定义节点，并确保工作流中的模型路径参数配置正确。","https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Fissues\u002F113",{"id":75,"question_zh":76,"answer_zh":77,"source_url":78},30092,"使用 v1.2 版本权重推理时生成的图片全黑怎么办？","如果您使用 v1.2 版本的权重（特别是 distill 权重）运行 `sample_t2i.py` 得到全黑图片，而使用 main 分支权重正常，这通常是因为采样参数配置不匹配。\n尝试调整 `--beta-end` 参数。在复现案例中，移除或调整该参数可能解决问题。此外，请确保您使用的代码版本与模型权重版本严格对应，建议优先使用 HuggingFace 上提供的最新权重或主分支代码进行推理。","https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Fissues\u002F160",{"id":80,"question_zh":81,"answer_zh":82,"source_url":83},30093,"启动 multiTurnT2I_app.py 时在 Gradio 界面提交提示词报错如何处理？","此类错误通常由环境依赖缺失或模型加载失败引起。虽然具体报错堆栈涉及 Gradio 内部调用，但常见的解决思路包括：\n1. 确保已正确安装所有依赖包（参考 requirements.txt）。\n2. 检查模型权重文件是否完整下载到指定目录。\n3. 尝试以纯文本模式运行，排除 UI 渲染问题。\n如果问题依旧，建议查看是否有针对特定 GPU 显存不足的报错，必要时使用 `--load-4bit` 降低显存需求。","https:\u002F\u002Fgithub.com\u002FTencent-Hunyuan\u002FHunyuanDiT\u002Fissues\u002F54",{"id":85,"question_zh":86,"answer_zh":87,"source_url":63},30094,"如何降低 HunyuanDiT 运行时的显存占用？","对于显存有限的用户（如显存小于 32GB），官方推荐启用 4-bit 量化加载模式。\n在运行多轮对话 Demo 或其他推理脚本时，添加 `--load-4bit` 参数。这将把模型量化为 4 比特精度，最低仅需约 22GB 显存即可运行。\n示例命令：\n```bash\npython app\u002FmultiTurnT2I_app.py --load-4bit\n```",[],[90,101,109,116,125,134],{"id":91,"name":92,"github_repo":93,"description_zh":94,"stars":95,"difficulty_score":37,"last_commit_at":96,"category_tags":97,"status":55},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,"2026-04-06T06:32:30",[98,99,52,100],"Agent","开发框架","数据工具",{"id":102,"name":103,"github_repo":104,"description_zh":105,"stars":106,"difficulty_score":37,"last_commit_at":107,"category_tags":108,"status":55},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[99,52,98],{"id":110,"name":49,"github_repo":111,"description_zh":112,"stars":113,"difficulty_score":54,"last_commit_at":114,"category_tags":115,"status":55},2271,"Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[99,52,98],{"id":117,"name":118,"github_repo":119,"description_zh":120,"stars":121,"difficulty_score":54,"last_commit_at":122,"category_tags":123,"status":55},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[124,98,52,99],"插件",{"id":126,"name":127,"github_repo":128,"description_zh":129,"stars":130,"difficulty_score":37,"last_commit_at":131,"category_tags":132,"status":55},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[133,52,98,99],"语言模型",{"id":135,"name":136,"github_repo":137,"description_zh":138,"stars":139,"difficulty_score":37,"last_commit_at":140,"category_tags":141,"status":55},4292,"Deep-Live-Cam","hacksider\u002FDeep-Live-Cam","Deep-Live-Cam 是一款专注于实时换脸与视频生成的开源工具，用户仅需一张静态照片，即可通过“一键操作”实现摄像头画面的即时变脸或制作深度伪造视频。它有效解决了传统换脸技术流程繁琐、对硬件配置要求极高以及难以实时预览的痛点，让高质量的数字内容创作变得触手可及。\n\n这款工具不仅适合开发者和技术研究人员探索算法边界，更因其极简的操作逻辑（仅需三步：选脸、选摄像头、启动），广泛适用于普通用户、内容创作者、设计师及直播主播。无论是为了动画角色定制、服装展示模特替换，还是制作趣味短视频和直播互动，Deep-Live-Cam 都能提供流畅的支持。\n\n其核心技术亮点在于强大的实时处理能力，支持口型遮罩（Mouth Mask）以保留使用者原始的嘴部动作，确保表情自然精准；同时具备“人脸映射”功能，可同时对画面中的多个主体应用不同面孔。此外，项目内置了严格的内容安全过滤机制，自动拦截涉及裸露、暴力等不当素材，并倡导用户在获得授权及明确标注的前提下合规使用，体现了技术发展与伦理责任的平衡。",88924,"2026-04-06T03:28:53",[99,52,98,142],"视频"]