[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-jdh-algo--JoyHallo":3,"tool-jdh-algo--JoyHallo":64},[4,18,26,35,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107888,2,"2026-04-06T11:32:50",[14,15,13],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":10,"last_commit_at":41,"category_tags":42,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[43,15,13,14],"语言模型",{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":10,"last_commit_at":50,"category_tags":51,"status":17},4292,"Deep-Live-Cam","hacksider\u002FDeep-Live-Cam","Deep-Live-Cam 是一款专注于实时换脸与视频生成的开源工具，用户仅需一张静态照片，即可通过“一键操作”实现摄像头画面的即时变脸或制作深度伪造视频。它有效解决了传统换脸技术流程繁琐、对硬件配置要求极高以及难以实时预览的痛点，让高质量的数字内容创作变得触手可及。\n\n这款工具不仅适合开发者和技术研究人员探索算法边界，更因其极简的操作逻辑（仅需三步：选脸、选摄像头、启动），广泛适用于普通用户、内容创作者、设计师及直播主播。无论是为了动画角色定制、服装展示模特替换，还是制作趣味短视频和直播互动，Deep-Live-Cam 都能提供流畅的支持。\n\n其核心技术亮点在于强大的实时处理能力，支持口型遮罩（Mouth Mask）以保留使用者原始的嘴部动作，确保表情自然精准；同时具备“人脸映射”功能，可同时对画面中的多个主体应用不同面孔。此外，项目内置了严格的内容安全过滤机制，自动拦截涉及裸露、暴力等不当素材，并倡导用户在获得授权及明确标注的前提下合规使用，体现了技术发展与伦理责任的平衡。",88924,"2026-04-06T03:28:53",[14,15,13,52],"视频",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",85013,"2026-04-06T11:09:19",[15,16,52,61,13,62,43,14,63],"插件","其他","音频",{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":75,"owner_avatar_url":76,"owner_bio":77,"owner_company":77,"owner_location":77,"owner_email":77,"owner_twitter":77,"owner_website":77,"owner_url":78,"languages":79,"stars":88,"forks":89,"last_commit_at":90,"license":91,"difficulty_score":10,"env_os":92,"env_gpu":93,"env_ram":94,"env_deps":95,"category_tags":101,"github_topics":102,"view_count":32,"oss_zip_url":77,"oss_zip_packed_at":77,"status":17,"created_at":106,"updated_at":107,"faqs":108,"releases":139},5308,"jdh-algo\u002FJoyHallo","JoyHallo","JoyHallo: Digital human model for Mandarin","JoyHallo 是一款专为中文语境打造的数字人生成模型，能够根据音频驱动生成口型自然、表情生动的真人视频。针对以往音频驱动视频技术在中文场景下面临的难题——如高质量中文数据集稀缺、中文发音对应的唇部运动更为复杂等，JoyHallo 通过构建包含 29 小时多样化中文演讲视频的 jdh-Hallo 数据集，并引入中文 wav2vec2 模型进行音频特征嵌入，显著提升了中文视频生成的准确度与流畅度。\n\n该模型采用半解耦架构，有效捕捉唇形、表情与姿态之间的关联，在提升信息利用效率的同时，将推理速度加快了 14.3%。值得一提的是，JoyHallo 不仅擅长中文视频生成，还保留了出色的英文跨语言生成能力，适用场景更加广泛。\n\nJoyHallo 非常适合 AI 开发者、多媒体研究人员以及数字内容创作者使用，尤其适用于需要制作中文虚拟主播、在线教育视频、医疗科普内容等场景。其开源特性与清晰的部署文档，也让具备一定技术基础的用户能够快速上手实验与二次开发。目前模型权重已托管于 Hugging Face，支持社区自由下载与体验。","\u003Ch1 align='center'>JoyHallo: Digital human model for Mandarin\u003C\u002Fh1>\n\n\u003Cdiv align='center'>\n    \u003Ca href='https:\u002F\u002Fgithub.com\u002FDBDXSS' target='_blank'>Sheng Shi\u003C\u002Fa> \n    \u003Ca href='https:\u002F\u002Fgithub.com\u002Fxuyangcao' target='_blank'>Xuyang Cao\u003C\u002Fa> \n    \u003Ca href='https:\u002F\u002Fgithub.com\u002Fzhaojun060708' target='_blank'>Jun Zhao\u003C\u002Fa> \n    Guoxin Wang\n\u003C\u002Fdiv>\n\u003Cdiv align='center'>\n    JD Health International Inc.\n\u003C\u002Fdiv>\n\n\u003Cbr>\n\u003Cdiv align='center'>\n    \u003Ca href='https:\u002F\u002Fgithub.com\u002Fjdh-algo\u002FJoyHallo'>\u003Cimg src='https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fjdh-algo\u002FJoyHallo?style=social'>\u003C\u002Fa>\n    \u003Ca href='https:\u002F\u002Fjdh-algo.github.io\u002FJoyHallo'>\u003Cimg src='https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FProject-HomePage-Green'>\u003C\u002Fa>\n    \u003Ca href='https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.13268'>\u003Cimg src='https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPaper-Arxiv-red'>\u003C\u002Fa>\n    \u003Ca href='https:\u002F\u002Fhuggingface.co\u002Fjdh-algo\u002FJoyHallo-v1'>\u003Cimg src='https:\u002F\u002Fimg.shields.io\u002Fbadge\u002F%F0%9F%A4%97%20HuggingFace-Model-yellow'>\u003C\u002Fa>\n    \u003Ca href='https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fjdh-algo\u002FJoyHallo'>\u003Cimg src='https:\u002F\u002Fimg.shields.io\u002Fbadge\u002F%F0%9F%A4%97%20HuggingFace-Demo-yellow'>\u003C\u002Fa>\n\u003C\u002Fdiv>\n\u003Cbr>\n\n## 📖 Introduction\n\nIn audio-driven video generation, creating Mandarin videos presents significant challenges. Collecting comprehensive Mandarin datasets is difficult, and the complex lip movements in Mandarin further complicate model training compared to English. In this study, we collected 29 hours of Mandarin speech video from JD Health International Inc. employees, resulting in the jdh-Hallo dataset. This dataset includes a diverse range of ages and speaking styles, encompassing both conversational and specialized medical topics. To adapt the JoyHallo model for Mandarin, we employed the Chinese wav2vec2 model for audio feature embedding. A semi-decoupled structure is proposed to capture inter-feature relationships among lip, expression, and pose features. This integration not only improves information utilization efficiency but also accelerates inference speed by 14.3%. Notably, JoyHallo maintains its strong ability to generate English videos, demonstrating excellent cross-language generation capabilities.\n\n## 📰 News\n\n- 2024\u002F11\u002F19: 🎉🎉🎉 We are proud to introduce our brand new model JoyVASA. It is lighter and faster than other generative models. Here is the [HomePage](https:\u002F\u002Fjdh-algo.github.io\u002FJoyVASA\u002F).\n\n## 🎬 Videos-Mandarin-Woman\n\nhttps:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002F389e053f-e0c4-433c-8c60-80f9181d3f9c\n\n## 🎬 Videos-Mandarin-Man\n\nhttps:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002F1694efd9-2577-4bb5-ada4-7aa711d016a6\n\n## 🎬 Videos-English\n\nhttps:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002Fd6b2efea-be76-442e-a8aa-ea0eef8b5f12\n\n## 🧳 Framework\n\n![Network](assets\u002Fnetwork.png \"Network\")\n\n## ⚙️ Installation\n\nSystem requirements:\n\n- Tested on Ubuntu 20.04, Cuda 11.3\n- Tested GPUs: A100\n\nCreate environment:\n\n```bash\n# 1. Create base environment\nconda create -n joyhallo python=3.10 -y\nconda activate joyhallo\n\n# 2. Install requirements\npip install -r requirements.txt\n\n# 3. Install ffmpeg\nsudo apt-get update  \nsudo apt-get install ffmpeg -y\n```\n\n## 🎒 Prepare model checkpoints\n\n### 1. Download base checkpoints\n\nUse the following command to download the base weights:\n\n```shell\ngit lfs install\ngit clone https:\u002F\u002Fhuggingface.co\u002Ffudan-generative-ai\u002Fhallo pretrained_models\n```\n\n### 2. Download chinese-wav2vec2-base model\n\nUse the following command to download the `chinese-wav2vec2-base` model:\n\n```shell\ncd pretrained_models\ngit lfs install\ngit clone https:\u002F\u002Fhuggingface.co\u002FTencentGameMate\u002Fchinese-wav2vec2-base \n```\n\n### 3. Download JoyHallo model\n\n```bash\ngit lfs install\ngit clone https:\u002F\u002Fhuggingface.co\u002Fjdh-algo\u002FJoyHallo-v1 pretrained_models\u002Fjoyhallo\n```\n\nFor convenience, we have uploaded the model weights to **Hugging Face**.\n\n|  Model  |  Dataset  |                     Hugging Face                     |\n| :------: | :-------: | :--------------------------------------------------: |\n| JoyHallo | jdh-Hallo | [JoyHallo](https:\u002F\u002Fhuggingface.co\u002Fjdh-algo\u002FJoyHallo-v1) |\n\n### 4. pretrained_models contents\n\nThe final `pretrained_models` directory should look like this:\n\n```text\n.\u002Fpretrained_models\u002F\n|-- audio_separator\u002F\n|   |-- download_checks.json\n|   |-- mdx_model_data.json\n|   |-- vr_model_data.json\n|   `-- Kim_Vocal_2.onnx\n|-- face_analysis\u002F\n|   `-- models\u002F\n|       |-- face_landmarker_v2_with_blendshapes.task\n|       |-- 1k3d68.onnx\n|       |-- 2d106det.onnx\n|       |-- genderage.onnx\n|       |-- glintr100.onnx\n|       `-- scrfd_10g_bnkps.onnx\n|-- hallo\u002F\n|   `-- net.pth\n|-- joyhallo\u002F\n|   `-- net.pth\n|-- motion_module\u002F\n|   `-- mm_sd_v15_v2.ckpt\n|-- sd-vae-ft-mse\u002F\n|   |-- config.json\n|   `-- diffusion_pytorch_model.safetensors\n|-- stable-diffusion-v1-5\u002F\n|   `-- unet\u002F\n|       |-- config.json\n|       `-- diffusion_pytorch_model.safetensors\n|-- wav2vec\u002F\n|   `-- wav2vec2-base-960h\u002F\n|       |-- config.json\n|       |-- feature_extractor_config.json\n|       |-- model.safetensors\n|       |-- preprocessor_config.json\n|       |-- special_tokens_map.json\n|       |-- tokenizer_config.json\n|       `-- vocab.json\n`-- chinese-wav2vec2-base\u002F\n    |-- chinese-wav2vec2-base-fairseq-ckpt.pt\n    |-- config.json\n    |-- preprocessor_config.json\n    `-- pytorch_model.bin\n```\n\n## 🚧 Data requirements\n\n**Image**:\n\n- Cropped to square shape.\n- Face should be facing forward and occupy 50%-70% of the image.\n\n**Audio**:\n\n- Use `wav` format.\n- Mandarin, English or mixed, with clear audio and suitable background music.\n\n> [!IMPORTANT]\n> These requirements apply to **both training and inference processes**.\n\n## 🚀 Inference\n\n### 1. Inference with command line\n\nUse the following command to perform inference:\n\n```bash\nsh joyhallo-infer.sh\n```\n\n> [!TIP]\n> If you want to improve the inference speed, you can change the `inference_steps` from **40** to **15** in `configs\u002Finference\u002Finference.yaml`. That will enhance the efficiency immediately. You can decrease that even more, but you may get a worse result. You can try changing `cfg_scale` together.\n\nModify the parameters in `configs\u002Finference\u002Finference.yaml` to specify the audio and image files you want to use, as well as switch between models. The inference results will be saved in `opts\u002Fjoyhallo`. The parameters in `inference.yaml` are explained as follows:\n\n* audio_ckpt_dir: Path to the model weights.\n* ref_img_path: Path to the reference images.\n* audio_path: Path to the reference audios.\n* output_dir: Output directory.\n* exp_name: Output file folder name.\n\n### 2. Inference with web demo\n\nUse the following command to start web demo:\n\n```bash\nsh joyhallo-app.sh\n```\n\nThe demo will be create at [http:\u002F\u002F127.0.0.1:7860](http:\u002F\u002F127.0.0.1:7860).\n\n## ⚓️ Train or fine-tune JoyHallo\n\nYou have two options when training or fine-tuning the model: start from **Stage 1** or only train  **Stage 2** .\n\n### 1. Use the following command to start training from Stage 1\n\n```\nsh joyhallo-alltrain.sh\n```\n\nThis will automatically start training both stages (including Stage 1 and Stage 2), and you can adjust the training parameters by referring to `configs\u002Ftrain\u002Fstage1_alltrain.yaml` and `configs\u002Ftrain\u002Fstage2_alltrain.yaml`.\n\n### 2. Use the following command to train only Stage 2\n\n```\nsh joyhallo-train.sh\n```\n\nThis will start training from  **Stage 2** , and you can adjust the training parameters by referring to `configs\u002Ftrain\u002Fstage2.yaml`.\n\n## 🎓 Prepare training data\n\n### 1. Prepare the data in the following directory structure, ensuring that the data meets the requirements mentioned earlier\n\n```text\njdh-Hallo\u002F\n|-- videos\u002F\n|   |-- 0001.mp4\n|   |-- 0002.mp4\n|   |-- 0003.mp4\n|   `-- 0004.mp4\n```\n\n### 2. Use the following command to process the dataset\n\n```bash\n# 1. Extract features from videos.\npython -m scripts.data_preprocess --input_dir jdh-Hallo\u002Fvideos --step 1 -p 1 -r 0\npython -m scripts.data_preprocess --input_dir jdh-Hallo\u002Fvideos --step 2 -p 1 -r 0\n\n# 2. Get jdh-Hallo dataset.\npython scripts\u002Fextract_meta_info_stage1.py -r jdh-Hallo -n jdh-Hallo\npython scripts\u002Fextract_meta_info_stage2.py -r jdh-Hallo -n jdh-Hallo\n```\n\n> [!NOTE]\n> Execute steps 1 and 2 sequentially as they perform different tasks. Step 1 converts videos into frames, extracts audio from each video, and generates the necessary masks. Step 2 generates face embeddings using InsightFace and audio embeddings using Chinese wav2vec2, and requires a GPU. For parallel processing, use the `-p` and `-r` arguments. The `-p` argument specifies the total number of instances to launch, dividing the data into `p` parts. The `-r` argument specifies which part the current process should handle. You need to manually launch multiple instances with different values for `-r`.\n\n## 💻 Comparison\n\n### 1. Accuracy comparison in Mandarin\n\n|  Model  | IQA $\\uparrow$ | VQA $\\uparrow$ | Sync-C $\\uparrow$ | Sync-D $\\downarrow$ | Smooth $\\uparrow$ | Subject $\\uparrow$ | Background $\\uparrow$ |\n| :------: | :--------------: | :--------------: | :----------------: | :------------------: | :----------------: | :-----------------: | :--------------------: |\n|  Hallo  | **0.7865** |      0.8563      |       5.7420       |  **13.8140**  |       0.9924       |       0.9855       |    **0.9651**    |\n| JoyHallo |      0.7781      | **0.8566** |  **6.1596**  |       14.2053       |  **0.9925**  |  **0.9864**  |         0.9627         |\n\nNotes: The evaluation metrics used here are from the following repositories, and the results are for reference purposes only:\n\n- IQA and VQA: [Q-Align](https:\u002F\u002Fgithub.com\u002FQ-Future\u002FQ-Align)\n- Sync-C and Sync-D: [Syncnet](https:\u002F\u002Fgithub.com\u002Fjoonson\u002Fsyncnet_python)\n- Smooth, Subject, and Background: [VBench](https:\u002F\u002Fgithub.com\u002FVchitect\u002FVBench)\n\n### 2. Accuracy comparison in English\n\n|  Model  | IQA $\\uparrow$ | VQA $\\uparrow$ | Sync-C $\\uparrow$ | Sync-D $\\downarrow$ | Smooth $\\uparrow$ | Subject $\\uparrow$ | Background $\\uparrow$ |\n| :------: | :--------------: | :--------------: | :----------------: | :------------------: | :----------------: | :-----------------: | :--------------------: |\n|  Hallo  | **0.7779** |      0.8471      |       4.4093       |  **13.2340**  |       0.9921       |       0.9814       |    **0.9649**    |\n| JoyHallo | **0.7779** | **0.8537** |  **4.7658**  |       13.3617       |  **0.9922**  |  **0.9838**  |         0.9622         |\n\n### 3. Inference efficiency comparison\n\n|                              | JoyHallo | Hallo |   Improvement   |\n| :---------------------------: | :------: | :----: | :-------------: |\n| GPU Memory (512*512, step 40) |  19049m  | 19547m | **2.5%** |\n|  Inference Speed (16 frames)  |   24s   |  28s  | **14.3%** |\n\n## 📝 Citations\n\nIf you find our work helpful, please consider citing us:\n\n```\n@misc{shi2024joyhallo,\n  title={JoyHallo: Digital human model for Mandarin}, \n  author={Sheng Shi and Xuyang Cao and Jun Zhao and Guoxin Wang},\n  year={2024},\n  eprint={2409.13268},\n  archivePrefix={arXiv},\n  primaryClass={cs.CV},\n  url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.13268}, \n}\n```\n\n## 🤝 Acknowledgments\n\nWe would like to thank the contributors to the [Hallo](https:\u002F\u002Fgithub.com\u002Ffudan-generative-vision\u002Fhallo), [wav2vec 2.0](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Ffairseq\u002Ftree\u002Fmain\u002Fexamples\u002Fwav2vec), [Chinese-wav2vec2](https:\u002F\u002Fgithub.com\u002FTencentGameMate\u002Fchinese_speech_pretrain), [Q-Align](https:\u002F\u002Fgithub.com\u002FQ-Future\u002FQ-Align), [Syncnet](https:\u002F\u002Fgithub.com\u002Fjoonson\u002Fsyncnet_python), [VBench](https:\u002F\u002Fgithub.com\u002FVchitect\u002FVBench), and [Moore-AnimateAnyone](https:\u002F\u002Fgithub.com\u002FMooreThreads\u002FMoore-AnimateAnyone) repositories, for their open research and extraordinary work.\n","\u003Ch1 align='center'>JoyHallo：中文数字人模型\u003C\u002Fh1>\n\n\u003Cdiv align='center'>\n    \u003Ca href='https:\u002F\u002Fgithub.com\u002FDBDXSS' target='_blank'>Sheng Shi\u003C\u002Fa> \n    \u003Ca href='https:\u002F\u002Fgithub.com\u002Fxuyangcao' target='_blank'>Xuyang Cao\u003C\u002Fa> \n    \u003Ca href='https:\u002F\u002Fgithub.com\u002Fzhaojun060708' target='_blank'>Jun Zhao\u003C\u002Fa> \n    Guoxin Wang\n\u003C\u002Fdiv>\n\u003Cdiv align='center'>\n    京东健康国际公司\n\u003C\u002Fdiv>\n\n\u003Cbr>\n\u003Cdiv align='center'>\n    \u003Ca href='https:\u002F\u002Fgithub.com\u002Fjdh-algo\u002FJoyHallo'>\u003Cimg src='https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fjdh-algo\u002FJoyHallo?style=social'>\u003C\u002Fa>\n    \u003Ca href='https:\u002F\u002Fjdh-algo.github.io\u002FJoyHallo'>\u003Cimg src='https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FProject-HomePage-Green'>\u003C\u002Fa>\n    \u003Ca href='https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.13268'>\u003Cimg src='https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPaper-Arxiv-red'>\u003C\u002Fa>\n    \u003Ca href='https:\u002F\u002Fhuggingface.co\u002Fjdh-algo\u002FJoyHallo-v1'>\u003Cimg src='https:\u002F\u002Fimg.shields.io\u002Fbadge\u002F%F0%9F%A4%97%20HuggingFace-Model-yellow'>\u003C\u002Fa>\n    \u003Ca href='https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fjdh-algo\u002FJoyHallo'>\u003Cimg src='https:\u002F\u002Fimg.shields.io\u002Fbadge\u002F%F0%9F%A4%97%20HuggingFace-Demo-yellow'>\u003C\u002Fa>\n\u003C\u002Fdiv>\n\u003Cbr>\n\n## 📖 引言\n\n在音频驱动的视频生成任务中，制作中文视频面临着诸多挑战。一方面，高质量的中文数据集收集难度较大；另一方面，中文复杂的唇部动作使得模型训练相较于英文更为复杂。为此，我们从京东健康国际公司的员工处收集了29小时的中文语音视频数据，构建了jdh-Hallo数据集。该数据集涵盖了不同年龄段和多种说话风格，内容既包括日常对话也涉及专业医学领域。为了使JoyHallo模型适应中文场景，我们引入了Chinese wav2vec2模型进行音频特征提取，并提出了一种半解耦结构来捕捉唇部、表情和姿态特征之间的相互关系。这一改进不仅提升了信息利用效率，还使推理速度提升了14.3%。值得注意的是，JoyHallo依然保持了其强大的英文视频生成能力，展现出出色的跨语言生成性能。\n\n## 📰 新闻\n\n- 2024年11月19日：🎉🎉🎉 我们自豪地推出全新模型JoyVASA。它比其他生成模型更轻量、更快。点击此处查看[主页](https:\u002F\u002Fjdh-algo.github.io\u002FJoyVASA\u002F)。\n\n## 🎬 视频—中文—女性\n\nhttps:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002F389e053f-e0c4-433c-8c60-80f9181d3f9c\n\n## 🎬 视频—中文—男性\n\nhttps:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002F1694efd9-2577-4bb5-ada4-7aa711d016a6\n\n## 🎬 视频—英文\n\nhttps:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002Fd6b2efea-be76-442e-a8aa-ea0eef8b5f12\n\n## 🧳 框架\n\n![网络架构](assets\u002Fnetwork.png \"网络架构\")\n\n## ⚙️ 安装\n\n系统要求：\n\n- 经测试支持Ubuntu 20.04及CUDA 11.3\n- 测试GPU：A100\n\n创建环境：\n\n```bash\n# 1. 创建基础环境\nconda create -n joyhallo python=3.10 -y\nconda activate joyhallo\n\n# 2. 安装依赖\npip install -r requirements.txt\n\n# 3. 安装ffmpeg\nsudo apt-get update  \nsudo apt-get install ffmpeg -y\n```\n\n## 🎒 准备模型检查点\n\n### 1. 下载基础检查点\n\n使用以下命令下载基础权重：\n\n```shell\ngit lfs install\ngit clone https:\u002F\u002Fhuggingface.co\u002Ffudan-generative-ai\u002Fhallo pretrained_models\n```\n\n### 2. 下载chinese-wav2vec2-base模型\n\n使用以下命令下载`chinese-wav2vec2-base`模型：\n\n```bash\ncd pretrained_models\ngit lfs install\ngit clone https:\u002F\u002Fhuggingface.co\u002FTencentGameMate\u002Fchinese-wav2vec2-base \n```\n\n### 3. 下载JoyHallo模型\n\n```bash\ngit lfs install\ngit clone https:\u002F\u002Fhuggingface.co\u002Fjdh-algo\u002FJoyHallo-v1 pretrained_models\u002Fjoyhallo\n```\n\n为方便起见，我们已将模型权重上传至**Hugging Face**平台。\n\n| 模型     | 数据集       | Hugging Face 链接                     |\n| :-------: | :----------: | :-----------------------------------: |\n| JoyHallo  | jdh-Hallo    | [JoyHallo](https:\u002F\u002Fhuggingface.co\u002Fjdh-algo\u002FJoyHallo-v1) |\n\n### 4. pretrained_models 目录结构\n\n最终的`pretrained_models`目录应如下所示：\n\n```text\n.\u002Fpretrained_models\u002F\n|-- audio_separator\u002F\n|   |-- download_checks.json\n|   |-- mdx_model_data.json\n|   |-- vr_model_data.json\n|   `-- Kim_Vocal_2.onnx\n|-- face_analysis\u002F\n|   `-- models\u002F\n|       |-- face_landmarker_v2_with_blendshapes.task\n|       |-- 1k3d68.onnx\n|       |-- 2d106det.onnx\n|       |-- genderage.onnx\n|       |-- glintr100.onnx\n|       `-- scrfd_10g_bnkps.onnx\n|-- hallo\u002F\n|   `-- net.pth\n|-- joyhallo\u002F\n|   `-- net.pth\n|-- motion_module\u002F\n|   `-- mm_sd_v15_v2.ckpt\n|-- sd-vae-ft-mse\u002F\n|   |-- config.json\n|   `-- diffusion_pytorch_model.safetensors\n|-- stable-diffusion-v1-5\u002F\n|   `-- unet\u002F\n|       |-- config.json\n|       `-- diffusion_pytorch_model.safetensors\n|-- wav2vec\u002F\n|   `-- wav2vec2-base-960h\u002F\n|       |-- config.json\n|       |-- feature_extractor_config.json\n|       |-- model.safetensors\n|       |-- preprocessor_config.json\n|       |-- special_tokens_map.json\n|       |-- tokenizer_config.json\n|       `-- vocab.json\n`-- chinese-wav2vec2-base\u002F\n    |-- chinese-wav2vec2-base-fairseq-ckpt.pt\n    |-- config.json\n    |-- preprocessor_config.json\n    `-- pytorch_model.bin\n```\n\n## 🚧 数据要求\n\n**图像**：\n\n- 裁剪为正方形。\n- 人脸需正面朝向，占据图像的50%–70%。\n\n**音频**：\n\n- 格式为`wav`。\n- 支持中文、英文或混合语种，要求音频清晰且背景音乐适宜。\n\n> [!重要提示]\n> 以上要求适用于**训练与推理过程**。\n\n## 🚀 推理\n\n### 1. 命令行推理\n\n使用以下命令执行推理：\n\n```bash\nsh joyhallo-infer.sh\n```\n\n> [!提示]\n> 若希望提升推理速度，可将`configs\u002Finference\u002Finference.yaml`中的`inference_steps`由**40**调整为**15**，这将立即提高效率。您还可以进一步降低该值，但可能会导致生成效果变差。同时可以尝试调整`cfg_scale`参数。\n\n通过修改`configs\u002Finference\u002Finference.yaml`中的参数，您可以指定使用的音频和图像文件，并切换不同的模型。推理结果将保存在`opts\u002Fjoyhallo`目录下。该配置文件中的参数说明如下：\n\n* audio_ckpt_dir：模型权重路径。\n* ref_img_path：参考图像路径。\n* audio_path：参考音频路径。\n* output_dir：输出目录。\n* exp_name：输出文件夹名称。\n\n### 2. Web演示推理\n\n使用以下命令启动Web演示：\n\n```bash\nsh joyhallo-app.sh\n```\n\n演示将在[http:\u002F\u002F127.0.0.1:7860](http:\u002F\u002F127.0.0.1:7860)上运行。\n\n## ⚓️ 训练或微调JoyHallo\n\n在训练或微调模型时，您有两个选择：可以从**阶段1**开始，或者仅训练**阶段2**。\n\n### 1. 使用以下命令从阶段1开始训练\n\n```\nsh joyhallo-alltrain.sh\n```\n\n这将自动启动两个阶段的训练（包括阶段1和阶段2），你可以参考 `configs\u002Ftrain\u002Fstage1_alltrain.yaml` 和 `configs\u002Ftrain\u002Fstage2_alltrain.yaml` 来调整训练参数。\n\n### 2. 使用以下命令仅训练阶段2\n\n```\nsh joyhallo-train.sh\n```\n\n这将从 **阶段2** 开始训练，你可以参考 `configs\u002Ftrain\u002Fstage2.yaml` 来调整训练参数。\n\n## 🎓 准备训练数据\n\n### 1. 按照以下目录结构准备数据，确保数据符合前面提到的要求\n\n```text\njdh-Hallo\u002F\n|-- videos\u002F\n|   |-- 0001.mp4\n|   |-- 0002.mp4\n|   |-- 0003.mp4\n|   `-- 0004.mp4\n```\n\n### 2. 使用以下命令处理数据集\n\n```bash\n# 1. 从视频中提取特征。\npython -m scripts.data_preprocess --input_dir jdh-Hallo\u002Fvideos --step 1 -p 1 -r 0\npython -m scripts.data_preprocess --input_dir jdh-Hallo\u002Fvideos --step 2 -p 1 -r 0\n\n# 2. 获取jdh-Hallo数据集。\npython scripts\u002Fextract_meta_info_stage1.py -r jdh-Hallo -n jdh-Hallo\npython scripts\u002Fextract_meta_info_stage2.py -r jdh-Hallo -n jdh-Hallo\n```\n\n> [!NOTE]\n> 请按顺序执行步骤1和步骤2，因为它们执行不同的任务。步骤1会将视频转换为帧、从每个视频中提取音频，并生成所需的掩码。步骤2则使用InsightFace生成人脸嵌入，使用Chinese wav2vec2生成音频嵌入，且需要GPU支持。若需并行处理，可使用 `-p` 和 `-r` 参数。其中 `-p` 参数指定要启动的实例总数，将数据分成 `p` 份；`-r` 参数指定当前进程应处理的那部分数据。你需要手动启动多个实例，并为每个实例设置不同的 `-r` 值。\n\n## 💻 对比\n\n### 1. 普通话下的准确率对比\n\n|  模型  | IQA $\\uparrow$ | VQA $\\uparrow$ | Sync-C $\\uparrow$ | Sync-D $\\downarrow$ | Smooth $\\uparrow$ | Subject $\\uparrow$ | Background $\\uparrow$ |\n| :------: | :--------------: | :--------------: | :----------------: | :------------------: | :----------------: | :-----------------: | :--------------------: |\n|  Hallo  | **0.7865** |      0.8563      |       5.7420       |  **13.8140**  |       0.9924       |       0.9855       |    **0.9651**    |\n| JoyHallo |      0.7781      | **0.8566** |  **6.1596**  |       14.2053       |  **0.9925**  |  **0.9864**  |         0.9627         |\n\n注：此处使用的评估指标来自以下仓库，结果仅供参考：\n\n- IQA和VQA：[Q-Align](https:\u002F\u002Fgithub.com\u002FQ-Future\u002FQ-Align)\n- Sync-C和Sync-D：[Syncnet](https:\u002F\u002Fgithub.com\u002Fjoonson\u002Fsyncnet_python)\n- Smooth、Subject和Background：[VBench](https:\u002F\u002Fgithub.com\u002FVchitect\u002FVBench)\n\n### 2. 英语下的准确率对比\n\n|  模型  | IQA $\\uparrow$ | VQA $\\uparrow$ | Sync-C $\\uparrow$ | Sync-D $\\downarrow$ | Smooth $\\uparrow$ | Subject $\\uparrow$ | Background $\\uparrow$ |\n| :------: | :--------------: | :--------------: | :----------------: | :------------------: | :----------------: | :-----------------: | :--------------------: |\n|  Hallo  | **0.7779** |      0.8471      |       4.4093       |  **13.2340**  |       0.9921       |       0.9814       |    **0.9649**    |\n| JoyHallo | **0.7779** | **0.8537** |  **4.7658**  |       13.3617       |  **0.9922**  |  **0.9838**  |         0.9622         |\n\n### 3. 推理效率对比\n\n|                              | JoyHallo | Hallo |   提升   |\n| :---------------------------: | :------: | :----: | :-------------: |\n| GPU内存（512*512，步长40） |  19049m  | 19547m | **2.5%** |\n|  推理速度（16帧）  |   24s   |  28s  | **14.3%** |\n\n## 📝 引用\n\n如果你觉得我们的工作对你有帮助，请考虑引用我们：\n\n```\n@misc{shi2024joyhallo,\n  title={JoyHallo: Mandarin语言的数字人模型}, \n  author={Sheng Shi and Xuyang Cao and Jun Zhao and Guoxin Wang},\n  year={2024},\n  eprint={2409.13268},\n  archivePrefix={arXiv},\n  primaryClass={cs.CV},\n  url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.13268}, \n}\n```\n\n## 🤝 致谢\n\n我们衷心感谢 [Hallo](https:\u002F\u002Fgithub.com\u002Ffudan-generative-vision\u002Fhallo)、[wav2vec 2.0](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Ffairseq\u002Ftree\u002Fmain\u002Fexamples\u002Fwav2vec)、[Chinese-wav2vec2](https:\u002F\u002Fgithub.com\u002FTencentGameMate\u002Fchinese_speech_pretrain)、[Q-Align](https:\u002F\u002Fgithub.com\u002FQ-Future\u002FQ-Align)、[Syncnet](https:\u002F\u002Fgithub.com\u002Fjoonson\u002Fsyncnet_python)、[VBench](https:\u002F\u002Fgithub.com\u002FVchitect\u002FVBench) 以及 [Moore-AnimateAnyone](https:\u002F\u002Fgithub.com\u002FMooreThreads\u002FMoore-AnimateAnyone) 等项目的贡献者们，感谢他们开放的研究精神和卓越的工作成果。","# JoyHallo 快速上手指南\n\nJoyHallo 是一个专为普通话设计的数字人视频生成模型，由京东健康团队开源。它支持中、英及混合语言驱动，能够生成口型自然、表情丰富的数字人视频。\n\n## 1. 环境准备\n\n在开始之前，请确保您的系统满足以下要求：\n\n*   **操作系统**: Ubuntu 20.04 (推荐)\n*   **GPU**: 推荐使用 NVIDIA A100 或其他高性能显卡\n*   **CUDA 版本**: 11.3\n*   **Python 版本**: 3.10\n*   **其他依赖**: `ffmpeg`, `git`, `git-lfs`\n\n> **提示**：国内用户建议配置 pip 国内镜像源（如清华源或阿里源）以加速依赖下载。\n\n## 2. 安装步骤\n\n### 2.1 创建虚拟环境并安装依赖\n\n```bash\n# 1. 创建基础环境\nconda create -n joyhallo python=3.10 -y\nconda activate joyhallo\n\n# 2. 安装 Python 依赖\npip install -r requirements.txt\n\n# 3. 安装 ffmpeg (Ubuntu\u002FDebian)\nsudo apt-get update  \nsudo apt-get install ffmpeg -y\n```\n\n### 2.2 下载模型权重\n\n项目依赖多个预训练模型，请按顺序执行以下命令下载。所有模型将存放在 `pretrained_models` 目录中。\n\n```bash\n# 初始化 git lfs\ngit lfs install\n\n# 1. 下载 Hallo 基础权重\ngit clone https:\u002F\u002Fhuggingface.co\u002Ffudan-generative-ai\u002Fhallo pretrained_models\n\n# 2. 下载中文语音特征提取模型 (chinese-wav2vec2-base)\ncd pretrained_models\ngit clone https:\u002F\u002Fhuggingface.co\u002FTencentGameMate\u002Fchinese-wav2vec2-base \ncd ..\n\n# 3. 下载 JoyHallo 主模型\ngit clone https:\u002F\u002Fhuggingface.co\u002Fjdh-algo\u002FJoyHallo-v1 pretrained_models\u002Fjoyhallo\n```\n\n> **注意**：如果 Hugging Face 下载速度慢，国内用户可尝试使用镜像站（如 `hf-mirror.com`）或在命令前设置环境变量：\n> `export HF_ENDPOINT=https:\u002F\u002Fhf-mirror.com`\n\n下载完成后，`pretrained_models` 目录结构应包含 `hallo`, `joyhallo`, `chinese-wav2vec2-base`, `wav2vec`, `face_analysis` 等文件夹。\n\n## 3. 基本使用\n\n### 3.1 数据准备\n\n推理需要一张参考图片和一段音频文件：\n*   **图片 (`ref_img`)**: 正方形裁剪，人脸正对镜头，占比约 50%-70%。\n*   **音频 (`audio`)**: `.wav` 格式，支持普通话、英语或混合发音，声音清晰。\n\n### 3.2 命令行推理\n\n修改配置文件 `configs\u002Finference\u002Finference.yaml`，指定您的图片路径、音频路径及输出目录。主要参数说明：\n*   `ref_img_path`: 参考图片路径\n*   `audio_path`: 驱动音频路径\n*   `output_dir`: 结果保存目录\n*   `exp_name`: 输出文件夹名称\n\n执行推理脚本：\n\n```bash\nsh joyhallo-infer.sh\n```\n\n生成的视频将保存在 `opts\u002Fjoyhallo` 目录下。\n\n> **加速技巧**：若需提升推理速度，可编辑 `configs\u002Finference\u002Finference.yaml`，将 `inference_steps` 从 **40** 改为 **15**。这会显著减少耗时，但可能会轻微影响画质。\n\n### 3.3 Web Demo 体验\n\n如果您更喜欢图形化界面，可以启动本地 Web Demo：\n\n```bash\nsh joyhallo-app.sh\n```\n\n启动后，在浏览器访问 `http:\u002F\u002F127.0.0.1:7860` 即可上传素材并生成视频。","某互联网医疗平台的内容运营团队急需将最新的普通话健康科普文章快速转化为真人讲解视频，以触达更多中老年用户。\n\n### 没有 JoyHallo 时\n- **发音生硬不自然**：通用的数字人模型多基于英语训练，强行用于中文时唇形与声调严重不同步，导致“口型对不上”，观众观感极差。\n- **数据采集成本高昂**：为了获得自然的中文口型，团队需花费数周时间录制特定医生的高清视频素材，且难以覆盖多样化的年龄和语速风格。\n- **跨语言支持缺失**：若需同时制作中英文双语版本，必须分别寻找两套不同的生成方案或模型，工作流割裂且维护成本高。\n- **推理速度缓慢**：现有开源方案在处理复杂的中文唇部运动特征时计算冗余大，生成一段 1 分钟视频往往需要数小时，无法满足每日更新需求。\n\n### 使用 JoyHallo 后\n- **中文唇形精准同步**：JoyHallo 专为普通话优化，内置中文 wav2vec2 音频嵌入，能完美捕捉中文特有的复杂唇部动作，生成的视频口型自然流畅。\n- **零样本快速启动**：依托其训练的 jdh-Hallo 数据集（涵盖不同年龄与医疗话题），团队无需重新采集数据，直接输入文本和音频即可生成专业风格的医生讲解视频。\n- **中英双语无缝切换**：JoyHallo 具备强大的跨语言生成能力，同一套模型即可高质量输出中文和英文视频，大幅简化了多语言内容的生产流程。\n- **推理效率显著提升**：得益于半解耦结构对特征关系的高效捕捉，JoyHallo 将推理速度提升了 14.3%，让批量生成科普视频成为实时可行的操作。\n\nJoyHallo 通过专攻中文语音难点与架构优化，让高质量的普通话数字人视频生成变得低成本、高效率且自然逼真。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjdh-algo_JoyHallo_dbb0b6e8.png","jdh-algo","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fjdh-algo_cecb645d.png",null,"https:\u002F\u002Fgithub.com\u002Fjdh-algo",[80,84],{"name":81,"color":82,"percentage":83},"Python","#3572A5",99.8,{"name":85,"color":86,"percentage":87},"Shell","#89e051",0.2,521,51,"2026-03-31T08:52:52","MIT","Linux","需要 NVIDIA GPU，测试型号为 A100，CUDA 11.3","未说明",{"notes":96,"python":97,"dependencies":98},"仅在 Ubuntu 20.04 上测试通过；需安装 ffmpeg；模型权重需从 Hugging Face 下载（包含基础模型、中文 wav2vec2 及 JoyHallo 专用权重）；推理时若将步数从 40 减至 15 可提升速度但可能降低质量。","3.10",[99,100],"requirements.txt 中定义的依赖","ffmpeg",[63,15],[103,104,105],"audio-driven-talking-face","generative-ai","mandarin-chinese","2026-03-27T02:49:30.150509","2026-04-08T10:17:35.009995",[109,114,119,124,129,134],{"id":110,"question_zh":111,"answer_zh":112,"source_url":113},24076,"Gradio 推理后合成音频到视频时报错 TypeError: must be real number, not NoneType 怎么办？","该问题通常由 moviepy 或 decorator 包的版本冲突引起。请运行以下命令卸载并重新安装 moviepy 来解决：\npip uninstall moviepy decorator\npip install moviepy\n如果问题依旧，可能是环境变量或镜像源导致，建议更换环境重试。","https:\u002F\u002Fgithub.com\u002Fjdh-algo\u002FJoyHallo\u002Fissues\u002F7",{"id":115,"question_zh":116,"answer_zh":117,"source_url":118},24077,"WebUI 生成时报错 AttributeError: __pydantic_core_schema__ 如何解决？","这是由于 fastapi 版本过高导致的兼容性问题。请将 fastapi 降级到 0.112.4 版本即可解决：\npip install fastapi==0.112.4","https:\u002F\u002Fgithub.com\u002Fjdh-algo\u002FJoyHallo\u002Fissues\u002F12",{"id":120,"question_zh":121,"answer_zh":122,"source_url":123},24078,"单卡显存不足（如 16GB）无法运行推理，支持双卡推理吗？","目前不支持使用两张显卡进行单次推理。对于 16GB 显存的显卡，建议移除代码中的 `accelerate` 启动方式，直接使用 python 脚本启动推理。使用 accelerate 会占用更多显存，直接运行可以显著降低显存占用（约 15.8GB），从而在单卡上运行成功。","https:\u002F\u002Fgithub.com\u002Fjdh-algo\u002FJoyHallo\u002Fissues\u002F23",{"id":125,"question_zh":126,"answer_zh":127,"source_url":128},24079,"生成的视频中人物脸部变形、出现奇怪皱纹或嘴巴不对劲是什么原因？","这可能与输入数据集的质量或录制时的打光条件有关。建议检查输入素材的光照是否均匀。此外，项目已更新设置以提升推理效果和速度，请参考官方文档中的命令行推理部分（Kindly remind 章节）调整参数，推理速度可提升 60% 以上。","https:\u002F\u002Fgithub.com\u002Fjdh-algo\u002FJoyHallo\u002Fissues\u002F9",{"id":130,"question_zh":131,"answer_zh":132,"source_url":133},24080,"推理时提示 'The passed generator was created on cpu' 导致生成速度极慢怎么办？","该问题已在最新版本中修复。请更新仓库代码以获取最新结果。修复后通过修改 scripts\u002Finference.py 中的生成器设备设置，解决了 CPU 与 CUDA 设备不匹配的问题，推理速度大约提升 5%。如果仍然较慢，可能是模型本身对显存和计算资源要求较高，后续版本有望进一步优化。","https:\u002F\u002Fgithub.com\u002Fjdh-algo\u002FJoyHallo\u002Fissues\u002F11",{"id":135,"question_zh":136,"answer_zh":137,"source_url":138},24081,"代码中提到的半解耦结构（semi-decoupled structure）或 motion_scale 在哪里实现？","半解耦结构和相关聚合模块实现在 joyhallo\u002Fmodels\u002Fattention.py 文件中。您可以查看该文件找到具体的 aggregation module 和 motion_scale 相关代码逻辑。","https:\u002F\u002Fgithub.com\u002Fjdh-algo\u002FJoyHallo\u002Fissues\u002F13",[]]