[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-lyuchenyang--Macaw-LLM":3,"tool-lyuchenyang--Macaw-LLM":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",143909,2,"2026-04-07T11:33:18",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107888,"2026-04-06T11:32:50",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":10,"last_commit_at":59,"category_tags":60,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[35,15,13,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":77,"owner_email":78,"owner_twitter":78,"owner_website":78,"owner_url":79,"languages":80,"stars":89,"forks":90,"last_commit_at":91,"license":92,"difficulty_score":93,"env_os":94,"env_gpu":95,"env_ram":96,"env_deps":97,"category_tags":104,"github_topics":105,"view_count":32,"oss_zip_url":78,"oss_zip_packed_at":78,"status":17,"created_at":112,"updated_at":113,"faqs":114,"releases":152},5097,"lyuchenyang\u002FMacaw-LLM","Macaw-LLM","Macaw-LLM: Multi-Modal Language Modeling with Image, Video, Audio, and Text Integration","Macaw-LLM 是一款前沿的开源多模态大语言模型，旨在打破文本、图像、视频和音频之间的界限，实现真正的“全能”理解。它巧妙地将 CLIP（视觉）、Whisper（听觉）与 LLaMA（语言）三大顶尖模型融为一体，让 AI 能够像人类一样同时处理看、听、读多种信息流，解决了传统模型难以高效整合异构数据、跨模态交互能力不足的痛点。\n\n对于希望探索多模态融合技术的研究人员和开发者而言，Macaw-LLM 提供了极佳的实验平台。其核心亮点在于采用了“简单快速的对齐策略”，能将不同模态的数据高效映射到大语言模型的嵌入空间，并通过“单阶段指令微调”简化了复杂的训练流程。这种设计不仅大幅降低了多模态模型的开发门槛，还显著提升了推理效率。无论是需要构建智能客服、多媒体内容分析系统，还是致力于下一代通用人工智能研究的团队，都能利用 Macaw-LLM 快速验证想法，构建出能看懂图表、听懂语音并流畅对话的智能应用。","\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_d366abbfa29a.png\" alt=\"Logo\" width=\"200\">\n\u003C\u002Fdiv>\n\n\u003Cdiv align=\"center\"> \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_e09299781bec.png\" alt=\"Logo\" width=\"50\"> &nbsp; &nbsp; &nbsp; &nbsp; \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_db4000a516d7.png\" alt=\"Logo\" width=\"70\"> &nbsp; &nbsp; &nbsp; &nbsp; \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_f1de0551500f.png\" alt=\"Logo\" width=\"50\"> &nbsp; &nbsp; &nbsp; &nbsp; \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_e1ffabe6866f.png\" alt=\"Logo\" width=\"50\"> &nbsp; &nbsp; &nbsp; &nbsp; \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_b9898e5b6da3.png\" alt=\"Logo\" width=\"50\"> \u003C\u002Fdiv>\n\n# Macaw-LLM: Multi-Modal Language Modeling with Image, Audio, Video, and Text Integration\n\n\n\u003Cdiv align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FVersion-1.0.0-blue.svg\" alt=\"Version\"> \n\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-CC%20BY%204.0-green.svg\" alt=\"License\">\n\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Flyuchenyang\u002FMacaw-LLM?color=yellow\" alt=\"Stars\">\n\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fissues\u002Flyuchenyang\u002FMacaw-LLM?color=red\" alt=\"Issues\">\n\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fpython-3.8-purple.svg\" alt=\"Python\">\n\n  \n\u003C!-- **Authors:** -->\n\n**_¹ ² [Chenyang Lyu](https:\u002F\u002Flyuchenyang.github.io), ³ [Minghao Wu](https:\u002F\u002Fminghao-wu.github.io\u002F), ¹ \u003Csup>*\u003C\u002Fsup> [Longyue Wang](http:\u002F\u002Fwww.longyuewang.com\u002F), ¹ [Xinting Huang](https:\u002F\u002Ftimhuang1.github.io\u002F),_**\n\n**_¹ [Bingshuai Liu](bingshuailiu.github.io), ¹ [Zefeng Du](https:\u002F\u002Fseeledu.github.io\u002Findex-en.html), ¹ [Shuming Shi](https:\u002F\u002Fshumingshi.github.io\u002F), ¹ [Zhaopeng Tu](http:\u002F\u002Fwww.zptu.net\u002F)_**\n\n\n\u003C!-- **Affiliations:** -->\n\n_¹ Tencent AI Lab, ² Dublin City University, ³ Monash University_\n\n_\u003Csup>*\u003C\u002Fsup>Longyue Wang is the corresponding author: [vinnlywang@tencent.com](mailto:{vinnlywang@tencent.com)_\n\u003C\u002Fdiv>\n\n\nMacaw-LLM is an exploratory endeavor that pioneers multi-modal language modeling by seamlessly combining image🖼️, video📹, audio🎵, and text📝 data, built upon the foundations of CLIP, Whisper, and LLaMA.\n\n#### 📰 \u003Ca href=\"https:\u002F\u002Ftinyurl.com\u002F4rsexudv\" style=\"color: black; text-decoration: underline;text-decoration-style: dotted;\">Paper\u003C\u002Fa>     :building_construction: \u003Ca href=\"https:\u002F\u002Ftinyurl.com\u002Fyem9m4nf\" style=\"color: black; text-decoration: underline;text-decoration-style: dotted;\">Model (via dropbox)\u003C\u002Fa>    :building_construction: \u003Ca href=\"https:\u002F\u002Fshare.weiyun.com\u002F27EFG2pq\" style=\"color: black; text-decoration: underline;text-decoration-style: dotted;\">Model (via weiyun)\u003C\u002Fa>    :card_file_box: \u003Ca href=\"\u002Fdata\" style=\"color: black; text-decoration: underline;text-decoration-style: dotted;\">Dataset\u003C\u002Fa>    :bricks: [Code](#usage)    :monocle_face: Video    :technologist: Demo    \n\n## Table of Contents 📚\n\n- [Introduction](#introduction)\n- [Key Features](#key-features)\n- [Architecture](#architecture)\n- [Alignment Strategy](#alignment-strategy)\n- [Installation](#installation)\n- [Usage](#usage)\n- [Future Work and Contributions](#future-work-and-contributions)\n\n## Introduction \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_b9898e5b6da3.png\" alt=\"Logo\" width=\"40\">\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_85cdda6f5f9b.png\" alt=\"Figure Description or Alt Text\" width=\"80%\">\n\u003C\u002Fdiv>\n\n\u003C!-- ![Figure Description or Alt Text](alignment.png) -->\n\nIn recent years, the field of language modeling has witnessed remarkable advancements. However, the integration of multiple modalities, such as images, videos, audios, and text, has remained a challenging task. Macaw-LLM is a model of its kind, bringing together state-of-the-art models for processing visual, auditory, and textual information, namely CLIP, Whisper, and LLaMA.\n\n## Key Features 🔑\n\nMacaw-LLM boasts the following unique features:\n\n1. **Simple & Fast Alignment**: Macaw-LLM enables seamless integration of multi-modal data through simple and fast alignment to LLM embeddings. This efficient process ensures quick adaptation of diverse data types.\n2. **One-Stage Instruction Fine-Tuning**: Our model streamlines the adaptation process through one-stage instruction fine-tuning, promoting a more efficient learning experience.\n3. **New Multi-modal Instruction Dataset**: We create a new multi-modal instruction dataset that covers diverse instructional tasks leveraging image and video modalities, which facilitates future work on multi-modal LLMs.\n\n\n## Architecture \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_e09299781bec.png\" alt=\"Logo\" width=\"40\">\n\nMacaw-LLM is composed of three main components:\n\n1. **CLIP**: Responsible for encoding images and video frames.\n2. **Whisper**: Responsible for encoding audio data.\n3. **LLM** (LLaMA\u002FVicuna\u002FBloom): The language model that encodes instructions and generates responses.\n\nThe integration of these models allows Macaw-LLM to process and analyze multi-modal data effectively.\n\n## Alignment Strategy \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_e1ffabe6866f.png\" alt=\"Logo\" width=\"40\">\n\nOur novel alignment strategy enables faster adaptation by efficiently bridging multi-modal features to textual features. The process involves:\n\n1. Encoding multi-modal features with CLIP and Whisper.\n2. Feeding the encoded features into an attention function, wherein the multi-modal features serve as the query and the embedding matrix of LLaMA as the key and value.\n3. Injecting the outputs into the input sequence (before instruction tokens) of LLaMA, allowing for a streamlined alignment process with minimal additional parameters.\n\n## New Multi-modal Instruction Dataset 🆕\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_5455e4872b1b.png\" alt=\"Figure Description or Alt Text\" width=\"80%\">\n\u003C\u002Fdiv>\nIn this project, we generate a dataset using GPT-3.5-Turbo by providing image or video captions as prompts. To create this dataset, we use captions from the MS COCO dataset for images and the Charades and AVSD datasets for videos. Our dataset consists of approximately 69K examples based on COCO image captions and 50K examples based on Charades and AVSD video captions. We currently focus on single-turn dialogues but plan to expand into multi-turn dialogues and diverse multi-modal content in the future. This will enrich the dataset and improve fine-tuning for language learning models (LLMs).\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_00abf1cda658.png\" alt=\"Figure Description or Alt Text\" width=\"70%\">\n\u003C\u002Fdiv>\n\n## Installation \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_db4000a516d7.png\" alt=\"Logo\" width=\"60\">\n\nTo install Macaw-LLM, follow these steps:\n\n```bash\n# Clone the repository\ngit clone https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM.git\n\n# Change to the Macaw-LLM directory\ncd Macaw-LLM\n\n# Install required packages\npip install -r requirements.txt\n\n# Install ffmpeg\nyum install ffmpeg -y\n\n# Install apex\ngit clone https:\u002F\u002Fgithub.com\u002FNVIDIA\u002Fapex.git\ncd apex\npython setup.py install\ncd ..\n```\n\n## Usage 🚀\n\n1. **Downloading dataset:** \n   - Text data: [stanford_alpaca\u002Falpaca_data.json](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca\u002Fblob\u002Fmain\u002Falpaca_data.json) \n   - Image data: [COCO Dataset](https:\u002F\u002Fcocodataset.org\u002F#home) [VQA Dataset](https:\u002F\u002Fvisualqa.org\u002Fdownload.html)\n   - Video data: [Charades](https:\u002F\u002Fallenai.org\u002Fplato\u002Fcharades\u002F) and [Video Dialog](https:\u002F\u002Fvideo-dialog.com\u002F)\n   - Image instruction data: [Macaw-LLM image instruction dataset](https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM\u002Fblob\u002Fmain\u002Fdata\u002Fgenerated_examples_coco.json)\n   - Video instruction data: [Macaw-LLM video instruction dataset](https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM\u002Fblob\u002Fmain\u002Fdata\u002Fgenerated_examples_avsd.json)\n\n2. **Dataset preprocessing:** \n   - Place the data in three modalities to specific folders - `data\u002Ftext\u002F`, `data\u002Fimage\u002F`, `data\u002Fvideo\u002F`\n   - Extract frames and audio from videos: \n     ```\n     python preprocess_data.py\n     ```\n   - Transform supervised data to dataset: \n     ```\n     python preprocess_data_supervised.py\n     ```\n   - Transform unsupervised data to dataset: \n     ```\n     python preprocess_data_unsupervised.py\n     ```\n\n3. **Training:** \n   - Execute the training script (you can specify the training parameters inside):\n     ```\n     .\u002Ftrain.sh\n     ```\n\n4. **Inference:** \n   - Execute the inference script (you can give any customized inputs inside):\n     ```\n     .\u002Finference.sh\n     ```\n\n## Examples \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_f1de0551500f.png\" alt=\"Logo\" width=\"40\">\nWe present several examples that highlight the proficiency of our Macaw-LLM in understanding and following multi-modal instructions.\nThese examples showcase our system's multi-modal ability to understand and generate responses based on images and videos. These examples demonstrate how our system comprehends visual content and produces high-quality, fluent responses in natural language conversations. Our system generates contextually relevant and informative answers to various questions about the image, demonstrating its capability to communicate about visual content naturally and fluently.\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_f0cef3b45cac.png\" alt=\"Figure Description or Alt Text\" width=\"96%\">\n\u003C\u002Fdiv>\n\n## Future Work and Contributions 🚀\n\nWhile our model is still in its early stages, we believe that Macaw-LLM paves the way for future research in the realm of multi-modal language modeling. The integration of diverse data modalities holds immense potential for pushing the boundaries of artificial intelligence and enhancing our understanding of complex real-world scenarios. By introducing Macaw-LLM, we hope to inspire further exploration and innovation in this exciting area of study.\n\nWe welcome contributions from the community to improve and expand Macaw-LLM's capabilities. 🤝\n\n## ToDo 👨‍💻\n- [ ] **Evaluation:** We show some examples showcasing the multi-modal ability of our Macaw-LLM. However, we acknowledge that these efforts may not be fully adequate for accurately and comprehensively demonstrate model capabilities. We aim to conduct extensive evaluation on our systems to evaluate its capability.\n    \n- [ ] **More Language Models:** We aim to extend Macaw-LLM by incorporating additional language models like Dolly, BLOOM, T-5, etc. This will enable more robust and versatile processing and understanding of multi-modal data.\n\n- [ ] **Multilingual Support:** Our next step is to support multiple languages, moving towards true multi-modal and multilingual language models. We believe this will significantly broaden Macaw-LLM's applicability and enhance its understanding of diverse, global contexts.\n\n## Acknowledgements 🙏\n\nWe would like to express our gratitude to the following open-source projects for their valuable contributions to Macaw-LLM:\n\n- [Stanford Alpaca](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca\u002F) for providing the Alpaca dataset, which we used in our experiments.\n- [Parrot](https:\u002F\u002Fgithub.com\u002Fwxjiao\u002Fparrot) for providing a helpful implementation of the training of LLaMA.\n- [CLIP](https:\u002F\u002Fgithub.com\u002Fopenai\u002FCLIP) for providing a strong image and video encoding model.\n- [Whisper](https:\u002F\u002Fgithub.com\u002Fs3prl\u002Fs3prl) for providing a strong audio encoding model.\n- [LLaMA](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002Fllama) for providing a powerful LLM.\n\nWe would also like to thank the developers and maintainers of these projects for their dedication and hard work in making their projects open-source and accessible to the community.\n\n\n## Citation\n\n```bibtex\n@article{lyu2023macaw,\n  title={Macaw-LLM: Multi-Modal Language Modeling with Image, Audio, Video, and Text Integration},\n  author={Lyu, Chenyang and Wu, Minghao and Wang, Longyue and Huang, Xinting and Liu, Bingshuai and Du, Zefeng and Shi, Shuming and Tu, Zhaopeng},\n  journal={arXiv preprint arXiv:2306.09093},\n  year={2023}\n}\n```\n","\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_d366abbfa29a.png\" alt=\"Logo\" width=\"200\">\n\u003C\u002Fdiv>\n\n\u003Cdiv align=\"center\"> \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_e09299781bec.png\" alt=\"Logo\" width=\"50\"> &nbsp; &nbsp; &nbsp; &nbsp; \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_db4000a516d7.png\" alt=\"Logo\" width=\"70\"> &nbsp; &nbsp; &nbsp; &nbsp; \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_f1de0551500f.png\" alt=\"Logo\" width=\"50\"> &nbsp; &nbsp; &nbsp; &nbsp; \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_e1ffabe6866f.png\" alt=\"Logo\" width=\"50\"> &nbsp; &nbsp; &nbsp; &nbsp; \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_b9898e5b6da3.png\" alt=\"Logo\" width=\"50\"> \u003C\u002Fdiv>\n\n# Macaw-LLM：融合图像、音频、视频与文本的多模态语言建模\n\n\u003Cdiv align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FVersion-1.0.0-blue.svg\" alt=\"版本\"> \n\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-CC%20BY%204.0-green.svg\" alt=\"许可证\">\n\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Flyuchenyang\u002FMacaw-LLM?color=yellow\" alt=\"星标\">\n\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fissues\u002Flyuchenyang\u002FMacaw-LLM?color=red\" alt=\"问题\">\n\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fpython-3.8-purple.svg\" alt=\"Python\">\n\n  \n\u003C!-- **作者:** -->\n\n**_¹ ² [吕晨阳](https:\u002F\u002Flyuchenyang.github.io), ³ [吴明浩](https:\u002F\u002Fminghao-wu.github.io\u002F), ¹ \u003Csup>*\u003C\u002Fsup> [王龙跃](http:\u002F\u002Fwww.longyuewang.com\u002F), ¹ [黄欣婷](https:\u002F\u002Ftimhuang1.github.io\u002F),_**\n\n**_¹ [刘炳帅](bingshuailiu.github.io), ¹ [杜泽峰](https:\u002F\u002Fseeledu.github.io\u002Findex-en.html), ¹ [史书铭](https:\u002F\u002Fshumingshi.github.io\u002F), ¹ [涂兆鹏](http:\u002F\u002Fwww.zptu.net\u002F)_**\n\n\n\u003C!-- **所属机构:** -->\n\n_¹ 腾讯AI实验室, ² 都柏林城市大学, ³ 莫纳什大学_\n\n_\u003Csup>*\u003C\u002Fsup>王龙跃为通讯作者：[vinnlywang@tencent.com](mailto:{vinnlywang@tencent.com)_\n\u003C\u002Fdiv>\n\n\nMacaw-LLM是一项开创性的探索性工作，它基于CLIP、Whisper和LLaMA的基础，通过无缝整合图像🖼️、视频📹、音频🎵和文本📝数据，率先实现了多模态语言建模。\n\n#### 📰 \u003Ca href=\"https:\u002F\u002Ftinyurl.com\u002F4rsexudv\" style=\"color: black; text-decoration: underline;text-decoration-style: dotted;\">论文\u003C\u002Fa>     :building_construction: \u003Ca href=\"https:\u002F\u002Ftinyurl.com\u002Fyem9m4nf\" style=\"color: black; text-decoration: underline;text-decoration-style: dotted;\">模型（通过Dropbox）\u003C\u002Fa>    :building_construction: \u003Ca href=\"https:\u002F\u002Fshare.weiyun.com\u002F27EFG2pq\" style=\"color: black; text-decoration: underline;text-decoration-style: dotted;\">模型（通过微云）\u003C\u002Fa>    :card_file_box: \u003Ca href=\"\u002Fdata\" style=\"color: black; text-decoration: underline;text-decoration-style: dotted;\">数据集\u003C\u002Fa>    :bricks: [代码](#usage)    :monocle_face: 视频    :technologist: 演示    \n\n## 目录 📚\n\n- [简介](#introduction)\n- [关键特性](#key-features)\n- [架构](#architecture)\n- [对齐策略](#alignment-strategy)\n- [安装](#installation)\n- [使用](#usage)\n- [未来工作与贡献](#future-work-and-contributions)\n\n## 简介 \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_b9898e5b6da3.png\" alt=\"Logo\" width=\"40\">\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_85cdda6f5f9b.png\" alt=\"图示说明或替代文本\" width=\"80%\">\n\u003C\u002Fdiv>\n\n\u003C!-- ![图示说明或替代文本](alignment.png) -->\n\n近年来，语言建模领域取得了显著进展。然而，如何有效整合图像、视频、音频和文本等多种模态信息，仍然是一个极具挑战性的课题。Macaw-LLM正是这样一款模型，它将用于处理视觉、听觉和文本信息的最先进模型——CLIP、Whisper和LLaMA——有机地结合在一起。\n\n## 关键特性 🔑\n\nMacaw-LLM具有以下独特优势：\n\n1. **简单快速的对齐**：Macaw-LLM通过简单高效的对齐方式，将多模态数据无缝融入LLM的嵌入空间。这一高效流程确保了不同模态数据能够迅速适配。\n2. **单阶段指令微调**：我们的模型采用单阶段指令微调机制，简化了适配流程，提升了学习效率。\n3. **全新多模态指令数据集**：我们构建了一个涵盖多种指令任务的新多模态指令数据集，充分利用了图像和视频模态，为后续多模态LLM的研究奠定了基础。\n\n\n## 架构 \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_e09299781bec.png\" alt=\"Logo\" width=\"40\">\n\nMacaw-LLM由三个主要组件构成：\n\n1. **CLIP**：负责编码图像和视频帧。\n2. **Whisper**：负责编码音频数据。\n3. **LLM**（LLaMA\u002FVicuna\u002FBloom）：用于编码指令并生成响应的语言模型。\n\n通过这些模型的协同工作，Macaw-LLM能够高效地处理和分析多模态数据。\n\n## 对齐策略 \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_e1ffabe6866f.png\" alt=\"Logo\" width=\"40\">\n\n我们提出的创新对齐策略，能够更快速地实现多模态特征与文本特征之间的映射。具体步骤如下：\n\n1. 使用CLIP和Whisper分别对多模态特征进行编码。\n2. 将编码后的特征输入到注意力机制中，其中多模态特征作为查询，LLaMA的嵌入矩阵作为键和值。\n3. 将注意力机制的输出注入到LLaMA的输入序列中（位于指令标记之前），从而以最少的额外参数完成对齐过程。\n\n## 全新多模态指令数据集 🆕\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_5455e4872b1b.png\" alt=\"图示说明或替代文本\" width=\"80%\">\n\u003C\u002Fdiv>\n在本项目中，我们利用GPT-3.5-Turbo，以图像或视频字幕作为提示，生成了一套数据集。数据集的来源包括MS COCO数据集中的图像字幕，以及Charades和AVSD数据集中的视频字幕。最终，我们共收集了约69,000个基于COCO图像字幕的样本，以及50,000个基于Charades和AVSD视频字幕的样本。目前我们专注于单轮对话，但未来计划扩展到多轮对话及更多样化的多模态内容，以进一步丰富数据集，并提升语言模型（LLMs）的微调效果。\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_00abf1cda658.png\" alt=\"图示说明或替代文本\" width=\"70%\">\n\u003C\u002Fdiv>\n\n## 安装 \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_db4000a516d7.png\" alt=\"Logo\" width=\"60\">\n\n安装Macaw-LLM，请按照以下步骤操作：\n\n```bash\n# 克隆仓库\ngit clone https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM.git\n\n# 进入Macaw-LLM目录\ncd Macaw-LLM\n\n# 安装所需依赖\npip install -r requirements.txt\n\n# 安装ffmpeg\nyum install ffmpeg -y\n\n# 安装apex\ngit clone https:\u002F\u002Fgithub.com\u002FNVIDIA\u002Fapex.git\ncd apex\npython setup.py install\ncd ..\n```\n\n## 使用方法 🚀\n\n1. **下载数据集：**\n   - 文本数据：[stanford_alpaca\u002Falpaca_data.json](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca\u002Fblob\u002Fmain\u002Falpaca_data.json)\n   - 图像数据：[COCO 数据集](https:\u002F\u002Fcocodataset.org\u002F#home) [VQA 数据集](https:\u002F\u002Fvisualqa.org\u002Fdownload.html)\n   - 视频数据：[Charades](https:\u002F\u002Fallenai.org\u002Fplato\u002Fcharades\u002F) 和 [Video Dialog](https:\u002F\u002Fvideo-dialog.com\u002F)\n   - 图像指令数据：[Macaw-LLM 图像指令数据集](https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM\u002Fblob\u002Fmain\u002Fdata\u002Fgenerated_examples_coco.json)\n   - 视频指令数据：[Macaw-LLM 视频指令数据集](https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM\u002Fblob\u002Fmain\u002Fdata\u002Fgenerated_examples_avsd.json)\n\n2. **数据集预处理：**\n   - 将三种模态的数据分别放入指定文件夹中：`data\u002Ftext\u002F`、`data\u002Fimage\u002F`、`data\u002Fvideo\u002F`\n   - 从视频中提取帧和音频：\n     ```\n     python preprocess_data.py\n     ```\n   - 将监督数据转换为数据集：\n     ```\n     python preprocess_data_supervised.py\n     ```\n   - 将无监督数据转换为数据集：\n     ```\n     python preprocess_data_unsupervised.py\n     ```\n\n3. **训练：**\n   - 执行训练脚本（可在脚本内指定训练参数）：\n     ```\n     .\u002Ftrain.sh\n     ```\n\n4. **推理：**\n   - 执行推理脚本（可在脚本内提供自定义输入）：\n     ```\n     .\u002Finference.sh\n     ```\n\n## 示例 \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_f1de0551500f.png\" alt=\"Logo\" width=\"40\">\n我们展示了几个示例，突显了我们的 Macaw-LLM 在理解和执行多模态指令方面的出色能力。\n这些示例展示了我们的系统在理解图像和视频并据此生成响应方面的多模态能力。它们表明，我们的系统能够理解视觉内容，并在自然语言对话中生成高质量、流畅的回应。对于关于图像的各种问题，我们的系统都能给出上下文相关且信息丰富的答案，这充分证明了它能够自然流畅地交流视觉内容。\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_readme_f0cef3b45cac.png\" alt=\"图示说明或替代文本\" width=\"96%\">\n\u003C\u002Fdiv>\n\n## 未来工作与贡献 🚀\n\n尽管我们的模型仍处于早期阶段，但我们相信，Macaw-LLM 为多模态语言建模领域的未来研究开辟了道路。整合多种数据模态具有巨大的潜力，可以推动人工智能的边界，并加深我们对复杂现实场景的理解。通过推出 Macaw-LLM，我们希望激发这一激动人心的研究领域中的进一步探索和创新。\n\n我们欢迎社区的贡献，以改进和扩展 Macaw-LLM 的能力。🤝\n\n## 待办事项 👨‍💻\n- [ ] **评估：** 我们展示了一些示例，说明了 Macaw-LLM 的多模态能力。然而，我们也意识到，这些努力可能不足以准确全面地展示模型的能力。我们计划对系统进行广泛的评估，以检验其性能。\n    \n- [ ] **更多语言模型：** 我们计划通过引入其他语言模型，如 Dolly、BLOOM、T-5 等，来扩展 Macaw-LLM。这将使多模态数据的处理和理解更加 robust 和 versatile。\n\n- [ ] **多语言支持：** 我们的下一步是支持多种语言，朝着真正的多模态、多语言语言模型迈进。我们相信，这将显著拓宽 Macaw-LLM 的应用范围，并增强其对多样化全球语境的理解。\n\n## 致谢 🙏\n\n我们衷心感谢以下开源项目对 Macaw-LLM 的宝贵贡献：\n\n- [Stanford Alpaca](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca\u002F) 提供了我们在实验中使用的 Alpaca 数据集。\n- [Parrot](https:\u002F\u002Fgithub.com\u002Fwxjiao\u002Fparrot) 提供了 LLaMA 训练的实用实现。\n- [CLIP](https:\u002F\u002Fgithub.com\u002Fopenai\u002FCLIP) 提供了强大的图像和视频编码模型。\n- [Whisper](https:\u002F\u002Fgithub.com\u002Fs3prl\u002Fs3prl) 提供了强大的音频编码模型。\n- [LLaMA](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002Fllama) 提供了功能强大的 LLM。\n\n我们还要感谢这些项目的开发者和维护者，感谢他们致力于将项目开源并使其向社区开放。\n\n\n## 引用\n\n```bibtex\n@article{lyu2023macaw,\n  title={Macaw-LLM: 多模态语言建模——融合图像、音频、视频与文本},\n  author={Lyu, Chenyang and Wu, Minghao and Wang, Longyue and Huang, Xinting and Liu, Bingshuai and Du, Zefeng and Shi, Shuming and Tu, Zhaopeng},\n  journal={arXiv 预印本 arXiv:2306.09093},\n  year={2023}\n}\n```","# Macaw-LLM 快速上手指南\n\nMacaw-LLM 是一个探索性的多模态语言模型，基于 CLIP、Whisper 和 LLaMA 构建，能够无缝整合图像、视频、音频和文本数据。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**: Linux (推荐)\n*   **Python**: 3.8 或更高版本\n*   **GPU**: 支持 CUDA 的 NVIDIA 显卡（用于加速训练和推理）\n*   **前置依赖**:\n    *   `git`: 用于克隆代码库\n    *   `ffmpeg`: 用于处理视频和音频数据\n    *   `pip`: Python 包管理工具\n\n## 安装步骤\n\n请依次执行以下命令来完成环境搭建：\n\n```bash\n# 1. 克隆仓库\ngit clone https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM.git\n\n# 2. 进入项目目录\ncd Macaw-LLM\n\n# 3. 安装 Python 依赖包\npip install -r requirements.txt\n\n# 4. 安装 ffmpeg (以 CentOS\u002FYum 为例，Ubuntu 请使用 apt-get install ffmpeg)\nyum install ffmpeg -y\n\n# 5. 安装 Apex (用于混合精度训练)\ngit clone https:\u002F\u002Fgithub.com\u002FNVIDIA\u002Fapex.git\ncd apex\npython setup.py install\ncd ..\n```\n\n> **注意**：如果在国内网络环境下安装 `requirements.txt` 或克隆仓库较慢，建议配置 pip 国内镜像源（如清华源、阿里源）或使用 Git 代理加速。\n\n## 基本使用\n\n### 1. 数据准备\n在使用模型前，需下载并预处理数据集。将不同模态的数据放置于指定文件夹：\n*   文本数据 -> `data\u002Ftext\u002F`\n*   图像数据 -> `data\u002Fimage\u002F`\n*   视频数据 -> `data\u002Fvideo\u002F`\n\n执行以下脚本进行数据预处理（提取视频帧、音频及格式转换）：\n\n```bash\n# 提取视频帧和音频\npython preprocess_data.py\n\n# 转换监督学习数据\npython preprocess_data_supervised.py\n\n# 转换无监督数据\npython preprocess_data_unsupervised.py\n```\n\n*数据集下载参考*：\n*   图像指令数据：[Macaw-LLM image instruction dataset](https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM\u002Fblob\u002Fmain\u002Fdata\u002Fgenerated_examples_coco.json)\n*   视频指令数据：[Macaw-LLM video instruction dataset](https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM\u002Fblob\u002Fmain\u002Fdata\u002Fgenerated_examples_avsd.json)\n\n### 2. 模型训练\n配置好 `train.sh` 中的参数后，运行以下命令启动训练：\n\n```bash\n.\u002Ftrain.sh\n```\n\n### 3. 模型推理\n训练完成后，您可以修改 `inference.sh` 中的输入内容，然后运行以下命令进行测试：\n\n```bash\n.\u002Finference.sh\n```\n\n该模型支持针对图像、视频内容的自然语言问答，能够生成流畅且符合上下文的回复。","某新媒体运营团队需要每日处理大量包含视频采访、背景音效和图文素材的原始资料，以快速生成多平台分发内容。\n\n### 没有 Macaw-LLM 时\n- **流程割裂效率低**：团队成员需分别使用转录工具提取音频、CV 模型分析画面、再人工将文本输入大模型，跨工具切换耗时极长。\n- **多模态语境丢失**：单独处理音频或视频时，模型无法结合画面中的表情动作或背景噪音来理解说话人的真实情绪与意图。\n- **对齐成本高昂**：将不同来源的文本、图像特征强行拼凑给大模型时，常出现“图文不符”或逻辑断层，需人工反复校对修正。\n- **响应速度滞后**：从原始素材到最终文案的完整链路往往需要数小时，难以应对突发热点事件的即时报道需求。\n\n### 使用 Macaw-LLM 后\n- **一站式多模态输入**：直接上传包含视频、音频和参考图的原始包，Macaw-LLM 基于 CLIP、Whisper 和 LLaMA 的架构自动同步解析所有模态数据。\n- **深度语义融合**：模型能同时“看”懂视频画面、“听”清背景音与对话，精准捕捉讽刺语气或紧急情境，生成的文案情感色彩更丰富准确。\n- **原生对齐免调试**：凭借简单的快速对齐策略，Macaw-LLM 内部自动完成多模态特征与大语言模型的嵌入对齐，输出内容逻辑连贯，无需人工二次拼接。\n- **实时内容产出**：单阶段指令微调让推理速度大幅提升，几分钟内即可从复杂素材中提炼出高质量的新闻稿或短视频脚本。\n\nMacaw-LLM 通过原生整合图、文、音、视频能力，将繁琐的多模态数据处理流水线简化为一次高效的智能交互，彻底重塑了内容创作的生产力。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flyuchenyang_Macaw-LLM_d366abbf.png","lyuchenyang","Chenyang Lyu","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Flyuchenyang_5d975494.jpg","PhD student at ML-Labs, Dublin City University, Ireland.\r\n\r\nCurrently interested in Machine Learning and Natural Language Processing especially LLMs.","Dublin City University","Dublin, Ireland",null,"https:\u002F\u002Fgithub.com\u002Flyuchenyang",[81,85],{"name":82,"color":83,"percentage":84},"Python","#3572A5",98.4,{"name":86,"color":87,"percentage":88},"Shell","#89e051",1.6,1592,131,"2026-03-23T12:18:40","Apache-2.0",4,"Linux","需要 NVIDIA GPU (需安装 apex 和 CUDA)，具体型号和显存未说明","未说明",{"notes":98,"python":99,"dependencies":100},"1. 安装步骤明确包含 'yum install ffmpeg'，表明主要支持基于 RPM 的 Linux 发行版（如 CentOS\u002FRHEL）。2. 需要手动克隆并编译安装 NVIDIA apex 库。3. 模型基于 CLIP、Whisper 和 LLaMA\u002FVicuna\u002FBloom 构建。4. 运行前需自行下载并预处理多模态数据集（文本、图像、视频）。","3.8+",[101,102,103],"requirements.txt 中定义的依赖","ffmpeg","apex",[14,35],[106,107,108,109,110,111],"language-model","multi-modal-learning","natural-language-processing","deep-learning","machine-learning","neural-networks","2026-03-27T02:49:30.150509","2026-04-07T22:51:02.594271",[115,120,125,130,134,138,142,147],{"id":116,"question_zh":117,"answer_zh":118,"source_url":119},23169,"加载预训练模型进行推理时遇到错误或结果不符，需要注意什么？","在构造输入数据（data_item）时，需要注意从 \"input_ids\" 中移除 pad_token_id 和 eos_token_id，否则可能导致模型输入格式错误。确保输入数据的结构与模型预期一致，特别是当 instruction、input 和 output 组合时。","https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM\u002Fissues\u002F22",{"id":121,"question_zh":122,"answer_zh":123,"source_url":124},23170,"项目缺少 LICENSE 文件，目前的使用许可是什么？","项目标签中引用了 CC By 4.0 许可证，意味着允许在注明出处的情况下使用和分发。维护者已表示会尽快添加正式的 LICENSE 文件以明确授权细节。","https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM\u002Fissues\u002F11",{"id":126,"question_zh":127,"answer_zh":128,"source_url":129},23163,"项目需要下载哪些具体的数据集文件？","主要需要以下文件：\n1. Stage 1: COCO 2014 训练集图片、Macaw 生成的示例数据 (coco.json, avsd.json)、Charades 视频数据集。\n2. Stage 2: VQA 2017 的训练和验证标注及问题文件、AVSD 数据集的 4 个文件。\n具体目录结构需按照 README 或预处理脚本要求创建，例如将视频移入 '.\u002Favsd\u002Fvideos\u002F'，图片移入 '.\u002Fcoco\u002Fimages\u002F'。","https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM\u002Fissues\u002F13",{"id":131,"question_zh":132,"answer_zh":133,"source_url":129},23164,"为什么 AVSD 的预处理脚本中没有基于输入文本长度的过滤步骤？","这是因为如果指令（instruction）长度超过最大限制，序列中将不包含任何响应（response），导致模型无法学习如何生成回答。因此过滤逻辑与其他数据集略有不同，重点在于保证序列中包含有效的响应部分。",{"id":135,"question_zh":136,"answer_zh":137,"source_url":129},23165,"预处理过程中提取的音频和视频帧是否被直接存储在张量数据集中？","通常不会直接将图像像素或音频波形存储在张量数据集中，因为这会消耗巨大的内存。项目中保留的是图像或视频的索引（index），在训练过程中动态加载对应的数据。",{"id":139,"question_zh":140,"answer_zh":141,"source_url":129},23166,"脚本中的一些函数（如 preprocess_vqa2_to_val_dataset）未被调用，是废弃了吗？","这些函数并未废弃。前两个函数用于处理验证集以进行评估和推理，最后一个函数（resize_image）用于调整图像和视频帧的大小。它们可能在特定的评估脚本或独立流程中被调用，而未在主训练脚本中直接体现。",{"id":143,"question_zh":144,"answer_zh":145,"source_url":146},23167,"运行该项目至少需要多少 GPU 显存？两张 3090 显卡可以运行吗？","可以使用两张 3090 GPU（共 48GB 显存）运行。维护者确认，在使用 FP16 精度时，模型可以适配两张 24GB 显存的 3090 显卡。","https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM\u002Fissues\u002F9",{"id":148,"question_zh":149,"answer_zh":150,"source_url":151},23168,"运行推理时提示缺少 'data\u002Fall_visual_names.json' 文件，如何获取？","该文件通常由数据预处理步骤生成。如果缺失，请检查是否已完整运行数据准备脚本。此外，部分由项目方自行生成的数据文件（如 avsd_train.json）若暂未公开，可先使用提供的监督数据集（supervised dataset）进行替代或等待官方后续发布。","https:\u002F\u002Fgithub.com\u002Flyuchenyang\u002FMacaw-LLM\u002Fissues\u002F3",[]]