[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-yaodongC--awesome-instruction-dataset":3,"tool-yaodongC--awesome-instruction-dataset":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",148568,2,"2026-04-09T23:34:24",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108111,"2026-04-08T11:23:26",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":77,"owner_email":78,"owner_twitter":78,"owner_website":79,"owner_url":80,"languages":78,"stars":81,"forks":82,"last_commit_at":83,"license":78,"difficulty_score":84,"env_os":85,"env_gpu":86,"env_ram":86,"env_deps":87,"category_tags":90,"github_topics":91,"view_count":32,"oss_zip_url":78,"oss_zip_packed_at":78,"status":17,"created_at":100,"updated_at":101,"faqs":102,"releases":103},6059,"yaodongC\u002Fawesome-instruction-dataset","awesome-instruction-dataset","A collection of open-source dataset to train instruction-following LLMs (ChatGPT,LLaMA,Alpaca)","awesome-instruction-dataset 是一个专为训练指令跟随型大语言模型（如 ChatGPT、LLaMA、Alpaca）而整理的开源数据集合集。它致力于解决开发者在微调模型时面临的数据分散、格式不一及获取困难等痛点，提供了一站式的资源索引。\n\n该资源库主要面向 AI 研究人员和大模型开发者，帮助他们快速找到适合特定任务的高质量数据。其核心亮点在于分类清晰且覆盖全面：不仅包含纯文本指令数据，还收录了视觉 - 语言多模态指令数据（如图像问答），以及用于红队测试和人类反馈强化学习（RLHF）的关键数据集。此外，每个数据集都详细标注了语言类型（支持中、英及多语言）、任务范围（通用或多任务）、数据来源（人工生成、自指令生成或混合数据）以及样本规模。\n\n无论是希望构建多模态对话系统，还是专注于提升模型在特定领域的指令理解能力，用户都能在此找到经过筛选的优质资源。通过整合来自 Alpaca、LLaVA、GPT-4-LLM 等多个知名项目的数据集，awesome-instruction-dataset 极大地降低了大模型微调的门槛，加速了从研究到应用的转化过程。","# awesome-text\u002Fvisual-instruction-tuning-dataset\nA collection of open-source instruction tuning datasets to train (text and multi-modal) chat-based LLMs (GPT-4, ChatGPT,LLaMA,Alpaca). \nWe currently include three types of dataset:\n 1. visual-instruction-tuning (e.g. image-instruction-answer)\n 2. text-instruction-tuning datasets.\n 3. red-teaming | Reinforcement Learning from Human Feedback (RLHF) Datasets\n\nInstruction Tuning \u002F Reinforcement Learning from Human Feedback (RLHF) Dataset is a key component of instruction-following LLMs such as ChatGPT. This repo is dedicated to providing a comprehensive list of datasets used for instruction tuning in various LLMs, making it easier for researchers and developers to access and utilize these resources.\n\nLists of codebse to train your LLMs: \n - [nichtdax\u002Fawesome-totally-open-chatgpt](https:\u002F\u002Fgithub.com\u002Fnichtdax\u002Fawesome-totally-open-chatgpt): A codebase of totally open alternatives to ChatGPT\n\nSize: The number of instruction tuning pairs\n\nLingual-Tags:\n-   EN: Instruction datasets in English\n-   CN: Instruction datasets in Chinese\n-   ML: [Multi-lingual] Instruction datasets in multiple languages\n\nTask-Tags:\n-  MT: [Multi-task] Datasets containing multiple tasks\n-  TS: [Task-specific] Datasets tailored for specific tasks\n\nGeneration-method:\n- HG: [Human Generated Dataset] Datasets created by humans\n- SI: [Self-Instruct] Datasets generated using self-instruct methods\n- MIX: [Mixed Dataset] Dataset contains both human and machine generated data\n- COL: [Collection of Dataset] Dataset made from a collection of other datasets\n\n# Table of Contents\n1. [The template](#the-template)\n2. [The Multi-modal Instruction Dataset](#the-multi-modal-instruction-datasets)\n   - [(Vision-CAIR\u002FMiniGPT-4)|5K|EN|MT|MIX](https:\u002F\u002Fminigpt-4.github.io\u002F)\n   - [(haotian-liu\u002FLLaVA)|150K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fliuhaotian\u002FLLaVA-Instruct-150K)\n3. [The Instruction tuning Dataset](#the-instruction-following-datasets)\n   - [(tatsu-lab\u002FAlpaca)|52K|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca)\n   - [(gururise\u002FCleaned Alpaca)|52K|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Fgururise\u002FAlpacaDataCleaned)\n   - [(XueFuzhao\u002FInstructionWild)|52K|EN|CN|MT|SI](https:\u002F\u002Fgithub.com\u002FXueFuzhao\u002FInstructionWild)\n   - [(JosephusCheung\u002FGuanacoDataset)|534K|ML|MT|SI](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FJosephusCheung\u002FGuanacoDataset)\n   - [(Hello-SimpleAI\u002FHC3)|24K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHello-SimpleAI\u002FHC3)\n   - [(Hello-SimpleAI\u002FHC3-Chinese)|13K|CN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHello-SimpleAI\u002FHC3)\n   - [(allenai\u002Fprosocial-dialog)|58K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fallenai\u002Fprosocial-dialog)\n   - [(allenai\u002Fnatural-instructions)|1.6K|ML|MT|HG](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fnatural-instructions)\n   - [(bigscience\u002FxP3)|N\u002FA|ML|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fbigscience\u002FxP3)\n   - [(nomic-ai\u002Fgpt4all)|437k|EN|MT|COL](https:\u002F\u002Fgithub.com\u002Fnomic-ai\u002Fgpt4all)\n   - [(PhoebusSi\u002FAlpaca-CoT)|500k|ML|MT|COL](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FQingyiSi\u002FAlpaca-CoT)\n   - [(google-research\u002FFLAN)|N\u002FA|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002FFLAN\u002Ftree\u002Fmain\u002Fflan\u002Fv2)\n   - [(thunlp\u002FUltraChat)|280k|EN|TS|MIX](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FUltraChat)\n   - [(cascip\u002FChatAlpaca)|10k|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002Fcascip\u002FChatAlpaca)\n   - [(YeungNLP\u002Ffirefly-train-1.1M)|1100k|CN|MT|COL](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FYeungNLP\u002Ffirefly-train-1.1M)\n   - [(orhonovich\u002Funnatural-instructions)|240K|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002Forhonovich\u002Funnatural-instructions)\n   - [(Instruction-Tuning-with-GPT-4\u002FGPT-4-LLM)|52K|EN|CN|MT|SI](https:\u002F\u002Fgithub.com\u002FInstruction-Tuning-with-GPT-4\u002FGPT-4-LLM)\n   - [(databrickslabs\u002Fdolly)|15K|EN|MT|HG](https:\u002F\u002Fgithub.com\u002Fdatabrickslabs\u002Fdolly\u002Ftree\u002Fmaster\u002Fdata)\n   - [(OpenAssistant\u002Foasst1)|161K|ML|MT|HG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FOpenAssistant\u002Foasst1)\n   - [(RyokoAI\u002FShareGPT52K)|90K|ML|MT|SI](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FRyokoAI\u002FShareGPT52K)\n   - [(zjunlp\u002FMol-Instructions)|2043K|ML|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fzjunlp\u002FMol-Instructions)\n4. [Reinforcement Learning from Human Feedback (RLHF) Datasets](#reinforcement-learning-from-human-feedback-(rlhf)-datasets)\n   - [(Anthropic\u002Fhh-rlhf)|22k|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAnthropic\u002Fhh-rlhf)\n   - [(thu-coai\u002FSafety-Prompts)|100k|CN|MT|MIX](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FSafety-Prompts)\n   - [(HuggingFaceH4\u002Fstack-exchange-preferences)|10741k|EN|TS|HG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHuggingFaceH4\u002Fstack-exchange-preferences)\n   - [(stanfordnlp\u002FSHP)|385k|EN|MT|HG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fstanfordnlp\u002FSHP)\n   - [(Instruction-Tuning-with-GPT-4\u002FGPT-4-LLM)|52K|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002FInstruction-Tuning-with-GPT-4\u002FGPT-4-LLM)\n5. [License that Allows Commercial Use](#license-that-allows-commercial-use)\n \n\n# The template\n\nAppend the new project at the end of file\n\n```markdown\n## [({owner}\u002F{project-name)|Tags}]{https:\u002F\u002Fgithub.com\u002Flink\u002Fto\u002Fproject}\n\n- summary:\n- Data generation model:\n- paper:\n- License:\n- Related: (if applicable)\n```\n# The Multi-modal Instruction Datasets\n\n ## [(Vision-CAIR\u002FMiniGPT-4)|5K|EN|MT|MIX](https:\u002F\u002Fminigpt-4.github.io\u002F)\n  \n - Summary: A high-quality, well-aligned (e.g. more detailed image desciption) image-text dataset created using conversation between two bots, similar to [ChatCaptioner](https:\u002F\u002Fgithub.com\u002FVision-CAIR\u002FChatCaptioner). This image-text dataset can then be used with some predefined instruction template for image-instruction-answer finetuning.\n - Modality: Text, Image\n - Data generation model: N\u002FA\n - paper: [MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models](https:\u002F\u002Fgithub.com\u002FVision-CAIR\u002FMiniGPT-4\u002Fblob\u002Fmain\u002FMiniGPT_4.pdf)\n - License: [`BSD 3-Clause`](https:\u002F\u002Fopensource.org\u002Flicense\u002Fbsd-3-clause\u002F)\n - Related: \n     - [Interactive ChatCaptioner for image and video](https:\u002F\u002Fgithub.com\u002FVision-CAIR\u002FChatCaptioner)\n     \n ## [(haotian-liu\u002FLLaVA)|150K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fliuhaotian\u002FLLaVA-Instruct-150K)\n  \n - Summary: LLaVA Visual Instruct 150K is a set of GPT-generated multimodal instruction-following data. It is constructed for visual instruction tuning and for building large multimodal towards GPT-4 vision\u002Flanguage capability.\n - Modality: Text, Image\n - Data generation model: `GPT-4-0314` \n - paper: [Visual Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08485)\n - License: [`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n\n ## [({sunrainyg}\u002F{InstructCV)|EN|MT|MIX}]{https:\u002F\u002Fgithub.com\u002FAlaaLab\u002FInstructCV}\n\n- summary: Instruction-Tuned Text-To-Image Diffusion Models As Vision Generalists\n- Modality: Text, Image\n- paper: [InstructCV](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.00390.pdf)\n- License: [`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n\n     \n# The Instruction-following Datasets\n\n ## [(tatsu-lab\u002FAlpaca)|52K|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca)\n\n - Summary:`52K` data generated from modified `self-instruct` pipeline with human written `175 seed task`.\n - Data generation model: `text-davinci-003`\n - paper: [alpaca-blog](https:\u002F\u002Fcrfm.stanford.edu\u002F2023\u002F03\u002F13\u002Falpaca.html)\n - License: [`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n\n ## [(gururise\u002FCleaned Alpaca)|52K|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Fgururise\u002FAlpacaDataCleaned)\n\n - Summary: A project that manually cleaned the Alpaca 52K Dataset\n - Data generation model: `text-davinci-003`\n - paper: N\u002FA\n - License: [`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n \n ## [(XueFuzhao\u002FInstructionWild)|52K|EN|CN|MT|SI](https:\u002F\u002Fgithub.com\u002FXueFuzhao\u002FInstructionWild)\n\n - Summary:`52K` data generated from modified `self-instruct` pipeline with human written `429 seed task`.\n - Data generation model: `text-davinci-003`\n - paper: N\u002FA\n - License: InstructWild dataset is intended for non-commercial research purpose only.\n \n ## [(JosephusCheung\u002FGuanacoDataset)|534K|ML|MT|SI](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FJosephusCheung\u002FGuanacoDataset)\n\n - Summary:`52K` instruction data generated from modified `self-instruct` pipeline with human written `429 seed task`.\n - Data generation model: `text-davinci-003`\n - License: [`GPL-3.0`](https:\u002F\u002Fwww.gnu.org\u002Flicenses\u002Fgpl-3.0.en.html)\n\n ## [(Hello-SimpleAI\u002FHC3)|24K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHello-SimpleAI\u002FHC3)\n\n - Summary:The the first human-ChatGPT comparison corpus (English Version), named HC3 dataset\n - Data generation model: `gpt-3.5`, `human generated`\n - paper: [How Close is ChatGPT to Human Experts? Comparison Corpus, Evaluation, and Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.07597)\n - License: [`CC BY-SA 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-sa\u002F4.0\u002F)\n\n ## [(Hello-SimpleAI\u002FHC3-Chinese)|13K|CN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHello-SimpleAI\u002FHC3-Chinese)\n\n - Summary:The the first human-ChatGPT comparison corpus (Chinese Version), named HC3 dataset\n - Data generation model: `gpt-3.5`, `human generated`\n - paper: [How Close is ChatGPT to Human Experts? Comparison Corpus, Evaluation, and Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.07597)\n - License: [`CC BY-SA 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-sa\u002F4.0\u002F)\n\n ## [(allenai\u002Fprosocial-dialog)|58K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fallenai\u002Fprosocial-dialog)\n\n - Summary: ProsocialDialog is the first large-scale multi-turn English dialogue dataset to teach conversational agents to respond to problematic content following social norms.\n - Data generation model: `gpt-3.5`, `human generated`\n - paper: [ProsocialDialog: A Prosocial Backbone for Conversational Agents](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.12688)\n - License: [`CC BY 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby\u002F4.0\u002F)\n\n ## [(allenai\u002Fnatural-instructions)|1.6K|ML|MT|HG](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fnatural-instructions)\n\n - Summary: A community effort to create a large collection of `1,616 diverse NLP tasks` and their natural language definitions\u002Finstructions.\n - Data generation model: `Human generated`\n - paper: [Super-NaturalInstructions: Generalization via Declarative Instructions on 1600+ NLP Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07705)\n - License: [`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n \n ## [(bigscience\u002FxP3)|N\u002FA|ML|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fbigscience\u002FxP3)\n\n - Summary: [Prompt-resource] xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks.\n - Data generation model: N\u002FA\n - paper: [Crosslingual Generalization through Multitask Finetuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.01786)\n - License: [`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n\n ## [(PhoebusSi\u002FAlpaca-CoT)|500k|ML|MT|COL](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FQingyiSi\u002FAlpaca-CoT)\n\n - Summary: A datset for Chain-of-Thoughts reasoning based on LLaMA and Alpaca. Note: Their repository will continuously collect and combine various instruction tuning datasets. [Github Repo](https:\u002F\u002Fgithub.com\u002FPhoebusSi\u002FAlpaca-CoT)\n - paper: N\u002FA\n - License: [`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n \n ## [(nomic-ai\u002Fgpt4all)|437k|EN|MT|COL](https:\u002F\u002Fgithub.com\u002Fnomic-ai\u002Fgpt4all)\n\n - Summary: gpt4all leverages three publicly available datasets: 1.[laion\u002FOIG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Flaion\u002FOIG), 2.[pacovaldez\u002Fstackoverflow-questions](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fpacovaldez\u002Fstackoverflow-questions) 3. subset of [bigscience\u002Fbloomz-p3](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz-p3)\n - Data generation model: N\u002FA\n - paper: [GPT4All: Training an Assistant-style Chatbot with Large Scale Data Distillation from GPT-3.5-Turbo](https:\u002F\u002Fs3.amazonaws.com\u002Fstatic.nomic.ai\u002Fgpt4all\u002F2023_GPT4All_Technical_Report.pdf)\n - License: [`MIT License`](https:\u002F\u002Fopensource.org\u002Flicense\u002Fmit\u002F)\n \n ## [(teknium1\u002FGPTeacher)|20k+|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Fteknium1\u002FGPTeacher)\n\n - Summary: A collection of modular datasets generated by GPT-4, General-Instruct - Roleplay-Instruct - Code-Instruct - and Toolformer\n - Data generation model: `GPT-4`\n - paper: N\u002FA\n - License: [`MIT License`](https:\u002F\u002Fopensource.org\u002Flicense\u002Fmit\u002F)\n \n ## [(google-research\u002FFLAN)|N\u002FA|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002FFLAN\u002Ftree\u002Fmain\u002Fflan\u002Fv2)\n\n - Summary: The Flan Collection compiles datasets from Flan 2021, P3, Super-Natural Instructions, along with dozens more datasets into one place, formats them into a mix of zero-shot, few-shot and chain-of-thought templates\n - Data generation model: N\u002FA\n - paper: [The Flan Collection: Designing Data and Methods for Effective Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.13688)\n - License: [`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n\n ## [(thunlp\u002FUltraChat)|280k|EN|TS|MIX](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FUltraChat)\n\n - Summary: UltraChat aims to construct an open-source, large-scale, and multi-round dialogue data. The first part of UltraChat (i.e., the Questions about the World sector) is released, which contains 280k diverse and informative dialogues. More dialogues about writing and creation, assistance on existing materials are to come.\n - Data generation model: `GPT-3.5-turbo`\n - paper: N\u002FA\n - License: [`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n \n ## [(cascip\u002FChatAlpaca)|10k|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002Fcascip\u002FChatAlpaca)\n\n - Summary: Based on the Stanford Alpaca data, ChatAlpaca extends the data to multi-turn instructions and their corresponding responses. More data (20k) and the Chinese translated version are to come.\n - Data generation model: `GPT-3.5-turbo`\n - paper: N\u002FA\n - License: [`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n - Related: [(tatsu-lab\u002FAlpaca)|52K|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca)\n  \n ## [(YeungNLP\u002Ffirefly-train-1.1M)|1100k|CN|MT|COL](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FYeungNLP\u002Ffirefly-train-1.1M)\n \n - Summary: Chinese datasets of 23 tasks combined with human-written instruction templates. \n - Data generation model: N\u002FA\n - paper: N\u002FA\n - License: N\u002FA\n \n ## [(orhonovich\u002Funnatural-instructions)|240K|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002Forhonovich\u002Funnatural-instructions)\n  \n - Summary: 64K examples by prompting a language model with three seed examples of instructions and eliciting a fourth. Then the set is expanded to 240K by prompting the model to rephrase each instruction.\n - Data generation model: `text-davinci-002`\n - paper: [Unnatural Instructions: Tuning Language Models with (Almost) No Human Labor](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09689)\n - License: [`MIT License`](https:\u002F\u002Fopensource.org\u002Flicense\u002Fmit\u002F)\n \n ## [(Instruction-Tuning-with-GPT-4\u002FGPT-4-LLM)|52K|EN|CN|MT|SI](https:\u002F\u002Fgithub.com\u002FInstruction-Tuning-with-GPT-4\u002FGPT-4-LLM)\n \n - Summary: 52K instruction-following data generated by GPT-4 with the original Alpaca prompts & Alpaca prompts translated into Chinese by ChatGPT + 9K instruction-following data generated by GPT-4 with prompts in Unnatural Instruction.\n - Data generation model: `GPT-4`\n - paper: [Instruction Tuning with GPT-4](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.03277)\n - License: [`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n - Related: \n     - [(tatsu-lab\u002FAlpaca)|52K|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca)\n     - [(orhonovich\u002Funnatural-instructions)|240K|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002Forhonovich\u002Funnatural-instructions)\n \n ## [(databrickslabs\u002Fdolly)|15K|EN|MT|HG](https:\u002F\u002Fgithub.com\u002Fdatabrickslabs\u002Fdolly\u002Ftree\u002Fmaster\u002Fdata)\n \n - Summary: This datset was generated by thousands of Databricks employees in several of the behavioral categories outlined in the InstructGPT paper, including brainstorming, classification, closed QA, generation, information extraction, open QA, and summarization.\n - Data generation model: N\u002FA\n - paper: [Free Dolly](https:\u002F\u002Fwww.databricks.com\u002Fblog\u002F2023\u002F04\u002F12\u002Fdolly-first-open-commercially-viable-instruction-tuned-llm)\n - License: [`CC BY-SA 3.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-sa\u002F3.0\u002F)\n \n ## [(OpenAssistant\u002Foasst1)|161K|ML|MT|HG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FOpenAssistant\u002Foasst1)\n \n - Summary: OpenAssistant Conversations (OASST1), a human-generated, human-annotated assistant-style conversation corpus consisting of 161,443 messages distributed across 66,497 conversation trees, in 35 different languages, annotated with 461,292 quality ratings. \n - Data generation model: N\u002FA\n - paper: [OpenAssistant Conversations - Democratizing Large Language Model Alignment](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F10iR5hKwFqAKhL3umx8muOWSRm7hs5FqX\u002Fview)\n - License: [`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n \n ## [(RyokoAI\u002FShareGPT52K)|90K|ML|MT|SI](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FRyokoAI\u002FShareGPT52K)\n \n - Summary: 90,000 conversations scraped via the ShareGPT API before it was shut down. These conversations include both user prompts and responses from OpenAI's ChatGPT.\n - Data generation model: `GPT-4`,`GPT-3.5`\n - paper: N\u002FA\n - License: [`CC0 1.0 Universal`](https:\u002F\u002Fcreativecommons.org\u002Fpublicdomain\u002Fzero\u002F1.0\u002F)\n\n ## [(zjunlp\u002FMol-Instructions)|2043K|ML|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fzjunlp\u002FMol-Instructions)\n \n - Summary: An open, large-scale biomolecular instruction dataset consisting of 148,4K molecule-oriented, 505K protein-oriented, and 53K biomolecular text instructions.\n - Data generation model: `GPT-3.5`\n - paper: [Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for Large Language Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.08018.pdf)\n - License: [`CC BY 4.0`](https:\u002F\u002Fgithub.com\u002Fzjunlp\u002FMol-Instructions\u002Fblob\u002Fmain\u002FDATA_LICENSE)\n\n \n# Reinforcement Learning from Human Feedback (RLHF) | Red-Teaming Datasets\n\n  ## [(Anthropic\u002Fhh-rlhf)|22k|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAnthropic\u002Fhh-rlhf)\n\n - Summary: This RLHF dataset is an iterated 'online' dataset that includes data from 52B language models. It contains 22k helpfulness comparisons and no red-teaming data. \n - Data generation model: `Anthropic RL-CAI 52B`\n - paper: [Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.05862)\n - License: [`MIT License`](https:\u002F\u002Fopensource.org\u002Flicense\u002Fmit\u002F)\n - Related: \n     - [(Hello-SimpleAI\u002FHC3)|24K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHello-SimpleAI\u002FHC3)\n     - [(Hello-SimpleAI\u002FHC3-Chinese)|13K|CN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHello-SimpleAI\u002FHC3-Chinese)\n\n## [(thu-coai\u002FSafety-Prompts)|100k|CN|MT|MIX](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FSafety-Prompts)\n\n - Summary: Chinese safety prompts for evaluating and improving the safety of LLMs. This repository includes 100k Chinese security scene prompts and ChatGPT responses, covering various security scenarios and command attacks. It can be used for comprehensive evaluation and improvement of model security, as well as enhancing the model's knowledge of security, aligning model output with human values.\n - Data generation model: `GPT-3.5`\n - paper: [Safety Assessment of Chinese Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.10436)\n - License: [`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n\n  ## [(HuggingFaceH4\u002Fstack-exchange-preferences)|10741k|EN|TS|HG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHuggingFaceH4\u002Fstack-exchange-preferences)\n\n - Summary: This dataset contains questions and answers from the Stack Overflow Data Dump for the purpose of preference model training.\n - Data generation model: N\u002FA\n - paper: [A General Language Assistant as a Laboratory for Alignment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.00861)\n - License: [`CC BY-SA 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-sa\u002F4.0\u002F)\n - Related:\n     - [stack-exchange-paired](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Flvwerra\u002Fstack-exchange-paired)\n\n \n  ## [(stanfordnlp\u002FSHP)|385k|EN|MT|HG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fstanfordnlp\u002FSHP)\n\n - Summary: Each example is a Reddit post with a question\u002Finstruction and a pair of top-level comments for that post, where one comment is more preferred by Reddit users (collectively).\n - Data generation model: N\u002FA\n - paper: N\u002FA\n - License: N\u002FA\n \n  ## [(Instruction-Tuning-with-GPT-4\u002FGPT-4-LLM)|52K|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002FInstruction-Tuning-with-GPT-4\u002FGPT-4-LLM)\n \n - Summary: Ranked responses (Note: Data is evaluated by `GPT-4` model NOT human) of Alpaca prompts from three models (GPT-4, GPT-3.5 and OPT-IML) by asking GPT-4 to rate the quality. Author believes \"GPT-4 is capable of identifying and fixing its own mistakes, and accurately judging the quality of responses\" \n - Data generation model: `GPT-4`\n - paper: [Instruction Tuning with GPT-4](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.03277)\n - License: [`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n - Related: \n     - [(tatsu-lab\u002FAlpaca)|52K|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca)\n      \n  ## [(Reddit\u002Feli5)|500k|EN|MT|HG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Feli5)\n\n- summary: This dataset contains questions and answers from the subreddits [r\u002Fexplainlikeimfive](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fexplainlikeimfive\u002F), [r\u002Faskhistorians](https:\u002F\u002Fwww.reddit.com\u002Fr\u002FAskHistorians\u002F) and [r\u002Faskscience](https:\u002F\u002Fwww.reddit.com\u002Fr\u002FAskScience\u002F).\n- Data generation model: N\u002FA\n- paper: N\u002FA\n- License: N\u002FA\n- Related: [eli5 dataset](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fvincentmin\u002Feli5_rlhf) a transformation of the [eli5](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Feli5) dataset in a format similar to [stack-exchange-paired](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Flvwerra\u002Fstack-exchange-paired).\n\n# License that Allows Commercial Use\n\nNote: While these licenses permit commercial use, they may have different requirements for attribution, distribution, or modification. Be sure to review the specific terms of each license before using it in a commercial project.\n\nCommercial use licenses:\n\n1. `Apache License 2.0`\n2. `MIT License`\n3. `BSD 3-Clause License`\n4. `BSD 2-Clause License`\n5. `GNU Lesser General Public License v3.0 (LGPLv3)`\n6. `GNU Affero General Public License v3.0 (AGPLv3)`\n7. `Mozilla Public License 2.0 (MPL-2.0)`\n8. `Eclipse Public License 2.0 (EPL-2.0)`\n9. `Microsoft Public License (Ms-PL)`\n10. `Creative Commons Attribution 4.0 International (CC BY 4.0)`\n11. `Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)`\n12. `zlib License`\n13. `Boost Software License 1.0`\n\n","# 令人惊叹的文本\u002F视觉指令微调数据集\n一个用于训练基于聊天的（文本和多模态）大型语言模型（如GPT-4、ChatGPT、LLaMA、Alpaca）的开源指令微调数据集合集。\n我们目前包含三种类型的数据集：\n1. 视觉指令微调数据集（例如，图像-指令-答案）\n2. 文本指令微调数据集。\n3. 红队测试 | 人类反馈强化学习（RLHF）数据集\n\n指令微调\u002F人类反馈强化学习（RLHF）数据集是像ChatGPT这样的遵循指令的大型语言模型的关键组成部分。这个仓库致力于提供用于各种大型语言模型中指令微调的全面数据集列表，使研究人员和开发者更容易获取和使用这些资源。\n\n用于训练你的大型语言模型的代码库列表：\n- [nichtdax\u002Fawesome-totally-open-chatgpt](https:\u002F\u002Fgithub.com\u002Fnichtdax\u002Fawesome-totally-open-chatgpt)：一个完全开放的ChatGPT替代方案代码库\n\n规模：指令微调对的数量\n\n语言标签：\n- EN：英文指令数据集\n- CN：中文指令数据集\n- ML：[多语言] 多种语言的指令数据集\n\n任务标签：\n- MT：[多任务] 包含多个任务的数据集\n- TS：[特定任务] 针对特定任务定制的数据集\n\n生成方式：\n- HG：[人工生成数据集] 由人类创建的数据集\n- SI：[自我指导] 使用自我指导方法生成的数据集\n- MIX：[混合数据集] 包含人类和机器生成的数据\n- COL：[集合数据集] 由其他数据集集合而成的数据集\n\n# 目录\n1. [模板](#the-template)\n2. [多模态指令数据集](#the-multi-modal-instruction-datasets)\n   - [(Vision-CAIR\u002FMiniGPT-4)|5K|EN|MT|MIX](https:\u002F\u002Fminigpt-4.github.io\u002F)\n   - [(haotian-liu\u002FLLaVA)|150K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fliuhaotian\u002FLLaVA-Instruct-150K)\n3. [指令遵循数据集](#the-instruction-following-datasets)\n   - [(tatsu-lab\u002FAlpaca)|52K|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca)\n   - [(gururise\u002FCleaned Alpaca)|52K|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Fgururise\u002FAlpacaDataCleaned)\n   - [(XueFuzhao\u002FInstructionWild)|52K|EN|CN|MT|SI](https:\u002F\u002Fgithub.com\u002FXueFuzhao\u002FInstructionWild)\n   - [(JosephusCheung\u002FGuanacoDataset)|534K|ML|MT|SI](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FJosephusCheung\u002FGuanacoDataset)\n   - [(Hello-SimpleAI\u002FHC3)|24K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHello-SimpleAI\u002FHC3)\n   - [(Hello-SimpleAI\u002FHC3-Chinese)|13K|CN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHello-SimpleAI\u002FHC3)\n   - [(allenai\u002Fprosocial-dialog)|58K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fallenai\u002Fprosocial-dialog)\n   - [(allenai\u002Fnatural-instructions)|1.6K|ML|MT|HG](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fnatural-instructions)\n   - [(bigscience\u002FxP3)|N\u002FA|ML|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fbigscience\u002FxP3)\n   - [(nomic-ai\u002Fgpt4all)|437k|EN|MT|COL](https:\u002F\u002Fgithub.com\u002Fnomic-ai\u002Fgpt4all)\n   - [(PhoebusSi\u002FAlpaca-CoT)|500k|ML|MT|COL](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FQingyiSi\u002FAlpaca-CoT)\n   - [(google-research\u002FFLAN)|N\u002FA|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002FFLAN\u002Ftree\u002Fmain\u002Fflan\u002Fv2)\n   - [(thunlp\u002FUltraChat)|280k|EN|TS|MIX](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FUltraChat)\n   - [(cascip\u002FChatAlpaca)|10k|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002Fcascip\u002FChatAlpaca)\n   - [(YeungNLP\u002Ffirefly-train-1.1M)|1100k|CN|MT|COL](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FYeungNLP\u002Ffirefly-train-1.1M)\n   - [(orhonovich\u002Funnatural-instructions)|240K|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002Forhonovich\u002Funnatural-instructions)\n   - [(Instruction-Tuning-with-GPT-4\u002FGPT-4-LLM)|52K|EN|CN|MT|SI](https:\u002F\u002Fgithub.com\u002FInstruction-Tuning-with-GPT-4\u002FGPT-4-LLM)\n   - [(databrickslabs\u002Fdolly)|15K|EN|MT|HG](https:\u002F\u002Fgithub.com\u002Fdatabrickslabs\u002Fdolly\u002Ftree\u002Fmaster\u002Fdata)\n   - [(OpenAssistant\u002Foasst1)|161K|ML|MT|HG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FOpenAssistant\u002Foasst1)\n   - [(RyokoAI\u002FShareGPT52K)|90K|ML|MT|SI](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FRyokoAI\u002FShareGPT52K)\n   - [(zjunlp\u002FMol-Instructions)|2043K|ML|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fzjunlp\u002FMol-Instructions)\n4. [人类反馈强化学习（RLHF）数据集](#reinforcement-learning-from-human-feedback-(rlhf)-datasets)\n   - [(Anthropic\u002Fhh-rlhf)|22k|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAnthropic\u002Fhh-rlhf)\n   - [(thu-coai\u002FSafety-Prompts)|100k|CN|MT|MIX](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FSafety-Prompts)\n   - [(HuggingFaceH4\u002Fstack-exchange-preferences)|10741k|EN|TS|HG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHuggingFaceH4\u002Fstack-exchange-preferences)\n   - [(stanfordnlp\u002FSHP)|385k|EN|MT|HG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fstanfordnlp\u002FSHP)\n   - [(Instruction-Tuning-with-GPT-4\u002FGPT-4-LLM)|52K|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002FInstruction-Tuning-with-GPT-4\u002FGPT-4-LLM)\n5. [允许商业使用的许可](#license-that-allows-commercial-use)\n\n# 模板\n\n将新项目添加到文件末尾\n\n```markdown\n## [({owner}\u002F{project-name)|标签}]{https:\u002F\u002Fgithub.com\u002Flink\u002Fto\u002Fproject}\n\n- 摘要：\n- 数据生成模型：\n- 论文：\n- 许可证：\n- 相关：（如果适用）\n```\n\n# 多模态指令数据集\n\n ## [(Vision-CAIR\u002FMiniGPT-4)|5K|EN|MT|MIX](https:\u002F\u002Fminigpt-4.github.io\u002F)\n\n - 摘要：这是一个高质量、对齐良好的（例如，更详细的图像描述）图文数据集，由两个机器人之间的对话生成，类似于[ChatCaptioner](https:\u002F\u002Fgithub.com\u002FVision-CAIR\u002FChatCaptioner)。该图文数据集随后可以与一些预定义的指令模板一起用于图像-指令-答案的微调。\n - 模态：文本、图像\n - 数据生成模型：无\n - 论文：[MiniGPT-4：利用先进大型语言模型增强视觉-语言理解能力](https:\u002F\u002Fgithub.com\u002FVision-CAIR\u002FMiniGPT-4\u002Fblob\u002Fmain\u002FMiniGPT_4.pdf)\n - 许可证：[`BSD 3-Clause`](https:\u002F\u002Fopensource.org\u002Flicense\u002Fbsd-3-clause\u002F)\n - 相关：\n     - [用于图像和视频的交互式ChatCaptioner](https:\u002F\u002Fgithub.com\u002FVision-CAIR\u002FChatCaptioner)\n\n ## [(haotian-liu\u002FLLaVA)|150K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fliuhaotian\u002FLLaVA-Instruct-150K)\n\n - 摘要：LLaVA Visual Instruct 150K是一组由GPT生成的多模态指令遵循数据。它专为视觉指令微调以及构建面向GPT-4视觉\u002F语言能力的大规模多模态模型而设计。\n - 模态：文本、图像\n - 数据生成模型：`GPT-4-0314`\n - 论文：[视觉指令微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08485)\n - 许可证：[`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n\n ## [({sunrainyg}\u002F{InstructCV)|EN|MT|MIX}]{https:\u002F\u002Fgithub.com\u002FAlaaLab\u002FInstructCV}\n\n - 摘要：指令微调的文本到图像扩散模型作为视觉通用模型\n - 模态：文本、图像\n - 论文：[InstructCV](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.00390.pdf)\n - 许可证：[`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n\n# 指令遵循数据集\n\n ## [(tatsu-lab\u002FAlpaca)|52K|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca)\n\n- 摘要：基于修改后的 `self-instruct` 流程，结合人工编写的 `175 个种子任务` 生成的 `52K` 数据。\n- 数据生成模型：`text-davinci-003`\n- 论文：[alpaca-blog](https:\u002F\u002Fcrfm.stanford.edu\u002F2023\u002F03\u002F13\u002Falpaca.html)\n- 许可证：[`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n\n## [(gururise\u002FCleaned Alpaca)|52K|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Fgururise\u002FAlpacaDataCleaned)\n\n- 摘要：一个对 Alpaca 52K 数据集进行手动清洗的项目。\n- 数据生成模型：`text-davinci-003`\n- 论文：无\n- 许可证：[`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n\n## [(XueFuzhao\u002FInstructionWild)|52K|EN|CN|MT|SI](https:\u002F\u002Fgithub.com\u002FXueFuzhao\u002FInstructionWild)\n\n- 摘要：基于修改后的 `self-instruct` 流程，结合人工编写的 `429 个种子任务` 生成的 `52K` 数据。\n- 数据生成模型：`text-davinci-003`\n- 论文：无\n- 许可证：InstructWild 数据集仅用于非商业研究目的。\n\n## [(JosephusCheung\u002FGuanacoDataset)|534K|ML|MT|SI](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FJosephusCheung\u002FGuanacoDataset)\n\n- 摘要：基于修改后的 `self-instruct` 流程，结合人工编写的 `429 个种子任务` 生成的 `52K` 指令数据。\n- 数据生成模型：`text-davinci-003`\n- 许可证：[`GPL-3.0`](https:\u002F\u002Fwww.gnu.org\u002Flicenses\u002Fgpl-3.0.en.html)\n\n## [(Hello-SimpleAI\u002FHC3)|24K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHello-SimpleAI\u002FHC3)\n\n- 摘要：首个中英双语的人类与 ChatGPT 对比语料库，名为 HC3 数据集。\n- 数据生成模型：`gpt-3.5`、人工生成\n- 论文：[ChatGPT 跟人类专家有多接近？对比语料库、评估与检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.07597)\n- 许可证：[`CC BY-SA 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-sa\u002F4.0\u002F)\n\n## [(Hello-SimpleAI\u002FHC3-Chinese)|13K|CN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHello-SimpleAI\u002FHC3-Chinese)\n\n- 摘要：首个中文版的人类与 ChatGPT 对比语料库，名为 HC3 数据集。\n- 数据生成模型：`gpt-3.5`、人工生成\n- 论文：[ChatGPT 跟人类专家有多接近？对比语料库、评估与检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.07597)\n- 许可证：[`CC BY-SA 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-sa\u002F4.0\u002F)\n\n## [(allenai\u002Fprosocial-dialog)|58K|EN|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fallenai\u002Fprosocial-dialog)\n\n- 摘要：ProsocialDialog 是首个大规模多轮英语对话数据集，旨在教导对话式智能体如何按照社会规范回应不良内容。\n- 数据生成模型：`gpt-3.5`、人工生成\n- 论文：[ProsocialDialog：对话式智能体的社会友好型基础](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.12688)\n- 许可证：[`CC BY 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby\u002F4.0\u002F)\n\n## [(allenai\u002Fnatural-instructions)|1.6K|ML|MT|HG](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fnatural-instructions)\n\n- 摘要：一项社区协作项目，旨在创建包含 `1,616 种多样化 NLP 任务` 及其自然语言定义\u002F指令的大规模集合。\n- 数据生成模型：人工生成\n- 论文：[Super-NaturalInstructions：通过 1600 多种 NLP 任务的声明式指令实现泛化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07705)\n- 许可证：[`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n\n## [(bigscience\u002FxP3)|N\u002FA|ML|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fbigscience\u002FxP3)\n\n- 摘要：[提示资源] xP3（跨语言公共提示池）是一个涵盖 46 种语言和 16 种 NLP 任务的提示与数据集集合。\n- 数据生成模型：无\n- 论文：[通过多任务微调实现跨语言泛化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.01786)\n- 许可证：[`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n\n## [(PhoebusSi\u002FAlpaca-CoT)|500k|ML|MT|COL](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FQingyiSi\u002FAlpaca-CoT)\n\n- 摘要：一个基于 LLaMA 和 Alpaca 的思维链推理数据集。注意：该仓库将持续收集并整合各种指令调优数据集。[GitHub 仓库](https:\u002F\u002Fgithub.com\u002FPhoebusSi\u002FAlpaca-CoT)\n- 论文：无\n- 许可证：[`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n\n## [(nomic-ai\u002Fgpt4all)|437k|EN|MT|COL](https:\u002F\u002Fgithub.com\u002Fnomic-ai\u002Fgpt4all)\n\n- 摘要：gpt4all 利用了三个公开可用的数据集：1.[laion\u002FOIG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Flaion\u002FOIG)，2.[pacovaldez\u002Fstackoverflow-questions](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fpacovaldez\u002Fstackoverflow-questions)，3. [bigscience\u002Fbloomz-p3](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloomz-p3) 的子集。\n- 数据生成模型：无\n- 论文：[GPT4All：使用 GPT-3.5-Turbo 的大规模数据蒸馏训练助理型聊天机器人](https:\u002F\u002Fs3.amazonaws.com\u002Fstatic.nomic.ai\u002Fgpt4all\u002F2023_GPT4All_Technical_Report.pdf)\n- 许可证：[`MIT License`](https:\u002F\u002Fopensource.org\u002Flicense\u002Fmit\u002F)\n\n## [(teknium1\u002FGPTeacher)|20k+|EN|MT|SI](https:\u002F\u002Fgithub.com\u002Fteknium1\u002FGPTeacher)\n\n- 摘要：由 GPT-4 生成的一系列模块化数据集，包括 General-Instruct、Roleplay-Instruct、Code-Instruct 和 Toolformer。\n- 数据生成模型：`GPT-4`\n- 论文：无\n- 许可证：[`MIT License`](https:\u002F\u002Fopensource.org\u002Flicense\u002Fmit\u002F)\n\n## [(google-research\u002FFLAN)|N\u002FA|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002FFLAN\u002Ftree\u002Fmain\u002Fflan\u002Fv2)\n\n- 摘要：Flan Collection 将 2021 年的 Flan、P3、Super-Natural Instructions 等数十个数据集汇集到一处，并将其格式化为零样本、少样本和思维链模板的混合形式。\n- 数据生成模型：无\n- 论文：[Flan Collection：设计用于有效指令调优的数据与方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.13688)\n- 许可证：[`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n\n## [(thunlp\u002FUltraChat)|280k|EN|TS|MIX](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FUltraChat)\n\n- 摘要：UltraChat 旨在构建一个开源、大规模且多轮的对话数据集。UltraChat 的第一部分（即“关于世界的问题”领域）已发布，包含 28 万条多样且信息丰富的对话。未来还将推出更多关于写作与创作、现有资料辅助等方面的对话。\n- 数据生成模型：`GPT-3.5-turbo`\n- 论文：无\n- 许可证：[`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n\n## [(cascip\u002FChatAlpaca)|10k|EN|MT|MIX](https:\u002F\u002Fgithub.com\u002Fcascip\u002FChatAlpaca)\n\n- 摘要：基于斯坦福Alpaca数据集，ChatAlpaca将数据扩展至多轮指令及其对应响应。更多数据（2万条）及中文翻译版本即将发布。\n- 数据生成模型：`GPT-3.5-turbo`\n- 论文：无\n- 许可证：[`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n- 相关：[(tatsu-lab\u002FAlpaca)|5.2万|英|MT|SI](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca)\n\n## [(YeungNLP\u002Ffirefly-train-1.1M)|110万|中|MT|COL](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FYeungNLP\u002Ffirefly-train-1.1M)\n- 摘要：包含23个任务的中文数据集，并结合人工编写的指令模板。\n- 数据生成模型：无\n- 论文：无\n- 许可证：无\n\n## [(orhonovich\u002Funnatural-instructions)|24万|英|MT|MIX](https:\u002F\u002Fgithub.com\u002Forhonovich\u002Funnatural-instructions)\n- 摘要：通过向语言模型提供三个种子指令示例并诱导其生成第四个示例，生成6.4万条样本；随后通过让模型改写每条指令，将数据集扩展至24万条。\n- 数据生成模型：`text-davinci-002`\n- 论文：[不自然指令：几乎无需人工即可微调语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09689)\n- 许可证：[`MIT License`](https:\u002F\u002Fopensource.org\u002Flicense\u002Fmit\u002F)\n\n## [(Instruction-Tuning-with-GPT-4\u002FGPT-4-LLM)|5.2万|英|中|MT|SI](https:\u002F\u002Fgithub.com\u002FInstruction-Tuning-with-GPT-4\u002FGPT-4-LLM)\n- 摘要：由GPT-4生成的5.2万条指令遵循数据，使用原始Alpaca指令及由ChatGPT翻译成中文的Alpaca指令；此外，还有9千条使用“不自然指令”生成的指令遵循数据。\n- 数据生成模型：`GPT-4`\n- 论文：[使用GPT-4进行指令微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.03277)\n- 许可证：[`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n- 相关：\n    - [(tatsu-lab\u002FAlpaca)|5.2万|英|MT|SI](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca)\n    - [(orhonovich\u002Funnatural-instructions)|24万|英|MT|MIX](https:\u002F\u002Fgithub.com\u002Forhonovich\u002Funnatural-instructions)\n\n## [(databrickslabs\u002Fdolly)|1.5万|英|MT|HG](https:\u002F\u002Fgithub.com\u002Fdatabrickslabs\u002Fdolly\u002Ftree\u002Fmaster\u002Fdata)\n- 摘要：该数据集由数千名Databricks员工在InstructGPT论文中列出的几类行为场景下生成，包括头脑风暴、分类、封闭式问答、生成、信息抽取、开放式问答和摘要等。\n- 数据生成模型：无\n- 论文：[免费Dolly](https:\u002F\u002Fwww.databricks.com\u002Fblog\u002F2023\u002F04\u002F12\u002Fdolly-first-open-commercially-viable-instruction-tuned-llm)\n- 许可证：[`CC BY-SA 3.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-sa\u002F3.0\u002F)\n\n## [(OpenAssistant\u002Foasst1)|16.1万|ML|MT|HG](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FOpenAssistant\u002Foasst1)\n- 摘要：OpenAssistant对话数据集（OASST1），是一个由人类生成并标注的助手风格对话语料库，包含161,443条消息，分布在66,497个对话树中，涵盖35种语言，并附有461,292条质量评分。\n- 数据生成模型：无\n- 论文：[OpenAssistant对话——推动大型语言模型对齐的民主化](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F10iR5hKwFqAKhL3umx8muOWSRm7hs5FqX\u002Fview)\n- 许可证：[`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n\n## [(RyokoAI\u002FShareGPT52K)|9万|ML|MT|SI](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FRyokoAI\u002FShareGPT52K)\n- 摘要：通过ShareGPT API抓取的9万条对话，这些对话在API关闭前被收集，包括用户提问以及来自OpenAI ChatGPT的回复。\n- 数据生成模型：`GPT-4`、`GPT-3.5`\n- 论文：无\n- 许可证：[`CC0 1.0 Universal`](https:\u002F\u002Fcreativecommons.org\u002Fpublicdomain\u002Fzero\u002F1.0\u002F)\n\n## [(zjunlp\u002FMol-Instructions)|204.3万|ML|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fzjunlp\u002FMol-Instructions)\n- 摘要：一个开放的大规模生物分子指令数据集，包含14.84万条以分子为导向的指令、50.5万条以蛋白质为导向的指令，以及5.3万条生物分子文本指令。\n- 数据生成模型：`GPT-3.5`\n- 论文：[Mol-Instructions：面向大型语言模型的大规模生物分子指令数据集](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.08018.pdf)\n- 许可证：[`CC BY 4.0`](https:\u002F\u002Fgithub.com\u002Fzjunlp\u002FMol-Instructions\u002Fblob\u002Fmain\u002FDATA_LICENSE)\n\n# 基于人类反馈的强化学习（RLHF）| 红队测试数据集\n\n## [(Anthropic\u002Fhh-rlhf)|2.2万|英|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAnthropic\u002Fhh-rlhf)\n- 摘要：该RLHF数据集是一个迭代的“在线”数据集，包含了来自52B参数语言模型的数据。其中包含2.2万条有益性比较，但没有红队测试数据。\n- 数据生成模型：`Anthropic RL-CAI 52B`\n- 论文：[通过基于人类反馈的强化学习训练出既有益又无害的助手](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.05862)\n- 许可证：[`MIT License`](https:\u002F\u002Fopensource.org\u002Flicense\u002Fmit\u002F)\n- 相关：\n    - [(Hello-SimpleAI\u002FHC3)|2.4万|英|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHello-SimpleAI\u002FHC3)\n    - [(Hello-SimpleAI\u002FHC3-Chinese)|1.3万|中|MT|MIX](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHello-SimpleAI\u002FHC3-Chinese)\n\n## [(thu-coai\u002FSafety-Prompts)|10万|中文|机器翻译|混合](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FSafety-Prompts)\n\n - 摘要：用于评估和提升大语言模型安全性的中文安全提示。该仓库包含10万个中文安全场景提示及ChatGPT的响应，覆盖多种安全场景和指令攻击。可用于全面评估和改进模型安全性，同时增强模型的安全知识，使模型输出更符合人类价值观。\n - 数据生成模型：`GPT-3.5`\n - 论文：[中文大型语言模型的安全性评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.10436)\n - 许可证：[`Apache License 2.0`](https:\u002F\u002Fwww.apache.org\u002Flicenses\u002FLICENSE-2.0)\n\n  ## [(HuggingFaceH4\u002Fstack-exchange-preferences)|1074.1万|英文|时间序列|Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FHuggingFaceH4\u002Fstack-exchange-preferences)\n\n - 摘要：该数据集包含来自Stack Overflow数据转储的问题与答案，用于偏好模型训练。\n - 数据生成模型：无\n - 论文：[通用语言助手作为对齐研究的实验室](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.00861)\n - 许可证：[`CC BY-SA 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-sa\u002F4.0\u002F)\n - 相关：\n     - [stack-exchange-paired](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Flvwerra\u002Fstack-exchange-paired)\n\n \n  ## [(stanfordnlp\u002FSHP)|38.5万|英文|机器翻译|Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fstanfordnlp\u002FSHP)\n\n - 摘要：每个样本是一个Reddit帖子，包含一个问题或指令以及该帖子中的两个顶级评论，其中一个评论更受Reddit用户（总体）青睐。\n - 数据生成模型：无\n - 论文：无\n - 许可证：无\n \n  ## [(Instruction-Tuning-with-GPT-4\u002FGPT-4-LLM)|5.2万|英文|机器翻译|混合](https:\u002F\u002Fgithub.com\u002FInstruction-Tuning-with-GPT-4\u002FGPT-4-LLM)\n \n - 摘要：对Alpaca提示的三种模型（GPT-4、GPT-3.5和OPT-IML）的响应进行排名（注：数据由`GPT-4`模型而非人类评估），通过让GPT-4评分来判断质量。作者认为“GPT-4能够识别并修正自身的错误，并准确判断响应的质量”。\n - 数据生成模型：`GPT-4`\n - 论文：[使用GPT-4进行指令微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.03277)\n - 许可证：[`CC BY-NC 4.0`](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-nc\u002F4.0\u002Fdeed.en_GB)\n - 相关：\n     - [(tatsu-lab\u002FAlpaca)|5.2万|英文|机器翻译|单文档](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca)\n      \n  ## [(Reddit\u002Feli5)|50万|英文|机器翻译|Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Feli5)\n\n- 摘要：该数据集包含来自子版块[r\u002Fexplainlikeimfive](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fexplainlikeimfive\u002F)、[r\u002Faskhistorians](https:\u002F\u002Fwww.reddit.com\u002Fr\u002FAskHistorians\u002F)和[r\u002Faskscience](https:\u002F\u002Fwww.reddit.com\u002Fr\u002FAskScience\u002F)的问题与答案。\n- 数据生成模型：无\n- 论文：无\n- 许可证：无\n- 相关：[eli5数据集](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fvincentmin\u002Feli5_rlhf)，它是[eli5](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Feli5)数据集的一种转换，格式类似于[stack-exchange-paired](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Flvwerra\u002Fstack-exchange-paired)。\n\n# 允许商业使用的许可证\n\n注意：虽然这些许可证允许商业使用，但它们在署名、分发或修改方面可能有不同的要求。在将任何许可证用于商业项目之前，请务必仔细阅读其具体条款。\n\n允许商业使用的许可证：\n\n1. `Apache License 2.0`\n2. `MIT License`\n3. `BSD 3-Clause License`\n4. `BSD 2-Clause License`\n5. `GNU Lesser General Public License v3.0 (LGPLv3)`\n6. `GNU Affero General Public License v3.0 (AGPLv3)`\n7. `Mozilla Public License 2.0 (MPL-2.0)`\n8. `Eclipse Public License 2.0 (EPL-2.0)`\n9. `Microsoft Public License (Ms-PL)`\n10. `Creative Commons Attribution 4.0 International (CC BY 4.0)`\n11. `Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)`\n12. `zlib License`\n13. `Boost Software License 1.0`","# awesome-instruction-dataset 快速上手指南\n\n`awesome-instruction-dataset` 并非一个可直接安装的软件库或 Python 包，而是一个**开源指令微调数据集的精选合集**。它旨在为研究人员和开发者提供训练大语言模型（LLM）及多模态模型所需的高质量数据资源列表。\n\n本指南将指导你如何浏览该列表、选择适合的数据集，并快速开始数据下载与使用。\n\n## 1. 环境准备\n\n由于本项目本质是数据索引，无需安装特定的运行时环境。你需要准备以下基础开发环境以进行后续的数据下载和处理：\n\n*   **操作系统**: Linux (推荐 Ubuntu 20.04+), macOS, 或 Windows (WSL2)\n*   **Python**: 3.8 或更高版本\n*   **依赖库**:\n    *   `git`: 用于克隆仓库查看列表\n    *   `huggingface_hub`: 用于从 Hugging Face 下载数据集（大多数数据集托管于此）\n    *   `datasets`: Hugging Face 提供的数据处理库\n\n**前置依赖安装命令：**\n\n```bash\n# 安装 Hugging Face 相关库\npip install huggingface_hub datasets\n\n# (可选) 配置国内镜像加速 (针对 Hugging Face 下载慢的问题)\nexport HF_ENDPOINT=https:\u002F\u002Fhf-mirror.com\n```\n\n## 2. 获取数据集列表\n\n首先，克隆该仓库以获取最新的数据集清单和详细信息。\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fhiyouga\u002Fawesome-instruction-dataset.git\ncd awesome-instruction-dataset\n```\n\n> **提示**: 如果 GitHub 访问缓慢，可使用国内镜像源：\n> `git clone https:\u002F\u002Fgitee.com\u002Fmirrors\u002Fawesome-instruction-dataset.git` (若存在镜像) 或通过代理访问。\n\n在仓库根目录查看 `README.md` 文件，根据以下标签筛选适合你的数据集：\n*   **语言标签**: `CN` (中文), `EN` (英文), `ML` (多语言)\n*   **任务标签**: `MT` (多任务), `TS` (特定任务)\n*   **生成方式**: `HG` (人工生成), `SI` (Self-Instruct), `MIX` (混合)\n\n## 3. 基本使用示例\n\n以下演示如何选取列表中热门的 **Alpaca (英文)** 和 **Firefly (中文)** 数据集进行下载和预览。\n\n### 场景 A：下载英文指令数据集 (Stanford Alpaca)\n\nAlpaca 是基于 Self-Instruct 方法生成的经典指令数据集。\n\n```python\nfrom datasets import load_dataset\n\n# 加载 Stanford Alpaca 数据集 (52K 条)\n# 对应列表中的: (tatsu-lab\u002FAlpaca)|52K|EN|MT|SI\ndataset = load_dataset(\"tatsu-lab\u002Falpaca\")\n\n# 查看数据结构\nprint(dataset[\"train\"][0])\n\n# 输出示例:\n# {'instruction': 'Give three tips for staying healthy.', \n#  'input': '', \n#  'output': '1.Eat a balanced diet...'}\n```\n\n### 场景 B：下载中文指令数据集 (Firefly)\n\nFirefly 是专为中文大模型训练收集的高质量数据集。\n\n```python\nfrom datasets import load_dataset\n\n# 确保已设置国内镜像加速 (见环境准备部分)，否则下载可能超时\n# 加载 Firefly 中文数据集 (1.1M 条)\n# 对应列表中的: (YeungNLP\u002Ffirefly-train-1.1M)|1100k|CN|MT|COL\ndataset = load_dataset(\"YeungNLP\u002Ffirefly-train-1.1M\")\n\n# 预览前两条数据\nfor i in range(2):\n    print(f\"--- Sample {i+1} ---\")\n    print(f\"Instruction: {dataset['train'][i]['instruction']}\")\n    print(f\"Input: {dataset['train'][i]['input']}\")\n    print(f\"Output: {dataset['train'][i]['output']}\\n\")\n```\n\n### 场景 C：处理多模态数据集 (LLaVA)\n\n对于视觉 - 语言指令微调（如 LLaVA），数据通常包含图像路径和文本指令。\n\n```python\nfrom datasets import load_dataset\n\n# 加载 LLaVA Visual Instruct 150K\n# 对应列表中的: (haotian-liu\u002FLLaVA)|150K|EN|MT|MIX\ndataset = load_dataset(\"liuhaotian\u002FLLaVA-Instruct-150K\", \"llava_instruct_150k\")\n\n# 查看单条数据 (包含图像文件和对话内容)\nsample = dataset[\"train\"][0]\nprint(f\"Image source: {sample['image']}\")\nprint(f\"Conversation: {sample['conversations']}\")\n```\n\n## 4. 下一步建议\n\n1.  **数据清洗**: 下载后的数据可能需要根据你的具体任务进行格式统一或清洗（参考列表中的 `Cleaned Alpaca` 项目思路）。\n2.  **格式转换**: 将数据转换为训练框架（如 LLaMA-Factory, DeepSpeed, Megatron-LM）所需的 JSONL 格式。\n3.  **许可证检查**: 在使用前，务必在列表中确认数据集的 `License` 字段。\n    *   `CC BY-NC`: 仅限非商业用途。\n    *   `Apache 2.0` \u002F `MIT`: 通常允许商业使用（需遵循具体条款）。\n    *   列表第 5 章节专门整理了 **[允许商业使用的许可证](#license-that-allows-commercial-use)** 数据集，商用请优先查阅该部分。","某初创团队希望基于 LLaMA 架构构建一个支持中英双语的垂直领域医疗咨询助手，但面临高质量训练数据匮乏的难题。\n\n### 没有 awesome-instruction-dataset 时\n- **数据搜集耗时极长**：团队成员需手动在 GitHub、Hugging Face 等平台分散搜索，难以区分哪些数据集包含中文或特定医疗任务，效率低下。\n- **数据质量参差不齐**：缺乏统一标准，容易混入未经清洗的噪声数据或未标注来源的机器生成内容，导致模型出现“幻觉”或回答不专业。\n- **多模态能力缺失**：若想增加“看片诊断”功能，很难快速找到像 LLaVA 或 MiniGPT-4 那样成熟的图文指令对数据集，被迫放弃多模态研发。\n- **合规与安全风险高**：缺少专门的 RLHF（人类反馈强化学习）和红队测试数据集，模型可能输出有害建议，无法满足医疗行业的严谨性要求。\n\n### 使用 awesome-instruction-dataset 后\n- **一站式精准获取**：直接通过标签筛选出\"CN（中文）”、\"MT（多任务）”及\"HG（人工生成）”的数据集（如 Firefly 或 HC3-Chinese），半天内即可凑齐千万级高质量语料。\n- **结构化分类清晰**：利用其明确的生成方法标签（SI\u002FMIX\u002FHG），团队能快速组合出自指令数据与人工校验数据，显著提升了模型在复杂问诊中的逻辑稳定性。\n- **多模态快速集成**：直接调用列表中集成的视觉指令数据集，顺利将图像识别能力融入模型，实现了上传检查单即可解读的功能。\n- **安全对齐有保障**：引入专门的 prosocial-dialog 和红队测试数据集进行微调，有效抑制了模型的错误医疗建议，大幅降低了上线风险。\n\nawesome-instruction-dataset 通过聚合全球优质的指令微调资源，将原本数周的数据工程压缩至数天，让开发者能专注于模型核心能力的打磨而非数据搬运。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FyaodongC_awesome-instruction-dataset_cb323abe.png","yaodongC","Yaodong Cui","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FyaodongC_8eaf6c93.png","Co-founder of LoopX.ai | \r\nPh.D | NEXT AI 23  ","@loopx-ai","Waterloo, Canada",null,"loopx.ai","https:\u002F\u002Fgithub.com\u002FyaodongC",1146,56,"2026-03-24T02:43:32",1,"","未说明",{"notes":88,"python":86,"dependencies":89},"该项目是一个指令微调数据集的集合列表（Awesome List），本身不包含可执行的代码库或训练脚本，因此没有特定的运行环境、GPU、内存或依赖库要求。用户需根据列表中引用的具体子项目（如 MiniGPT-4, LLaVA, Alpaca 等）的独立文档来配置相应的运行环境。",[],[35,16,14],[92,93,94,95,96,97,98,99],"llama","awsome-lists","datasets","gpt-3","gpt-4","instruction-following","instruction-tuning","language-model","2026-03-27T02:49:30.150509","2026-04-10T10:30:15.650720",[],[]]