[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-PKU-Alignment--safe-rlhf":3,"tool-PKU-Alignment--safe-rlhf":62},[4,18,28,37,45,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":24,"last_commit_at":25,"category_tags":26,"status":17},9989,"n8n","n8n-io\u002Fn8n","n8n 是一款面向技术团队的公平代码（fair-code）工作流自动化平台，旨在让用户在享受低代码快速构建便利的同时，保留编写自定义代码的灵活性。它主要解决了传统自动化工具要么过于封闭难以扩展、要么完全依赖手写代码效率低下的痛点，帮助用户轻松连接 400 多种应用与服务，实现复杂业务流程的自动化。\n\nn8n 特别适合开发者、工程师以及具备一定技术背景的业务人员使用。其核心亮点在于“按需编码”：既可以通过直观的可视化界面拖拽节点搭建流程，也能随时插入 JavaScript 或 Python 代码、调用 npm 包来处理复杂逻辑。此外，n8n 原生集成了基于 LangChain 的 AI 能力，支持用户利用自有数据和模型构建智能体工作流。在部署方面，n8n 提供极高的自由度，支持完全自托管以保障数据隐私和控制权，也提供云端服务选项。凭借活跃的社区生态和数百个现成模板，n8n 让构建强大且可控的自动化系统变得简单高效。",184740,2,"2026-04-19T23:22:26",[16,14,13,15,27],"插件",{"id":29,"name":30,"github_repo":31,"description_zh":32,"stars":33,"difficulty_score":10,"last_commit_at":34,"category_tags":35,"status":17},10095,"AutoGPT","Significant-Gravitas\u002FAutoGPT","AutoGPT 是一个旨在让每个人都能轻松使用和构建 AI 的强大平台，核心功能是帮助用户创建、部署和管理能够自动执行复杂任务的连续型 AI 智能体。它解决了传统 AI 应用中需要频繁人工干预、难以自动化长流程工作的痛点，让用户只需设定目标，AI 即可自主规划步骤、调用工具并持续运行直至完成任务。\n\n无论是开发者、研究人员，还是希望提升工作效率的普通用户，都能从 AutoGPT 中受益。开发者可利用其低代码界面快速定制专属智能体；研究人员能基于开源架构探索多智能体协作机制；而非技术背景用户也可直接选用预置的智能体模板，立即投入实际工作场景。\n\nAutoGPT 的技术亮点在于其模块化“积木式”工作流设计——用户通过连接功能块即可构建复杂逻辑，每个块负责单一动作，灵活且易于调试。同时，平台支持本地自托管与云端部署两种模式，兼顾数据隐私与使用便捷性。配合完善的文档和一键安装脚本，即使是初次接触的用户也能在几分钟内启动自己的第一个 AI 智能体。AutoGPT 正致力于降低 AI 应用门槛，让人人都能成为 AI 的创造者与受益者。",183572,"2026-04-20T04:47:55",[13,36,27,14,15],"语言模型",{"id":38,"name":39,"github_repo":40,"description_zh":41,"stars":42,"difficulty_score":10,"last_commit_at":43,"category_tags":44,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":46,"name":47,"github_repo":48,"description_zh":49,"stars":50,"difficulty_score":24,"last_commit_at":51,"category_tags":52,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",161692,"2026-04-20T11:33:57",[14,13,36],{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":59,"last_commit_at":60,"category_tags":61,"status":17},8272,"opencode","anomalyco\u002Fopencode","OpenCode 是一款开源的 AI 编程助手（Coding Agent），旨在像一位智能搭档一样融入您的开发流程。它不仅仅是一个代码补全插件，而是一个能够理解项目上下文、自主规划任务并执行复杂编码操作的智能体。无论是生成全新功能、重构现有代码，还是排查难以定位的 Bug，OpenCode 都能通过自然语言交互高效完成，显著减少开发者在重复性劳动和上下文切换上的时间消耗。\n\n这款工具专为软件开发者、工程师及技术研究人员设计，特别适合希望利用大模型能力来提升编码效率、加速原型开发或处理遗留代码维护的专业人群。其核心亮点在于完全开源的架构，这意味着用户可以审查代码逻辑、自定义行为策略，甚至私有化部署以保障数据安全，彻底打破了传统闭源 AI 助手的“黑盒”限制。\n\n在技术体验上，OpenCode 提供了灵活的终端界面（Terminal UI）和正在测试中的桌面应用程序，支持 macOS、Windows 及 Linux 全平台。它兼容多种包管理工具，安装便捷，并能无缝集成到现有的开发环境中。无论您是追求极致控制权的资深极客，还是渴望提升产出的独立开发者，OpenCode 都提供了一个透明、可信",144296,1,"2026-04-16T14:50:03",[13,27],{"id":63,"github_repo":64,"name":65,"description_en":66,"description_zh":67,"ai_summary_zh":68,"readme_en":69,"readme_zh":70,"quickstart_zh":71,"use_case_zh":72,"hero_image_url":73,"owner_login":74,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":77,"owner_email":78,"owner_twitter":77,"owner_website":77,"owner_url":79,"languages":80,"stars":97,"forks":98,"last_commit_at":99,"license":100,"difficulty_score":10,"env_os":101,"env_gpu":102,"env_ram":102,"env_deps":103,"category_tags":107,"github_topics":108,"view_count":24,"oss_zip_url":77,"oss_zip_packed_at":77,"status":17,"created_at":128,"updated_at":129,"faqs":130,"releases":146},10186,"PKU-Alignment\u002Fsafe-rlhf","safe-rlhf","Safe RLHF: Constrained Value Alignment via Safe Reinforcement Learning from Human Feedback","safe-rlhf 是由北京大学团队开发的一款高度模块化的开源框架，旨在通过“安全强化学习人类反馈”（Safe RLHF）技术，解决大语言模型在价值观对齐过程中难以兼顾“有用性”与“安全性”的难题。传统方法往往侧重提升模型能力而忽视潜在危害，safe-rlhf 则引入约束机制，确保模型在遵循人类偏好的同时，严格规避有害输出。\n\n该工具支持对 LLaMA、OPT、Baichuan 等主流预训练模型进行监督微调（SFT）、常规 RLHF 及安全 RLHF 训练。其核心亮点在于提供了规模高达百万级的人工标注数据集，涵盖帮助性与无害性偏好，并配套了完整的奖励模型与成本模型训练流程及预训练权重。此外，它还集成了 BIG-bench、GPT-4 评估等多尺度指标，方便开发者量化验证模型的安全约束效果。\n\nsafe-rlhf 特别适合从事大模型对齐研究的研究人员、需要定制安全策略的算法工程师以及希望复现前沿安全对齐成果的开发者使用。无论是探索受限价值对齐的理论边界，还是构建实际落地的安全对话系统，safe-rlhf 都提供了一套可复现、可扩展且数据丰富的完整解决方案，助力社区共同推动可信 AI 的发","safe-rlhf 是由北京大学团队开发的一款高度模块化的开源框架，旨在通过“安全强化学习人类反馈”（Safe RLHF）技术，解决大语言模型在价值观对齐过程中难以兼顾“有用性”与“安全性”的难题。传统方法往往侧重提升模型能力而忽视潜在危害，safe-rlhf 则引入约束机制，确保模型在遵循人类偏好的同时，严格规避有害输出。\n\n该工具支持对 LLaMA、OPT、Baichuan 等主流预训练模型进行监督微调（SFT）、常规 RLHF 及安全 RLHF 训练。其核心亮点在于提供了规模高达百万级的人工标注数据集，涵盖帮助性与无害性偏好，并配套了完整的奖励模型与成本模型训练流程及预训练权重。此外，它还集成了 BIG-bench、GPT-4 评估等多尺度指标，方便开发者量化验证模型的安全约束效果。\n\nsafe-rlhf 特别适合从事大模型对齐研究的研究人员、需要定制安全策略的算法工程师以及希望复现前沿安全对齐成果的开发者使用。无论是探索受限价值对齐的理论边界，还是构建实际落地的安全对话系统，safe-rlhf 都提供了一套可复现、可扩展且数据丰富的完整解决方案，助力社区共同推动可信 AI 的发展。","\u003C!-- markdownlint-disable first-line-h1 -->\n\u003C!-- markdownlint-disable html -->\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"images\u002FPKU-Beaver-logo-wide.svg\" width=\"80%\"\u002F>\n\u003C\u002Fdiv>\n\n\u003Ch1 align=\"center\">Constrained Value-Aligned LLM via Safe RLHF\u003C\u002Fh1>\n\nBeaver is a highly modular open-source RLHF framework developed by the PKU-Alignment team at Peking University.\nIt aims to provide training data and a reproducible code pipeline for alignment research, especially constrained alignment LLM research via Safe RLHF methods.\n\nThe key features of Beaver are:\n\n- Support **SFT**, **RLHF** and **Safe RLHF** training for popular pre-trained models: [LLaMA](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Flarge-language-model-llama-meta-ai), [OPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.01068), [Baichuan](https:\u002F\u002Fhuggingface.co\u002Fbaichuan-inc\u002FBaichuan-7B), etc.\n- Provide a large human-labeled dataset **(up to 1M pairs)** including both helpful and harmless preferences to support reproducible RLHF research.\n- Support training for Reward Model & Cost Model, and provide pre-trained checkpoints.\n- Support customized parameters and datasets for SFT and RLHF.\n- Provide multi-scale metrics for safety constraints verification, e.g., BIG-bench, GPT-4 Evaluation.\n\n## **🦫 What's New?**  \u003C!-- omit in toc -->\n\n- **🎉 `2024\u002F06\u002F13`:** We are pleased to announce the open-sourcing of our PKU-SafeRLHF dataset version 1.0. This release advances over the initial beta version by incorporating human-AI joint annotations, expanding the scope of harm categories, and introducing detailed severity level labels. For further details and access, please visit our dataset page on 🤗 Hugging Face: [PKU-Alignment\u002FPKU-SafeRLHF](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPKU-Alignment\u002FPKU-SafeRLHF).\n- **🎉 `2024\u002F01\u002F16`:** Our method [**Safe RLHF**](https:\u002F\u002Fopenreview.net\u002Fforum?id=TyFrPOKYXw) has been accepted by ICLR 2024 Spotlight.\n- **📄 `2023\u002F10\u002F19`:** We've released our [**Safe RLHF paper**](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.12773) on arXiv, detailing our new safe alignment algorithm and its implementation.\n- **🚀 `2023\u002F07\u002F10`:** We're delighted to announce the open-sourcing of **Beaver-7B** [v1](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v1.0) \u002F [v2](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v2.0) \u002F [v3](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v3.0) models as the first milestone of the Safe RLHF training series, complemented by the corresponding **Reward Models** [v1](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v1.0-reward) \u002F [v2](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v2.0-reward) \u002F [v3](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v3.0-reward) \u002F [unified](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-unified-reward) and **Cost Models** [v1](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v1.0-cost) \u002F [v2](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v2.0-cost) \u002F [v3](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v3.0-cost) \u002F [unified](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-unified-cost) checkpoints on 🤗 Hugging Face.\n- **🔥 `2023\u002F07\u002F10`:** We extend the open-source safety preference dataset, [**PKU-Alignment\u002FPKU-SafeRLHF**](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPKU-Alignment\u002FPKU-SafeRLHF), which now contains over 300k examples. (See also section [PKU-SafeRLHF-Dataset](#pku-saferlhf-dataset))\n- **⚙ `2023\u002F07\u002F05`:** We enhanced our support for Chinese pre-training models and incorporated additional open-source Chinese datasets. (See also sections [Chinese Support (中文支持)](#chinese-support-中文支持) and [Custom Datasets (自定义数据集)](#custom-datasets))\n- **⭐️ `2023\u002F05\u002F15`:** First release of the Safe RLHF pipeline, evaluation results, and training code.\n\n### Table of Contents  \u003C!-- omit in toc -->\n\n- [Constrained Value Alignment via Safe RLHF](#constrained-value-alignment-via-safe-rlhf)\n- [Comparison with Other RLHF Libraries](#comparison-with-other-rlhf-libraries)\n- [PKU-SafeRLHF-Dataset](#pku-saferlhf-dataset)\n  - [PKU-SafeRLHF-10K](#pku-saferlhf-10k)\n  - [PKU-SafeRLHF-1M](#pku-saferlhf-1m)\n- [Why \"Beaver\"](#why-beaver)\n- [Beaver vs. Alpaca](#beaver-vs-alpaca)\n- [Installation](#installation)\n- [Training](#training)\n- [Custom Datasets](#custom-datasets)\n- [Inference](#inference)\n  - [Interactive CLI Demo](#interactive-cli-demo)\n  - [Interactive Arena](#interactive-arena)\n- [Chinese Support (中文支持)](#chinese-support-中文支持)\n- [Benchmark and Evaluation](#benchmark-and-evaluation)\n  - [Arena via Reward and Cost Models](#arena-via-reward-and-cost-models)\n  - [BIG-bench](#big-bench)\n  - [GPT-4 Evaluation](#gpt-4-evaluation)\n- [Future Plans](#future-plans)\n- [Citation](#citation)\n- [PKU-Alignment Team](#pku-alignment-team)\n- [License](#license)\n\n## Constrained Value Alignment via Safe RLHF\n\nReinforcement Learning from Human Feedback: reward maximization via preference learning\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"images\u002Frl-formulation.svg\" width=\"40%\"\u002F>\n\u003C\u002Fdiv>\n\nSafe Reinforcement Learning from Human Feedback: constrained reward maximization via preference learning\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"images\u002Fsafe-rl-formulation.svg\" width=\"55%\"\u002F>\n\u003C\u002Fdiv>\n\nwhere $R (\\cdot)$ and $C (\\cdot)$ are reward and cost functions respectively. They are neural networks known as human proxies trained on human preferences.\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"images\u002Fpreference-learning.svg\" width=\"90%\"\u002F>\n\u003C\u002Fdiv>\n\nThe ultimate goal is to find a model $\\pi_{\\theta}$ that is both helpful (high reward) and harmless (low cost).\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_7fe2d4c33dbd.png\" width=\"90%\"\u002F>\n\u003C\u002Fdiv>\n\n## Comparison with Other RLHF Libraries\n\nCompare with other frameworks supporting RLHF, `safe-rlhf` is the first framework to support all stages from SFT to RLHF and Evaluation.\nIn addition, `safe-rlhf` is the first framework that takes safety preference under consideration during the RLHF stage.\nIt holds a more theoretical guarantee for constrained parameter searching in the policy space.\n\n|                                                                                                        |             SFT              | Preference Model\u003Csup>[1](#perference-model)\u003C\u002Fsup> Training | RLHF  | Safe RLHF | PTX Loss | Evaluation |      Backend      |\n| :----------------------------------------------------------------------------------------------------: | :--------------------------: | :--------------------------------------------------------: | :---: | :-------: | :------: | :--------: | :---------------: |\n|                  [Beaver](https:\u002F\u002Fgithub.com\u002FPKU-Alignment\u002Fsafe-rlhf)\u003C\u002Fbr>(Safe-RLHF)                  |              ✔️               |                             ✔️                              |   ✔️   |     ✔️     |    ✔️     |     ✔️      |     DeepSpeed     |\n|                                [trlX](https:\u002F\u002Fgithub.com\u002FCarperAI\u002Ftrlx)                                |              ✔️               |             ❌\u003Csup>[2](#trlx-rm-example)\u003C\u002Fsup>              |   ✔️   |     ❌     |    ❌     |     ❌      | Accelerate \u002F NeMo |\n| [DeepSpeed-Chat](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeepSpeedExamples\u002Ftree\u002FHEAD\u002Fapplications\u002FDeepSpeed-Chat) |              ✔️               |                             ✔️                              |   ✔️   |     ❌     |    ✔️     |     ❌      |     DeepSpeed     |\n|                         [Colossal-AI](https:\u002F\u002Fgithub.com\u002Fhpcaitech\u002FColossalAI)                         |              ✔️               |                             ✔️                              |   ✔️   |     ❌     |    ✔️     |     ❌      |    ColossalAI     |\n|                         [AlpacaFarm](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm)                         | ❌\u003Csup>[3](#alpaca-sft)\u003C\u002Fsup> |                             ✔️                              |   ✔️   |     ❌     |    ❌     |     ✔️      |    Accelerate     |\n\n\u003Csup>\n  \u003Ca name=\"perference-model\">1.\u003C\u002Fa> In the context of RLHF, the \"Preference Model\" is identified as the \"Reward Model\". And the \"Preference Model\" refers to both the \"Reward Model\" and the \"Cost Model\" in Safe RLHF.\n\u003C\u002Fsup>\u003Cbr\u002F>\n\u003Csup>\n  \u003Ca name=\"trlx-rm-example\">2.\u003C\u002Fa> There is an example for reward model training in the examples directory in the trlX repository. However it is not officially supported and is not integrated into the trlX library.\n\u003C\u002Fsup>\u003Cbr\u002F>\n\u003Csup>\n  \u003Ca name=\"alpaca-sft\">3.\u003C\u002Fa> The supervised fine-tuning support for Alpaca is provided in the \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca\">tatsu-lab\u002Fstanford_alpaca\u003C\u002Fa> repository.\n\u003C\u002Fsup>\n\n## PKU-SafeRLHF-Dataset\n\nThe `PKU-SafeRLHF` dataset is a human-labeled dataset containing both performance and safety preferences.\nIt includes constraints in over ten dimensions, such as insults, immorality, crime, emotional harm, and privacy, among others.\nThese constraints are designed for fine-grained value alignment in RLHF technology.\n\nTo facilitate multi-round fine-tuning, we will release the initial parameter weights, required datasets, and training parameters for each round.\nThis ensures reproducibility in scientific and academic research.\nThe dataset will be released gradually through rolling updates.\n\nThe dataset is available on Hugging Face: [PKU-Alignment\u002FPKU-SafeRLHF](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPKU-Alignment\u002FPKU-SafeRLHF).\n\n### PKU-SafeRLHF-10K\n\n`PKU-SafeRLHF-10K` is a subset of `PKU-SafeRLHF` that contains the first round of Safe RLHF training data with 10K instances, including safety preferences.\nYou can find it on Hugging Face: [PKU-Alignment\u002FPKU-SafeRLHF-10K](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPKU-Alignment\u002FPKU-SafeRLHF-10K).\n\n### PKU-SafeRLHF-1M\n\nWe will gradually release the full Safe-RLHF datasets, which include **1M _human-labeled_ pairs** for both helpful and harmless preferences.\n\n## Why \"Beaver\"\n\nBeaver is a large language model based on [LLaMA](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Flarge-language-model-llama-meta-ai), trained using `safe-rlhf`.\nIt is developed upon the foundation of the [Alpaca](https:\u002F\u002Fcrfm.stanford.edu\u002F2023\u002F03\u002F13\u002Falpaca.html) model, by collecting human preference data related to helpfulness and harmlessness and employing the Safe RLHF technique for training.\nWhile maintaining the helpful performance of Alpaca, Beaver significantly improves its harmlessness.\n\n> Beavers are known as the \"natural dam engineers\" as they are adept at using branches, shrubs, rocks, and soil to build dams and small wooden houses, creating wetland environments suitable for other creatures to inhabit, making them an indispensable part of the ecosystem. To ensure the safety and reliability of Large Language Models (LLMs) while accommodating a wide range of values across different populations, the Peking University team has named their open-source model \"Beaver\" and aims to build a dam for LLMs through the Constrained Value Alignment (CVA) technology. This technology enables fine-grained labeling of information and, combined with secure reinforcement learning methods, significantly reduces model bias and discrimination, thereby enhancing the model's safety. Analogous to the role of beavers in the ecosystem, the Beaver model will provide crucial support for the development of large language models and make positive contributions to the sustainable development of artificial intelligence technology.\n\n## Beaver vs. Alpaca\n\nFollowing the evaluation methodology of the [Vicuna](https:\u002F\u002Flmsys.org\u002Fblog\u002F2023-03-30-vicuna) model, we utilized GPT-4 to evaluate Beaver. The results indicate that, compared to Alpaca, Beaver exhibits significant improvements in multiple dimensions related to safety.\n\n![Arena-Demo](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_a92a2e8ec8b6.gif)\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_e880806ce697.png\" width=\"60%\"\u002F>\n\u003C\u002Fdiv>\n\nSignificant distribution shift for safety preferences after utilizing the Safe RLHF pipeline on the Alpaca-7B model.\n\n\u003Ctable width=\"100%\" cellspacing=\"0\" cellpadding=\"0\">\n  \u003Ctr align=\"center\" valign=\"middle\">\n    \u003Ctd width=\"50%\">\n      \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_4d076cac560c.png\" width=\"100%\"\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd width=\"50%\">\n      \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_6fc8c5148e0c.png\" width=\"100%\"\u002F>\n    \u003C\u002Ftd>\n  \u003C\u002Ftr>\n\u003C\u002Ftable>\n\n## Installation\n\nClone the source code from GitHub:\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FPKU-Alignment\u002Fsafe-rlhf.git\ncd safe-rlhf\n```\n\n**Native Runner:** Setup a conda environment using [`conda`](https:\u002F\u002Fgithub.com\u002Fconda\u002Fconda) \u002F [`mamba`](https:\u002F\u002Fgithub.com\u002Fmamba-org\u002Fmamba):\n\n```bash\nconda env create --file conda-recipe.yaml  # or `mamba env create --file conda-recipe.yaml`\n```\n\nThis will automatically setup all dependencies.\n\n**Containerized Runner:** Other than using the native machine with conda isolation, as an alternative, you can also use docker images to configure the environment.\n\nFirstly, please follow [NVIDIA Container Toolkit: Installation Guide](https:\u002F\u002Fdocs.nvidia.com\u002Fdatacenter\u002Fcloud-native\u002Fcontainer-toolkit\u002Finstall-guide.html) and [NVIDIA Docker: Installation Guide](https:\u002F\u002Fdocs.nvidia.com\u002Fdatacenter\u002Fcloud-native\u002Fcontainer-toolkit\u002Finstall-guide.html#docker) to setup `nvidia-docker`.\nThen you can run:\n\n```bash\nmake docker-run\n```\n\nThis command will build and start a docker container installed with proper dependencies.\nThe host path `\u002F` will be mapped to `\u002Fhost` and the current working directory will be mapped to `\u002Fworkspace` inside the container.\n\n## Training\n\n`safe-rlhf` supports a complete pipeline from Supervised Fine-Tuning (SFT) to preference model training to RLHF alignment training.\n\n0. Follow the instructions in section [Installation](#installation) to setup the training environment properly.\n\n```bash\nconda activate safe-rlhf\nexport WANDB_API_KEY=\"...\"  # your W&B API key here\n```\n\nor\n\n```bash\nmake docker-run\nexport WANDB_API_KEY=\"...\"  # your W&B API key here\n```\n\n1. Supervised Fine-Tuning (SFT)\n\n```bash\nbash scripts\u002Fsft.sh \\\n    --model_name_or_path \u003Cyour-model-name-or-checkpoint-path> \\\n    --output_dir output\u002Fsft\n```\n\nNOTE: You may need to update some of the parameters in the script according to your machine setup, such as the number of GPUs for training, the training batch size, etc.\n\n2. Value Models (reward model & cost model)\n\n```bash\nbash scripts\u002Freward-model.sh \\\n    --model_name_or_path output\u002Fsft \\\n    --output_dir output\u002Frm\n```\n\n```bash\nbash scripts\u002Fcost-model.sh \\\n    --model_name_or_path output\u002Fsft \\\n    --output_dir output\u002Fcm\n```\n\n3. RLHF (Optional)\n\n```bash\nbash scripts\u002Fppo.sh \\\n    --actor_model_name_or_path output\u002Fsft \\\n    --reward_model_name_or_path output\u002Frm \\\n    --output_dir output\u002Fppo\n```\n\n4. Safe-RLHF\n\n```bash\nbash scripts\u002Fppo-lag.sh \\\n    --actor_model_name_or_path output\u002Fsft \\\n    --reward_model_name_or_path output\u002Frm \\\n    --cost_model_name_or_path output\u002Fcm \\\n    --output_dir output\u002Fppo-lag\n```\n\nAn example of commands to run the whole pipeline with [LLaMA-7B](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Flarge-language-model-llama-meta-ai):\n\n```bash\nconda activate safe-rlhf\nbash scripts\u002Fsft.sh --model_name_or_path ~\u002Fmodels\u002Fllama-7b --output_dir output\u002Fsft\nbash scripts\u002Freward-model.sh --model_name_or_path output\u002Fsft --output_dir output\u002Frm\nbash scripts\u002Fcost-model.sh --model_name_or_path output\u002Fsft --output_dir output\u002Fcm\nbash scripts\u002Fppo-lag.sh \\\n    --actor_model_name_or_path output\u002Fsft \\\n    --reward_model_name_or_path output\u002Frm \\\n    --cost_model_name_or_path output\u002Fcm \\\n    --output_dir output\u002Fppo-lag\n```\n\n#### Computational Requirements  \u003C!-- omit in toc -->\n\nAll training processes listed above are tested with [LLaMA-7B](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Flarge-language-model-llama-meta-ai) on a cloud server with 8 x NVIDIA A800-80GB GPUs.\n\nUsers, who do not have enough GPU memory resources, can enable [DeepSpeed ZeRO-Offload](https:\u002F\u002Fwww.deepspeed.ai\u002Ftutorials\u002Fzero-offload) to alleviate the peak GPU memory usage.\n\nAll training scripts can pass with an extra option `--offload` (defaults to `none`, i.e., disable ZeRO-Offload) to offload the tensors (parameters and\u002For optimizer states) to CPU. For example:\n\n```bash\nbash scripts\u002Fsft.sh \\\n    --model_name_or_path ~\u002Fmodels\u002Fllama-7b \\\n    --output_dir output\u002Fsft \\\n    --offload all  # or `parameter` or `optimizer`\n```\n\nFor multi-node settings, users can refer to the [DeepSpeed: Resource Configuration (multi-node)](https:\u002F\u002Fwww.deepspeed.ai\u002Fgetting-started\u002F#resource-configuration-multi-node) documentation for more details. Here is an example to start the training process on 4 nodes (each has 8 GPUs):\n\n```text\n# myhostfile\nworker-1 slots=8\nworker-2 slots=8\nworker-3 slots=8\nworker-4 slots=8\n```\n\nThen launch the training scripts with:\n\n```bash\nbash scripts\u002Fsft.sh \\\n    --hostfile myhostfile \\\n    --model_name_or_path ~\u002Fmodels\u002Fllama-7b \\\n    --output_dir output\u002Fsft\n```\n\n## Custom Datasets\n\n`safe-rlhf` provides an abstraction to create datasets for all of the Supervised Fine-Tuning, preference model training, and RL training stages.\n\n```python\nclass RawSample(TypedDict, total=False):\n    \"\"\"Raw sample type.\n\n    For SupervisedDataset, should provide (input, answer) or (dialogue).\n    For PreferenceDataset, should provide (input, answer, other_answer, better).\n    For SafetyPreferenceDataset, should provide (input, answer, other_answer, safer, is_safe, is_other_safe).\n    For PromptOnlyDataset, should provide (input).\n    \"\"\"\n\n    # Texts\n    input: NotRequired[str]  # either `input` or `dialogue` should be provided\n    \"\"\"User input text.\"\"\"\n    answer: NotRequired[str]\n    \"\"\"Assistant answer text.\"\"\"\n    other_answer: NotRequired[str]\n    \"\"\"Other assistant answer text via resampling.\"\"\"\n    dialogue: NotRequired[list[str]]  # either `input` or `dialogue` should be provided\n    \"\"\"Dialogue history.\"\"\"\n\n    # Flags\n    better: NotRequired[bool]\n    \"\"\"Whether ``answer`` is better than ``other_answer``.\"\"\"\n    safer: NotRequired[bool]\n    \"\"\"Whether ``answer`` is safer than ``other_answer``.\"\"\"\n    is_safe: NotRequired[bool]\n    \"\"\"Whether ``answer`` is safe.\"\"\"\n    is_other_safe: NotRequired[bool]\n    \"\"\"Whether ``other_answer`` is safe.\"\"\"\n```\n\nHere is an example to implement a custom dataset (see [safe_rlhf\u002Fdatasets\u002Fraw](safe_rlhf\u002Fdatasets\u002Fraw) for more examples):\n\n```python\nimport argparse\nfrom datasets import load_dataset\nfrom safe_rlhf.datasets import RawDataset, RawSample, parse_dataset\n\n\nclass MyRawDataset(RawDataset):\n    NAME = 'my-dataset-name'\n\n    def __init__(self, path=None) -> None:\n        # Load a dataset from Hugging Face\n        self.data = load_dataset(path or 'my-organization\u002Fmy-dataset')['train']\n\n    def __getitem__(self, index: int) -> RawSample:\n        data = self.data[index]\n        # Construct a `RawSample` dictionary from your custom dataset item\n        return RawSample(\n            input=data['col1'],\n            answer=data['col2'],\n            other_answer=data['col3'],\n            better=float(data['col4']) > float(data['col5']),\n            ...\n        )\n\n    def __len__(self) -> int:\n        return len(self.data)  # dataset size\n\n\ndef parse_arguments():\n    parser = argparse.ArgumentParser(...)\n    parser.add_argument(\n        '--datasets',\n        type=parse_dataset,\n        nargs='+',\n        metavar='DATASET[:PROPORTION[:PATH]]',\n    )\n    ...\n    return parser.parse_args()\n\n\ndef main():\n    args = parse_arguments()\n    ...\n\n\nif __name__ == '__main__':\n    main()\n```\n\nThen you can pass this dataset to the training scripts as:\n\n```bash\npython3 train.py --datasets my-dataset-name\n```\n\nYou may also pass multiple datasets with optionally additional dataset proportions (separated by a colon `:`). For example:\n\n```bash\npython3 train.py --datasets alpaca:0.75 my-dataset-name:0.5\n```\n\nThis will use randomly split 75% of the Stanford Alpaca dataset and 50% of your custom dataset.\n\nIn addition, the dataset argument can also be followed by a local path (separated by a colon `:`) if you have already cloned the dataset repository from Hugging Face.\n\n```bash\ngit lfs install\ngit clone https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmy-organization\u002Fmy-dataset ~\u002Fpath\u002Fto\u002Fmy-dataset\u002Frepository\npython3 train.py --datasets alpaca:0.75 my-dataset-name:0.5:~\u002Fpath\u002Fto\u002Fmy-dataset\u002Frepository\n```\n\nNOTE: The dataset class must be imported before the training script begins to parse the command line arguments.\n\n## Inference\n\n### Interactive CLI Demo\n\n```bash\npython3 -m safe_rlhf.serve.cli --model_name_or_path output\u002Fsft  # or output\u002Fppo-lag\n```\n\n### Interactive Arena\n\n```bash\npython3 -m safe_rlhf.serve.arena --red_corner_model_name_or_path output\u002Fsft --blue_corner_model_name_or_path output\u002Fppo-lag\n```\n\n![Arena-Demo](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_a92a2e8ec8b6.gif)\n\n## Chinese Support (中文支持)\n\nThe Safe-RLHF pipeline supports not only the LLaMA model family but also other pre-trained models such as [Baichuan](https:\u002F\u002Fhuggingface.co\u002Fbaichuan-inc\u002FBaichuan-7B), [InternLM](https:\u002F\u002Fhuggingface.co\u002Finternlm\u002Finternlm-7b), etc. that offer better support for Chinese. You just need to update the path to the pre-trained model in the training and inference code.\n\n> Safe-RLHF 管道不仅仅支持 LLaMA 系列模型，它也支持其他一些对中文支持更好的预训练模型，例如 [Baichuan](https:\u002F\u002Fhuggingface.co\u002Fbaichuan-inc\u002FBaichuan-7B) 和 [InternLM](https:\u002F\u002Fhuggingface.co\u002Finternlm\u002Finternlm-7b) 等。你只需要在训练和推理的代码中更新预训练模型的路径即可。\n\n```bash\n# SFT training\nbash scripts\u002Fsft.sh --model_name_or_path baichuan-inc\u002FBaichuan-7B --output_dir output\u002Fbaichuan-sft\n\n# Inference\npython3 -m safe_rlhf.serve.cli --model_name_or_path output\u002Fbaichuan-sft\n```\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_a0fe7cd146fa.png\" width=\"100%\"\u002F>\n\u003C\u002Fdiv>\n\nIn the meantime, we've added support for Chinese datasets such as the [Firefly](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FYeungNLP\u002Ffirefly-train-1.1M) and [MOSS series](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Ffnlp\u002Fmoss-003-sft-data) to our [raw-datasets](safe_rlhf\u002Fdatasets\u002Fraw). You only need to change the dataset path in the training code to use the corresponding dataset for fine-tuning the Chinese pre-training model:\n\n> 同时，我们也在 [raw-datasets](safe_rlhf\u002Fdatasets\u002Fraw) 中增加了支持一些中文数据集，例如 [Firefly](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FYeungNLP\u002Ffirefly-train-1.1M) 和 [MOSS 系列](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Ffnlp\u002Fmoss-003-sft-data)等。在训练代码中更改数据集路径，你就可以使用相应的数据集来微调中文预训练模型：\n\n```diff\n# scripts\u002Fsft.sh\n-\t--train_datasets alpaca \\\n+\t--train_datasets firefly \\\n```\n\nFor instructions on how to add custom datasets, please refer to section [Custom Datasets](#custom-datasets).\n\n> 关于如何添加自定义数据集的方法，请参阅章节 [Custom Datasets (自定义数据集)](#custom-datasets)。\n\n## Benchmark and Evaluation\n\n### Arena via Reward and Cost Models\n\n```bash\nscripts\u002Farena-evaluation.sh \\\n    --red_corner_model_name_or_path output\u002Fsft \\\n    --blue_corner_model_name_or_path output\u002Fppo-lag \\\n    --reward_model_name_or_path output\u002Frm \\\n    --cost_model_name_or_path output\u002Fcm \\\n    --output_dir output\u002Farena-evaluation\n```\n\n### BIG-bench\n\n```bash\n# Install BIG-bench\ngit clone https:\u002F\u002Fgithub.com\u002Fgoogle\u002FBIG-bench.git\n(\n    cd BIG-bench\n    python3 setup.py sdist\n    python3 -m pip install -e .\n)\n\n# BIG-bench evaluation\npython3 -m safe_rlhf.evaluate.bigbench \\\n    --model_name_or_path output\u002Fppo-lag \\\n    --task_name \u003CBIG-bench-task-name>\n```\n\n### GPT-4 Evaluation\n\n```bash\n# Install OpenAI Python API\npip3 install openai\nexport OPENAI_API_KEY=\"...\"  # your OpenAI API key here\n\n# GPT-4 evaluation\npython3 -m safe_rlhf.evaluate.gpt4 \\\n    --red_corner_model_name_or_path output\u002Fsft \\\n    --blue_corner_model_name_or_path output\u002Fppo-lag\n```\n\n## Future Plans\n\n- [X] Beaver-7B checkpoint is released on Hugging Face.\n- [X] Release Safe RLHF paper preprint.\n- [ ] We will gradually release the full Safe-RLHF datasets.\n- [ ] Train Larger LLM with Safe-RLHF.\n- [ ] Support memory-efficient training, such as LoRA, PEFT, etc.\n\n## Citation\n\nIf you find Safe-RLHF useful or use Safe-RLHF (model, code, dataset, etc.) in your research, please consider citing the following work in your publications.\n\n```bibtex\n@inproceedings{safe-rlhf,\n  title={Safe RLHF: Safe Reinforcement Learning from Human Feedback},\n  author={Josef Dai and Xuehai Pan and Ruiyang Sun and Jiaming Ji and Xinbo Xu and Mickel Liu and Yizhou Wang and Yaodong Yang},\n  booktitle={The Twelfth International Conference on Learning Representations},\n  year={2024},\n  url={https:\u002F\u002Fopenreview.net\u002Fforum?id=TyFrPOKYXw}\n}\n@inproceedings{beavertails,\n  title={BeaverTails: Towards Improved Safety Alignment of {LLM} via a Human-Preference Dataset},\n  author={Jiaming Ji and Mickel Liu and Juntao Dai and Xuehai Pan and Chi Zhang and Ce Bian and Boyuan Chen and Ruiyang Sun and Yizhou Wang and Yaodong Yang},\n  booktitle={Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track},\n  year={2023},\n  url={https:\u002F\u002Fopenreview.net\u002Fforum?id=g0QovXbFw3}\n}\n```\n\n## PKU-Alignment Team\n\nAll students below contributed equally and the order is determined alphabetically:\n\n- [Juntao Dai](https:\u002F\u002Fgithub.com\u002Fcalico-1226)\n- [Jiaming Ji](https:\u002F\u002Fgithub.com\u002Fzmsn-2077)\n- [Xuehai Pan](https:\u002F\u002Fgithub.com\u002FXuehaiPan)\n- [Ruiyang Sun](https:\u002F\u002Fgithub.com\u002Frockmagma02)\n\nAll advised by [Yizhou Wang](https:\u002F\u002Fcfcs.pku.edu.cn\u002Fenglish\u002Fpeople\u002Ffaculty\u002Fyizhouwang\u002Findex.htm) and [Yaodong Yang](https:\u002F\u002Fwww.yangyaodong.com\u002F).\nAcknowledge: We appreciate [Ms. Yi Qu](https:\u002F\u002Fwww.xiaohongshu.com\u002Fuser\u002Fprofile\u002F58ee23c96a6a695050dcf276) for designing the Beaver logo.\n\n### Acknowledgment  \u003C!-- omit in toc -->\n\nThis repository benefits from [LLaMA](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Flarge-language-model-llama-meta-ai), [Stanford Alpaca](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca), [DeepSpeed](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeepSpeed), and [DeepSpeed-Chat](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeepSpeedExamples\u002Ftree\u002FHEAD\u002Fapplications\u002FDeepSpeed-Chat).\nThanks for their wonderful works and their efforts for democratizing the LLM research.\nSafe-RLHF and its related assets are built and open-sourced with love 🤗❤️.\n\nThis work is supported and funded by the Peking University.\n\n\u003Ctable width=\"100%\" cellspacing=\"0\" cellpadding=\"0\">\n  \u003Ctr align=\"center\" valign=\"middle\">\n    \u003Ctd width=\"40%\">\n      \u003Ca href=\"https:\u002F\u002Fwww.ai.pku.edu.cn\u002F\">\n        \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_5a028ebad8db.png\" width=\"100%\"\u002F>\n      \u003C\u002Fa>\n    \u003C\u002Ftd>\n    \u003Ctd width=\"60%\">\n      \u003Ca href=\"https:\u002F\u002Fcfcs.pku.edu.cn\u002F\">\n        \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_36cf835427b2.png\" width=\"100%\"\u002F>\n      \u003C\u002Fa>\n    \u003C\u002Ftd>\n  \u003C\u002Ftr>\n\u003C\u002Ftable>\n\n## License\n\nSafe-RLHF is released under Apache License 2.0.\n","\u003C!-- markdownlint-disable first-line-h1 -->\n\u003C!-- markdownlint-disable html -->\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"images\u002FPKU-Beaver-logo-wide.svg\" width=\"80%\"\u002F>\n\u003C\u002Fdiv>\n\n\u003Ch1 align=\"center\">基于安全RLHF的约束型价值对齐大模型\u003C\u002Fh1>\n\nBeaver是由北京大学PKU-Alignment团队开发的一个高度模块化的开源RLHF框架。其目标是为对齐研究，尤其是通过安全RLHF方法进行的约束型对齐大模型研究，提供训练数据和可复现的代码流水线。\n\nBeaver的主要特点包括：\n\n- 支持主流预训练模型的**SFT**、**RLHF**和**安全RLHF**训练：[LLaMA](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Flarge-language-model-llama-meta-ai)、[OPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.01068)、[百川](https:\u002F\u002Fhuggingface.co\u002Fbaichuan-inc\u002FBaichuan-7B)等。\n- 提供大规模人工标注数据集**（多达100万条）**，包含有益和无害偏好，以支持可复现的RLHF研究。\n- 支持奖励模型和成本模型的训练，并提供预训练检查点。\n- 支持针对SFT和RLHF的自定义参数和数据集。\n- 提供多尺度的安全约束验证指标，例如BIG-bench、GPT-4评估等。\n\n## **🦫 最新动态？**  \u003C!-- omit in toc -->\n\n- **🎉 `2024\u002F06\u002F13`:** 我们很高兴地宣布开源我们的PKU-SafeRLHF数据集1.0版本。该版本在初始测试版的基础上进行了改进，加入了人机联合标注，扩展了危害类别范围，并引入了详细的严重程度标签。更多详情及访问方式，请前往Hugging Face上的数据集页面：[PKU-Alignment\u002FPKU-SafeRLHF](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPKU-Alignment\u002FPKU-SafeRLHF)。\n- **🎉 `2024\u002F01\u002F16`:** 我们的**安全RLHF**方法已被ICLR 2024 Spotlight接收。\n- **📄 `2023\u002F10\u002F19`:** 我们已在arXiv上发布了**安全RLHF论文**，详细介绍了我们的新型安全对齐算法及其实现。\n- **🚀 `2023\u002F07\u002F10`:** 我们非常高兴地宣布开源**Beaver-7B** [v1](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v1.0) \u002F [v2](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v2.0) \u002F [v3](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v3.0)模型，作为安全RLHF训练系列的第一个里程碑，并配套提供了相应的**奖励模型** [v1](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v1.0-reward) \u002F [v2](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v2.0-reward) \u002F [v3](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v3.0-reward) \u002F [统一版](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-unified-reward)以及**成本模型** [v1](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v1.0-cost) \u002F [v2](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v2.0-cost) \u002F [v3](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-v3.0-cost) \u002F [统一版](https:\u002F\u002Fhuggingface.co\u002FPKU-Alignment\u002Fbeaver-7b-unified-cost)检查点，均已发布在Hugging Face平台上。\n- **🔥 `2023\u002F07\u002F10`:** 我们扩展了开源的安全偏好数据集，即[**PKU-Alignment\u002FPKU-SafeRLHF**](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPKU-Alignment\u002FPKU-SafeRLHF)，目前包含超过30万条样本。（详见[PKEU-SafeRLHF数据集](#pku-saferlhf-dataset)部分）\n- **⚙ `2023\u002F07\u002F05`:** 我们增强了对中国预训练模型的支持，并整合了更多开源中文数据集。（详见[中文支持](#chinese-support-中文支持)和[自定义数据集](#custom-datasets)部分）\n- **⭐️ `2023\u002F05\u002F15`:** 首次发布安全RLHF流水线、评估结果及训练代码。\n\n### 目录  \u003C!-- omit in toc -->\n\n- [基于安全RLHF的约束型价值对齐](#constrained-value-alignment-via-safe-rlhf)\n- [与其他RLHF库的比较](#comparison-with-other-rlhf-libraries)\n- [PKU-SafeRLHF数据集](#pku-saferlhf-dataset)\n  - [PKU-SafeRLHF-10K](#pku-saferlhf-10k)\n  - [PKU-SafeRLHF-1M](#pku-saferlhf-1m)\n- [为什么叫“Beaver”？](#why-beaver)\n- [Beaver与Alpaca对比](#beaver-vs-alpaca)\n- [安装](#installation)\n- [训练](#training)\n- [自定义数据集](#custom-datasets)\n- [推理](#inference)\n  - [交互式CLI演示](#interactive-cli-demo)\n  - [交互式竞技场](#interactive-arena)\n- [中文支持](#chinese-support-中文支持)\n- [基准测试与评估](#benchmark-and-evaluation)\n  - [基于奖励和成本模型的竞技场](#arena-via-reward-and-cost-models)\n  - [BIG-bench](#big-bench)\n  - [GPT-4评估](#gpt-4-evaluation)\n- [未来计划](#future-plans)\n- [引用](#citation)\n- [PKU-Alignment团队](#pku-alignment-team)\n- [许可证](#license)\n\n## 基于安全RLHF的约束型价值对齐\n\n从人类反馈中学习强化学习：通过偏好学习实现奖励最大化\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"images\u002Frl-formulation.svg\" width=\"40%\"\u002F>\n\u003C\u002Fdiv>\n\n从人类反馈中学习安全强化学习：通过偏好学习实现受约束的奖励最大化\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"images\u002Fsafe-rl-formulation.svg\" width=\"55%\"\u002F>\n\u003C\u002Fdiv>\n\n其中$R (\\cdot)$和$C (\\cdot)$分别为奖励函数和成本函数。它们是基于人类偏好训练的人类代理神经网络。\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"images\u002Fpreference-learning.svg\" width=\"90%\"\u002F>\n\u003C\u002Fdiv>\n\n最终目标是找到一个既有益（高奖励）又无害（低成本）的模型$\\pi_{\\theta}$。\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_7fe2d4c33dbd.png\" width=\"90%\"\u002F>\n\u003C\u002Fdiv>\n\n## 与其他 RLHF 库的比较\n\n与支持 RLHF 的其他框架相比，`safe-rlhf` 是首个支持从 SFT 到 RLHF 再到评估全流程的框架。此外，`safe-rlhf` 还是第一个在 RLHF 阶段就将安全偏好纳入考量的框架，它在策略空间中的约束参数搜索方面具有更强的理论保证。\n\n|                                                                                                        |             SFT              | 偏好模型\u003Csup>[1](#perference-model)\u003C\u002Fsup>训练 | RLHF  | 安全 RLHF | PTX 损失 | 评估 |      后端      |\n| :----------------------------------------------------------------------------------------------------: | :--------------------------: | :--------------------------------------------------------: | :---: | :-------: | :------: | :--------: | :---------------: |\n|                  [Beaver](https:\u002F\u002Fgithub.com\u002FPKU-Alignment\u002Fsafe-rlhf)\u003C\u002Fbr>(Safe-RLHF)                  |              ✔️               |                             ✔️                              |   ✔️   |     ✔️     |    ✔️     |     ✔️      |     DeepSpeed     |\n|                                [trlX](https:\u002F\u002Fgithub.com\u002FCarperAI\u002Ftrlx)                                |              ✔️               |             ❌\u003Csup>[2](#trlx-rm-example)\u003C\u002Fsup>              |   ✔️   |     ❌     |    ❌     |     ❌      | Accelerate \u002F NeMo |\n| [DeepSpeed-Chat](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeepSpeedExamples\u002Ftree\u002FHEAD\u002Fapplications\u002FDeepSpeed-Chat) |              ✔️               |                             ✔️                              |   ✔️   |     ❌     |    ✔️     |     ❌      |     DeepSpeed     |\n|                         [Colossal-AI](https:\u002F\u002Fgithub.com\u002Fhpcaitech\u002FColossalAI)                         |              ✔️               |                             ✔️                              |   ✔️   |     ❌     |    ✔️     |     ❌      |    ColossalAI     |\n|                         [AlpacaFarm](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm)                         | ❌\u003Csup>[3](#alpaca-sft)\u003C\u002Fsup> |                             ✔️                              |   ✔️   |     ❌     |    ❌     |     ✔️      |    Accelerate     |\n\n\u003Csup>\n  \u003Ca name=\"perference-model\">1.\u003C\u002Fa> 在 RLHF 的语境中，“偏好模型”即为“奖励模型”。而在 Safe RLHF 中，“偏好模型”则同时指代“奖励模型”和“成本模型”。\n\u003C\u002Fsup>\u003Cbr\u002F>\n\u003Csup>\n  \u003Ca name=\"trlx-rm-example\">2.\u003C\u002Fa> trlX 仓库的 examples 目录中确实有一个用于奖励模型训练的示例，但该示例并未得到官方支持，也未集成到 trlX 库中。\n\u003C\u002Fsup>\u003Cbr\u002F>\n\u003Csup>\n  \u003Ca name=\"alpaca-sft\">3.\u003C\u002Fa> Alpaca 的监督微调支持是在 \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca\">tatsu-lab\u002Fstanford_alpaca\u003C\u002Fa> 仓库中提供的。\n\u003C\u002Fsup>\n\n## PKU-SafeRLHF 数据集\n\n`PKU-SafeRLHF` 数据集是一个由人类标注的、同时包含性能偏好和安全偏好的数据集。它涵盖了十多个维度的约束条件，例如侮辱性内容、不道德行为、犯罪、情感伤害以及隐私等，旨在实现 RLHF 技术中的细粒度价值观对齐。\n\n为了便于多轮微调，我们将逐步发布每一轮所需的初始参数权重、所需数据集以及训练参数，以确保科学和学术研究的可重复性。该数据集将通过滚动更新的方式逐步发布。\n\n数据集已在 Hugging Face 上开放：[PKU-Alignment\u002FPKU-SafeRLHF](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPKU-Alignment\u002FPKU-SafeRLHF)。\n\n### PKU-SafeRLHF-10K\n\n`PKU-SafeRLHF-10K` 是 `PKU-SafeRLHF` 的一个子集，包含了第一轮安全 RLHF 训练的 1 万条数据，其中包括安全偏好信息。你可以在 Hugging Face 上找到它：[PKU-Alignment\u002FPKU-SafeRLHF-10K](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FPKU-Alignment\u002FPKU-SafeRLHF-10K)。\n\n### PKU-SafeRLHF-1M\n\n我们将逐步发布完整的安全 RLHF 数据集，其中包含 **100 万对由人类标注的** 有益性和无害性偏好数据。\n\n## 为什么叫“Beaver”\n\nBeaver 是一款基于 [LLaMA](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Flarge-language-model-llama-meta-ai) 的大型语言模型，使用 `safe-rlhf` 技术进行训练。它建立在 [Alpaca](https:\u002F\u002Fcrfm.stanford.edu\u002F2023\u002F03\u002F13\u002Falpaca.html) 模型的基础上，通过收集与有益性和无害性相关的用户偏好数据，并采用安全 RLHF 技术进行训练。在保持 Alpaca 有益性能的同时，Beaver 显著提升了其无害性。\n\n> 海狸素有“天然筑坝工程师”的美誉，它们善于利用树枝、灌木、岩石和泥土建造水坝及小型木屋，从而创造出适合其他生物栖息的湿地环境，成为生态系统中不可或缺的一部分。为了确保大型语言模型（LLMs）的安全可靠，同时满足不同人群的多样化价值取向，北京大学团队将其开源模型命名为“Beaver”，并计划通过约束性价值观对齐（CVA）技术为 LLMs 筑起一道“大坝”。这项技术能够对信息进行细粒度标注，并结合安全强化学习方法，显著降低模型的偏见和歧视，从而提升模型的安全性。正如海狸在生态系统中所扮演的角色一样，Beaver 模型将为大型语言模型的发展提供重要支持，并为人工智能技术的可持续发展作出积极贡献。\n\n## Beaver 与 Alpaca 对比\n\n参照 [Vicuna](https:\u002F\u002Flmsys.org\u002Fblog\u002F2023-03-30-vicuna) 模型的评估方法，我们使用 GPT-4 对 Beaver 进行了评估。结果显示，与 Alpaca 相比，Beaver 在多项安全相关指标上均有显著提升。\n\n![Arena-Demo](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_a92a2e8ec8b6.gif)\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_e880806ce697.png\" width=\"60%\"\u002F>\n\u003C\u002Fdiv>\n\n在对 Alpaca-7B 模型应用安全 RLHF 流程后，安全偏好分布发生了显著变化。\n\n\u003Ctable width=\"100%\" cellspacing=\"0\" cellpadding=\"0\">\n  \u003Ctr align=\"center\" valign=\"middle\">\n    \u003Ctd width=\"50%\">\n      \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_4d076cac560c.png\" width=\"100%\"\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd width=\"50%\">\n      \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_6fc8c5148e0c.png\" width=\"100%\"\u002F>\n    \u003C\u002Ftd>\n  \u003C\u002Ftr>\n\u003C\u002Ftable>\n\n## 安装\n\n从 GitHub 克隆源代码：\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FPKU-Alignment\u002Fsafe-rlhf.git\ncd safe-rlhf\n```\n\n**原生运行环境：** 使用 [`conda`](https:\u002F\u002Fgithub.com\u002Fconda\u002Fconda) 或 [`mamba`](https:\u002F\u002Fgithub.com\u002Fmamba-org\u002Fmamba) 设置 Conda 环境：\n\n```bash\nconda env create --file conda-recipe.yaml  # 或 `mamba env create --file conda-recipe.yaml`\n```\n\n这将自动安装所有依赖项。\n\n**容器化运行环境：** 除了使用带有 Conda 隔离的本地机器外，您也可以选择使用 Docker 镜像来配置环境。\n\n首先，请按照 [NVIDIA Container Toolkit：安装指南](https:\u002F\u002Fdocs.nvidia.com\u002Fdatacenter\u002Fcloud-native\u002Fcontainer-toolkit\u002Finstall-guide.html) 和 [NVIDIA Docker：安装指南](https:\u002F\u002Fdocs.nvidia.com\u002Fdatacenter\u002Fcloud-native\u002Fcontainer-toolkit\u002Finstall-guide.html#docker) 设置 `nvidia-docker`。然后您可以运行：\n\n```bash\nmake docker-run\n```\n\n该命令将构建并启动一个安装了所需依赖项的 Docker 容器。宿主机的根目录 `\u002F` 将被映射到容器内的 `\u002Fhost`，而当前工作目录则会被映射到容器内的 `\u002Fworkspace`。\n\n## 训练\n\n`safe-rlhf` 支持从监督微调（SFT）到偏好模型训练，再到 RLHF 对齐训练的完整流程。\n\n0. 按照[安装](#installation)部分的说明正确设置训练环境。\n\n```bash\nconda activate safe-rlhf\nexport WANDB_API_KEY=\"...\"  # 在此处填写您的 W&B API 密钥\n```\n\n或者\n\n```bash\nmake docker-run\nexport WANDB_API_KEY=\"...\"  # 在此处填写您的 W&B API 密钥\n```\n\n1. 监督微调（SFT）\n\n```bash\nbash scripts\u002Fsft.sh \\\n    --model_name_or_path \u003C您的模型名称或检查点路径> \\\n    --output_dir output\u002Fsft\n```\n\n注意：您可能需要根据自己的机器配置调整脚本中的一些参数，例如用于训练的 GPU 数量、训练批次大小等。\n\n2. 价值模型（奖励模型和成本模型）\n\n```bash\nbash scripts\u002Freward-model.sh \\\n    --model_name_or_path output\u002Fsft \\\n    --output_dir output\u002Frm\n```\n\n```bash\nbash scripts\u002Fcost-model.sh \\\n    --model_name_or_path output\u002Fsft \\\n    --output_dir output\u002Fcm\n```\n\n3. RLHF（可选）\n\n```bash\nbash scripts\u002Fppo.sh \\\n    --actor_model_name_or_path output\u002Fsft \\\n    --reward_model_name_or_path output\u002Frm \\\n    --output_dir output\u002Fppo\n```\n\n4. 安全 RLHF\n\n```bash\nbash scripts\u002Fppo-lag.sh \\\n    --actor_model_name_or_path output\u002Fsft \\\n    --reward_model_name_or_path output\u002Frm \\\n    --cost_model_name_or_path output\u002Fcm \\\n    --output_dir output\u002Fppo-lag\n```\n\n以下是一个使用 [LLaMA-7B](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Flarge-language-model-llama-meta-ai) 运行整个流程的示例命令：\n\n```bash\nconda activate safe-rlhf\nbash scripts\u002Fsft.sh --model_name_or_path ~\u002Fmodels\u002Fllama-7b --output_dir output\u002Fsft\nbash scripts\u002Freward-model.sh --model_name_or_path output\u002Fsft --output_dir output\u002Frm\nbash scripts\u002Fcost-model.sh --model_name_or_path output\u002Fsft --output_dir output\u002Fcm\nbash scripts\u002Fppo-lag.sh \\\n    --actor_model_name_or_path output\u002Fsft \\\n    --reward_model_name_or_path output\u002Frm \\\n    --cost_model_name_or_path output\u002Fcm \\\n    --output_dir output\u002Fppo-lag\n```\n\n#### 计算资源需求  \u003C!-- omit in toc -->\n\n上述所有训练过程均已在配备 8 块 NVIDIA A800-80GB GPU 的云服务器上，使用 [LLaMA-7B](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Flarge-language-model-llama-meta-ai) 进行测试。\n\n对于 GPU 显存不足的用户，可以启用 [DeepSpeed ZeRO-Offload](https:\u002F\u002Fwww.deepspeed.ai\u002Ftutorials\u002Fzero-offload) 来缓解峰值显存占用。\n\n所有训练脚本都支持通过额外的 `--offload` 参数（默认为 `none`，即禁用 ZeRO-Offload）将张量（参数和\u002F或优化器状态）卸载到 CPU 上。例如：\n\n```bash\nbash scripts\u002Fsft.sh \\\n    --model_name_or_path ~\u002Fmodels\u002Fllama-7b \\\n    --output_dir output\u002Fsft \\\n    --offload all  # 或 `parameter` 或 `optimizer`\n```\n\n在多节点环境下，用户可以参考 [DeepSpeed：资源配置（多节点）](https:\u002F\u002Fwww.deepspeed.ai\u002Fgetting-started\u002F#resource-configuration-multi-node) 文档以获取更多详细信息。以下是一个在 4 个节点（每个节点有 8 个 GPU）上启动训练过程的示例：\n\n```text\n# myhostfile\nworker-1 slots=8\nworker-2 slots=8\nworker-3 slots=8\nworker-4 slots=8\n```\n\n然后使用以下命令启动训练脚本：\n\n```bash\nbash scripts\u002Fsft.sh \\\n    --hostfile myhostfile \\\n    --model_name_or_path ~\u002Fmodels\u002Fllama-7b \\\n    --output_dir output\u002Fsft\n```\n\n## 自定义数据集\n\n`safe-rlhf` 提供了一个抽象层，用于为监督微调、偏好模型训练和强化学习训练的所有阶段创建数据集。\n\n```python\nclass RawSample(TypedDict, total=False):\n    \"\"\"原始样本类型。\n\n    对于监督数据集，应提供 (input, answer) 或 (dialogue)。\n    对于偏好数据集，应提供 (input, answer, other_answer, better)。\n    对于安全偏好数据集，应提供 (input, answer, other_answer, safer, is_safe, is_other_safe)。\n    对于仅提示数据集，应提供 (input)。\n    \"\"\"\n\n    # 文本\n    input: NotRequired[str]  # either `input` or `dialogue` should be provided\n    \"\"\"用户输入文本。\"\"\"\n    answer: NotRequired[str]\n    \"\"\"助手回答文本。\"\"\"\n    other_answer: NotRequired[str]\n    \"\"\"通过重采样得到的另一条助手回答文本。\"\"\"\n    dialogue: NotRequired[list[str]]  # either `input` or `dialogue` should be provided\n    \"\"\"对话历史。\"\"\"\n\n    # 标记\n    better: NotRequired[bool]\n    \"\"\"是否 ``answer`` 比 ``other_answer`` 更好。\"\"\"\n    safer: NotRequired[bool]\n    \"\"\"是否 ``answer`` 比 ``other_answer`` 更安全。\"\"\"\n    is_safe: NotRequired[bool]\n    \"\"\"是否 ``answer`` 是安全的。\"\"\"\n    is_other_safe: NotRequired[bool]\n    \"\"\"是否 ``other_answer`` 是安全的。\"\"\"\n```\n\n以下是一个实现自定义数据集的示例（更多示例请参见 [safe_rlhf\u002Fdatasets\u002Fraw](safe_rlhf\u002Fdatasets\u002Fraw)）：\n\n```python\nimport argparse\nfrom datasets import load_dataset\nfrom safe_rlhf.datasets import RawDataset, RawSample, parse_dataset\n\n\nclass MyRawDataset(RawDataset):\n    NAME = 'my-dataset-name'\n\n    def __init__(self, path=None) -> None:\n        # 从 Hugging Face 加载数据集\n        self.data = load_dataset(path or 'my-organization\u002Fmy-dataset')['train']\n\n    def __getitem__(self, index: int) -> RawSample:\n        data = self.data[index]\n        # 从自定义数据集条目中构造一个 `RawSample` 字典\n        return RawSample(\n            input=data['col1'],\n            answer=data['col2'],\n            other_answer=data['col3'],\n            better=float(data['col4']) > float(data['col5']),\n            ...\n        )\n\n    def __len__(self) -> int:\n        return len(self.data)  # 数据集大小\n\n\ndef parse_arguments():\n    parser = argparse.ArgumentParser(...)\n    parser.add_argument(\n        '--datasets',\n        type=parse_dataset,\n        nargs='+',\n        metavar='DATASET[:PROPORTION[:PATH]]',\n    )\n    ...\n    return parser.parse_args()\n\n\ndef main():\n    args = parse_arguments()\n    ...\n\n\nif __name__ == '__main__':\n    main()\n```\n\n然后你可以将这个数据集传递给训练脚本，如下所示：\n\n```bash\npython3 train.py --datasets my-dataset-name\n```\n\n你也可以传递多个数据集，并可选地指定每个数据集的比例（用冒号 `:` 分隔）。例如：\n\n```bash\npython3 train.py --datasets alpaca:0.75 my-dataset-name:0.5\n```\n\n这将会随机选取斯坦福 Alpaca 数据集的 75% 和你自定义数据集的 50% 进行使用。\n\n此外，如果已经从 Hugging Face 克隆了数据集仓库，数据集参数后面还可以跟本地路径（同样用冒号 `:` 分隔）。\n\n```bash\ngit lfs install\ngit clone https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmy-organization\u002Fmy-dataset ~\u002Fpath\u002Fto\u002Fmy-dataset\u002Frepository\npython3 train.py --datasets alpaca:0.75 my-dataset-name:0.5:~\u002Fpath\u002Fto\u002Fmy-dataset\u002Frepository\n```\n\n注意：在训练脚本开始解析命令行参数之前，必须先导入数据集类。\n\n## 推理\n\n### 交互式 CLI 演示\n\n```bash\npython3 -m safe_rlhf.serve.cli --model_name_or_path output\u002Fsft  # 或 output\u002Fppo-lag\n```\n\n### 交互式竞技场\n\n```bash\npython3 -m safe_rlhf.serve.arena --red_corner_model_name_or_path output\u002Fsft --blue_corner_model_name_or_path output\u002Fppo-lag\n```\n\n![Arena-Demo](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_a92a2e8ec8b6.gif)\n\n## 中文支持\n\nSafe-RLHF 流程不仅支持 LLaMA 系列模型，还支持其他一些对中文支持更好的预训练模型，例如 [Baichuan](https:\u002F\u002Fhuggingface.co\u002Fbaichuan-inc\u002FBaichuan-7B) 和 [InternLM](https:\u002F\u002Fhuggingface.co\u002Finternlm\u002Finternlm-7b) 等。你只需要在训练和推理代码中更新预训练模型的路径即可。\n\n```bash\n# SFT 训练\nbash scripts\u002Fsft.sh --model_name_or_path baichuan-inc\u002FBaichuan-7B --output_dir output\u002Fbaichuan-sft\n\n# 推理\npython3 -m safe_rlhf.serve.cli --model_name_or_path output\u002Fbaichuan-sft\n```\n\n\u003Cdiv align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_a0fe7cd146fa.png\" width=\"100%\"\u002F>\n\u003C\u002Fdiv>\n\n同时，我们也在 [raw-datasets](safe_rlhf\u002Fdatasets\u002Fraw) 中增加了支持一些中文数据集，例如 [Firefly](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FYeungNLP\u002Ffirefly-train-1.1M) 和 [MOSS 系列](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Ffnlp\u002Fmoss-003-sft-data)等。在训练代码中更改数据集路径，你就可以使用相应的数据集来微调中文预训练模型：\n\n```diff\n# scripts\u002Fsft.sh\n-\t--train_datasets alpaca \\\n+\t--train_datasets firefly \\\n```\n\n关于如何添加自定义数据集的方法，请参阅章节 [Custom Datasets (自定义数据集)](#custom-datasets)。\n\n## 基准测试与评估\n\n### 通过奖励和成本模型进行竞技场评估\n\n```bash\nscripts\u002Farena-evaluation.sh \\\n    --red_corner_model_name_or_path output\u002Fsft \\\n    --blue_corner_model_name_or_path output\u002Fppo-lag \\\n    --reward_model_name_or_path output\u002Frm \\\n    --cost_model_name_or_path output\u002Fcm \\\n    --output_dir output\u002Farena-evaluation\n```\n\n### BIG-bench\n\n```bash\n# 安装 BIG-bench\ngit clone https:\u002F\u002Fgithub.com\u002Fgoogle\u002FBIG-bench.git\n(\n    cd BIG-bench\n    python3 setup.py sdist\n    python3 -m pip install -e .\n)\n\n# BIG-bench 评估\npython3 -m safe rlhf.evaluate.bigbench \\\n    --model_name_or_path output\u002Fppo-lag \\\n    --task_name \u003CBIG-bench-task-name>\n```\n\n### GPT-4 评估\n\n```bash\n# 安装 OpenAI Python API\npip3 install openai\nexport OPENAI_API_KEY=\"...\"  # 在此处填写你的 OpenAI API 密钥\n\n# GPT-4 评估\npython3 -m safe rlhf.evaluate.gpt4 \\\n    --red_corner_model_name_or_path output\u002Fsft \\\n    --blue_corner_model_name_or_path output\u002Fppo-lag\n```\n\n## 未来计划\n\n- [X] Beaver-7B 检查点已在 Hugging Face 上发布。\n- [X] 发布 Safe RLHF 论文预印本。\n- [ ] 我们将逐步发布完整的 Safe-RLHF 数据集。\n- [ ] 使用 Safe-RLHF 训练更大的 LLM。\n- [ ] 支持内存高效的训练方法，如 LoRA、PEFT 等。\n\n## 引用\n\n如果您认为 Safe-RLHF 有用，或在您的研究中使用了 Safe-RLHF（模型、代码、数据集等），请考虑在您的出版物中引用以下文献。\n\n```bibtex\n@inproceedings{safe-rlhf,\n  title={Safe RLHF: 基于人类反馈的安全强化学习},\n  author={Josef Dai 和 Xuehai Pan 和 Ruiyang Sun 和 Jiaming Ji 和 Xinbo Xu 和 Mickel Liu 和 Yizhou Wang 和 Yaodong Yang},\n  booktitle={第十二届国际学习表示会议},\n  year={2024},\n  url={https:\u002F\u002Fopenreview.net\u002Fforum?id=TyFrPOKYXw}\n}\n@inproceedings{beavertails,\n  title={BeaverTails：通过人类偏好数据集实现 {LLM} 更佳的安全对齐},\n  author={Jiaming Ji 和 Mickel Liu 和 Juntao Dai 和 Xuehai Pan 和 Chi Zhang 和 Ce Bian 和 Boyuan Chen 和 Ruiyang Sun 和 Yizhou Wang 和 Yaodong Yang},\n  booktitle={第三十七届神经信息处理系统大会数据集与基准赛道},\n  year={2023},\n  url={https:\u002F\u002Fopenreview.net\u002Fforum?id=g0QovXbFw3}\n}\n```\n\n## 北大对齐团队\n\n以下所有同学贡献均等，顺序按姓名字母排序：\n\n- [Juntao Dai](https:\u002F\u002Fgithub.com\u002Fcalico-1226)\n- [Jiaming Ji](https:\u002F\u002Fgithub.com\u002Fzmsn-2077)\n- [Xuehai Pan](https:\u002F\u002Fgithub.com\u002FXuehaiPan)\n- [Ruiyang Sun](https:\u002F\u002Fgithub.com\u002Frockmagma02)\n\n全体由 [Yizhou Wang](https:\u002F\u002Fcfcs.pku.edu.cn\u002Fenglish\u002Fpeople\u002Ffaculty\u002Fyizhouwang\u002Findex.htm) 和 [Yaodong Yang](https:\u002F\u002Fwww.yangyaodong.com\u002F) 指导。  \n特别感谢：我们感谢 [Ms. Yi Qu](https:\u002F\u002Fwww.xiaohongshu.com\u002Fuser\u002Fprofile\u002F58ee23c96a6a695050dcf276) 设计的海狸标志。\n\n### 致谢  \u003C!-- omit in toc -->\n\n本仓库受益于 [LLaMA](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Flarge-language-model-llama-meta-ai)、[斯坦福 Alpaca](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fstanford_alpaca)、[DeepSpeed](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeepSpeed) 和 [DeepSpeed-Chat](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeepSpeedExamples\u002Ftree\u002FHEAD\u002Fapplications\u002FDeepSpeed-Chat) 的优秀工作。感谢他们为推动 LLM 研究的民主化所做出的努力。Safe-RLHF 及其相关资源以爱构建并开源 🤗❤️。\n\n本项目得到北京大学的支持与资助。\n\n\u003Ctable width=\"100%\" cellspacing=\"0\" cellpadding=\"0\">\n  \u003Ctr align=\"center\" valign=\"middle\">\n    \u003Ctd width=\"40%\">\n      \u003Ca href=\"https:\u002F\u002Fwww.ai.pku.edu.cn\u002F\">\n        \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_5a028ebad8db.png\" width=\"100%\"\u002F>\n      \u003C\u002Fa>\n    \u003C\u002Ftd>\n    \u003Ctd width=\"60%\">\n      \u003Ca href=\"https:\u002F\u002Fcfcs.pku.edu.cn\u002F\">\n        \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_readme_36cf835427b2.png\" width=\"100%\"\u002F>\n      \u003C\u002Fa>\n    \u003C\u002Ftd>\n  \u003C\u002Ftr>\n\u003C\u002Ftable>\n\n## 许可证\n\nSafe-RLHF 采用 Apache License 2.0 开源。","# safe-rlhf 快速上手指南\n\n`safe-rlhf` (Beaver) 是由北京大学 PKU-Alignment 团队开发的高度模块化开源框架，旨在通过 **Safe RLHF**（安全强化学习人类反馈）技术，训练既“有帮助”又“无害”的大语言模型。它支持从 SFT、奖励\u002F成本模型训练到 Safe RLHF 全流程，并提供大规模人工标注的安全偏好数据集。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**: Linux (推荐 Ubuntu 20.04+)\n*   **Python**: 3.8 或更高版本\n*   **GPU**: 支持 CUDA 的 NVIDIA GPU (建议显存 24GB+ 以运行 7B 模型训练)\n*   **前置依赖**:\n    *   PyTorch (与您的 CUDA 版本匹配)\n    *   DeepSpeed (框架后端依赖)\n    *   Hugging Face Transformers, Datasets, Accelerate\n\n> **国内加速建议**：\n> 推荐使用国内镜像源安装 Python 依赖，并配置 Hugging Face 镜像以加速模型和数据集下载。\n> ```bash\n> # 设置 Hugging Face 镜像 (可选)\n> export HF_ENDPOINT=https:\u002F\u002Fhf-mirror.com\n> ```\n\n## 安装步骤\n\n### 1. 克隆项目代码\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FPKU-Alignment\u002Fsafe-rlhf.git\ncd safe-rlhf\n```\n\n### 2. 创建虚拟环境并安装依赖\n建议使用 `conda` 或 `venv` 创建独立环境。\n\n```bash\n# 创建 conda 环境\nconda create -n safe-rlhf python=3.9 -y\nconda activate safe-rlhf\n\n# 安装 PyTorch (请根据实际 CUDA 版本选择，此处以 cu118 为例)\npip install torch torchvision torchaudio --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu118\n\n# 安装项目依赖\n# 国内用户可使用清华源加速\npip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n\n# 安装 safe-rlhf 包本身\npip install -e .\n```\n\n## 基本使用\n\n`safe-rlhf` 的核心流程通常包含三个阶段：**SFT (有监督微调)** -> **Reward\u002FCost Model Training (奖励\u002F成本模型训练)** -> **Safe RLHF (安全强化学习)**。\n\n以下是最简单的命令行使用示例，展示如何启动训练任务。\n\n### 1. 数据准备\n框架会自动从 Hugging Face 下载 `PKU-SafeRLHF` 数据集。如果您网络受限，可手动下载后本地加载，或直接使用默认配置（需配置好镜像）。\n\n### 2. 运行 SFT (有监督微调)\n使用提供的配置文件对预训练模型（如 LLaMA-7B）进行微调。\n\n```bash\ndeepspeed --num_gpus=8 \\\n    safe_rlhf\u002Ftrain\u002Fsft.py \\\n    --model_name_or_path meta-llama\u002FLlama-2-7b-hf \\\n    --dataset_names PKU-Alignment\u002FPKU-SafeRLHF \\\n    --split train \\\n    --output_dir .\u002Foutput\u002Fsft_model \\\n    --per_device_train_batch_size 4 \\\n    --gradient_accumulation_steps 4 \\\n    --learning_rate 2e-5 \\\n    --num_train_epochs 3 \\\n    --deepspeed ds_configs\u002Fstage3.json\n```\n\n### 3. 训练奖励模型 (Reward Model) 与 成本模型 (Cost Model)\nSafe RLHF 需要两个关键模型：一个评估“有用性”(Reward)，一个评估“有害性”(Cost)。\n\n**训练 Reward Model:**\n```bash\ndeepspeed --num_gpus=8 \\\n    safe_rlhf\u002Ftrain\u002Freward.py \\\n    --model_name_or_path .\u002Foutput\u002Fsft_model \\\n    --dataset_names PKU-Alignment\u002FPKU-SafeRLHF \\\n    --split train \\\n    --output_dir .\u002Foutput\u002Freward_model \\\n    --per_device_train_batch_size 4 \\\n    --learning_rate 1e-5 \\\n    --num_train_epochs 1 \\\n    --deepspeed ds_configs\u002Fstage3.json\n```\n\n**训练 Cost Model:**\n```bash\ndeepspeed --num_gpus=8 \\\n    safe_rlhf\u002Ftrain\u002Fcost.py \\\n    --model_name_or_path .\u002Foutput\u002Fsft_model \\\n    --dataset_names PKU-Alignment\u002FPKU-SafeRLHF \\\n    --split train \\\n    --output_dir .\u002Foutput\u002Fcost_model \\\n    --per_device_train_batch_size 4 \\\n    --learning_rate 1e-5 \\\n    --num_train_epochs 1 \\\n    --deepspeed ds_configs\u002Fstage3.json\n```\n\n### 4. 运行 Safe RLHF\n结合上述模型进行最终的安全对齐训练。\n\n```bash\ndeepspeed --num_gpus=8 \\\n    safe_rlhf\u002Ftrain\u002Frl.py \\\n    --model_name_or_path .\u002Foutput\u002Fsft_model \\\n    --reward_model_name_or_path .\u002Foutput\u002Freward_model \\\n    --cost_model_name_or_path .\u002Foutput\u002Fcost_model \\\n    --dataset_names PKU-Alignment\u002FPKU-SafeRLHF \\\n    --split train \\\n    --output_dir .\u002Foutput\u002Fsafe_rlhf_model \\\n    --per_device_prompt_batch_size 4 \\\n    --per_device_train_batch_size 4 \\\n    --learning_rate 1e-6 \\\n    --num_train_epochs 1 \\\n    --deepspeed ds_configs\u002Fstage3.json\n```\n\n### 5. 交互式推理\n训练完成后，可以使用 CLI 进行对话测试：\n\n```bash\npython safe_rlhf\u002Finference\u002Fgenerate.py \\\n    --model_name_or_path .\u002Foutput\u002Fsafe_rlhf_model \\\n    --max_new_tokens 256 \\\n    --temperature 0.7 \\\n    --top_p 0.9\n```","某金融科技公司正在开发一款面向大众的 AI 理财顾问，需要在提供专业建议的同时，严格避免生成诱导高风险投资或涉及欺诈的话术。\n\n### 没有 safe-rlhf 时\n- 模型虽然能流畅回答理财问题，但偶尔会为了“讨好”用户而推荐未经核实的高风险杠杆产品，缺乏安全底线。\n- 团队只能依靠简单的关键词过滤来拦截有害内容，导致正常建议被误杀，或新型违规话术轻易绕过防御。\n- 缺乏专门的“代价模型”来量化回答的危害程度，无法在训练阶段平衡“有用性”与“安全性”，上线后合规风险极高。\n- 人工复核成本巨大，每次迭代都需要大量专家逐条检查输出，严重拖慢产品发布节奏。\n\n### 使用 safe-rlhf 后\n- 利用 PKU-SafeRLHF 数据集进行约束对齐训练，模型学会了主动拒绝生成诱导高风险投资的回答，从源头遏制有害内容。\n- 通过独立的代价模型（Cost Model）实时评估回答的潜在危害，实现了细粒度的安全控制，不再依赖粗糙的规则过滤。\n- 在强化学习过程中引入安全约束，确保模型在保持专业解答能力的同时，将违规概率降至极低，完美平衡有用与安全。\n- 借助内置的多尺度安全评估指标，团队可自动化验证模型安全性，大幅减少人工审核工作量，加速合规上线进程。\n\nsafe-rlhf 通过引入约束价值对齐机制，让 AI 理财顾问在具备专业能力的同时，拥有了不可逾越的安全红线。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FPKU-Alignment_safe-rlhf_7fe2d4c3.png","PKU-Alignment","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FPKU-Alignment_a9b98610.png","Loves Sharing and Open-Source, Making AI Safer.",null,"yaodong.yang@outlook.com","https:\u002F\u002Fgithub.com\u002FPKU-Alignment",[81,85,89,93],{"name":82,"color":83,"percentage":84},"Python","#3572A5",92.1,{"name":86,"color":87,"percentage":88},"Shell","#89e051",6.4,{"name":90,"color":91,"percentage":92},"Makefile","#427819",1.1,{"name":94,"color":95,"percentage":96},"Dockerfile","#384d54",0.4,1596,132,"2026-04-07T11:32:22","Apache-2.0","","未说明",{"notes":104,"python":102,"dependencies":105},"README 提供的文本在'Installation'章节处被截断，因此缺失具体的操作系统、GPU 型号、显存大小、内存需求、Python 版本及详细依赖库列表。根据文中对比表格可知，该框架后端基于 DeepSpeed，支持 SFT、RLHF 及 Safe RLHF 全流程训练，并支持 LLaMA、OPT、Baichuan 等模型。",[106],"DeepSpeed",[36,13,14,16],[109,110,111,112,113,114,115,116,117,118,119,120,121,65,122,123,124,125,126,127],"ai-safety","alpaca","datasets","deepspeed","large-language-models","llama","llm","llms","reinforcement-learning","reinforcement-learning-from-human-feedback","rlhf","transformers","vicuna","safe-reinforcement-learning","safe-reinforcement-learning-from-human-feedback","safety","gpt","transformer","beaver","2026-03-27T02:49:30.150509","2026-04-20T20:23:00.502656",[131,136,141],{"id":132,"question_zh":133,"answer_zh":134,"source_url":135},45708,"安装或训练时遇到 'fused_adam.so: cannot open shared object file' 错误怎么办？","该问题通常由环境变量配置冲突或缓存文件导致，请按以下步骤解决：\n1. 清理环境变量：确保 `~\u002F.bashrc`（注意不是 `.baserc`）中没有多余的环境变量，尤其是不要提前初始化 conda 默认变量。仅保留以下必要配置并执行 `source` 生效：\n   export CUDA_HOME=\"\u002Fusr\u002Flocal\u002Fcuda-xx.x\"\n   export PATH=\"${CUDA_HOME}\u002Fbin${PATH:+:\"${PATH}\"}\"\n   export LD_LIBRARY_PATH=\"${CUDA_HOME}\u002Flib64:${CUDA_HOME}\u002Fextras\u002FCUPTI\u002Flib64${LD_LIBRARY_PATH:+:\"${LD_LIBRARY_PATH}\"}\"\n   export NCCL_P2P_DISABLE=1\n2. 清除构建缓存：在重新构建前，务必删除以下目录（每次构建失败后都需执行）：\n   rm -r ~\u002F.cache\u002Ftorch\n   rm -r ~\u002F.cache\u002Ftorch_extensions\n3. 重新创建环境：执行 `conda env create --file conda-recipe.yaml`。\n4. 依赖版本建议：mpi4py 建议指定版本为 3.1.3（可通过 `conda install mpi4py==3.1.3` 安装）。\n注意：初次构建耗时较长（可能达 3 小时），若中途断开需重复步骤 2 和 3。","https:\u002F\u002Fgithub.com\u002FPKU-Alignment\u002Fsafe-rlhf\u002Fissues\u002F2",{"id":137,"question_zh":138,"answer_zh":139,"source_url":140},45709,"运行 PPO 阶段时报错 'CUDA error: device-side assert triggered' 如何解决？","该错误通常与使用的 LLaMA 模型来源及 Tokenizer 配置有关，特别是使用 Hugging Face 上的 `decapoda-research\u002Fllama-7b-hf` 时。解决方案如下：\n1. 推荐方案：尽量使用 Meta 官方发布的原始 LLaMA checkpoint 和 Tokenizer，避免使用第三方转换版本。\n2. 若必须使用 `decapoda-research\u002Fllama-7b-hf`，需手动修改其配置文件以修复 special token 映射问题：\n   - 修改 `tokenizer_config.json`：\n     {\n       \"bos_token\": \"\u003Cs>\",\n       \"eos_token\": \"\u003C\u002Fs>\",\n       \"model_max_length\": 1000000000000000019884624838656,\n       \"tokenizer_class\": \"LlamaTokenizer\",\n       \"unk_token\": \"\u003Cunk>\"\n     }\n   - 检查并更新 `config.json` 中的架构配置以匹配当前 transformers 版本。\n3. 确保使用的 `transformers` 库版本较新，旧版本可能导致兼容性问题。","https:\u002F\u002Fgithub.com\u002FPKU-Alignment\u002Fsafe-rlhf\u002Fissues\u002F9",{"id":142,"question_zh":143,"answer_zh":144,"source_url":145},45710,"PPO 训练过程中 rollout 阶段的 generate 函数耗时过长是否正常？","是的，这在多卡分布式推理场景下是正常现象。`generate` 函数耗时远高于后续的前向传播计算（如计算 logits 和 reward），主要原因如下：\n1. 通信开销：在 inference（rollout）阶段，虽然不需要同步梯度，但每张卡之间仍需频繁通信以同步数据、中间层计算结果以及生成过程中的状态（特别是在 `synced_gpus=True` 模式下）。\n2. 生成特性：自回归生成（autoregressive generation）是串行的，无法像矩阵乘法那样高度并行化，且受限于显存带宽和卡间通信延迟。\n相比之下，后续的 `logits` 和 `reward` 计算主要是大规模矩阵运算，并行效率高，因此耗时较短。若需优化，可尝试调整 `max_length` 或优化分布式通信后端配置。","https:\u002F\u002Fgithub.com\u002FPKU-Alignment\u002Fsafe-rlhf\u002Fissues\u002F40",[]]