[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-tatsu-lab--alpaca_farm":3,"tool-tatsu-lab--alpaca_farm":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",149489,2,"2026-04-10T11:32:46",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":76,"owner_twitter":76,"owner_website":76,"owner_url":77,"languages":78,"stars":87,"forks":88,"last_commit_at":89,"license":90,"difficulty_score":32,"env_os":91,"env_gpu":92,"env_ram":91,"env_deps":93,"category_tags":102,"github_topics":103,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":109,"updated_at":110,"faqs":111,"releases":140},6275,"tatsu-lab\u002Falpaca_farm","alpaca_farm","A simulation framework for RLHF and alternatives. Develop your RLHF method without collecting human data. ","AlpacaFarm 是一个专为“从人类反馈中学习”（RLHF）技术设计的仿真框架。它核心解决了当前大模型对齐研究中成本高、门槛大的痛点：传统 RLHF 方法依赖昂贵且耗时的人工数据收集与评估，而 AlpacaFarm 允许开发者在不采集真实人类数据的情况下，利用 GPT-4 等强大模型自动模拟偏好反馈并执行自动化评估。这使得研究人员能够以极低的成本快速迭代和验证新的对齐算法。\n\n该工具主要面向 AI 研究人员和算法开发者，特别是那些希望深入研究指令遵循模型优化，但受限于计算资源或数据获取渠道的团队。其独特亮点在于提供了一套完整的闭环系统：不仅包含经过验证的基准算法参考实现（如 PPO、Best-of-N），还集成了基于 API 模型的成对偏好模拟机制，以及标准化的自动评估流程。需要注意的是，由于数据集许可限制，AlpacaFarm 目前仅适用于学术研究用途，不支持商业应用。通过降低实验门槛，它致力于推动大模型对齐技术的普及与创新。","\u003Cp align=\"center\" width=\"100%\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Ftatsu-lab_alpaca_farm_readme_1487816d037e.png\" alt=\"AlpacaFarm\" style=\"width: 50%; min-width: 300px; display: block; margin: auto;\">\n\u003C\u002Fp>\n\n# AlpacaFarm: A Simulation Framework for Methods that \u003Cbr\u002F>Learn from Human Feedback\n\n[![Code License](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCode%20License-Apache_2.0-green.svg)](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fblob\u002Fmain\u002FLICENSE)\n[![Data License](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FData%20License-CC%20By%20NC%204.0-red.svg)](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fblob\u002Fmain\u002FDATA_LICENSE)\n[![Python 3.10+](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fpython-3.10+-blue.svg)](https:\u002F\u002Fwww.python.org\u002Fdownloads\u002Frelease\u002Fpython-3100\u002F)\n[![Code style: black](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fcode%20style-black-000000.svg)](https:\u002F\u002Fgithub.com\u002Fpsf\u002Fblack)\n\n\n**Changing auto-annotators**: `text-davinci-003` is [now depreciated](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fdeprecations) by OpenAI, as a result, we can't use the original pool of annotators for automatically generating preferences (for fine-tuning or evaluation). We, therefore, switched to the GPT-4 annotator from [AlpacaEval 1](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_eval). All results should thus be compared to models from AlpacaEval 1 rather than the original AlpacaFarm results. Note that over-optimization might not be seen in this new setting (see Figure 4 in the [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14387)). We are sorry for the inconvenience caused.\n\n---\n\nResearch and development on learning from human feedback is difficult because methods\nlike [RLHF](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.02155) are complex and costly to run.\nAlpacaFarm is a simulator that enables research and development on learning from feedback at a fraction of the usual\ncost, promoting accessible research on instruction following and alignment.\n\nPlease read our [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14387)\nand [blog post](https:\u002F\u002Fcrfm.stanford.edu\u002F2023\u002F05\u002F22\u002Falpaca-farm.html) for details on our research findings.\n\nThis repo contains code for\n\n- [simulating preference feedback from language models such as GPT-4](#simulating-pairwise-preference)\n- [automated evaluation for instruction-following models](#running-automatic-evaluation)\n- [validated reference implementations of baseline methods such as PPO and best-of-n](#running-reference-methods)\n\nThe data needed to run our code is hosted on HuggingFace: \u003Chttps:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Ftatsu-lab\u002Falpaca_farm>.\n\n**Usage and License Notices**: AlpacaFarm is intended and licensed for research use only.\nThe dataset is CC BY NC 4.0 (allowing only non-commercial use) and models trained using the dataset should not be used\noutside of research purposes.\nThe weight diff is also CC BY NC 4.0 (allowing only non-commercial use).\n\n## The AlpacaFarm\n\n\u003Cbr>\n\u003Cp style=\"text-align:center;\">\n  \u003Cimg style=\"max-width:70%; height:auto;\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Ftatsu-lab_alpaca_farm_readme_c3eaf3c2e599.jpg\" alt=\"Workflow\">\n\u003C\u002Fp>\n\nInstruction-following models are typically developed in 3 steps\n\n1. Supervised fine-tuning with demonstrations\n2. Learning from human feedback; usually pairwise preferences\n3. Human evaluation with interaction\n\nThe goal of AlpacaFarm is to provide three key components that tackles steps 2 and 3:\nLow-cost simulation of pairwise feedback from API models (e.g. GPT-4, ChatGPT), automated evaluations for methods\ndevelopment, and reference implementations of\nlearning algorithms for comparison and modification.\n\n## Installation\n\nTo install the stable release, run\n\n```bash\npip install alpaca-farm\n```\n\nTo install from the latest commit on `main` branch, run\n\n```bash\npip install git+https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm.git\n```\n\nTo enable FlashAttention and other optimizations, install\nthe [`flash-attn`](https:\u002F\u002Fgithub.com\u002FHazyResearch\u002Fflash-attention) and [`apex`](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002Fapex)\npackages.\n\n## Simulating pairwise preference\n\n**Notebook\nexample:** [![Using](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Ftatsu-lab\u002Falpaca_farm\u002Fblob\u002Fmain\u002Fexamples\u002Fauto_annotations.ipynb)\n\nFor all the evaluation and annotations we use [**AlpacaEval**](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_eval\u002Ftree\u002Fmain#making-a-new-evaluator) with our pool of automatic annotators and additional noise to simulate the variance of human annotations.\n\nTo get started, set the environment variable `OPENAI_API_KEY` to your OpenAI API key, and (optionally) `OPENAI_ORG` to\nthe\norganization ID.\nYou can do this by running\n\n```bash\nexport OPENAI_API_KEY=\"sk...\"\n```\n\nTo annotate the pairs of outputs of your model use the following code.\nFor more details or functions to use if you have outputs in different formats refer to\nthe [example notebook](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fblob\u002Fmain\u002Fexamples\u002Fauto_annotations.ipynb).\n\n```python\nfrom alpaca_farm.auto_annotations import PairwiseAutoAnnotator\nimport json\n\n# load some data\nwith open(\"examples\u002Fdata\u002Foutputs_pairs.json\") as f:\n    outputs_pairs = json.load(f)[:6]\nprint(outputs_pairs[-1:])\n# [{'instruction': 'If you could help me write an email to my friends inviting them to dinner on Friday, it would be greatly appreciated.',\n#   'input': '',\n#   'output_1': \"Dear Friends, \\r\\n\\r\\nI hope this message finds you well. I'm excited to invite you to dinner on Friday. We'll meet at 7:00 PM at [location]. I look forward to seeing you there. \\r\\n\\r\\nBest,\\r\\n[Name]\",\n#   'output_2': \"Hey everyone! \\n\\nI'm hosting a dinner party this Friday night and I'd love for all of you to come over. We'll have a delicious spread of food and some great conversations. \\n\\nLet me know if you can make it - I'd love to see you all there!\\n\\nCheers,\\n[Your Name]\"}]\n\nannotator = PairwiseAutoAnnotator()\nannotated = annotator.annotate_pairs(outputs_pairs)\n\nprint(annotated[-1:])\n# [{'instruction': 'If you could help me write an email to my friends inviting them to dinner on Friday, it would be greatly appreciated.', \n# 'input': '', \n# 'output_1': \"Dear Friends, \\r\\n\\r\\nI hope this message finds you well. I'm excited to invite you to dinner on Friday. We'll meet at 7:00 PM at [location]. I look forward to seeing you there. \\r\\n\\r\\nBest,\\r\\n[Name]\", \n# 'output_2': \"Hey everyone! \\n\\nI'm hosting a dinner party this Friday night and I'd love for all of you to come over. We'll have a delicious spread of food and some great conversations. \\n\\nLet me know if you can make it - I'd love to see you all there!\\n\\nCheers,\\n[Your Name]\",\n# 'annotator': 'chatgpt_2', \n# 'preference': 2}]\n```\n\nIf instead of pairs you have a list of sampled outputs, you can use the following.\n\n```python\nmultisample_outputs = [dict(instruction=\"repeat the following\", input=\"yes\", output=[\"yes\", \"no\", \"maybe\", \"repeat\"])]\nprint(annotator.annotate_samples(multisample_outputs))\n# [{'sample_id': 0, \n#   'instruction': 'repeat the following', \n#   'input': 'yes', \n#   'output_1': 'yes', \n#   'output_2': 'maybe', \n#   'annotator': 'chatgpt_2', \n#   'preference': 1}]\n```\n\n## Running automatic evaluation\n\nFor all the evaluation we use [**AlpacaEval**](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_eval\u002Ftree\u002Fmain#making-a-new-evaluator) with our pool of automatic annotators. \n\nTo get started, set the environment variable OPENAI_API_KEY to your OpenAI API key, and (optionally) OPENAI_ORG to the\norganization ID. You can do this by running\n\n```bash\nexport OPENAI_API_KEY=\"sk...\"\n```\n\nThe easiest to add your model to the Alpaca Leaderboard is to run the following code, which only requires having outputs\nfor your model on our eval data.\n\n```python\nfrom alpaca_farm.auto_annotations import alpaca_leaderboard\nimport datasets\n\n# predict on Alpaca eval data\nalpaca_eval_data = datasets.load_dataset(\"tatsu-lab\u002Falpaca_farm\", \"alpaca_farm_evaluation\")[\"eval\"]\n...  # use the data to get outputs for your model and save it\npath_to_outputs = \"examples\u002Fdata\u002Feval_gpt-3.5-turbo-0301.json\"\n# outputs should be a list of json as such:\n# [{'instruction': 'What are the names of some famous actors that started their careers on Broadway?', 'input': '', 'output': 'Some famous actors that started their careers on Broadway are Hugh Jackman, Meryl Streep, Denzel Washington, Audra McDonald, and Lin-Manuel Miranda.', 'generator': 'gpt-3.5-turbo-0301', 'dataset': 'helpful_base', 'datasplit': 'eval'},\n# ...]\n\nalpaca_leaderboard(path_to_outputs, name=\"My fancy model\")\n#                               win_rate  standard_error  n_total  avg_length\n# gpt35_turbo_instruct             81.71            1.33      801        1018\n# alpaca-farm-ppo-sim-gpt4-20k     44.10            1.74      805         511\n# My fancy model                   41.54            2.01      597         327\n# alpaca-farm-ppo-human            41.24            1.73      805         803\n# alpaca-7b                        26.46            1.54      805         396\n# text_davinci_001                 15.17            1.24      804         296\n```\n\n## Running reference methods\n\nWe provide reference implementations of several methods for learning from pairwise feedback.\nExample code to run these methods can be found in the `examples\u002F` directory.\nThis includes [supervised fine-tuning](examples\u002Fsupervised.py), [reward modeding](examples\u002Freward_modeling.py)\n, [RLHF with PPO](examples\u002Frlhf_ppo.py), [best-of-n decoding](examples\u002Fbest_of_n.py) and more.\n\nBelow we give example commands for reproducing the model artifacts in our paper. Notes:\n\n- All training code are tested with FlashAttention enabled on a machine with 8 80GB A100 GPUs.\n- Best-of-n decoding was tested with a single 80GB GPU.\n- Supervised fine-tuning and reward modeling can fit on 4 80GB A100 GPUs, while PPO training currently requires at least\n  8\n  80GB GPUs.\n- Before running the code below, make sure to convert your LLaMA checkpoint and tokenizer into HuggingFace format and\n  store it at `\u003Cyour_path_to_hf_converted_llama_ckpt_and_tokenizer>`.\n\n### Supervised fine-tuning (SFT)\n\nTo replicate our SFT10k model fine-tuned from LLaMA in the paper, run\n\n```bash\nbash examples\u002Fscripts\u002Fsft.sh \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_path_to_hf_converted_llama_ckpt_and_tokenizer>\n```\n\nThe SFT10k model will be saved at `\u003Cyour_output_dir>`, and the name of the wandb run will be `\u003Cyour_wandb_run_name>`.\n\n### Reward modeling\n\nTo replicate our reward models trained in the paper, run\n\n```bash\nbash examples\u002Fscripts\u002Freward_modeling.sh \\\n  \u003Cyour_output_dir_for_reward_model> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Cpreference_dataset_name>\n```\n\nSet `\u003Cpreference_dataset_name>` to `\"alpaca_noisy_multi_preference\"` for simulated preference reward model, and\n`\"alpaca_human_preference\"` for human preference reward model.\n\n### RLHF with PPO\n\nTo replicate our RLHF PPO model trained with simulated reward model in the paper, run\n\n```bash\nbash examples\u002Fscripts\u002Frlhf_ppo.sh \\\n  \u003Cyour_output_dir_for_ppo> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_output_dir_for_reward_model> \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Ckl_coef>\n```\n\n`\u003Cyour_output_dir_for_reward_model>` should point to either simulated reward model or human reward model trained\naccording\nto the previous step.\nNote the KL penalty coefficient for human reward PPO is much larger than for simulated PPO.\nSet `\u003Ckl_coef>` to `0.0067` for simulated PPO, and `0.02` for human PPO to recover our original results.\nPerformance of the PPO model is typically much better than SFT at 20-80 PPO steps (less than 4 passes through the entire\nset of instructions) and starts to decay with more PPO steps.\n\n### Best-of-n decoding\n\nTo replicate our best-of-n inference-time decoding results for the AlpacaFarm evaluation suite, run\n\n```bash\npython examples\u002Fbest_of_n.py \\\n  --task \"run_best_of_n\" \\\n  --decoder_name_or_path \u003Cyour_output_dir_for_decoder> \\  # Can be SFT model or even PPO tuned model.\n  --scorer_name_or_path \u003Cyour_output_dir_for_reward_model> \\\n  --num_return_sequences 16 \\  # This is the n in best-of-n.\n  --per_device_batch_size 4 \\  # Reduce this if you don't have enough memory.\n  --split \"eval\" \\\n  --mixed_precision \"bf16\" \\\n  --tf32 True \\\n  --flash_attn True \\\n  --output_path \u003Cyour_output_path_to_store_samples>\n```\n\nYou can then use the generated samples at `\u003Cyour_output_path_to_store_samples>` directly with our automated evaluation.\n\n### Expert Iteration\n\nTo replicate our expert iteration results for the AlpacaFarm evaluation suite, first produce best-of-n samples. Run\n\n```bash\npython examples\u002Fbest_of_n.py \\\n  --task \"run_best_of_n\" \\\n  --decoder_name_or_path \u003Cyour_output_dir_for_decoder> \\  # SFT10k model.\n  --scorer_name_or_path \u003Cyour_output_dir_for_reward_model> \\\n  --num_return_sequences 16 \\  # This is the n in best-of-n.\n  --per_device_batch_size 4 \\  # Reduce this if you don't have enough memory.\n  --split \"unlabeled\" \\\n  --mixed_precision \"bf16\" \\\n  --tf32 True \\\n  --flash_attn True \\\n  --output_path '\u003Cyour_output_dir_for_expiter_data>\u002Fbest_of_n_samples.json'\n```\n\nThen perform supervised fine-tuning from the SFT10k checkpoint with the best-of-n samples\n\n```bash\nbash examples\u002Fscripts\u002Fexpiter.sh \\\n  \u003Cyour_output_dir_for_expiter> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Cyour_output_dir_for_expiter_data>\n```\n\n### Quark\n\nTo replicate our Quark results for the AlpacaFarm evaluation suite, run\n\n```bash\nbash examples\u002Fscripts\u002Frlhf_quark.sh \\\n  \u003Cyour_output_dir_for_quark> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_output_dir_for_reward_model> \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Ckl_coef>\n```\n\n### [Direct Preference Optimization (DPO)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.18290)\n\nTo replicate our DPO results for the AlpacaFarm evaluation suite, run\n\n```bash\nbash examples\u002Fscripts\u002Fdpo.sh \\\n  \u003Cyour_output_dir_for_dpo> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_output_dir_for_sft10k>\n```\n\n### OpenAI models\n\nTo run the OpenAI reference models with our prompts and decoding hyperparameters, run\n\n```bash\npython examples\u002Foai_baselines.py \\\n  --model_name \u003Coai_model_name> \\\n  --save_path \u003Csave_path> \n```\n\nYou can then use the generated samples at `\u003Csave_path>` directly with our automated evaluation.\n\n## Downloading pre-tuned AlpacaFarm models\n\nWe provide model checkpoints for reward models and all our reference methods, listed in Table 2 of\nour [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14387). Concretely, we tune each reference method in AlpacaFarm simulation and on\nhuman preference data and release both versions. The current list of models\n(available [here](https:\u002F\u002Fhuggingface.co\u002Ftatsu-lab)) includes:\n\n- `sft10k`, the supervised learning base model that we collect preference data with.\n- `reward-model-sim`, the reward model trained on AlpacaFarm preference data.\n- `reward-model-human`, the reward model trained on human preference data.\n- `ppo-sim`, the best PPO checkpoint trained in simulation.\n- `ppo-human`, the best PPO checkpoint trained on human data.\n- `expiter-sim`, the best expert iteration checkpoint trained in simulation.\n- `expiter-human`, the best expert iteration checkpoint trained on human data.\n- `feedme-sim`, the FeedME method trained on simulated preferences.\n- `feedme-human`, the FeedME method trained on human preferences.\n- `reward-condition-sim`, the reward conditioning method trained on simulated preferences.\n\nTo download and recover these checkpoints, first make sure to have a LLaMA-7B\ncheckpoint [converted into the Hugging Face format](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Ftransformers\u002Fmain\u002Fmodel_doc\u002Fllama)\n**with transformers>=4.29.2**.\nThen, run the following to download all AlpacaFarm models:\n\n```\npython -m pretrained_models.recover_model_weights \\\n  --llama-7b-hf-dir \u003Cyour_path_to_hf_converted_llama_ckpt_and_tokenizer> \\\n  --alpaca-farm-model-name all\n```\n\nOr, specify a particular model name to download just that model:\n\n```\npython -m pretrained_models.recover_model_weights \\\n  --llama-7b-hf-dir \u003Cyour_path_to_hf_converted_llama_ckpt_and_tokenizer> \\\n  --alpaca-farm-model-name \u003Cone_of_the_model_names_from_above> \\\n  --models-save-dir \u003Cdir_to_save_all_models>\n```\n\nTo download either of the reward models individually, you'll need to have `sft10k` downloaded first\nto `\u003Cdir_to_save_all_models>`.\n\n## Citation\n\nPlease consider citing our work if you use the data or code in this repo.\n\n```\n@misc{dubois2023alpacafarm,\n      title={AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback}, \n      author={Yann Dubois and Xuechen Li and Rohan Taori and Tianyi Zhang and Ishaan Gulrajani and Jimmy Ba and Carlos Guestrin and Percy Liang and Tatsunori B. Hashimoto},\n      year={2023},\n      eprint={2305.14387},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG}\n}\n```\n\nIf you use `alpaca-farm>=0.2.0` make sure to specify that the annotator changed (as `text-davinci-003` is depreciated). The preferences and win-rates are now from AlpacaEval 1 and are not comparable to the numbers from our paper. You can cite AlpacaEval as:\n\n```\n@misc{alpaca_eval,\n  author = {Xuechen Li and Tianyi Zhang and Yann Dubois and Rohan Taori and Ishaan Gulrajani and Carlos Guestrin and Percy Liang and Tatsunori B. Hashimoto },\n  title = {AlpacaEval: An Automatic Evaluator of Instruction-following Models},\n  year = {2023},\n  publisher = {GitHub},\n  journal = {GitHub repository},\n  howpublished = {\\url{https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_eval}}\n}\n```","\u003Cp align=\"center\" width=\"100%\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Ftatsu-lab_alpaca_farm_readme_1487816d037e.png\" alt=\"AlpacaFarm\" style=\"width: 50%; min-width: 300px; display: block; margin: auto;\">\n\u003C\u002Fp>\n\n# AlpacaFarm：一个用于研究从人类反馈中学习方法的仿真框架\n\n[![代码许可证](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCode%20License-Apache_2.0-green.svg)](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fblob\u002Fmain\u002FLICENSE)\n[![数据许可证](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FData%20License-CC%20By%20NC%204.0-red.svg)](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fblob\u002Fmain\u002FDATA_LICENSE)\n[![Python 3.10+](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fpython-3.10+-blue.svg)](https:\u002F\u002Fwww.python.org\u002Fdownloads\u002Frelease\u002Fpython-3100\u002F)\n[![代码风格：black](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fcode%20style-black-000000.svg)](https:\u002F\u002Fgithub.com\u002Fpsf\u002Fblack)\n\n\n**自动标注器变更**：OpenAI已将`text-davinci-003` [弃用](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fdeprecations)，因此我们无法再使用原有的标注池来自动生成偏好（用于微调或评估）。为此，我们改用了来自[AlpacaEval 1](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_eval)的GPT-4标注器。所有结果应与AlpacaEval 1中的模型进行比较，而非原始的AlpacaFarm结果。请注意，在这种新设置下可能不会出现过度优化现象（参见[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14387)中的图4）。由此带来的不便，我们深表歉意。\n\n---\n\n关于从人类反馈中学习的研究与开发颇具挑战性，因为像[RLHF](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.02155)这样的方法既复杂又成本高昂。AlpacaFarm是一个仿真平台，能够在远低于常规成本的情况下开展相关研究与开发，从而推动指令遵循和对齐领域的普惠性研究。\n\n有关我们的研究成果，请参阅我们的[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14387)和[博客文章](https:\u002F\u002Fcrfm.stanford.edu\u002F2023\u002F05\u002F22\u002Falpaca-farm.html)。\n\n本仓库包含以下内容的代码：\n\n- [模拟来自语言模型的偏好反馈，如GPT-4](#simulating-pairwise-preference)\n- [针对指令遵循模型的自动化评估](#running-automatic-evaluation)\n- [基准方法的验证参考实现，如PPO和best-of-n](#running-reference-methods)\n\n运行代码所需的数据托管在HuggingFace上：\u003Chttps:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Ftatsu-lab\u002Falpaca_farm>。\n\n**使用与许可说明**：AlpacaFarm仅限于科研用途，并据此授权使用。数据集采用CC BY NC 4.0许可（仅允许非商业用途），使用该数据集训练的模型不得用于科研以外的场景。权重差异同样适用CC BY NC 4.0许可（仅允许非商业用途）。\n\n## AlpacaFarm简介\n\n\u003Cbr>\n\u003Cp style=\"text-align:center;\">\n  \u003Cimg style=\"max-width:70%; height:auto;\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Ftatsu-lab_alpaca_farm_readme_c3eaf3c2e599.jpg\" alt=\"Workflow\">\n\u003C\u002Fp>\n\n指令遵循模型通常分为三个步骤进行开发：\n\n1. 基于示范的监督微调\n2. 从人类反馈中学习；通常是成对偏好\n3. 人工交互式评估\n\nAlpacaFarm的目标是提供三个关键组件，以解决第2步和第3步的问题：低成本地模拟来自API模型（如GPT-4、ChatGPT）的成对反馈，为方法开发提供自动化评估工具，以及提供用于比较和修改的学习算法参考实现。\n\n## 安装\n\n要安装稳定版本，请运行：\n\n```bash\npip install alpaca-farm\n```\n\n若要从`main`分支的最新提交安装，请运行：\n\n```bash\npip install git+https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm.git\n```\n\n为了启用FlashAttention及其他优化功能，还需安装[`flash-attn`](https:\u002F\u002Fgithub.com\u002FHazyResearch\u002Fflash-attention)和[`apex`](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002Fapex)这两个包。\n\n## 模拟成对偏好\n\n**笔记本示例**：[![使用](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Ftatsu-lab\u002Falpaca_farm\u002Fblob\u002Fmain\u002Fexamples\u002Fauto_annotations.ipynb)\n\n在所有的评估和标注中，我们使用[**AlpacaEval**](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_eval\u002Ftree\u002Fmain#making-a-new-evaluator)，结合我们的自动标注池及额外噪声，以模拟人类标注的变异性。\n\n开始之前，需将环境变量`OPENAI_API_KEY`设置为您的OpenAI API密钥，（可选）还将`OPENAI_ORG`设置为您所在的组织ID。您可以通过以下命令完成设置：\n\n```bash\nexport OPENAI_API_KEY=\"sk...\"\n```\n\n要标注您模型输出的成对样本，可使用如下代码。如需更多详细信息或处理不同格式输出的功能，请参阅[示例笔记本](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fblob\u002Fmain\u002Fexamples\u002Fauto_annotations.ipynb)。\n\n```python\nfrom alpaca_farm.auto_annotations import PairwiseAutoAnnotator\nimport json\n\n# 加载一些数据\nwith open(\"examples\u002Fdata\u002Foutputs_pairs.json\") as f:\n    outputs_pairs = json.load(f)[:6]\nprint(outputs_pairs[-1:])\n# [{'instruction': '如果您能帮我给朋友们写一封邀请他们周五共进晚餐的邮件，我将不胜感激。',\n#   'input': '',\n#   'output_1': \"亲爱的朋友们：\\r\\n\\r\\n希望这封信能带给您美好的问候。我非常高兴地邀请您们于本周五共进晚餐。我们将在[地点]晚上7点见面。期待与您们相聚。\\r\\n\\r\\n此致，\\r\\n[姓名]\",\n#   'output_2': \"大家好！\\n\\n我将于本周五晚上举办一场晚宴，诚挚邀请各位前来。届时我们将享用丰盛的美食，并畅聊一番。\\n\\n请告知我您是否能出席——我非常期待与各位见面！\\n\\n祝好，\\n[您的名字]\"}]\n\nannotator = PairwiseAutoAnnotator()\nannotated = annotator.annotate_pairs(outputs_pairs)\n\nprint(annotated[-1:])\n# [{'instruction': '如果您能帮我写一封邀请朋友周五共进晚餐的邮件，我将不胜感激。', \n# 'input': '', \n# 'output_1': \"亲爱的朋友们：\\r\\n\\r\\n希望这封信能带给您美好的问候。我非常高兴地邀请您们于本周五共进晚餐。我们将在[地点]晚上7点见面。期待与您们相聚。\\r\\n\\r\\n此致，\\r\\n[姓名]\", \n# 'output_2': \"大家好！\\n\\n我将于本周五晚上举办一场晚宴，诚挚邀请各位前来。我们将准备美味佳肴，一起度过愉快的时光。\\n\\n请告知我您是否能出席——我非常期待与各位见面！\\n\\n祝好，\\n[您的名字]\", \n# 'annotator': 'chatgpt_2', \n# 'preference': 2}]\n```\n\n如果您拥有的不是成对样本，而是一组采样输出，则可以使用以下代码：\n\n```python\nmultisample_outputs = [dict(instruction=\"重复以下内容\", input=\"yes\", output=[\"yes\", \"no\", \"maybe\", \"repeat\"])]\nprint(annotator.annotate_samples(multisample_outputs))\n# [{'sample_id': 0, \n#   'instruction': '重复以下内容', \n#   'input': 'yes', \n#   'output_1': 'yes', \n#   'output_2': 'maybe', \n#   'annotator': 'chatgpt_2', \n#   'preference': 1}]\n```\n\n## 运行自动评估\n\n在所有评估中，我们使用 [**AlpacaEval**](https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_eval\u002Ftree\u002Fmain#making-a-new-evaluator) 及其自动标注器池。\n\n要开始使用，请将环境变量 `OPENAI_API_KEY` 设置为您的 OpenAI API 密钥，并（可选）将 `OPENAI_ORG` 设置为您所在的组织 ID。您可以通过运行以下命令来完成：\n\n```bash\nexport OPENAI_API_KEY=\"sk...\"\n```\n\n将您的模型添加到 Alpaca Leaderboard 的最简单方法是运行以下代码，该代码仅需要您在我们的评估数据上生成的模型输出。\n\n```python\nfrom alpaca_farm.auto_annotations import alpaca_leaderboard\nimport datasets\n\n# 在 Alpaca 评估数据上进行预测\nalpaca_eval_data = datasets.load_dataset(\"tatsu-lab\u002Falpaca_farm\", \"alpaca_farm_evaluation\")[\"eval\"]\n...  # 使用这些数据为您的模型生成输出并保存\npath_to_outputs = \"examples\u002Fdata\u002Feval_gpt-3.5-turbo-0301.json\"\n# 输出应为如下格式的 JSON 列表：\n# [{'instruction': '哪些著名演员的职业生涯始于百老汇？', 'input': '', 'output': '一些职业生涯始于百老汇的著名演员包括休·杰克曼、梅丽尔·斯特里普、丹泽尔·华盛顿、奥德拉·麦克唐纳和林-曼努埃尔·米兰达。', 'generator': 'gpt-3.5-turbo-0301', 'dataset': 'helpful_base', 'datasplit': 'eval'},\n# ...]\n\nalpaca_leaderboard(path_to_outputs, name=\"我的酷炫模型\")\n#                               胜率  标准误差  总样本数  平均长度\n# gpt35_turbo_instruct             81.71            1.33      801        1018\n# alpaca-farm-ppo-sim-gpt4-20k     44.10            1.74      805         511\n# 我的酷炫模型                   41.54            2.01      597         327\n# alpaca-farm-ppo-human            41.24            1.73      805         803\n# alpaca-7b                        26.46            1.54      805         396\n# text_davinci_001                 15.17            1.24      804         296\n```\n\n## 运行参考方法\n\n我们提供了几种基于成对反馈学习方法的参考实现。这些方法的示例代码可在 `examples\u002F` 目录中找到。其中包括 [监督微调](examples\u002Fsupervised.py)、[奖励建模](examples\u002Freward_modeling.py)、[使用 PPO 的 RLHF](examples\u002Frlhf_ppo.py)、[最佳 n 次解码](examples\u002Fbest_of_n.py) 等。\n\n下面给出了一些用于复现我们论文中模型成果的示例命令。说明：\n\n- 所有训练代码均在启用 FlashAttention 的情况下，在配备 8 张 80GB A100 显卡的机器上进行了测试。\n- 最佳 n 次解码是在单张 80GB 显卡上测试的。\n- 监督微调和奖励建模可以在 4 张 80GB A100 显卡上完成，而 PPO 训练目前至少需要 8 张 80GB 显卡。\n- 在运行以下代码之前，请确保将您的 LLaMA 检查点和分词器转换为 HuggingFace 格式，并将其存储在 `\u003Cyour_path_to_hf_converted_llama_ckpt_and_tokenizer>`。\n\n### 监督微调 (SFT)\n\n要复现我们在论文中从 LLaMA 微调得到的 SFT10k 模型，请运行以下命令：\n\n```bash\nbash examples\u002Fscripts\u002Fsft.sh \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_path_to_hf_converted_llama_ckpt_and_tokenizer>\n```\n\nSFT10k 模型将被保存到 `\u003Cyour_output_dir>`，WandB 实验名称将为 `\u003Cyour_wandb_run_name>`。\n\n### 奖励建模\n\n要复现我们在论文中训练的奖励模型，请运行以下命令：\n\n```bash\nbash examples\u002Fscripts\u002Freward_modeling.sh \\\n  \u003Cyour_output_dir_for_reward_model> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Cpreference_dataset_name>\n```\n\n将 `\u003Cpreference_dataset_name>` 设置为 `\"alpaca_noisy_multi_preference\"` 以获得模拟偏好奖励模型，或设置为 `\"alpaca_human_preference\"` 以获得人类偏好奖励模型。\n\n### 使用 PPO 的 RLHF\n\n要复现我们在论文中使用模拟奖励模型训练的 RLHF PPO 模型，请运行以下命令：\n\n```bash\nbash examples\u002Fscripts\u002Frlhf_ppo.sh \\\n  \u003Cyour_output_dir_for_ppo> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_output_dir_for_reward_model> \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Ckl_coef>\n```\n\n`\u003Cyour_output_dir_for_reward_model>` 应指向根据上一步训练得到的模拟奖励模型或人类奖励模型。请注意，人类奖励 PPO 的 KL 惩罚系数远大于模拟 PPO。将 `\u003Ckl_coef>` 设置为 `0.0067` 用于模拟 PPO，设置为 `0.02` 用于人类 PPO，即可恢复我们的原始结果。通常情况下，PPO 模型在 20–80 个 PPO 步骤内（即不到四次遍历整个指令集）的表现会显著优于 SFT，而随着 PPO 步骤的增加，性能则会逐渐下降。\n\n### 最佳 n 次解码\n\n要复现我们在 AlpacaFarm 评估套件上的最佳 n 次推理时解码结果，请运行以下命令：\n\n```bash\npython examples\u002Fbest_of_n.py \\\n  --task \"run_best_of_n\" \\\n  --decoder_name_or_path \u003Cyour_output_dir_for_decoder> \\  # 可以是 SFT 模型，甚至经过 PPO 调优的模型。\n  --scorer_name_or_path \u003Cyour_output_dir_for_reward_model> \\\n  --num_return_sequences 16 \\  # 这就是最佳 n 次解码中的 n。\n  --per_device_batch_size 4 \\  # 如果内存不足，可以适当减少此值。\n  --split \"eval\" \\\n  --mixed_precision \"bf16\" \\\n  --tf32 True \\\n  --flash_attn True \\\n  --output_path \u003Cyour_output_path_to_store_samples>\n```\n\n随后，您可以直接将 `\u003Cyour_output_path_to_store_samples>` 中生成的样本用于我们的自动化评估。\n\n### 专家迭代\n\n要复现我们在 AlpacaFarm 评估套件上的专家迭代结果，首先需要生成最佳 n 次样本。请运行以下命令：\n\n```bash\npython examples\u002Fbest_of_n.py \\\n  --task \"run_best_of_n\" \\\n  --decoder_name_or_path \u003Cyour_output_dir_for_decoder> \\  # SFT10k 模型。\n  --scorer_name_or_path \u003Cyour_output_dir_for_reward_model> \\\n  --num_return_sequences 16 \\  # 这就是最佳 n 次解码中的 n。\n  --per_device_batch_size 4 \\  # 如果内存不足，可以适当减少此值。\n  --split \"unlabeled\" \\\n  --mixed_precision \"bf16\" \\\n  --tf32 True \\\n  --flash_attn True \\\n  --output_path '\u003Cyour_output_dir_for_expiter_data>\u002Fbest_of_n_samples.json'\n```\n\n然后，使用最佳 n 次样本从 SFT10k 检查点出发进行监督微调：\n\n```bash\nbash examples\u002Fscripts\u002Fexpiter.sh \\\n  \u003Cyour_output_dir_for_expiter> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Cyour_output_dir_for_expiter_data>\n```\n\n### Quark\n\n要复现我们在 AlpacaFarm 评估套件上的 Quark 结果，请运行以下命令：\n\n```bash\nbash examples\u002Fscripts\u002Frlhf_quark.sh \\\n  \u003Cyour_output_dir_for_quark> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_output_dir_for_reward_model> \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Ckl_coef>\n```\n\n### [直接偏好优化 (DPO)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.18290)\n\n要复现我们在 AlpacaFarm 评估套件上的 DPO 结果，请运行以下命令：\n\n```bash\nbash examples\u002Fscripts\u002Fdpo.sh \\\n  \u003Cyour_output_dir_for_dpo> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_output_dir_for_sft10k>\n```\n\n### OpenAI 模型\n\n要使用我们的提示和解码超参数运行 OpenAI 参考模型，请执行以下命令：\n\n```bash\npython examples\u002Foai_baselines.py \\\n  --model_name \u003Coai_model_name> \\\n  --save_path \u003Csave_path> \n```\n\n随后，您可以直接将 `\u003Csave_path>` 中生成的样本用于我们的自动化评估。\n\n## 下载预训练的 AlpacaFarm 模型\n\n我们提供了奖励模型以及所有参考方法的模型检查点，详见我们论文 [2] 的表 2。具体而言，我们在 AlpacaFarm 模拟环境及人类偏好数据上分别对每种参考方法进行了微调，并发布了两种版本的模型。当前可用的模型列表（[此处](https:\u002F\u002Fhuggingface.co\u002Ftatsu-lab)可查）包括：\n\n- `sft10k`：我们收集偏好数据所用的监督学习基础模型。\n- `reward-model-sim`：基于 AlpacaFarm 偏好数据训练的奖励模型。\n- `reward-model-human`：基于人类偏好数据训练的奖励模型。\n- `ppo-sim`：在模拟环境中训练的最佳 PPO 检查点。\n- `ppo-human`：基于人类数据训练的最佳 PPO 检查点。\n- `expiter-sim`：在模拟环境中训练的最佳专家迭代检查点。\n- `expiter-human`：基于人类数据训练的最佳专家迭代检查点。\n- `feedme-sim`：基于模拟偏好训练的 FeedME 方法。\n- `feedme-human`：基于人类偏好训练的 FeedME 方法。\n- `reward-condition-sim`：基于模拟偏好训练的奖励条件化方法。\n\n要下载并恢复这些检查点，首先请确保已拥有一个 [转换为 Hugging Face 格式](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Ftransformers\u002Fmain\u002Fmodel_doc\u002Fllama) 的 LLaMA-7B 检查点，且 `transformers>=4.29.2`。然后，运行以下命令以下载所有 AlpacaFarm 模型：\n\n```\npython -m pretrained_models.recover_model_weights \\\n  --llama-7b-hf-dir \u003Cyour_path_to_hf_converted_llama_ckpt_and_tokenizer> \\\n  --alpaca-farm-model-name all\n```\n\n或者，指定特定的模型名称以仅下载该模型：\n\n```\npython -m pretrained_models.recover_model_weights \\\n  --llama-7b-hf-dir \u003Cyour_path_to_hf_converted_llama_ckpt_and_tokenizer> \\\n  --alpaca-farm-model-name \u003Cone_of_the_model_names_from_above> \\\n  --models-save-dir \u003Cdir_to_save_all_models>\n```\n\n若要单独下载任一奖励模型，则需先将 `sft10k` 下载至 `\u003Cdir_to_save_all_models>`。\n\n## 引用\n\n如果您使用本仓库中的数据或代码，请考虑引用我们的工作。\n\n```\n@misc{dubois2023alpacafarm,\n      title={AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback}, \n      author={Yann Dubois and Xuechen Li and Rohan Taori and Tianyi Zhang and Ishaan Gulrajani and Jimmy Ba and Carlos Guestrin and Percy Liang and Tatsunori B. Hashimoto},\n      year={2023},\n      eprint={2305.14387},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG}\n}\n```\n\n如果您使用 `alpaca-farm>=0.2.0` 版本，请务必注明标注者已变更（因为 `text-davinci-003` 已被弃用）。此时的偏好数据和胜率来自 AlpacaEval 1，与我们论文中的数值不可直接比较。您可以这样引用 AlpacaEval：\n\n```\n@misc{alpaca_eval,\n  author = {Xuechen Li and Tianyi Zhang and Yann Dubois and Rohan Taori and Ishaan Gulrajani and Carlos Guestrin and Percy Liang and Tatsunori B. Hashimoto },\n  title = {AlpacaEval: An Automatic Evaluator of Instruction-following Models},\n  year = {2023},\n  publisher = {GitHub},\n  journal = {GitHub repository},\n  howpublished = {\\url{https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_eval}}\n}\n```","# AlpacaFarm 快速上手指南\n\nAlpacaFarm 是一个用于研究“从人类反馈中学习”（如 RLHF）的仿真框架。它通过模拟成对偏好反馈和自动化评估，大幅降低了相关研究的成本和门槛。\n\n> **注意**：由于 OpenAI 已弃用 `text-davinci-003`，本项目现已切换至使用 GPT-4 作为自动标注器（基于 AlpacaEval 1）。所有结果应与 AlpacaEval 1 中的模型进行对比。\n\n## 1. 环境准备\n\n*   **操作系统**: Linux (推荐) 或 macOS\n*   **Python 版本**: 3.10 或更高\n*   **硬件要求**:\n    *   基础推理\u002F标注：普通 CPU\u002FGPU 即可。\n    *   训练参考方法（如 PPO）：建议配备多张高性能 GPU（原文测试环境为 8x A100 80GB；SFT 和奖励建模至少需 4x A100 80GB）。\n*   **前置依赖**:\n    *   **OpenAI API Key**: 用于调用 GPT-4 进行自动标注和评估。\n    *   **FlashAttention & Apex** (可选但推荐): 用于加速训练和优化显存。\n\n## 2. 安装步骤\n\n### 安装稳定版\n```bash\npip install alpaca-farm\n```\n\n### 安装最新版（从 main 分支）\n```bash\npip install git+https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm.git\n```\n\n### 启用性能优化（可选）\n若需启用 FlashAttention 等优化，请额外安装以下包：\n```bash\npip install flash-attn apex\n```\n\n## 3. 基本使用\n\n### 步骤一：配置环境变量\n在使用自动标注或评估功能前，需设置 OpenAI API 密钥。\n\n```bash\nexport OPENAI_API_KEY=\"sk-...\"\n# 可选：设置组织 ID\nexport OPENAI_ORG=\"org-...\"\n```\n\n### 步骤二：模拟成对偏好标注 (Simulating Pairwise Preference)\n使用 `PairwiseAutoAnnotator` 对模型生成的输出对进行自动偏好打分。\n\n```python\nfrom alpaca_farm.auto_annotations import PairwiseAutoAnnotator\nimport json\n\n# 加载数据示例 (假设已有 outputs_pairs.json)\nwith open(\"examples\u002Fdata\u002Foutputs_pairs.json\") as f:\n    outputs_pairs = json.load(f)[:6]\n\n# 初始化标注器并执行标注\nannotator = PairwiseAutoAnnotator()\nannotated = annotator.annotate_pairs(outputs_pairs)\n\n# 查看结果\nprint(annotated[-1:])\n# 输出包含 'annotator' (标注者) 和 'preference' (偏好选择) 字段\n```\n\n如果只有单个指令的多个采样输出，可使用 `annotate_samples`：\n```python\nmultisample_outputs = [dict(instruction=\"repeat the following\", input=\"yes\", output=[\"yes\", \"no\", \"maybe\", \"repeat\"])]\nprint(annotator.annotate_samples(multisample_outputs))\n```\n\n### 步骤三：运行自动化评估 (Running Automatic Evaluation)\n将你的模型输出提交到 Alpaca 排行榜进行自动评估。\n\n```python\nfrom alpaca_farm.auto_annotations import alpaca_leaderboard\nimport datasets\n\n# 加载评估数据集\nalpaca_eval_data = datasets.load_dataset(\"tatsu-lab\u002Falpaca_farm\", \"alpaca_farm_evaluation\")[\"eval\"]\n\n# ... (此处省略模型推理代码，需生成 outputs 列表并保存为 JSON)\n# outputs 格式示例: [{'instruction': '...', 'input': '', 'output': '...', 'generator': 'my_model', ...}]\n\npath_to_outputs = \"examples\u002Fdata\u002Feval_gpt-3.5-turbo-0301.json\"\n\n# 提交评估\nalpaca_leaderboard(path_to_outputs, name=\"My fancy model\")\n```\n\n### 步骤四：运行参考方法 (Reference Methods)\n项目提供了 SFT、奖励建模、PPO 和 Best-of-N 的参考实现脚本。使用前请确保已将 LLaMA checkpoint 转换为 HuggingFace 格式。\n\n**1. 监督微调 (SFT)**\n```bash\nbash examples\u002Fscripts\u002Fsft.sh \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_path_to_hf_converted_llama_ckpt_and_tokenizer>\n```\n\n**2. 奖励建模 (Reward Modeling)**\n```bash\nbash examples\u002Fscripts\u002Freward_modeling.sh \\\n  \u003Cyour_output_dir_for_reward_model> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Cpreference_dataset_name>\n```\n*注：`\u003Cpreference_dataset_name>` 设为 `\"alpaca_noisy_multi_preference\"` (模拟偏好) 或 `\"alpaca_human_preference\"` (人类偏好)。*\n\n**3. 基于 PPO 的 RLHF**\n```bash\nbash examples\u002Fscripts\u002Frlhf_ppo.sh \\\n  \u003Cyour_output_dir_for_ppo> \\\n  \u003Cyour_wandb_run_name> \\\n  \u003Cyour_output_dir_for_reward_model> \\\n  \u003Cyour_output_dir_for_sft10k> \\\n  \u003Ckl_coef>\n```\n*注：模拟 PPO 的 `\u003Ckl_coef>` 设为 `0.0067`，人类偏好 PPO 设为 `0.02`。*\n\n**4. Best-of-N 解码**\n```bash\npython examples\u002Fbest_of_n.py \\\n  --task \"run_best_of_n\" \\\n  --decoder_name_or_path \u003Cyour_output_dir_for_decoder> \\\n  --scorer_name_or_path \u003Cyour_output_dir_for_reward_model> \\\n  --num_return_sequences 16 \\\n  --per_device_batch_size 4 \\\n  --split \"eval\" \\\n  --mixed_precision \"bf16\"\n```\n\n---\n**许可说明**：本工具及数据集仅限学术研究使用（非商业）。数据集遵循 CC BY NC 4.0 协议。","某初创 AI 团队正在研发一款垂直领域的智能客服模型，急需通过人类反馈强化学习（RLHF）来提升回答的准确性和亲和力，但面临资源匮乏的困境。\n\n### 没有 alpaca_farm 时\n- **数据成本高昂**：收集真实的人类偏好标注数据需要雇佣大量标注员或依赖众包平台，单次迭代成本高达数万美元，初创团队难以负担。\n- **开发周期漫长**：等待人工标注反馈往往需要数周时间，导致算法验证和模型迭代的周期被严重拉长，无法快速试错。\n- **实验门槛极高**：复现 PPO 等复杂的 RLHF 基线算法需要深厚的工程积累，团队需从零搭建训练框架，极易陷入底层代码调试而停滞不前。\n- **评估标准不一**：缺乏自动化的评估机制，每次模型效果验证都依赖人工抽检，主观性强且效率低下，难以量化改进幅度。\n\n### 使用 alpaca_farm 后\n- **零成本模拟反馈**：利用 alpaca_farm 内置的 GPT-4 自动标注器模拟人类偏好，无需采集任何真实人工数据即可生成高质量的对齐训练集，将数据成本降至接近零。\n- **即时迭代验证**：仿真环境可瞬间生成海量偏好数据，团队能在几小时内完成原本需数周的“训练 - 评估”循环，大幅加速算法优化进程。\n- **开箱即用基线**：直接调用 alpaca_farm 提供的已验证参考实现（如 PPO、Best-of-N），团队可专注于核心策略改进，无需重复造轮子。\n- **自动化客观评估**：通过框架自带的自动评估流程，实时量化模型在指令遵循上的表现，确保每一次迭代都有据可依，显著提升研发效率。\n\nalpaca_farm 通过高保真的仿真环境，让资源有限的团队也能以极低的成本高效开展前沿的 RLHF 对齐研究。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Ftatsu-lab_alpaca_farm_8aa7ce22.png","tatsu-lab","Tatsu's shared repositories","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Ftatsu-lab_d160c91d.png","Tatsu's shared repos",null,"https:\u002F\u002Fgithub.com\u002Ftatsu-lab",[79,83],{"name":80,"color":81,"percentage":82},"Python","#3572A5",100,{"name":84,"color":85,"percentage":86},"JavaScript","#f1e05a",0,843,63,"2026-04-05T00:13:24","Apache-2.0","未说明","需要 NVIDIA GPU。测试环境为 80GB A100 GPU：SFT 和奖励建模需 4 张，PPO 训练需至少 8 张，Best-of-n 解码需 1 张。需安装 flash-attn 和 apex 以启用优化。",{"notes":94,"python":95,"dependencies":96},"1. 代码仅在配备 80GB A100 GPU 的机器上经过测试，具体显存需求取决于任务（PPO 要求最高）。2. 运行前需将 LLaMA 检查点和分词器转换为 HuggingFace 格式。3. 必须设置 OPENAI_API_KEY 环境变量以使用自动标注和评估功能（依赖 GPT-4\u002FChatGPT）。4. 数据集和模型权重仅限非商业研究用途 (CC BY NC 4.0)。5. 原文中 text-davinci-003 已弃用，现默认使用 GPT-4 作为标注器。","3.10+",[97,98,99,100,101],"alpaca-farm","flash-attn","apex","datasets","openai",[35,14],[104,105,106,107,108],"deep-learning","instruction-following","large-language-models","reinforcement-learning-from-human-feedback","natural-language-processing","2026-03-27T02:49:30.150509","2026-04-10T22:37:09.941826",[112,117,122,127,132,136],{"id":113,"question_zh":114,"answer_zh":115,"source_url":116},28390,"使用 decapoda-research\u002Fllama-7b-hf 检查点加载模型时，生成的文本乱码或无法正常工作怎么办？","这通常是由于 LLaMA 检查点转换版本不匹配导致的。请尝试以下步骤：\n1. 使用 `transformers==4.29.2` 重新转换 LLaMA 检查点（不要使用 4.34.0 或其他版本）。\n2. 运行权重恢复命令重新构建模型权重：\n```bash\npython -m pretrained_models.recover_model_weights \\\n  --llama-7b-hf-dir \u003C你的 llama 模型路径> \\\n  --alpaca-farm-model-name \u003C模型名称，如 sft10k 或 ppo-sim> \\\n  --models-save-dir \u003C保存路径>\n```\n如果问题仍然存在，可能是预下载的 wdiff 文件有问题，建议重新生成。","https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fissues\u002F41",{"id":118,"question_zh":119,"answer_zh":120,"source_url":121},28391,"在双卡或多卡环境下使用 Flash-LLaMA 进行生成时，遇到 \"probability tensor contains either inf, nan or element \u003C 0\" 错误如何解决？","该项目官方主要支持并使用 FSDP (Fully Sharded Data Parallel)，对于原生的 Huggingface Model Parallelism 支持有限且容易出现此类数值错误。\n建议解决方案：\n1. 优先改用 FSDP 进行多卡训练或推理，这是项目维护者推荐的方式。\n2. 如果必须使用模型并行，请确保代码中 `position_ids` 已正确移动到对应设备上（即在相关代码行后添加 `.to(tensor.device)`），但这可能无法完全解决所有兼容性问题。","https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fissues\u002F48",{"id":123,"question_zh":124,"answer_zh":125,"source_url":126},28392,"运行标注脚本时出现 \"BaseAnnotator.__init__() got an unexpected keyword argument 'other_keys_to_keep'\" 报错怎么办？","这是由于 `alpaca-eval` 包版本更新导致参数不兼容引起的（特别是 davinci-003 被弃用后）。\n临时解决方法：\n找到报错的文件（通常在 `alpaca_farm\u002Fauto_annotations\u002Feval.py` 或相关调用处），注释掉传递 `other_keys_to_keep` 参数的代码行即可正常运行。\n长期解决：等待项目更新以适配最新的 AlpacaEval 版本。","https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fissues\u002F85",{"id":128,"question_zh":129,"answer_zh":130,"source_url":131},28393,"如何使用 AlpacaFarm 配合 Llama-2-70b 或其他新版 Llama 模型？","原始的 `recover_model_weights` 脚本是专门为 LLaMA-1 (7B) 架构设计的，直接传入 Llama-2-70b 的路径会导致配置加载错误（如 `CONFIG_MAPPING`  KeyError）。\n由于 Llama-2 的架构细节和分词器可能与原版不同，且 70B 模型体量巨大，不能直接复用针对 7B 模型的权重恢复脚本。用户需要自行修改 `pretrained_models\u002Frecover_model_weights.py` 中的模型加载逻辑以适配新的配置文件结构，或者直接使用 HuggingFace 上的完整微调模型而非差分权重。","https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fissues\u002F69",{"id":133,"question_zh":134,"answer_zh":135,"source_url":116},28394,"在恢复模型权重时遇到 \"integrity_check\" 错误是什么原因？","该错误通常表明基础检查点（base checkpoint）与预期的格式不一致。最常见的原因是基础模型是使用不兼容的 `transformers` 版本（如 4.34.0）转换的。\n解决方法：请确保使用 `transformers==4.29.2` 重新转换原始的 Meta LLaMA 检查点，然后再次运行权重恢复命令。",{"id":137,"question_zh":138,"answer_zh":139,"source_url":121},28395,"AlpacaFarm 推荐使用的多 GPU 并行策略是什么？","项目维护者明确表示，整个项目开发和测试主要使用的是 FSDP (Fully Sharded Data Parallel)。对于原生的模型并行（Model Parallelism），团队没有太多经验且不保证稳定性。如果遇到多卡相关问题，强烈建议参考项目中 FSDP 的配置和使用方式进行调整。",[141,146,151,156,161,166,171,176,181,185,189,193,197],{"id":142,"version":143,"summary_zh":144,"released_at":145},189322,"v0.2.0","## 变更内容\n* 【突发】003 折旧：将自动标注器更换为 AlpacaEval 1.，由 @YannDubs 在 https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fpull\u002F88 中提出\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fcompare\u002Fv0.1.12...v0.2.0","2024-02-24T08:58:26",{"id":147,"version":148,"summary_zh":149,"released_at":150},189323,"v0.1.12","## 变更内容\n* 修复 README 中 DPO 训练脚本的 bug。由 @stceum 在 https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fpull\u002F83 中完成。\n* 杂项：基于 husky 添加 commitlint 配置。由 @lxuechen 在 https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fpull\u002F84 中完成。\n\n## 新贡献者\n* @stceum 在 https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fpull\u002F83 中完成了首次贡献。\n\n**完整变更日志**：https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fcompare\u002Fv0.1.11...v0.1.12","2023-12-05T17:40:49",{"id":152,"version":153,"summary_zh":154,"released_at":155},189324,"v0.1.11","## 变更内容\n* 修复：@lxuechen 在 https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fpull\u002F82 中的低级错误\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fcompare\u002Fv0.1.10...v0.1.11","2023-12-04T22:45:34",{"id":157,"version":158,"summary_zh":159,"released_at":160},189325,"v0.1.10","## 变更内容\n* @YannDubs 在 https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fpull\u002F74 中修正了一个小拼写错误\n* @lxuechen 实现了 DPO 功能，相关更改见 https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fpull\u002F81\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fcompare\u002Fv0.1.9...v0.1.10","2023-12-02T22:42:01",{"id":162,"version":163,"summary_zh":164,"released_at":165},189326,"v0.1.9","## 变更内容\n* [BUG] 修复了带输入\u002F无输入时的填充问题，由 @YannDubs 在 https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fpull\u002F62 中提交\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fcompare\u002Fv0.1.8...v0.1.9","2023-08-04T10:37:35",{"id":167,"version":168,"summary_zh":169,"released_at":170},189327,"v0.1.8","## 变更内容\n* 修复梯度为零的 bug。由 @lxuechen 在 https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fpull\u002F58 中解决 #57。\n* 修复 recover_model_weights.py 中的日志 bug，由 @rtaori 在 https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fpull\u002F59 中完成。\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Falpaca_farm\u002Fcompare\u002Fv0.1.6...v0.1.8","2023-07-09T01:33:03",{"id":172,"version":173,"summary_zh":174,"released_at":175},189328,"v0.1.6","在 AlpacaFarm 中使用 AlpacaEval 中的注释逻辑进行成对偏好模拟。","2023-06-23T08:09:07",{"id":177,"version":178,"summary_zh":179,"released_at":180},189329,"v0.1.5","修复因不当导入导致的循环导入问题","2023-06-15T04:34:10",{"id":182,"version":183,"summary_zh":76,"released_at":184},189330,"v0.1.4","2023-06-12T23:54:15",{"id":186,"version":187,"summary_zh":76,"released_at":188},189331,"v0.1.3","2023-06-11T21:49:56",{"id":190,"version":191,"summary_zh":76,"released_at":192},189332,"v0.1.2","2023-06-11T20:32:25",{"id":194,"version":195,"summary_zh":76,"released_at":196},189333,"v0.1.1","2023-06-11T20:28:25",{"id":198,"version":199,"summary_zh":200,"released_at":201},189334,"v0.1.0","首次公开发布。","2023-05-23T16:06:19"]