[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-predibase--llm_distillation_playbook":3,"tool-predibase--llm_distillation_playbook":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",143909,2,"2026-04-07T11:33:18",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107888,"2026-04-06T11:32:50",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":10,"last_commit_at":59,"category_tags":60,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[35,15,13,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":77,"owner_twitter":72,"owner_website":78,"owner_url":79,"languages":80,"stars":89,"forks":90,"last_commit_at":91,"license":76,"difficulty_score":32,"env_os":92,"env_gpu":92,"env_ram":92,"env_deps":93,"category_tags":96,"github_topics":76,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":97,"updated_at":98,"faqs":99,"releases":100},5235,"predibase\u002Fllm_distillation_playbook","llm_distillation_playbook","Best practices for distilling large language models.","llm_distillation_playbook 是一份专注于大语言模型（LLM）知识蒸馏的实战指南，旨在帮助开发者将庞大、昂贵的教师模型高效转化为轻量、快速且低成本的学生模型。当前，虽然大模型能力强大，但其高昂的算力成本和推理延迟阻碍了大规模生产应用；同时，社区中关于如何成功实施蒸馏的建议往往零散且充满试错成本。这份手册系统性地解决了这一痛点，将来自 Google 和 Predibase 等团队的工业界经验与学术研究成果相结合，提供了一套可落地的最佳实践。\n\n它特别适合具备深度学习基础、致力于将 LLM 应用于生产环境的工程师和机器学习从业者。内容涵盖了从理解小模型局限性、构建日志基础设施、定义评估标准，到优化师生模型质量、数据多样性平衡及生产部署监控等全流程建议。其独特亮点在于不仅关注算法本身，更强调了“数据工程”与“生产运维”的重要性，例如利用真实日志或合成数据引导训练、逐项参数实验以及上线后的错误分析策略。通过遵循 llm_distillation_playbook，团队可以减少盲目猜测，以更科学的方法打造出既保留大模型智能又具备高效推理能力的定制化模型。","\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_774638639cb6.png\" height=250>\n\u003C\u002Fp>\n\n# LLM Distillation Playbook\n\n**Justin Zhao\u003Csup>&dagger;\u003C\u002Fsup>, Wael Abid\u003Csup>&dagger;\u003C\u002Fsup>**\n\n&dagger; Predibase, MLX team\n\n[Slides (February 1st, 2024)](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F15qrqENfi1TFR-kjK1IU5YnJ9etHaW_Q_\u002Fview?usp=drive_link)\n\n## Table of Contents\n\n- [LLM Distillation Playbook](#llm-distillation-playbook)\n  - [Table of Contents](#table-of-contents)\n  - [Who is this document for?](#who-is-this-document-for)\n  - [Why a distillation playbook?](#why-a-distillation-playbook)\n  - [Commitment to open source](#commitment-to-open-source)\n  - [Key Concepts](#key-concepts)\n  - [Best practices](#best-practices)\n    - [1. Understand the limitations of smaller models.](#1-understand-the-limitations-of-smaller-models)\n    - [2. Build good logging infrastructure.](#2-build-good-logging-infrastructure)\n      - [Bootstrap datasets with real logs.](#bootstrap-datasets-with-real-logs)\n      - [Bootstrap datasets with synthetic data.](#bootstrap-datasets-with-synthetic-data)\n    - [3. Define clear evaluation criteria.](#3-define-clear-evaluation-criteria)\n    - [4. Maximize the quality of your teacher model.](#4-maximize-the-quality-of-your-teacher-model)\n    - [5. Maximize the quality of your training data.](#5-maximize-the-quality-of-your-training-data)\n    - [6. The best datasets are diverse and balanced.](#6-the-best-datasets-are-diverse-and-balanced)\n    - [7. Start simple and small.](#7-start-simple-and-small)\n    - [8. Assess the marginal utility of having more data.](#8-assess-the-marginal-utility-of-having-more-data)\n    - [9. Consider how you want to serve your student.](#9-consider-how-you-want-to-serve-your-student)\n    - [10. Experiment broadly, one parameter at a time.](#10-experiment-broadly-one-parameter-at-a-time)\n    - [11. Look at the model's individual mistakes.](#11-look-at-the-models-individual-mistakes)\n    - [12. Actually deploy and monitor your models in production.](#12-actually-deploy-and-monitor-your-models-in-production)\n      - [Options for model deployment](#options-for-model-deployment)\n      - [Infrastructure safeguards](#infrastructure-safeguards)\n  - [Contributing](#contributing)\n\n## Who is this document for?\n\nThis document is for engineers and ML practitioners interested in **LLM distillation** for production applications. We assume familiarity with deep learning fundamentals and large language models (LLMs). While the advice in this guide is adaptable to other settings like academic research, our focus is on how to most effectively distill LLMs for production applications.\n\n## Why a distillation playbook?\n\nAlmost every organization we’ve worked with has built at least one novel internal application using LLMs; one larger company we spoke to had built 70 prototypes in a week. \n\nEveryone is building their prototype using large language models, however as LLMs become increasingly capable and integral to various applications, the need for more efficient, smaller counterparts has never been more pronounced.\n\nThis shift is driven by the compelling performance of LLMs, juxtaposed with the significant costs, resource demands, and slower operational speeds of large models. In response, distilling these models into more efficient, smaller versions presents a solution that balances capability with cost-effectiveness and speed.\n\nDespite significant interest in model distillation, we find there is still an astonishing amount of toil and guesswork involved in actually getting distilled models to work well in practice. Anecdotes and snippets of advice are spread across arxiv, huggingface, discord, substack, and social media, but the systemization and centralization of these recommendations remains to be seen.\n\nThe advice in this document draws from our experience distilling language models at Google and Predibase, combined with any SLM\u002FLLM research we could find on the topic. We are hopeful that these strategies for the efficient refinement of LLMs provide practitioners and enthusiasts with ideas that are practical, grounded in academic research, and helpful for the growing development and utilization of open source language models.\n\nThis is a living document. We anticipate making periodic improvements, both small and large. If you’d like to be notified, please watch our repository (see [instructions](https:\u002F\u002Fdocs.github.com\u002Fen\u002Faccount-and-profile\u002Fmanaging-subscriptions-and-notifications-on-github\u002Fsetting-up-notifications\u002Fconfiguring-notifications#configuring-your-watch-settings-for-an-individual-repository)).\n\n## Commitment to open source\n\nAt Predibase, we believe that the future is fine-tuned, specialized, and **open source** LLMs. Open source is in the DNA of the company. As a company, we maintain:\n\n- [Ludwig](https:\u002F\u002Fgithub.com\u002Fludwig-ai\u002Fludwig): Low-code framework for building custom LLMs, neural networks, and other AI models.\n- [LoRAX](https:\u002F\u002Fgithub.com\u002Fpredibase\u002Florax): Multi-LoRA inference server that scales to 1000s of fine-tuned LLMs.\n\n\u003Cdetails>\u003Csummary>\u003Cem>[More about Predibase]\u003C\u002Fem>\u003C\u002Fsummary>\n\nPredibase is a [managed platform](https:\u002F\u002Fpredibase.com) that's built on top of open source. If you are interested in a managed solution for fine-tuning and deploying LMs, you can sign up for a free trial [here](https:\u002F\u002Fpredibase.com).\n\n\u003C\u002Fdetails>\n\n## Key Concepts\n\nBefore we delve into the best practices for distilling large language models (LLMs), let's define terminology commonly used for model distillation and its applications.\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_c1f8d1c3cb64.jpeg)\n\n\u003Cp align=\"center\" >\u003Ci>Illustration of model distillation. \u003Ca href=https:\u002F\u002Fmagazine.sebastianraschka.com\u002Fp\u002Fresearch-papers-in-november-2023>Image source\u003C\u002Fa>.\u003C\u002Fi>\u003C\u002Fp>\n\n**Model distillation** enables the refinement and compression of large language models into more manageable, cost-effective versions without a significant loss in performance. \n\n**Large Language Models (LLMs)**: Advanced AI models (see [full list](https:\u002F\u002Fgithub.com\u002FHannibal046\u002FAwesome-LLM)) trained on vast amounts of text data. They seem to have a deep understanding of language, and can be trained to follow instructions or other tasks involving text.\n\n**Teacher Model**: A capable larger model that we aim to transfer to the smaller model.\n\n**Student Model**: The smaller model that the teacher model is distilled into.\n\n## Best practices\n\n### 1. Understand the limitations of smaller models.\n\n***Summary**: Model distillation is an empirical science and is not guaranteed to work well in all cases. The effectiveness of model distillation depends on the task and data.*\n\nThere is substantial and growing evidence that smaller models outperform zero-shot and few-shot GPT-4 when fine-tuned on golden labels ([1](https:\u002F\u002Fpredibase.com\u002Fblog\u002Fhow-to-fine-tune-llama-70b-for-structured-json-generation-with-ludwig), [2](https:\u002F\u002Fwww.anyscale.com\u002Fblog\u002Ffine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2)). However, due to their limited size, smaller models might not capture the full depth and nuance of language as effectively as their larger counterparts.\n\nIn a canonical model distillation set up where the student model is trained on the raw outputs of the teacher model (also called imitation learning), it is more often the case that the student model will, at best, match the teacher model's quality. \n\nIn [The False Promise of Imitating Proprietary LLMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.15717), researchers found that for certain tasks, smaller student models deceptively learned to mimic their teachers' style while falling short on factual correctness.\n\n\u003Cp align=\"center\" >\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_51966bcdc3cb.png\" \u002F>\n    \u003Cp align=\"center\" >\u003Ci>Spectrum of NLP tasks. The broader the domain and higher required precision, the more difficult the problem, and the less likely distillation will \"just work\".\u003C\u002Fi>\u003C\u002Fp>\n\u003C\u002Fp>\n\nIn truth, the effectiveness of model distillation depends largely on the specific task and data. Students are likely more disadvantaged than their larger pre-trained teachers when it comes to tasks that span broader domains or require substantial reasoning abilities. Conversely, for tasks that are straightforward and narrowly defined, out-of-the-box imitation learning may be entirely adequate for attaining competitive student models.\n\n\u003Cdetails>\u003Csummary>\u003Cem>[Case study: Jigsaw toxic comment classification]\u003C\u002Fem>\u003C\u002Fsummary>\n\nTo demonstrate and contextualize the best practices of LLM distillation that we will explore in the subsequent sections of this post, we use the [Jigsaw toxic comment classification dataset](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fjigsaw-toxic-comment-classification-challenge).\n\nThe [Jigsaw dataset](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fjigsaw-toxic-comment-classification-challenge) was created to train models to classify offensive comments. It consists of 160K real comments from the internet and has a mix of offensive and non-offensive examples.\n\nThe original dataset contains fine-grained labels for each comment: `toxic`, `severe_toxic`, `obscene`, `threat`, `insult`, and `identity_hate`. We collapse all the columns into one column `is_bad` to obtain a binary classification dataset.[^1]\n\n[^1]: While there are established state-of-the-art (SOTA) text classification models [specifically designed for the Jigsaw dataset](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fjigsaw-toxic-comment-classification-challenge\u002Fleaderboard), our intention here is not to surpass these benchmarks. Rather, we utilize this dataset as an illustrative tool to demonstrate and contextualize the best practices of LLM distillation.\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_b6b89d9ee0d8.png)\n\u003Cp align=\"center\" >\u003Ci>Model accuracy on a balanced test set comparing zero-shot performance of GPT-* models with OSS LLMs fine-tuned using a random subset of 10K examples.\u003C\u002Fi>\u003C\u002Fp>\n\n\u003C\u002Fdetails>\n\n### 2. Build good logging infrastructure.\n\n***Summary**: Have basic logging infrastructure for your LLMs in production. If logs are limited due to low traffic, PII, or other constraints, synthetic data generation may be a viable option for bootstrapping a dataset for fine-tuning.*\n\nIf you haven't already implemented logging in your application, you really should. Tokens are expensive and [data is oil](https:\u002F\u002Fwww.quora.com\u002FWho-should-get-credit-for-the-quote-data-is-the-new-oil). \n\n\u003Cp align=\"center\" >\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_8399c8aab6fa.png\" \u002F>\n    \u003Cp align=\"center\" >\u003Ci>Example of basic logging infrastructure with a Model-as-a-Service (MaaS) serverless teacher model. Stream requests and responses from your MaaS endpoint to a storage solution like Amazon S3 or Snowflake.\u003C\u002Fi>\u003C\u002Fp>\n\u003C\u002Fp>\n\n#### Bootstrap datasets with real logs.\n\nCollecting logs from production traffic that's sent to your teacher models is a great, lean option for bootstrapping a dataset for fine-tuning.[^2]\n\n[^2]: Always review the terms of service and usage policies of LLM providers when logging their outputs. While OpenAI permits the use of their models for academic or exploratory work, it's advisable to seek clarification for specific use cases and production settings.\n\nSee a lightweight example of asynchronously logging requests and responses to S3 in a streamlit app [here](https:\u002F\u002Fgithub.com\u002Fpredibase\u002Fllm_distillation_playbook\u002Ftree\u002Fmain\u002Fapp).\n\n#### Bootstrap datasets with synthetic data.\n\nFor applications with limited data either due to low traffic, PII, or other constraints, **synthetic data generation** may be a viable option for fine-tuning data.\n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_0e176b828c68.png\">\n  \u003Cp align=\"center\" >\u003Ci>Bootstrap your dataset with synthetic data. The biggest challenge with synthetic data is to ensure that the examples produced are varied and non-repetitive.\u003C\u002Fi>\u003C\u002Fp>\n\u003C\u002Fp>\n\nPapers like [Self-Instruct](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10560), [Alpacare](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.14558.pdf) or Microsoft's [phi-1](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.11644.pdf)\u002F[phi-1.5](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.05463.pdf)\u002F[phi-2](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fblog\u002Fphi-2-the-surprising-power-of-small-language-models\u002F) show how synthetic datasets, generated through creative variations of seed queries to GPT models, can be used to fine-tune compelling smaller models.\n\n> \"We speculate that the creation of synthetic datasets will become, in the near future, an important technical skill and a central topic of research in AI.\" ~ [phi 1.5 technical report](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.05463.pdf)\n\n### 3. Define clear evaluation criteria.\n\n***Summary:** Effective evaluation of distilled models requires clearly defined criteria that align with your specific application's needs. The choice of evaluation metrics should reflect the nature of the problem and the desired outcomes of the model.*\n\nThis is a well-known best practice for machine learning, but it's worth reiterating because it's so important.\n\n**Tailoring evaluation to the application**: Effective evaluation requires clearly defined criteria that align with your specific application's needs. For instance, LLMs for JSON generation tasks might focus on checking for schema adherence, extraction tasks might focus on accuracy or recall, and other language generation tasks might use BLEURT, ROUGE, or perplexity. The key is to select metrics that best represent the success of the model in its intended environment.\n\n**The emergence of LLMs as judges**: There's a growing trend of using LLMs themselves to assess model outputs, especially in scenarios where traditional metrics might fall short or where manual evaluation by human raters is too expensive. This approach can be compelling but requires [careful consideration](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.05685) to account for potential LLM biases.\n\n**Consistency and diversity in test sets**: Establishing clear test sets is critical. These sets should be diverse enough to cover various aspects of model performance yet consistent enough to allow for reliable tracking over time. Avoid changing your test sets frequently, as consistency is key when comparing performance across different models and iterations.\n\n\u003Cdetails>\u003Csummary>\u003Cem>[Case study: Jigsaw toxic comment classification]\u003C\u002Fem>\u003C\u002Fsummary>\n\nSampling a test set randomly from the [Jigsaw dataset](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fjigsaw-toxic-comment-classification-challenge) gives us a dataset with the distribution of: 90% non-toxic, 10% toxic.\n\nThis distribution might match what we expect our hypothetical application to receive (mostly non-toxic comments), we want to be sure that any model we put in production is equally good at detecting both offensive and non-offensive comments.\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_f8bcd8ad4e44.png)\n\nLet's formalize 2 different test sets:\n1. `test-indist`: An in-distribution test set with 90% non-bad examples and 10% toxic examples, drawn from the original test set.\n2. `test-balanced`: An explicitly balanced test set with 50% non-toxic and 50% toxic examples, drawn from the original test set.\n\nBy measuring models on both of these test sets simultaneously, we can track how well a candidate model classifies comments overall, as well as how well these classifications would fare in a traffic-realistic setting.\n\n\u003C\u002Fdetails>\n\n### 4. Maximize the quality of your teacher model.\n\n***Summary:** The quality of your teacher model's outputs serves as an upper limit for the performance of your distilled student model. Invest in maximizing the quality of your teacher model's performance as much as possible.*\n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_8ed7c9ca0d14.png\">\n  \u003Cp align=\"center\" >\u003Ci>Get your teacher model as good as it can be before feeding its outputs for the student to imitate.\u003C\u002Fi>\u003C\u002Fp>\n\u003C\u002Fp>\n\n**Choose a good teacher:** The choice of the teacher model is a critical first step. Opt for a model that demonstrates the highest accuracy and understanding of your task. GPT-4 is generally great, but it's worth checking to see if there's a better foundation model out there for your use case that may be better specialized to your task.\n\n| Metric          | zephyr-7b-alpha | Mixtral-8x7B-Instruct-v0.1 | Llama-2-70b-hf | Yi-34B-200K | CodeLlama-34b-Instruct-hf | GPT-3.5 | GPT-4 | Gemini     |\n| --------------- | :-------------: | :------------------------: | -------------- | ----------- | ------------------------- | ------- | ----- | ---------- |\n| Overall average |      59.5       |            72.6            | 67.9           | 70.8        | 57.3                      | 70.9    | 88.3  | 90.7       |\n| ARC             |      61.0       |            70.2            | 67.3           | 65.4        | 54.3                      | 82.9    | 94.9  | unreported |\n| HellaSwag       |      84.0       |            87.6            | 87.3           | 85.6        | 76.9                      | 79.4    | 92.4  | 87.8       |\n| MMLU            |      61.4       |            71.2            | 69.8           | 76.1        | 55.5                      | 67.4    | 83.7  | 90.0       |\n| TruthfulQA      |      57.9       |            64.6            | 44.9           | 53.6        | 44.4                      | 61.4    | 79.7  | unreported |\n| Winogrande      |      78.6       |            81.4            | 83.7           | 82.6        | 74.6                      | 65.8    | 87.1  | unreported |\n| GSM8K           |      14.0       |            60.7            | 54.1           | 61.6        | 38.0                      | 68.2    | 92.1  | 94.4       |\n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_73c599cf790b.png\">\n  \u003Cp align=\"center\" >\u003Ci>Sources: \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FHuggingFaceH4\u002Fopen_llm_leaderboard\">Open LLM Leaderboard\u003C\u002Fa>, \u003Ca href=\"https:\u002F\u002Fcrfm.stanford.edu\u002Fhelm\u002Flite\u002Flatest\u002F#\u002Fleaderboard\">HELM leaderboard\u003C\u002Fa> \u003Ca href=\"https:\u002F\u002Fthe-decoder.com\u002Fgpt-4-crushes-other-llms-according-to-new-benchmark-suite\u002F\">GPT-4 \"crushes\" other LLMs according to new benchmark suite\u003C\u002Fa>, \u003Ca href=\"https:\u002F\u002Fstorage.googleapis.com\u002Fdeepmind-media\u002Fgemini\u002Fgemini_1_report.pdf\">Gemini: A Family of Highly Capable Multimodal Models\u003C\u002Fa>\u003C\u002Fi>\u003C\u002Fp>\n\u003C\u002Fp>\n\n**Choose a good prompt:** Iterating on your prompts and prompt parameters can significantly enhance the quality of the teacher model's outputs. Thoughtfully crafted prompts often lead to more accurate and contextually relevant responses, which in turn, provide better training material for the student model. \n\n\u003Cdetails>\u003Csummary>\u003Cem>[Case study: Jigsaw toxic comment classification]\u003C\u002Fem>\u003C\u002Fsummary>\n\nVarying the LLMs, prompts, and temperature can a significant effect on teacher model accuracy.\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_3cb6408fc1ef.png)\n\nThe best temperature for this dataset appears to be 0.0. This makes sense because a higher temperature boosts the creativity of the outputs of the LLM, which likely isn't useful in a binary classification setting.\n\nWe highlight accuracy on `test-balanced` with two different prompts:\n\nSimple prompt:\n```\nGiven the input text below, please classify whether it is toxic. Provide your classification in a JSON format with the key 'is_bad'. If the comment is indeed toxic, set is_bad=1, otherwise set is_bad=0.\n\nInput text: '%s'\n\nOutput:\n```\n\nChain-of-Thought prompt:\n```\nFor a well-intentioned content moderation app, we want to flag a comment if it may contain toxic, insulting, obscene, or threatening language. Given the input text, please classify its toxicity. Please format your response in the form of a python dictionary with the following keys:\n\n1. 'is_bad': If the comment is indeed toxic, use set is_bad=1, otherwise set is_bad=0.\n2. 'reason': Provide an appropriate amount of detail for for your judgment.\n\nInput text: '%s'\nOutput:\n```\n\nHere were our results:\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_772267be6392.png)\n\nThe `simple_prompt` seems to be more aligned with human labels than a more sophisticated `Chain-of-Thought` prompt.\nThe quality gap between the two prompts is smaller when using GPT-4, however it does appear that a more sophisticated prompt does not always lead to better quality. Perhaps the additional reasoning that is spurred by Chain-of-Thought prompting increases the rate of false positives.\n\n\u003C\u002Fdetails>\n\n### 5. Maximize the quality of your training data.\n\n***Summary:** If you can continue enhancing the quality of your training data, with or without involvement from teachers, you absolutely should. Consider how you might fundamentally improve the quality of your data.*\n\nMost mistakes made by converged student models can be traced back to issues with the source data. For student models, addressing data quality issues at the source is typically more efficient than trying to correct these issues with auxiliary systems.\n\nHere are some of the most popular techniques.\n\n| Technique                                        | Difficulty | General applicability | Manual labor | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |\n| ------------------------------------------------ | ---------- | --------------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| Manually fix or curate your data.                | ★          | ★★★★★                 | ★★★★★        | Manually fix and revise bad outputs. Annotate new data. Simple but labor-intensive, this method ensures high-quality, error-free training material.                                                                                                                                                                                                                                                                                                                                                        |\n| Filter data based on rules.                      | ★★         | ★★★★                  | ★★★          | Employ basic rules (length criteria, regex patterns) to eliminate poor-quality data. While setting up rules is straightforward, identifying the right criteria can be time-consuming.                                                                                                                                                                                                                                                                                                                      |\n| Rank your data with auxiliary systems (or LLMs). | ★★★        | ★★★                   | ★            | Use an auxiliary system, such as another model, to assess and rank data quality. For example, Microsoft's phi-1 model employs GPT-4 to score training examples, using a classifier to prioritize higher-value data, and drop the bottom X% of examples. Also see section 2.1 of [this paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.04512).                                                                                                                                                                            |\n| Enrich data with explanation traces.             | ★★★        | ★★                    | ★            | Collect reasoning data. If your task requires non-trivial reasoning, you may find similar performance gains from including explanation traces or chain-of-thought (CoT) outputs from the teacher.                                                                                                                                                                                                                                                                                                          |\n| Aggregate your teachers.                         | ★★★★       | ★                     | ★            | For recursively-definable tasks such as summarization, use [chaining](https:\u002F\u002Fblog.langchain.dev\u002Ffine-tuning-chatgpt-surpassing-gpt-4-summarization\u002F). For tasks with exact answers, take a majority vote (see the [MedPrompt paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.16452), [Self-consistency paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11171)). ![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_32c8015802fa.png) By training your student on the consolidated outputs of multiple teachers, you enable your student model to leapfrog any single teacher. |\n\n\u003Cdetails>\u003Csummary>\u003Cem>[Case study: Jigsaw comment toxicity data quality experiments]\u003C\u002Fem>\u003C\u002Fsummary>\n\nTo assess the impact of data quality on model performance, we can derive 6 subsets of training data from the Jigsaw dataset, and train models for each of them.\n\n- A (1.1k rows): in-distribution, GPT labels.\n- B (2.2k rows): A + 1.1k rows in-distribution Gold labels.\n- C (2.1k rows): B filtered to remove GPT errors.\n- D (3.2k rows): B + 1k rows with Gold toxic labels.\n- E (5k rows): Larger in-distribution dataset, GPT labels.\n- F (10k rows): Largest in-distribution dataset, GPT labels.\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_e35613b96c01.png)\n\u003Cp align=\"center\">\u003Ci>Model performance on a balanced test set.\u003C\u002Fi>\u003C\u002Fp>\n\nPerformance improves both when we add high-quality human-labeled examples as well as when incorrect teacher-labeled examples are removed.\n\n\u003C\u002Fdetails>\n\n### 6. The best datasets are diverse and balanced.\n\n***Summary:** Try to make your dataset diverse, non-repetitive, and balanced. The more scenarios and complexities your dataset covers, the more likely the distilled student will generalize in an unbiased way.*\n\nOne of the main challenges in creating a high-quality dataset is ensuring that the examples are varied and non-repetitive. The training data for your student model should cover a wide range of scenarios, and they should vary in their level of difficulty, complexity, and style.\n\n**Diversity** is important for several reasons: it exposes the language model to different cases that it needs to be able to handle, it reduces the risk of overfitting or memorizing specific patterns or solutions, and it increases the generalization and robustness of the model to unseen or novel situations.\n\n**Balance** is just as important. If certain cases are sparsely represented in the overall dataset, it may be challenging for your student model to learn these effectively.\n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_c1cc106a2c9a.png\">\n  \u003Cp align=\"center\">\u003Ci>Datasets bootstrapped from real logs can also be variation or balance-deficient. For logs-based datasets, having too many examples from power users could be detrimental to overall dataset representation. Debias logs with random mutation, augment rarer examples with paraphrasing or back-translation, or manually add missing cases.\u003C\u002Fi>\u003C\u002Fp>\n\u003C\u002Fp>\n\nIt's not essential to know or address all data distribution issues upfront, but it is useful to anticipate them. Trust that if you've picked good test sets, meaningful biases in student models should become apparent during evaluation, and these can often be addressed with adjustments to training data.\n\n\u003Cdetails>\u003Csummary>\u003Cem>[Case study: Jigsaw toxic comment classification]\u003C\u002Fem>\u003C\u002Fsummary>\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_4c81273f54cc.png)\n\u003Cp align=\"center\">\u003Ci>Model performance on a balanced test set.\u003C\u002Fi>\u003C\u002Fp>\n\nPerfectly balanced is not necessary, nor necessarily better.\n\nFor example, it could be that the non-toxic examples are more difficult to detect than the toxic ones, so the model may very well benefit from having more examples of more difficult classes while having fewer examples of easier classes.\n\nUpfront, it's hard to know what the best \"balance\" is, or, for non-classification tasks, how to measure or productively change the balance of the dataset in the first place.\n\nThe higher-level idea is that if you have good test set(s), then when you do model evaluation with (unintentionally) imbalanced training data, you’ll be able to spot bias patterns that clue you into dataset distribution adjustments.\n\n\u003C\u002Fdetails>\n\n### 7. Start simple and small.\n\n***Summary**: Start with smaller, simpler model configurations that are quick to train so that you can debug issues with your setup, iterate quickly, and establish good benchmarks for comparing to more complex model configurations later.*\n\n**Embrace the power of the smallest, simplest model.** Not just a matter of efficiency; it's a strategic approach to model development. Smaller, simpler models are significantly quicker to train and understand, allowing for the fastest iteration and feedback.\n\n**Avoid the trap of cool, but complicated large models.**  One of the most common pitfalls in model training is starting with too large and too complex model configurations. These will be harder to understand, slow down iteration velocity, and extend experiment cycle times.\n\n**The value of naive baselines.** Always begin with naive, simple baseline models. These serve as a clear benchmark to measure the performance of subsequent more sophisticated model configurations.\n\n### 8. Assess the marginal utility of having more data.\n\n***Summary:** Meaningful fine-tuning results are often achieved with datasets ranging from a few hundred to tens of thousands of examples as a rule of thumb. To answer the question more concretely for your task, run an ablation study varying dataset size and extrapolate.*\n\n> \"How much data do I need for fine-tuning my model?\" ~ One of the most common questions that we get asked.\n\nIn earnest, it really depends, influenced by factors such as task difficulty, output variability, reasoning complexity, example length, task alignment with pre-training data, and hyperparameters. Some problems require minimal data for convergence, while others demand extensive training without converging at all.\n\nTo determine a good dataset size for your specific case, conduct an ablation experiment holding other training parameters constant, and varying dataset size (e.g., 5%, 10%, 25%, 50%, 75%, 100%).\n\nSuch experiments can reveal the marginal utility of having additional data for fine-tuning. If increasing data quantity doesn't yield much improvement, it's advisable to reevaluate other aspects of the training pipeline to identify potential areas for enhancement.\n\nIf you do find that the marginal utility of having more data is high, then consider data augmentation techniques like [back translation](https:\u002F\u002Fgithub.com\u002FQData\u002FTextAttack), manually annotating more data.\n\n\u003Cdetails>\u003Csummary>\u003Cem>[Case study: Jigsaw toxic comment classification]\u003C\u002Fem>\u003C\u002Fsummary>\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_49e9183f3b67.png)\n\u003Cp align=\"center\">\u003Ci>Model performance on a balanced test set.\u003C\u002Fi>\u003C\u002Fp>\n\nWhile there's a big jump in performance from 1.1K examples to 5K examples, the jump in quality from 5K to 10K is rather marginal. Based on these scores, we can roughly extrapolate that there is diminishing marginal utility for adding more training data for this model configuration beyond 10K examples.\n\n\u003C\u002Fdetails>\n\n### 9. Consider how you want to serve your student.\n\n***Summary:** While not crucial to decide upfront, have a model serving plan in mind to prioritize experiments with models that can ultimately be served.*\n\nIf you are planning to deploy multiple LLMs in production, it's beneficial to explore parameter-efficient fine-tuning (PEFT) techniques. PEFT, such as LoRA (Low-Rank Adaptation), involves training only a fraction of the model's weights, unlike full fine-tuning, which requires a dedicated set of GPU resources for each model. LoRA has been shown to [achieve performance on par with full fine-tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.09685), making it a viable option for efficient deployment.\n\n[The LoRA Exchange (LoRAX)](https:\u002F\u002Fgithub.com\u002Fpredibase\u002Florax), for example, is a serving solution optimized for serving numerous fine-tuned models using shared GPU resources. LoRAX stands out from traditional large language model serving methods by its ability to accommodate over a hundred task-specific, fine-tuned models on a single GPU. This capability significantly reduces the cost and complexity of serving fine-tuned models. LoRaX is especially suited for parameter-efficient fine-tuned models, offering a streamlined solution for deployment.\n\nWhile full fine-tuning with larger models might yield the highest absolute quality, the trade-off in terms of increased costs or serving latency might not justify the marginal gains in performance.\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_eff3e93a6e5a.png)\n\u003Cp align=\"center\" >\u003Ci>Serving adapter-based LLMs with LoRAX.\u003C\u002Fi>\u003C\u002Fp>\n\nConsider the target serving architecture early in the model development process. The type of model you choose may greatly influence how it will be served and should inform how to prioritize experiments.\n\n### 10. Experiment broadly, one parameter at a time.\n\n***Summary:** Exploration over exploitation: spend most of your time and energy to gain insight into the problem. Change one variable at a time, and try not to rathole.*\n\nTips for running lots of experiments at once:\n- Stay organized with model repositories or spreadsheets.\n- Parallelize but only change one parameter at a time.\n- Expect some toil and guesswork.\n- Optimize for iteration speed (simple -> complex, small -> large)\n\nThe following suggestions came about as we tried to crystalize our own approach to fine-tuning LLMs. This is far from a comprehensive list, but here are some of our favorite ideas for exploration.\n\n| Category               | Idea                                | Impact to Quality | Impact to Speed | Complexity | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |\n| ---------------------- | ----------------------------------- | ----------------- | --------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| Architecture parameter | Foundation model                    | ★★★★★             | ★★              | ★          | Try out a few different foundation models and see what happens to your student models performance. Like teachers, different foundation models may be inherently closer to your task than others.                                                                                                                                                                                                                                                                                                                                                                        |\n| Architecture parameter | Precision and quantization          | ★★★★              | ★★★★            | ★★         | Reducing precision significantly decreases the model's size, enabling it to train with larger batch sizes and thus higher throughput. While quantization can sometimes lead to a slight decrease in model accuracy due to the reduced precision, it's not always the case. In our experiments, oftentimes the trade-off is minimal compared to the gains in speed and size reduction.                                                                                                                                                                                   |\n| Architecture parameter | Adapter parameters (rank and alpha) | ★★★★              | ★★★             | ★★         | Rank in LoRA determines the size of the low-rank matrices that are used to approximate the full-rank weight matrices in the model. A higher rank can increase the model's capacity to learn complex patterns but at the cost of more parameters to train. Conversely, a lower rank is more parameter-efficient but limits the model's expressiveness.                                                                                                                                                                                                                   |\n| Architecture parameter | Base model size                     | ★★★               | ★★★★★           | ★          | Experiment with different sizes to get a sense of the trade-off between model performance and model size. Some tasks may benefit significantly from larger models due to the task's complexity. However, larger models are more likely to overfit to training data, especially if the dataset is not large or varied enough, or the gains in quality may be marginal. Increasingly, it's                                                                                                                                                                                |\n| Architecture parameter | Prompt                              | ★★                | ★★              | ★          | Prompts have an outsized impact with teacher models, but in supervised fine-tuning (SFT) and the weights of the model are updated directly, wordsmithing the prompt is not as directly impactful to quality.                                                                                                                                                                                                                                                                                                                                                            |\n| Training parameter     | Epochs                              | ★★★★★             | ★★★★★           | ★          | Simply training a model for longer (more epochs) will often result in a better model.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |\n| Training parameter     | Learning rate (LR) and LR schedule  | ★★★★★             | ★               | ★          | An optimal learning rate ensures that the model learns efficiently without missing or overshooting the optimal weights. A proper warm-up can improve model training stability and performance while decay helps maintain the balance between learning complex patterns and avoiding overfitting to the training data.                                                                                                                                                                                                                                                   |\n| Training parameter     | Max sequence length                 | ★★★               | ★★★             | ★          | For long-tailed data, consider truncating your data to maximize GPU utilization.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |\n| Training parameter     | Batch size                          | ★★★               | ★★★★★           | ★          | Max out your GPU. Choose the highest batch size that doesn't OOM.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |\n| Training strategy      | Curriculum learning                 | ★★★★              | ★★★             | ★★★★★      | Progressive learning, also known as curriculum learning, is a training strategy where the model is fine-tuned in a series of stages, each with a different kind of training data, typically progressing from more general or noisier data to more specific, high-quality, or in-domain data. Progressive learning mirrors the natural way humans learn: starting from broad concepts and gradually focusing on more specific and complex ideas. Example of progressive learning from [orca-2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.11045): ![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_9ccd7e5f1d6e.png) |\n| Training strategy      | RLHF\u002FRLAIF\u002FDPO                      | ★★★★              | ★★★★★           | ★★★★★      | RLHF\u002FRLHAIF\u002FDPO, also called \"preference tuning\" where the model undergoes reinforcement learning to align better to human preferences. This was originally popularized by OpenAI, however it's extremely costly, and seems like a last mile optimization. We have yet to speak with a company who has a critical need for this level of optimization. High-level diagram of [RLHF vs. RLAIF](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.00267): ![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_6161965aaab2.png)                                                                                                                |\n\n### 11. Look at the model's individual mistakes.\n\n***Summary:** While aggregate metrics and advanced automated evaluation methods provide a broad overview of model performance, manually reviewing examples of your model's outputs brings unparalleled value for a deeper qualitative understanding of model performance.*\n\nEspecially in generative contexts where model performance can't be neatly summarized with a clear-cut metric, taking the time to delve into specific examples of where and how your model makes mistakes is not just a step in the evaluation process; it's a critical component of the model development journey.\n\n**Identify Specific Errors:** Only by examining individual examples where the model errs, you can start to categorize and understand the nature of these mistakes. Is the model consistently struggling with certain types of inputs? Are there specific patterns or contexts where errors are more frequent or likely?\n\n**Uncover Data Issues:** Often, patterns in mistakes can be traced back to issues in data preparation or inadequate representation in the training set. Identifying these issues early can save significant resources and time that might otherwise be spent on futile parameter optimization. There's nothing more frustrating or wasteful than to spend hundreds of GPU hours optimizing modeling parameters when you uncover an issue with data quality.\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_ec1642492938.png)\n\n\u003Cp align=\"center\" >\u003Ci>Loss curves for fine-tuned LLMs will all look like this, yet the qualitative differences between these checkpoints can be substantial.\u003C\u002Fi>\u003C\u002Fp>\n\n### 12. Actually deploy and monitor your models in production.\n\n***Summary:** While test sets provide a controlled environment for evaluation, the true test of your model’s effectiveness is how it performs with actual users and real-time inputs. Deploy your model and observe its performance in a real-world setting!*\n\nDeploy and monitor your models in production… actually. Whether you are a resesarcher, an engineer, or somewhere in between, there's a lot to learn from going through the due dilligence of productionizing your model for real.\n\n#### Options for model deployment\n\n- **Live Experiment and Gradual Rollout:** Begin by directing a small percentage of traffic (e.g., 1%, then 10%) to the student model. Closely monitor changes in key application metrics like latency and user interactions before scaling up. Other names: incremental\u002Fcanary rollout.\n- **Dark Launch:** Continue using the teacher model in production but route a portion of traffic to the student model in the background. Compare instances where the student model’s predictions differ from the teacher’s to evaluate the student's quality readiness. Other names: shadow deployment.\n- **Hybrid Launch:** If the teacher model outperforms the student model, consider a hybrid deployment. The student model can handle simpler, less resource-intensive queries, while the teacher model addresses more complex requests. This approach balances efficiency with quality. Other names: blue-green deployment.\n\n#### Infrastructure safeguards\n\n- **Monitor Inputs:** Fine-tuned models, being more specialized, can be sensitive to feature drift. \n- **Monitor Outputs:** Establish failsafe mechanisms to scrutinize generated outputs. LLMs in production are often accompanied by rules-based or model-based systems to identify issues and trigger fallbacks. Be aware that using another LLM for output monitoring can add latency.\n- **Maintain Logs:** Continue logging the inputs and outputs for any production LLMs. Logs will be invaluable for future model refinements or re-distillations.\n\n## Contributing\n\nWe'd love to hear your feedback!\n\n- If you like the playbook, please [leave a star](https:\u002F\u002Fdocs.github.com\u002Fen\u002Fget-started\u002Fexploring-projects-on-github\u002Fsaving-repositories-with-stars#starring-a-repository)! You can also reach us by pinging the [Ludwig slack](https:\u002F\u002Fludwig-ai.slack.com\u002Fjoin\u002Fshared_invite\u002Fzt-mrxo87w6-DlX5~73T2B4v_g6jj0pJcQ) or the [LoRAX Discord](https:\u002F\u002Fdiscord.gg\u002FCBgdrGnZjy), or finding us on LinkedIn. Testimonials help us justify creating more resources like this.\n\n- If anything seems incorrect, please file an issue to start a discussion. For questions or other messages where an issue isn't appropriate, please open a new discussion topic on GitHub.\n\n- This is a living document. We anticipate making periodic improvements, both small and large. If you’d like to be notified, please watch our repository (see [instructions](https:\u002F\u002Fdocs.github.com\u002Fen\u002Faccount-and-profile\u002Fmanaging-subscriptions-and-notifications-on-github\u002Fsetting-up-notifications\u002Fconfiguring-notifications#configuring-your-watch-settings-for-an-individual-repository)).\n\n- Are there other best practices missing from this list? Feel free to create a PR! We promise to review your suggestions with expediency.\n","\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_774638639cb6.png\" height=250>\n\u003C\u002Fp>\n\n# LLM蒸馏操作手册\n\n**Justin Zhao\u003Csup>&dagger;\u003C\u002Fsup>, Wael Abid\u003Csup>&dagger;\u003C\u002Fsup>**\n\n&dagger; Predibase, MLX团队\n\n[幻灯片（2024年2月1日）](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F15qrqENfi1TFR-kjK1IU5YnJ9etHaW_Q_\u002Fview?usp=drive_link)\n\n## 目录\n\n- [LLM蒸馏操作手册](#llm-distillation-playbook)\n  - [目录](#table-of-contents)\n  - [本文档面向哪些读者？](#who-is-this-document-for)\n  - [为何需要蒸馏操作手册？](#why-a-distillation-playbook)\n  - [对开源的承诺](#commitment-to-open-source)\n  - [关键概念](#key-concepts)\n  - [最佳实践](#best-practices)\n    - [1. 了解小型模型的局限性。](#1-understand-the-limitations-of-smaller-models)\n    - [2. 构建良好的日志记录基础设施。](#2-build-good-logging-infrastructure)\n      - [使用真实日志数据构建初始数据集。](#bootstrap-datasets-with-real-logs)\n      - [使用合成数据构建初始数据集。](#bootstrap-datasets-with-synthetic-data)\n    - [3. 明确评估标准。](#3-define-clear-evaluation-criteria)\n    - [4. 尽可能提升教师模型的质量。](#4-maximize-the-quality-of-your-teacher-model)\n    - [5. 尽可能提升训练数据的质量。](#5-maximize-the-quality-of-your-training-data)\n    - [6. 最佳的数据集应兼具多样性和平衡性。](#6-the-best-datasets-are-diverse-and-balanced)\n    - [7. 从简单、小规模开始。](#7-start-simple-and-small)\n    - [8. 评估增加更多数据的边际效用。](#8-assess-the-marginal-utility-of-having-more-data)\n    - [9. 考虑如何部署学生模型。](#9-consider-how-you-want-to-serve-your-student)\n    - [10. 广泛实验，一次调整一个超参数。](#10-experiment-broadly-one-parameter-at-a-time)\n    - [11. 分析模型的具体错误。](#11-look-at-the-models-individual-mistakes)\n    - [12. 真正将模型部署到生产环境中并进行监控。](#12-actually-deploy-and-monitor-your-models-in-production)\n      - [模型部署选项](#options-for-model-deployment)\n      - [基础设施保障措施](#infrastructure-safeguards)\n  - [贡献](#contributing)\n\n## 本文档面向哪些读者？\n\n本文档面向对**LLM蒸馏**及其在生产应用中感兴趣的技术工程师和机器学习从业者。我们假定读者已掌握深度学习基础以及大型语言模型（LLMs）的相关知识。尽管本指南中的建议也可适用于学术研究等其他场景，但我们的重点在于如何更高效地将LLMs蒸馏为适合生产环境的轻量级模型。\n\n## 为何需要蒸馏操作手册？\n\n我们合作过的几乎所有组织都至少开发过一款基于LLMs的内部应用；其中一家大型企业甚至在一周内就完成了70个原型开发。\n\n如今，几乎每个人都在利用大型语言模型构建自己的原型系统。然而，随着LLMs的能力不断增强，并逐渐成为各类应用的核心组件，对更高效、更小型替代方案的需求也愈发迫切。\n\n这种需求的产生源于LLMs卓越的性能表现，同时也与其高昂的成本、巨大的资源消耗以及较慢的运行速度形成了鲜明对比。因此，通过蒸馏技术将这些大型模型压缩为更高效、更轻量的版本，能够在保证性能的同时显著降低计算成本并提升运行效率。\n\n尽管模型蒸馏备受关注，但我们发现，在实际操作中，要让蒸馏后的模型真正发挥良好效果，仍然需要大量的试错与经验积累。关于蒸馏的零散经验和建议散布于arXiv、Hugging Face、Discord、Substack以及社交媒体等多个平台，但目前尚未形成系统化、集中化的指导文档。\n\n本文档中的建议源自我们在Google和Predibase公司进行语言模型蒸馏的经验，同时结合了我们所能找到的所有关于SLM\u002FLLM蒸馏的研究成果。我们希望这些高效优化LLMs的方法能够为从业者和爱好者提供切实可行、基于学术研究且具有实用价值的思路，从而助力开源语言模型的持续发展与广泛应用。\n\n本文档是一个持续更新的活文档。我们计划定期对其进行小幅或大幅改进。如果您希望及时获取更新通知，请关注我们的仓库（详见[说明](https:\u002F\u002Fdocs.github.com\u002Fen\u002Faccount-and-profile\u002Fmanaging-subscriptions-and-notifications-on-github\u002Fsetting-up-notifications\u002Fconfiguring-notifications#configuring-your-watch-settings-for-an-individual-repository)）。\n\n## 对开源的承诺\n\n在Predibase，我们坚信未来的方向是经过微调、高度专业化且**开源**的LLMs。开源理念深深植根于公司的基因之中。作为一家公司，我们维护着以下项目：\n\n- [Ludwig](https:\u002F\u002Fgithub.com\u002Fludwig-ai\u002Fludwig)：用于构建自定义LLMs、神经网络及其他AI模型的低代码框架。\n- [LoRAX](https:\u002F\u002Fgithub.com\u002Fpredibase\u002Florax)：可扩展至数千个微调后LLMs的多LoRA推理服务器。\n\n\u003Cdetails>\u003Csummary>\u003Cem>[关于Predibase的更多信息]\u003C\u002Fem>\u003C\u002Fsummary>\n\nPredibase是一个基于开源技术构建的[托管平台](https:\u002F\u002Fpredibase.com)。如果您对微调和部署LLMs的托管解决方案感兴趣，可以在此处注册免费试用[这里](https:\u002F\u002Fpredibase.com)。\n\n\u003C\u002Fdetails>\n\n## 关键概念\n\n在深入探讨大型语言模型（LLMs）蒸馏的最佳实践之前，让我们先明确一些用于模型蒸馏及其相关应用的常用术语。\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_c1f8d1c3cb64.jpeg)\n\n\u003Cp align=\"center\" >\u003Ci>模型蒸馏示意图。\u003Ca href=https:\u002F\u002Fmagazine.sebastianraschka.com\u002Fp\u002Fresearch-papers-in-november-2023>图片来源\u003C\u002Fa>。\u003C\u002Fi>\u003C\u002Fp>\n\n**模型蒸馏**是指在不显著降低性能的前提下，将大型语言模型精简并压缩为更易于管理、更具成本效益的版本的技术。\n\n**大型语言模型（LLMs）**：经过海量文本数据训练的先进AI模型（参见[完整列表](https:\u002F\u002Fgithub.com\u002FHannibal046\u002FAwesome-LLM)）。它们对语言具有深刻的理解能力，并可通过训练完成指令遵循或其他文本相关任务。\n\n**教师模型**：功能强大、规模较大的原始模型，我们将从中提取知识并迁移到较小的学生模型中。\n\n**学生模型**：被蒸馏后的较小模型，其性能接近但低于教师模型。\n\n## 最佳实践\n\n### 1. 理解小型模型的局限性。\n\n***摘要**：模型蒸馏是一门经验科学，并非在所有情况下都能取得良好效果。模型蒸馏的有效性取决于具体任务和数据。*\n\n有大量且不断增加的证据表明，经过金标准标签微调后，小型模型的表现优于零样本和少样本的GPT-4（[1](https:\u002F\u002Fpredibase.com\u002Fblog\u002Fhow-to-fine-tune-llama-70b-for-structured-json-generation-with-ludwig)、[2](https:\u002F\u002Fwww.anyscale.com\u002Fblog\u002Ffine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2)）。然而，由于其规模有限，小型模型可能无法像大型模型那样有效地捕捉语言的全部深度和细微差别。\n\n在经典的模型蒸馏设置中，学生模型基于教师模型的原始输出进行训练（也称为模仿学习），通常情况下，学生模型最多只能达到与教师模型相当的水平。\n\n在论文《模仿专有大模型的虚假承诺》（[The False Promise of Imitating Proprietary LLMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.15717)）中，研究人员发现，对于某些任务，小型学生模型会欺骗性地学会模仿教师模型的风格，但在事实准确性上却表现不佳。\n\n\u003Cp align=\"center\" >\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_51966bcdc3cb.png\" \u002F>\n    \u003Cp align=\"center\" >\u003Ci>NLP任务的谱系。领域越广泛、精度要求越高，问题就越复杂，蒸馏方法“直接奏效”的可能性也就越小。\u003C\u002Fi>\u003C\u002Fp>\n\u003C\u002Fp>\n\n事实上，模型蒸馏的效果在很大程度上取决于具体的任务和数据。当任务涉及更广泛的领域或需要较强的推理能力时，学生模型往往不如其大型预训练教师模型。相反，对于那些简单且定义明确的任务，开箱即用的模仿学习就足以训练出具有竞争力的学生模型。\n\n\u003Cdetails>\u003Csummary>\u003Cem>[案例研究：Jigsaw有毒评论分类]\u003C\u002Fem>\u003C\u002Fsummary>\n\n为了展示并 contextualize 我们将在本文后续章节中探讨的大模型蒸馏最佳实践，我们使用了 [Jigsaw有毒评论分类数据集](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fjigsaw-toxic-comment-classification-challenge)。\n\n[Jigsaw数据集](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fjigsaw-toxic-comment-classification-challenge)旨在训练模型对攻击性评论进行分类。它包含来自互联网的16万条真实评论，既有攻击性的也有非攻击性的例子。\n\n原始数据集为每条评论提供了细粒度的标签：`toxic`、`severe_toxic`、`obscene`、`threat`、`insult` 和 `identity_hate`。我们将所有这些列合并为一个名为 `is_bad` 的列，从而得到一个二分类数据集。[^1]\n\n[^1]: 尽管已经存在针对 Jigsaw 数据集专门设计的先进文本分类模型（[SOTA 模型](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fjigsaw-toxic-comment-classification-challenge\u002Fleaderboard)），但我们在此的目的并不是超越这些基准。相反，我们利用该数据集作为示例工具，以展示和 contextualize 大模型蒸馏的最佳实践。\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_b6b89d9ee0d8.png)\n\u003Cp align=\"center\" >\u003Ci>在平衡测试集上，比较 GPT-* 模型的零样本性能与使用随机抽取的1万条样本微调后的开源大模型的准确率。\u003C\u002Fi>\u003C\u002Fp>\n\n\u003C\u002Fdetails>\n\n### 2. 构建良好的日志记录基础设施。\n\n***摘要**：在生产环境中为您的大模型配备基本的日志记录基础设施。如果由于流量低、PII 或其他限制导致日志数据不足，合成数据生成可能是用于微调的数据集构建的一种可行方案。*\n\n如果您尚未在应用中实现日志记录，那么现在确实应该开始实施了。Token 成本高昂，而“数据就是石油”（[data is oil](https:\u002F\u002Fwww.quora.com\u002FWho-should-get-credit-for-the-quote-data-is-the-new-oil)）。\n\n\u003Cp align=\"center\" >\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_8399c8aab6fa.png\" \u002F>\n    \u003Cp align=\"center\" >\u003Ci>基于模型即服务（MaaS）无服务器教师模型的基本日志记录基础设施示例。将来自 MaaS 端点的请求和响应流式传输到 Amazon S3 或 Snowflake 等存储解决方案中。\u003C\u002Fi>\u003C\u002Fp>\n\u003C\u002Fp>\n\n#### 使用真实日志构建数据集。\n\n从发送至教师模型的生产流量中收集日志，是为微调构建数据集的一个极佳且轻量级的选择。[^2]\n\n[^2]: 在记录大模型的输出时，请务必仔细阅读大模型提供商的服务条款和使用政策。虽然 OpenAI 允许将其模型用于学术或探索性工作，但对于特定的使用场景和生产环境，建议进一步确认。\n\n您可以在 [这里](https:\u002F\u002Fgithub.com\u002Fpredibase\u002Fllm_distillation_playbook\u002Ftree\u002Fmain\u002Fapp) 查看一个在 Streamlit 应用中异步将请求和响应记录到 S3 的轻量级示例。\n\n#### 使用合成数据构建数据集。\n\n对于因流量低、PII 或其他限制而导致数据不足的应用程序，**合成数据生成**可能是用于微调数据集的一种可行方案。\n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_0e176b828c68.png\">\n  \u003Cp align=\"center\" >\u003Ci>使用合成数据构建您的数据集。合成数据的最大挑战在于确保生成的示例多样化且不重复。\u003C\u002Fi>\u003C\u002Fp>\n\u003C\u002Fp>\n\n诸如 [Self-Instruct](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10560)、[Alpacare](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.14558.pdf) 以及微软的 [phi-1](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.11644.pdf)\u002F[phi-1.5](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.05463.pdf)\u002F[phi-2](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fblog\u002Fphi-2-the-surprising-power-of-small-language-models\u002F) 等论文都展示了如何通过向 GPT 模型输入种子查询并进行创造性变体，生成合成数据集，进而用于微调出令人信服的小型模型。\n\n> “我们推测，在不久的将来，合成数据的生成将成为一项重要的技术技能，也是人工智能研究的核心议题。” ~ [phi 1.5 技术报告](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.05463.pdf)\n\n### 3. 定义清晰的评估标准。\n\n***摘要：** 对蒸馏模型的有效评估，需要制定与具体应用场景需求相一致的明确标准。评估指标的选择应反映问题的本质以及对模型期望达到的效果。*\n\n这是一项广为人知的机器学习最佳实践，但由于其重要性，仍需再次强调。\n\n**根据应用量身定制评估**：有效的评估必须有清晰定义的标准，且这些标准要与特定的应用需求相契合。例如，用于生成 JSON 数据的大型语言模型可能会侧重于检查是否符合数据模式；信息抽取任务则可能关注准确率或召回率；而其他语言生成任务则可采用 BLEURT、ROUGE 或困惑度等指标。关键在于选择最能体现模型在其预期使用环境中表现成功的指标。\n\n**LLM 作为评估者的新趋势**：目前有一种日益增长的趋势，即利用大型语言模型本身来评估模型输出，尤其是在传统指标难以全面衡量，或由人工标注员进行手动评估成本过高的场景下。这种方法颇具吸引力，但同时也需要[审慎考虑](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.05685)，以避免 LLM 可能存在的偏见影响评估结果。\n\n**测试集的一致性与多样性**：建立清晰的测试集至关重要。这些测试集应当足够多样化，能够覆盖模型性能的各个方面；同时也要保持一致性，以便长期可靠地跟踪模型表现。切勿频繁更换测试集，因为在比较不同模型及不同迭代版本的表现时，一致性是核心要素。\n\n\u003Cdetails>\u003Csummary>\u003Cem>[案例研究：Jigsaw 恶意评论分类]\u003C\u002Fem>\u003C\u002Fsummary>\n\n从 [Jigsaw 数据集](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fjigsaw-toxic-comment-classification-challenge) 中随机采样得到的测试集，其分布为：90% 非恶意评论，10% 恶意评论。\n\n这一分布或许符合我们假设的应用场景中实际接收到的数据分布（大部分为非恶意评论）。然而，我们希望确保投入生产环境的任何模型，都能同样出色地识别出恶意和非恶意评论。\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_f8bcd8ad4e44.png)\n\n接下来，我们正式定义两个不同的测试集：\n1. `test-indist`：一个属于原分布的测试集，包含 90% 的非恶意样本和 10% 的恶意样本，直接取自原始测试集。\n2. `test-balanced`：一个显式平衡的测试集，包含 50% 非恶意样本和 50% 恶意样本，同样取自原始测试集。\n\n通过同时在这两个测试集上评估模型，我们可以追踪候选模型的整体分类效果，以及在更贴近真实流量环境下的分类表现如何。\n\n\u003C\u002Fdetails>\n\n### 4. 尽可能提升教师模型的质量。\n\n***总结：** 教师模型输出的质量是你蒸馏得到的学生模型性能的上限。请尽最大努力提升教师模型的性能。*\n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_8ed7c9ca0d14.png\">\n  \u003Cp align=\"center\" >\u003Ci>在将教师模型的输出用于学生模型模仿之前，务必使教师模型达到尽可能好的水平。\u003C\u002Fi>\u003C\u002Fp>\n\u003C\u002Fp>\n\n**选择优秀的教师模型：** 教师模型的选择是至关重要的第一步。应选择在你的任务上表现出最高准确率和理解能力的模型。GPT-4通常表现优异，但你也应该评估是否存在更适合你特定应用场景的基础模型，这些模型可能在你的任务上更具针对性。\n\n| 指标          | zephyr-7b-alpha | Mixtral-8x7B-Instruct-v0.1 | Llama-2-70b-hf | Yi-34B-200K | CodeLlama-34b-Instruct-hf | GPT-3.5 | GPT-4 | Gemini     |\n| --------------- | :-------------: | :------------------------: | -------------- | ----------- | ------------------------- | ------- | ----- | ---------- |\n| 总体平均      |      59.5       |            72.6            | 67.9           | 70.8        | 57.3                      | 70.9    | 88.3  | 90.7       |\n| ARC             |      61.0       |            70.2            | 67.3           | 65.4        | 54.3                      | 82.9    | 94.9  | 未报告     |\n| HellaSwag       |      84.0       |            87.6            | 87.3           | 85.6        | 76.9                      | 79.4    | 92.4  | 87.8       |\n| MMLU            |      61.4       |            71.2            | 69.8           | 76.1        | 55.5                      | 67.4    | 83.7  | 90.0       |\n| TruthfulQA      |      57.9       |            64.6            | 44.9           | 53.6        | 44.4                      | 61.4    | 79.7  | 未报告     |\n| Winogrande      |      78.6       |            81.4            | 83.7           | 82.6        | 74.6                      | 65.8    | 87.1  | 未报告     |\n| GSM8K           |      14.0       |            60.7            | 54.1           | 61.6        | 38.0                      | 68.2    | 92.1  | 94.4       |\n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_73c599cf790b.png\">\n  \u003Cp align=\"center\" >\u003Ci>资料来源： \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002FHuggingFaceH4\u002Fopen_llm_leaderboard\">开放LLM排行榜\u003C\u002Fa>, \u003Ca href=\"https:\u002F\u002Fcrfm.stanford.edu\u002Fhelm\u002Flite\u002Flatest\u002F#\u002Fleaderboard\">HELM排行榜\u003C\u002Fa> \u003Ca href=\"https:\u002F\u002Fthe-decoder.com\u002Fgpt-4-crushes-other-llms-according-to-new-benchmark-suite\u002F\">根据新的基准测试套件，GPT-4“碾压”其他大语言模型\u003C\u002Fa>, \u003Ca href=\"https:\u002F\u002Fstorage.googleapis.com\u002Fdeepmind-media\u002Fgemini\u002Fgemini_1_report.pdf\">Gemini：一系列功能强大的多模态模型\u003C\u002Fa>\u003C\u002Fi>\u003C\u002Fp>\n\u003C\u002Fp>\n\n**选择优质的提示词：** 不断优化提示词及其参数，可以显著提升教师模型输出的质量。精心设计的提示词通常能生成更准确、更符合上下文的响应，从而为学生模型提供更好的训练素材。\n\n\u003Cdetails>\u003Csummary>\u003Cem>[案例研究：Jigsaw有毒评论分类]\u003C\u002Fem>\u003C\u002Fsummary>\n\n调整大语言模型、提示词以及温度设置，会对教师模型的准确率产生显著影响。\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_3cb6408fc1ef.png)\n\n对于该数据集而言，最佳温度似乎是0.0。这是合理的，因为较高的温度会增强大语言模型输出的创造性，而这在二分类任务中往往并不适用。\n\n我们以两种不同的提示词为例，展示其在`test-balanced`数据集上的准确率：\n\n简单提示词：\n```\n给定以下输入文本，请判断其是否具有毒性。请以JSON格式给出分类结果，键名为'is_bad'。如果评论确实具有毒性，则设置is_bad=1；否则，设置is_bad=0。\n\n输入文本：%s\n\n输出：\n```\n\n思维链提示词：\n```\n对于一款旨在维护良好网络环境的内容审核应用，我们需要标记可能包含有毒、侮辱性、淫秽或威胁性语言的评论。请根据给定的输入文本，对其中的毒性进行分类。请以Python字典的形式作答，包含以下两个键：\n\n1. 'is_bad'：如果评论确实具有毒性，则设置is_bad=1；否则，设置is_bad=0。\n2. 'reason'：请针对您的判断提供适量的详细说明。\n\n输入文本：%s\n输出：\n```\n\n以下是我们的实验结果：\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_772267be6392.png)\n\n结果显示，与更为复杂的“思维链”提示相比，“简单提示”与人工标注的匹配度更高。尽管使用GPT-4时，两种提示之间的差距有所缩小，但这表明更复杂的提示并不一定带来更好的效果。或许，“思维链”提示所引发的额外推理反而增加了误判的概率。\n\n\u003C\u002Fdetails>\n\n### 5. 最大化训练数据的质量。\n\n***总结：** 如果你能够持续提升训练数据的质量，无论是否需要教师参与，都应当这样做。请思考如何从根本上提高数据质量。*\n\n大多数收敛后的学生模型所犯的错误，都可以追溯到源数据的问题上。对于学生模型而言，在源头解决数据质量问题，通常比试图通过辅助系统来纠正这些问题更为高效。\n\n以下是一些最常用的技术。\n\n| 技术                                        | 难度 | 普适性 | 手工劳动 | 描述                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |\n| ------------------------------------------------ | ---------- | --------------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| 手动修复或精选你的数据。                | ★          | ★★★★★                 | ★★★★★        | 手动修正并修改不良输出。标注新数据。这种方法简单但耗时费力，能够确保训练材料高质量且无错误。                                                                                                                                                                                                                                                                                                                                                        |\n| 基于规则过滤数据。                      | ★★         | ★★★★                  | ★★★          | 使用基本规则（如长度限制、正则表达式模式）来剔除低质量数据。虽然设置规则相对容易，但确定合适的筛选标准可能较为耗时。                                                                                                                                                                                                                                                                                                                      |\n| 利用辅助系统（或大语言模型）对数据进行排序。 | ★★★        | ★★★                   | ★            | 使用辅助系统，例如另一个模型，来评估和排序数据质量。例如，微软的phi-1模型利用GPT-4为训练样本打分，再通过分类器优先选择高价值数据，并丢弃得分最低的X%样本。更多信息参见[这篇论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.04512)的第2.1节。                                                                                                                                                                            |\n| 通过解释轨迹丰富数据。             | ★★★        | ★★                    | ★            | 收集推理数据。如果你的任务需要复杂的推理能力，那么在训练数据中加入教师提供的解释轨迹或思维链（CoT）输出，可能会带来类似的性能提升。                                                                                                                                                                                                                                                                                                          |\n| 聚合多个教师的输出。                         | ★★★★       | ★                     | ★            | 对于可递归定义的任务，例如摘要生成，可以使用[链式方法](https:\u002F\u002Fblog.langchain.dev\u002Ffine-tuning-chatgpt-surpassing-gpt-4-summarization\u002F)。而对于有明确答案的任务，则可以采用多数投票机制（参见[MedPrompt论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.16452)和[自洽性论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11171)）。![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_32c8015802fa.png) 通过让学生模型基于多位教师的综合输出进行训练，它就能超越任何单一教师的表现。 |\n\n\u003Cdetails>\u003Csummary>\u003Cem>[案例研究：Jigsaw评论毒性数据质量实验]\u003C\u002Fem>\u003C\u002Fsummary>\n\n为了评估数据质量对模型性能的影响，我们可以从Jigsaw数据集中提取6个子集，并分别为每个子集训练模型。\n\n- A（1,100行）：分布内数据，由GPT标注。\n- B（2,200行）：A加上1,100行分布内Gold标注数据。\n- C（2,100行）：B经过过滤，去除了GPT标注中的错误。\n- D（3,200行）：B加上1,000行带有Gold毒性标签的数据。\n- E（5,000行）：更大的分布内数据集，由GPT标注。\n- F（10,000行）：最大的分布内数据集，由GPT标注。\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_e35613b96c01.png)\n\u003Cp align=\"center\">\u003Ci>模型在平衡测试集上的表现。\u003C\u002Fi>\u003C\u002Fp>\n\n无论是加入高质量的人工标注样本，还是移除教师标注中的错误样本，模型性能都会有所提升。\n\n\u003C\u002Fdetails>\n\n### 6. 最佳数据集应具备多样性和平衡性。\n\n***总结：** 尽量使你的数据集多样化、非重复且平衡。数据集涵盖的场景和复杂度越高，蒸馏得到的学生模型就越有可能以无偏的方式进行泛化。*\n\n创建高质量数据集的主要挑战之一，是确保示例的多样性和非重复性。学生模型的训练数据应当覆盖广泛的场景，并在难度、复杂性和风格上有所差异。\n\n**多样性**之所以重要，有以下几个原因：它能让语言模型接触到各种需要处理的情况，降低过拟合或记忆特定模式与解决方案的风险，同时提高模型对未见或新情况的泛化能力和鲁棒性。\n\n**平衡性**同样关键。如果某些情况在整个数据集中所占比例过少，学生模型可能难以有效学习这些内容。\n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_c1cc106a2c9a.png\">\n  \u003Cp align=\"center\">\u003Ci>从真实日志中构建的数据集也可能存在多样性或平衡性不足的问题。对于基于日志的数据集而言，过多来自高级用户的示例可能会损害整体数据分布的代表性。可以通过随机变异来去偏，用释义或反向翻译扩充稀有样本，或者手动添加缺失的案例。\u003C\u002Fi>\u003C\u002Fp>\n\u003C\u002Fp>\n\n并非必须事先了解并解决所有数据分布问题，但提前预见到这些问题是有益的。只要选择了合适的测试集，在评估过程中，学生模型中的显著偏差通常会显现出来，而这些偏差往往可以通过调整训练数据来解决。\n\n\u003Cdetails>\u003Csummary>\u003Cem>[案例研究：Jigsaw有毒评论分类]\u003C\u002Fem>\u003C\u002Fsummary>\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_4c81273f54cc.png)\n\u003Cp align=\"center\">\u003Ci>模型在平衡测试集上的表现。\u003C\u002Fi>\u003C\u002Fp>\n\n完全平衡并非必要，也未必更好。\n\n例如，非有毒样本可能比有毒样本更难识别，因此模型或许能从提供更多较难类别的样本、减少较易类别样本中受益。\n\n在初期阶段，很难确定最佳的“平衡”状态，或者对于非分类任务来说，如何衡量或有效地改变数据集的平衡。\n\n更宏观的观点是，如果你拥有优质的测试集，那么即使使用（无意中）不平衡的训练数据进行模型评估，你也能够发现偏差模式，从而为调整数据分布提供线索。\n\n\u003C\u002Fdetails>\n\n### 7. 从简单小规模开始。\n\n***总结：** 从较小、较简单的模型配置入手，这类模型训练速度快，便于调试设置中的问题、快速迭代，并为后续比较更复杂的模型配置建立良好的基准。*\n\n**拥抱最小、最简单模型的力量。** 这不仅是效率问题，更是一种战略性建模方法。小型、简单的模型训练和理解起来都快得多，能够实现最快的迭代和反馈。\n\n**避免陷入酷炫但复杂的大型模型陷阱。** 模型训练中最常见的误区之一，就是一开始就选择过大、过复杂的模型配置。这样的模型不仅难以理解，还会拖慢迭代速度，延长实验周期。\n\n**朴素基线的价值。** 始终从朴素、简单的基线模型开始。这些基线模型可以作为清晰的基准，用来衡量后续更复杂模型配置的表现。\n\n### 8. 评估增加更多数据的边际效用。\n\n***总结：** 一般来说，有意义的微调效果往往可以在几百到几万条示例的数据集上实现。要针对你的具体任务更明确地回答这个问题，可以进行一项消融实验，改变数据集大小并加以推断。*\n\n> “我需要多少数据才能微调我的模型？”——这是我们经常被问到的问题之一。\n\n实际上，这确实因任务而异，受任务难度、输出多样性、推理复杂性、示例长度、任务与预训练数据的契合度以及超参数等因素影响。有些问题只需少量数据就能收敛，而另一些问题则需要大量训练却始终无法收敛。\n\n为了确定适合你具体情况的数据集规模，可以在保持其他训练参数不变的情况下，通过改变数据集大小（如5%、10%、25%、50%、75%、100%）来进行消融实验。这类实验可以帮助你了解增加额外数据进行微调的边际效用。如果增加数据量带来的提升有限，建议重新评估训练流程的其他方面，寻找潜在的改进空间。\n\n如果你发现增加更多数据的边际效用较高，则可以考虑采用数据增强技术，比如[反向翻译](https:\u002F\u002Fgithub.com\u002FQData\u002FTextAttack)，或者手动标注更多数据。\n\n\u003Cdetails>\u003Csummary>\u003Cem>[案例研究：Jigsaw有毒评论分类]\u003C\u002Fem>\u003C\u002Fsummary>\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_49e9183f3b67.png)\n\u003Cp align=\"center\">\u003Ci>模型在平衡测试集上的表现。\u003C\u002Fi>\u003C\u002Fp>\n\n从1,100条示例到5,000条示例，性能有了显著提升；然而，从5,000条到10,000条，质量提升则相对有限。根据这些结果，我们可以大致推断出，对于该模型配置而言，当训练数据超过10,000条后，继续增加数据的边际效用将逐渐减弱。\n\n\u003C\u002Fdetails>\n\n### 9. 思考你希望如何部署和提供学生模型的服务。\n\n***总结：** 虽然不必一开始就做出决定，但最好心中有一个明确的模型部署方案，以便优先尝试那些最终可以高效部署的模型。*\n\n如果你计划在生产环境中部署多个大语言模型，探索参数高效的微调（PEFT）技术将大有裨益。与需要为每个模型单独配备GPU资源的全量微调不同，PEFT技术（如LoRA，即低秩适应）只需训练模型中的一小部分权重。研究表明，LoRA能够[达到与全量微调相当的性能](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.09685)，因此是一种极具吸引力的高效部署方案。\n\n例如，[LoRA Exchange（LoRAX）](https:\u002F\u002Fgithub.com\u002Fpredibase\u002Florax)就是一个专为共享GPU资源而优化的多模型部署解决方案。与传统的大型语言模型部署方式相比，LoRAX的最大优势在于它能够在单个GPU上同时支持上百个针对特定任务微调过的模型。这一特性显著降低了微调模型部署的成本和复杂性。LoRAX尤其适合参数高效的微调模型，为模型部署提供了一套简化的解决方案。\n\n尽管使用更大规模的模型进行全量微调可能会带来最高的绝对质量，但从成本或推理延迟的角度来看，这种提升可能并不足以弥补额外的开销和代价。\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_eff3e93a6e5a.png)\n\u003Cp align=\"center\" >\u003Ci>使用LoRAX部署基于适配器的大语言模型。\u003C\u002Fi>\u003C\u002Fp>\n\n在模型开发的早期阶段就应考虑目标部署架构。你选择的模型类型将极大影响其部署方式，并应指导你如何确定实验的优先级。\n\n### 10. 广泛实验，一次只改变一个参数。\n\n***总结：** 探索胜于利用：将大部分时间和精力投入到对问题的理解中。每次只调整一个变量，尽量避免陷入细节泥潭。*\n\n同时进行大量实验的技巧：\n- 使用模型仓库或电子表格保持井然有序。\n- 尽量并行化实验，但每次只改变一个参数。\n- 预期会有一些繁琐的工作和试错过程。\n- 优先优化迭代速度（从简单到复杂、从小到大）。\n\n以下建议是我们尝试总结自身大语言模型微调方法时得出的。这绝非一份全面的清单，但其中包含了一些我们最喜欢的探索思路。\n\n| 类别               | 思路                                | 对质量的影响 | 对速度的影响 | 复杂度 | 描述                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |\n| ---------------------- | ----------------------------------- | ----------------- | --------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| 架构参数             | 基础模型                    | ★★★★★             | ★★              | ★          | 尝试几种不同的基础模型，观察对学生模型性能的影响。就像教师一样，不同的基础模型可能天生就比其他模型更贴近你的任务。                                                                                                                                                                                                                                                                                                                                                                        |\n| 架构参数             | 精度与量化          | ★★★★              | ★★★★            | ★★         | 降低精度会显著减小模型大小，使其能够使用更大的批量进行训练，从而提高吞吐量。虽然量化有时会因精度降低而导致模型准确率略有下降，但并非总是如此。在我们的实验中，这种权衡通常远小于速度提升和模型尺寸缩减带来的收益。                                                                                                                                                                                   |\n| 架构参数             | Adapter 参数（秩与 alpha） | ★★★★              | ★★★             | ★★         | LoRA 中的秩决定了用于近似模型中全秩权重矩阵的低秩矩阵的大小。较高的秩可以增强模型学习复杂模式的能力，但需要训练的参数更多；而较低的秩则更节省参数，但会限制模型的表达能力。                                                                                                                                                                                                                   |\n| 架构参数             | 基础模型规模                     | ★★★               | ★★★★★           | ★          | 通过尝试不同规模，感受模型性能与模型大小之间的权衡。某些任务由于其复杂性，可能会从更大规模的模型中获益显著。然而，较大的模型更容易对训练数据过拟合，尤其是在数据集不够大或不够多样化的情况下；或者性能提升可能并不明显。越来越多的……                                                                                                                                                                                |\n| 架构参数             | 提示词                              | ★★                | ★★              | ★          | 在教师模型中，提示词的影响非常大；但在监督微调（SFT）中，模型的权重会被直接更新，因此精心设计提示词对质量的直接影响较小。                                                                                                                                                                                                                                                                                                                                                            |\n| 训练参数     | Epochs                              | ★★★★★             | ★★★★★           | ★          | 只要让模型多训练几个 epoch，通常就能得到更好的模型。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |\n| 训练参数     | 学习率（LR）及学习率调度  | ★★★★★             | ★               | ★          | 适当的学习率能确保模型高效学习，既不会错过最优权重，也不会过度调整。合理的预热策略可以提升训练稳定性和性能，而学习率衰减则有助于在学习复杂模式与避免过拟合之间保持平衡。                                                                                                                                                                                                                                                   |\n| 训练参数     | 最大序列长度                 | ★★★               | ★★★             | ★          | 对于长尾数据，可以考虑截断数据以最大化 GPU 利用率。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |\n| 训练参数     | 批量大小                          | ★★★               | ★★★★★           | ★          | 充分利用 GPU 资源。选择一个不会导致显存溢出的最大批量。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |\n| 训练策略      | 课程式学习                 | ★★★★              | ★★★             | ★★★★★      | 渐进式学习，也称为课程式学习，是一种训练策略，即模型在多个阶段逐步微调，每个阶段使用不同类型的训练数据，通常是从更通用或噪声较多的数据开始，逐渐过渡到更具体、高质量或领域内数据。渐进式学习模拟了人类自然的学习方式：先从宽泛的概念入手，再逐步聚焦到更具体和复杂的主题。以下为 [orca-2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.11045) 中的渐进式学习示例：![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_9ccd7e5f1d6e.png) |\n| 训练策略      | RLHF\u002FRLAIF\u002FDPO                      | ★★★★              | ★★★★★           | ★★★★★      | RLHF\u002FRLHAIF\u002FDPO，也被称为“偏好调优”，是指通过强化学习使模型更好地符合人类偏好。这一方法最初由 OpenAI 推广开来，但成本极高，更像是最后一步优化。我们尚未遇到有迫切需求达到这种优化程度的企业。[RLHF 与 RLAIF](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.00267) 的高层级示意图如下：![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_6161965aaab2.png)                                                                                                                |\n\n### 11. 查看模型的个别错误。\n\n***总结：** 尽管聚合指标和先进的自动化评估方法能够提供对模型性能的宏观概览，但手动审查模型输出的具体示例，对于深入理解模型性能的定性特征具有无可比拟的价值。*\n\n尤其是在生成式场景中，模型性能往往难以用单一明确的指标来概括，因此花时间仔细分析模型在哪些地方、以何种方式出现错误，不仅是评估流程中的一个步骤，更是模型开发过程中至关重要的环节。\n\n**识别具体错误：** 只有通过检查模型出错的个别案例，才能开始对这些错误进行分类，并理解其本质。模型是否始终在处理某些特定类型的输入时遇到困难？是否存在某些特定的模式或情境，使得错误更容易发生？\n\n**发现数据问题：** 很多时候，错误的模式可以追溯到数据准备阶段的问题，或是训练集中数据代表性不足。及早发现这些问题，可以节省大量资源和时间，避免将精力浪费在无效的参数调优上。没有什么比耗费数百小时的GPU算力去优化模型参数，却最终发现是数据质量问题更令人沮丧和浪费的了。\n\n![img](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_readme_ec1642492938.png)\n\n\u003Cp align=\"center\" >\u003Ci>微调后的LLM损失曲线看起来都差不多，但这些检查点之间的定性差异可能非常巨大。\u003C\u002Fi>\u003C\u002Fp>\n\n### 12. 真正将模型部署并监控于生产环境。\n\n***总结：** 虽然测试集提供了一个受控的评估环境，但检验模型真正有效性的标准，是它在面对真实用户和实时输入时的表现。将你的模型部署到生产环境中，观察其在实际场景中的表现吧！*\n\n把你的模型真正部署并监控于生产环境……真的。无论你是研究人员、工程师，还是介于两者之间的人，亲身体验将模型投入生产的过程，都能让你学到很多宝贵的经验。\n\n#### 模型部署选项\n\n- **在线实验与逐步上线：** 首先将一小部分流量（例如1%，然后10%）引导至学生模型。在扩大规模之前，密切监控关键应用指标的变化，如延迟和用户交互等。其他名称：渐进式\u002F金丝雀发布。\n- **暗启动：** 继续在生产环境中使用教师模型，但在后台将一部分流量路由到学生模型。比较学生模型与教师模型预测结果不同的情况，以评估学生模型的质量是否已达到上线标准。其他名称：影子部署。\n- **混合部署：** 如果教师模型的表现优于学生模型，可以考虑采用混合部署的方式。学生模型负责处理较为简单、资源消耗较低的查询，而教师模型则处理更复杂的请求。这种方式能够在效率和质量之间取得平衡。其他名称：蓝绿部署。\n\n#### 基础设施保障措施\n\n- **监控输入：** 经过微调的模型由于更加专业化，可能会对特征漂移较为敏感。\n- **监控输出：** 建立故障保护机制，对生成的输出进行严格审查。生产环境中的LLM通常会配备基于规则或基于模型的系统，用于识别问题并触发回退机制。需要注意的是，使用另一款LLM来进行输出监控可能会增加延迟。\n- **维护日志：** 对所有投入生产的LLM持续记录输入和输出。这些日志对于未来的模型优化或蒸馏工作将极为重要。\n\n## 贡献\n我们非常期待您的反馈！\n\n- 如果您喜欢这份指南，请[给它点个赞](https:\u002F\u002Fdocs.github.com\u002Fen\u002Fget-started\u002Fexploring-projects-on-github\u002Fsaving-repositories-with-stars#starring-a-repository)！您也可以通过Ludwig Slack频道（[链接](https:\u002F\u002Fludwig-ai.slack.com\u002Fjoin\u002Fshared_invite\u002Fzt-mrxo87w6-DlX5~73T2B4v_g6jj0pJcQ)）、LoRAX Discord服务器（[链接](https:\u002F\u002Fdiscord.gg\u002FCBgdrGnZjy)），或者LinkedIn联系我们。您的评价将帮助我们证明继续创建类似资源的必要性。\n  \n- 如果您发现任何不准确之处，请提交一个问题以展开讨论。对于不适合通过问题形式提出的内容或疑问，请在GitHub上开启一个新的讨论主题。\n\n- 本指南是一个持续更新的文档。我们预计会定期进行大小不一的改进。如果您希望及时收到更新通知，请关注我们的仓库（参见[说明](https:\u002F\u002Fdocs.github.com\u002Fen\u002Faccount-and-profile\u002Fmanaging-subscriptions-and-notifications-on-github\u002Fsetting-up-notifications\u002Fconfiguring-notifications#configuring-your-watch-settings-for-an-individual-repository))。\n\n- 这份清单中是否还遗漏了其他最佳实践？欢迎随时创建PR！我们承诺会尽快审阅您的建议。","# LLM Distillation Playbook 快速上手指南\n\n`llm_distillation_playbook` 并非一个直接可执行的软件包，而是一份由 Predibase MLX 团队编写的**最佳实践指南与代码示例集合**。它旨在帮助工程师将大型语言模型（LLM）的知识蒸馏到更小、更高效的学生模型中，适用于生产环境。\n\n本指南基于其提供的核心概念和案例研究（如 Jigsaw 有毒评论分类），整理出适合中国开发者的快速启动流程。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**: Linux (推荐 Ubuntu 20.04+) 或 macOS。\n*   **Python 版本**: Python 3.9 或更高版本。\n*   **硬件要求**:\n    *   **训练\u002F微调**: 建议配备 NVIDIA GPU (显存 >= 16GB 用于较小模型，>= 24GB 更佳) 或使用 Apple Silicon (M1\u002FM2\u002FM3 Max\u002FUltra) 配合 MLX 框架。\n    *   **推理**: 单张消费级显卡即可运行蒸馏后的学生模型。\n*   **前置依赖**:\n    *   `git`: 用于克隆仓库。\n    *   `pip` 或 `conda`: 包管理工具。\n    *   (可选) Hugging Face Account: 用于访问部分受限模型或数据集。\n\n> **国内加速建议**:\n> *   建议使用国内镜像源安装 Python 依赖（如清华源、阿里源）。\n> *   访问 Hugging Face 模型或数据集时，建议配置镜像站（如 `hf-mirror.com`）或使用代理。\n\n## 安装步骤\n\n该项目主要包含文档、幻灯片以及用于演示最佳实践的参考代码（如 Streamlit 日志记录示例）。\n\n1.  **克隆仓库**\n    ```bash\n    git clone https:\u002F\u002Fgithub.com\u002Fpredibase\u002Fllm_distillation_playbook.git\n    cd llm_distillation_playbook\n    ```\n\n2.  **创建虚拟环境**\n    ```bash\n    python -m venv venv\n    source venv\u002Fbin\u002Factivate  # Windows 用户请使用: venv\\Scripts\\activate\n    ```\n\n3.  **安装基础依赖**\n    虽然根目录可能没有统一的 `requirements.txt`（因为不同章节涉及不同技术栈），但您可以安装通用的数据处理和深度学习库以运行示例代码（参考 `app` 目录示例）：\n    \n    ```bash\n    # 使用国内镜像源加速安装\n    pip install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple \\\n        torch torchvision torchaudio \\\n        transformers datasets accelerate peft \\\n        streamlit pandas scikit-learn \\\n        boto3 snowflake-connector-python\n    ```\n\n    *注：如果您计划使用 Apple MLX 框架进行实验，请参照 [MLX 官方文档](https:\u002F\u002Fml-explore.github.io\u002Fmlx\u002Fbuild\u002Fhtml\u002Finstall.html) 单独安装 `mlx` 和 `mlx-lm`。*\n\n## 基本使用\n\n本项目的核心价值在于遵循其 **Best Practices（最佳实践）** 流程。以下以最简单的“构建日志基础设施并准备数据”为例，展示如何开始蒸馏流程。\n\n### 1. 理解核心流程\n蒸馏不仅仅是训练，更是一个系统工程。请遵循以下简化路径：\n1.  **日志记录**: 收集生产环境中教师模型（Teacher Model）的输入输出。\n2.  **数据清洗**: 将日志转化为训练数据集。\n3.  **小规模实验**: 使用少量数据验证蒸馏可行性。\n4.  **全量训练与评估**: 扩大数据量并部署学生模型。\n\n### 2. 运行日志记录示例 (Bootstrap with Real Logs)\n项目提供了一个基于 Streamlit 的轻量级示例，展示如何异步记录请求和响应到存储系统（如 S3），这是构建蒸馏数据集的第一步。\n\n**启动示例应用：**\n```bash\ncd app\nstreamlit run app.py\n```\n*在应用中，您可以模拟发送请求给教师模型（如通过 API 调用 GPT-4 或本地 LLM），并将交互数据保存为 JSONL 格式，作为后续微调的“黄金数据集”。*\n\n### 3. 数据准备与简单蒸馏策略\n根据指南中的 **Jigsaw 案例研究**，您可以按照以下逻辑编写训练脚本（伪代码逻辑）：\n\n**步骤 A: 准备数据**\n将收集的日志或合成数据转换为 `datasets` 库兼容的格式。\n```python\nfrom datasets import load_dataset\n\n# 示例：加载处理后的二分类数据集 (参考 Jigsaw 案例)\n# 实际使用中请替换为您从日志中清洗出的数据路径\ndataset = load_dataset(\"csv\", data_files={\"train\": \"teacher_logs_cleaned.csv\"})\n\n# 确保数据多样且平衡 (Best Practice #6)\n# dataset = balance_dataset(dataset) \n```\n\n**步骤 B: 选择教师与学生模型**\n*   **Teacher**: 高性能大模型 (如 Llama-3-70B, GPT-4)。\n*   **Student**: 待蒸馏的小模型 (如 Llama-3-8B, Phi-3, Mistral-7B)。\n\n**步骤 C: 执行蒸馏 (模仿学习)**\n使用 `transformers` 或 `Ludwig` (Predibase 开源框架) 进行训练。核心思想让学生模型学习教师模型的输出分布（Logits）或生成结果。\n\n```python\n# 这是一个概念性示例，具体训练循环需参考 HuggingFace TRLO 或 SFTTrainer\nfrom transformers import TrainingArguments, Trainer\n\ntraining_args = TrainingArguments(\n    output_dir=\".\u002Fstudent_model\",\n    per_device_train_batch_size=8,\n    num_train_epochs=3,\n    learning_rate=2e-5,\n    # 关键：使用教师模型生成的标签进行监督微调 (Imitation Learning)\n)\n\n# trainer = Trainer(\n#     model=student_model,\n#     args=training_args,\n#     train_dataset=dataset[\"train\"],\n#     # 如果做逻辑蒸馏，需自定义 loss 函数以匹配 teacher logits\n# )\n\n# trainer.train()\n```\n\n### 4. 下一步行动\n*   **阅读完整指南**: 务必查看仓库中的 `README.md` 全文，特别是 \"Best practices\" 章节的 12 条建议（如：从小规模开始、关注边缘效用、监控生产环境等）。\n*   **查看幻灯片**: 访问提供的 [Google Slides 链接](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F15qrqENfi1TFR-kjK1IU5YnJ9etHaW_Q_\u002Fview) 获取更深入的架构图解。\n*   **探索 Ludwig**: 如果需要低代码实现蒸馏，可以参考 Predibase 维护的 [Ludwig](https:\u002F\u002Fgithub.com\u002Fludwig-ai\u002Fludwig) 框架。\n\n> **注意**: 蒸馏效果高度依赖于任务类型和数据质量。对于需要复杂推理的任务，小模型可能无法完全复现大模型的能力，请务必遵循指南第一条：**理解小模型的局限性**。","某电商初创团队希望将昂贵的云端大模型替换为低成本本地小模型，以支撑其 7x24 小时智能客服系统。\n\n### 没有 llm_distillation_playbook 时\n- 团队盲目尝试各种蒸馏参数，缺乏系统指导，导致多次实验失败且无法定位原因。\n- 训练数据仅靠少量人工标注，未利用真实日志或合成数据扩充，致使小模型在长尾问题上表现极差。\n- 缺乏清晰的评估标准，仅凭感觉判断模型好坏，上线后频繁出现答非所问的情况。\n- 忽视小模型的能力边界，强行让其处理复杂推理任务，导致响应延迟高且准确率低下。\n- 部署前未进行充分的错误案例分析，生产环境中突发大量客诉，运维压力剧增。\n\n### 使用 llm_distillation_playbook 后\n- 遵循“从简单开始”和“单变量实验”原则，快速锁定最优蒸馏配置，研发效率提升 3 倍。\n- 依据指南构建日志基础设施，通过真实对话日志与合成数据混合训练，显著覆盖边缘场景。\n- 建立了明确的量化评估体系，确保小模型在特定客服场景下的表现可衡量、可追踪。\n- 深入理解小模型局限性，重新设计任务拆解流程，使轻量级模型也能稳定处理复杂咨询。\n- 在部署前系统性分析个体错误并建立监控机制，上线首周零重大事故，客户满意度稳步上升。\n\nllm_distillation_playbook 将原本充满试错的蒸馏过程转化为标准化的工程实践，帮助团队以最低成本实现了大模型能力的平滑迁移。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpredibase_llm_distillation_playbook_b6b89d9e.png","predibase","Predibase","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fpredibase_9b5990da.png","",null,"nicolasberenganrubrik","www.predibase.com","https:\u002F\u002Fgithub.com\u002Fpredibase",[81,85],{"name":82,"color":83,"percentage":84},"Jupyter Notebook","#DA5B0B",70.3,{"name":86,"color":87,"percentage":88},"Python","#3572A5",29.7,618,55,"2026-04-05T16:22:57","未说明",{"notes":94,"python":92,"dependencies":95},"该文档主要是一份关于大语言模型（LLM）蒸馏的最佳实践指南和策略手册，而非一个可直接运行的软件工具包。文中提到的代码示例（如日志记录脚本）仅用于演示特定流程。文档内容侧重于方法论（如数据构建、评估标准、教师\u002F学生模型选择），未提供具体的运行环境配置、硬件需求或依赖库版本列表。",[92],[35,14],"2026-03-27T02:49:30.150509","2026-04-08T03:55:42.917173",[],[]]