[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-varungodbole--prompt-tuning-playbook":3,"tool-varungodbole--prompt-tuning-playbook":64},[4,17,25,39,48,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,14,15],"开发框架","Agent","语言模型","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":10,"last_commit_at":23,"category_tags":24,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,15],{"id":26,"name":27,"github_repo":28,"description_zh":29,"stars":30,"difficulty_score":10,"last_commit_at":31,"category_tags":32,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[33,34,35,36,14,37,15,13,38],"图像","数据工具","视频","插件","其他","音频",{"id":40,"name":41,"github_repo":42,"description_zh":43,"stars":44,"difficulty_score":45,"last_commit_at":46,"category_tags":47,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,3,"2026-04-04T04:44:48",[14,33,13,15,37],{"id":49,"name":50,"github_repo":51,"description_zh":52,"stars":53,"difficulty_score":45,"last_commit_at":54,"category_tags":55,"status":16},519,"PaddleOCR","PaddlePaddle\u002FPaddleOCR","PaddleOCR 是一款基于百度飞桨框架开发的高性能开源光学字符识别工具包。它的核心能力是将图片、PDF 等文档中的文字提取出来，转换成计算机可读取的结构化数据，让机器真正“看懂”图文内容。\n\n面对海量纸质或电子文档，PaddleOCR 解决了人工录入效率低、数字化成本高的问题。尤其在人工智能领域，它扮演着连接图像与大型语言模型（LLM）的桥梁角色，能将视觉信息直接转化为文本输入，助力智能问答、文档分析等应用场景落地。\n\nPaddleOCR 适合开发者、算法研究人员以及有文档自动化需求的普通用户。其技术优势十分明显：不仅支持全球 100 多种语言的识别，还能在 Windows、Linux、macOS 等多个系统上运行，并灵活适配 CPU、GPU、NPU 等各类硬件。作为一个轻量级且社区活跃的开源项目，PaddleOCR 既能满足快速集成的需求，也能支撑前沿的视觉语言研究，是处理文字识别任务的理想选择。",74939,"2026-04-05T23:16:38",[15,33,13,37],{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":45,"last_commit_at":62,"category_tags":63,"status":16},2181,"OpenHands","OpenHands\u002FOpenHands","OpenHands 是一个专注于 AI 驱动开发的开源平台，旨在让智能体（Agent）像人类开发者一样理解、编写和调试代码。它解决了传统编程中重复性劳动多、环境配置复杂以及人机协作效率低等痛点，通过自动化流程显著提升开发速度。\n\n无论是希望提升编码效率的软件工程师、探索智能体技术的研究人员，还是需要快速原型验证的技术团队，都能从中受益。OpenHands 提供了灵活多样的使用方式：既可以通过命令行（CLI）或本地图形界面在个人电脑上轻松上手，体验类似 Devin 的流畅交互；也能利用其强大的 Python SDK 自定义智能体逻辑，甚至在云端大规模部署上千个智能体并行工作。\n\n其核心技术亮点在于模块化的软件智能体 SDK，这不仅构成了平台的引擎，还支持高度可组合的开发模式。此外，OpenHands 在 SWE-bench 基准测试中取得了 77.6% 的优异成绩，证明了其解决真实世界软件工程问题的能力。平台还具备完善的企业级功能，支持与 Slack、Jira 等工具集成，并提供细粒度的权限管理，适合从个人开发者到大型企业的各类用户场景。",70612,"2026-04-05T11:12:22",[15,14,13,36],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":79,"owner_location":80,"owner_email":79,"owner_twitter":81,"owner_website":82,"owner_url":83,"languages":79,"stars":84,"forks":85,"last_commit_at":86,"license":87,"difficulty_score":88,"env_os":89,"env_gpu":89,"env_ram":89,"env_deps":90,"category_tags":93,"github_topics":79,"view_count":45,"oss_zip_url":79,"oss_zip_packed_at":79,"status":16,"created_at":94,"updated_at":95,"faqs":96,"releases":97},863,"varungodbole\u002Fprompt-tuning-playbook","prompt-tuning-playbook","A playbook for effectively prompting post-trained LLMs","prompt-tuning-playbook 是一份专为优化后训练大语言模型（LLM）提示效果而编写的实战指南。针对当前提示词工程常依赖经验试错、缺乏系统性策略的痛点，这份文档整合了多位研究者的多年实战心得，旨在帮助用户掌握更高效的模型交互方法。\n\n内容涵盖了预训练与后训练的本质区别、提示词编写风格指南、系统指令迭代流程以及 LLM 适用场景分析等核心模块。它特别适合具备一定 LLM 使用基础，希望深入理解原理并提升效果的开发者、研究人员及 AI 爱好者。虽然主要基于 Gemini 模型的经验总结，但其提供的思维模型和最佳实践对其他模型同样具有广泛的参考价值。\n\nprompt-tuning-playbook 不追求绝对真理，而是作为当前技术认知的快照，鼓励社区共同探索更科学的提示策略。通过梳理具体的心法与流程，它将提示词的“玄学”转化为可落地的工程实践，助力用户释放大模型的真正潜力。","# LLM Prompt Tuning Playbook\n\n**Varun Godbole, Ellie Pavlick**\n\n## Table of Contents\n\n- [Who is this document for?](#who-is-this-document-for)\n- [Why a tuning playbook?](#why-a-tuning-playbook)\n- [Background: Pre-training vs. Post-training](#background-pre-training-vs-post-training)\n   - [Pre-training](#pre-training)\n      - [The \"Cinematic Universe\" Intuition of Pre-training](#the-cinematic-universe-intuition-of-pre-training)\n   - [Post-training](#post-training)\n      - [Post-training Data Collection](#post-training-data-collection)\n- [Considerations for Prompting](#considerations-for-prompting)\n- [A rudimentary \"style guide\" for prompts](#a-rudimentary-style-guide-for-prompts)\n- [Procedure for iterating on new system instructions](#procedure-for-iterating-on-new-system-instructions)\n- [Some thoughts on when LLMs are useful](#some-thoughts-on-when-llms-are-useful)\n- [More Resources](#more-resources)\n- [Acknowledgements](#acknowledgements)\n\n## Who is this document for?\n\nThis document is for anyone who would like to get better at prompting post-trained LLMs. We assume that readers have had some basic interactions with some sort of LLM (e.g. Gemini), but we do not assume a rigorous technical understanding.\n\nThe first half of the document provides mental models on the nature of post-training and prompting. The second half of this document provides more concrete prescriptions and a high-level procedure for tuning prompts. Given the pace of innovation with LLMs, we suspect that the second half is likely to go stale a lot faster than the first half.\n\n## Why a tuning playbook?\n\nThis playbook was inspired by the [Deep Learning Tuning Playbook](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Ftuning_playbook), a guide for tuning hyperparameters for deep learning workloads.\n\nThe “art” of prompting, much like the broader field of deep learning, is empirical at best and alchemical at worst. While LLMs are rapidly transforming numerous applications, effective prompting strategies remain an open question for the field. This document was born out of a few years of working with LLMs, and countless requests for prompt engineering assistance. It represents an attempt to consolidate and share both helpful intuitions and practical prompting techniques.\n\nWe are a pair of researchers and engineers that have worked with LLMs for a few years. Having said that, this document shouldn’t be viewed as a definitive truth nor should it be viewed as the collective position of the Gemini post-training team. Rather, it’s a collection of our personal observations and best practices. We hope that this playbook will act as a snapshot of our current thinking, which might get updated in the future on a best effort basis as our beliefs change and as new knowledge becomes available.\n\nWe hope that by writing down our concrete set of mental models and processes, the community can work together to find better and more systematic prompting strategies.\n\nThis playbook is exclusively focused on the various post-trained versions of Gemini. Anecdotally, some of the prescriptions in this document might generalize to other models. But we have less experience with them.\n\n## Background: Pre-training vs. Post-training\n\n### Pre-training\n\n“Pre-training” is an old concept from deep learning. Essentially:\n\n1. You have a small dataset that you actually care about (i.e. Dataset A), and a large Dataset B that isn’t actually A, but similar in at least some important aspects. For example, A could involve a small amount of mammography images and B could be a large academic dataset of natural images like ImageNet.\n2. You train a model on the large Dataset B with the hope that it will learn some generally useful features. You then “fine-tune” it on Dataset A to get better performance on A’s validation set than if you trained the model directly from scratch on A. That is, you simply continue training on Dataset A using the same training procedure that you had used on Dataset B. This way, by the time your model encounters examples from Dataset A, it's able to make better use of them because it already knows a lot of generally-useful stuff from its extensive experience on Dataset B. \n3. To be more concrete, consider the mammography example again. By pretraining on the large set of readily-available images from the internet, your model can learn basic things like how to segment objects in an image, or how to recognize concepts regardless of their location within an image. These are important image processing skills that will be useful for your mammography application, but likely require lots of data in order to learn, and are not specific to mammograms. If you tried to teach your model these skills using only your (expensive to obtain, limited in supply) mammography data, it might never learn them, and thus never achieve its best performance. But if you pretrain on everyday images, your model can come to your mammography data armed with these general skills and ready to use your specialized data to learn only specialized skills that couldn’t be learned elsewhere.\n\nOne of the key ideas of training LLMs is to use “language modeling” -- that is, predicting the next word in a sentence -- as a pretraining task. It turns out that if you train a model to take an arbitrary piece of text from the internet, and do a good job of predicting the next word, the model implicitly learns a very rich structure of the world that’s been reflected within the web.\n\nThis seems easy enough to understand, until we try to answer the question: what world does the internet reflect? To try to wrap our heads around this question (and its answer) we suggest a useful if somewhat fanciful metaphor: the Cinematic Universe.\n\n#### The “Cinematic Universe” Intuition of Pre-training\n\nLarge language models must learn about what the world is like by reading about the world in text. Text, though, has never been constrained to describe only things that are “true” in the conventional sense. Much attention is paid to misinformation or incorrect statements, but there are also lots of very innocent and desirable reasons why text does not and should not reflect a single factual reality corresponding to a single state of the world.\n\nFor example, consider the statement “Aragorn eventually becomes the king of Gondor”. Is that statement true? That depends. For example, it depends on some temporality. Moreover, whether that statement makes sense is also contingent on the broader premise or context within which it's being discussed. If the premise is Lord of the Rings (LoTR), then yeah, you could argue that this is a fact. But imagine that you’re instead talking within the premise of the Marvel Cinematic Universe. Then it’s not clearly factual. If you’re in the non-fictional cinematic universe compatible with what we conventionally consider “true”, then the statement we made about Aragorn is not true. It’s not true because Aragorn and Gondor are fictional characters that you can’t find on Earth. If you’re in the Marvel Cinematic Universe, then it’s also not true for a similar reason. But if you’re in the LoTR cinematic universe, then it becomes true.\n\nThis issue – i.e., the issue of struggling to define what it means for something to be “true” and with respect to what world – is not new to LLMs. It relates to a long history of philosophical and linguistic theory and argument. This history and theory is a worthwhile rabbit hole (see, e.g., [this overview](https:\u002F\u002Fplato.stanford.edu\u002Fentries\u002Ftruth\u002F)). But, for practical purposes regarding prompting LLMs, it can be oversimplified as: Whether a statement is true or not depends on the “cinematic universe” that acts as the backdrop of the statement.\n\nFor the purposes of this document, you can think of the pretraining corpus as an approximation of the set union of all the cinematic universes produced by human culture. Or, more accurately, the cultures that heavily participate with the pretraining data sources like the web.\n\n> [!IMPORTANT]\n> you can think of the pretraining corpus as an approximation of the set union of all the cinematic universes produced by human culture. Or, more accurately, the cultures that heavily participate with the pretraining data sources like the web.\n\nWhen you give the model a fixed context window (i.e. prefix), it will try to infer from that prefix what universe it is in, and it will then behave in accordance with the rules, conventions, and facts of that universe. If you provide a prompt with very strong signals about context, it will be easier for the LLM to recognize the script. For example, consider a prompt like “*The concrete jungle where dreams are made of isn't just a catchy lyric – it's the electric truth of New York City. From the soaring skyscrapers that pierce the clouds to the vibrant pulse of its diverse neighborhoods, NYC offers an experience unlike any other on Earth*”, i.e., the first two lines of a blog post that I might write about NYC.) In this case, the model has very strong constraints on style and topic that will influence how it proceeds with the generation.\n\nBut, if your prompt is highly generic – like “Hi, how are you?” — the LLM might not have enough context to understand which cinematic universe it’s supposed to be in. “Hi, how are you?” probably occurs in all kinds of contexts in the diverse corpora it was trained on. That is, there are many “modes” in the probability density function used to decode a generation. Or to put it in simpler terms, it sees many possibilities that it could role-play as. The text “Hi, how are you?”, or even something much longer, doesn’t give it enough context to disambiguate this.\n\nThat’s where post-training comes in. \n\n### Post-training\n\nPost-training provides the LLM with guidance about the “default” universe within which it exists. Rather than asking the LLM to infer this universe from a prompt alone, post training can constrain the LLM to make certain assumptions or resolve ambiguities in consistent ways. There are many reasons this is necessary for making models useful. For example, LLMs might need to be told that, by default, they follow instructions. Otherwise, given a prompt like “*Write a report about George Washington*”, an LLM without post-training might happily generate a continuation of the instruction, e.g., something like “*It's due by 4:59pm on Friday*”, rather than generate the report that was requested. But post-training can be used to impose other defaults as well, such as influencing the model’s default behavior to be more consistent with social norms, however defined, ideally to make it a safer or more productive tool for its particular assumed use cases.\n\nWe really like Murray Shanahan’s articulation that one way to conceptualize what these models might be doing is that they’re engaging in a form of [role-playing](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16367) that’s a function of their overall training recipe. Our intuition is that post-training teaches these models a coherent and default role to play in diverse deployment settings.\n\n> [!IMPORTANT]\n> post-training teaches these models a coherent and default role to play in diverse deployment settings.\n\nHere’s a non-exhaustive list of what they might learn during post-training, ranging from the mundane and practical to the subjective and personal.\n\n* **That the model should follow a specific format.** For example, [Gemma’s formatter](https:\u002F\u002Fai.google.dev\u002Fgemma\u002Fdocs\u002Fformatting) teaches it that it’s in a cinematic universe where there’s always a conversation between it and some arbitrary human user. In that universe, the role it’s being asked to play is described in the system instructions. Depending on the formatter, in each conversation, the human’s turn is always first.\n* **That the model should “follow instructions” from the user.** That is, if the user gives it a string prompting it to “write an essay about a dog”, it should actually do that, rather than respond to the user with an increasingly bossy continuation of the instruction.\n* **That the model should match the “real world” (as opposed to some other cinematic universe).** Post-training is often used to improve the model’s factuality by aligning its implicit or default cinematic universe to one that most users are likely to care about. For example, if you ask it “Where was $CELEBRITY born in?”, it should assume by default that we’re talking about the “real world” rather than some fan fiction world that it might have encountered online that shares a celebrity with the same name.\n* **That it should be “safe”.** The internet is a complex web of normative standards. Needless to say, a fair amount of the internet would not be considered sanitary within the context of most global commercial deployments. Post-training helps align the model to a chosen distribution that can embody a range of safety policies, thereby imposing a normative standard on what the model should or shouldn’t generate. Ultimately, it is not possible for a model to generate something sufficiently complex without making some assumptions about norms.\n\n#### Post-training Data Collection\n\n**Broad Takeaway -** these models are ultimately trained and evaluated by human raters. When instructing a post-trained LLM, you are implicitly asking a digital role-player (i.e. the LLM) to role-play as a human rater (i.e. the person generating the post-training data) who is getting paid to role-play as an AI Assistant.\n\nThis section is a massive oversimplification. Substantially longer documents could be written about the complexities and vagaries of tasking human annotators with post-training LLMs. Our goal in this section is to provide an overall intuition for human annotation in this context, since it directly impacts how one thinks about prompting.\n\nFrom the perspective of the AI developer, the process of human data collection for post-training is roughly:\n1. Create a dataset of a diverse range of input examples–i.e., prompts describing tasks that an LLM might be asked to do. This could be anything from “reformat this data as json” to “help me plan my wedding”. (This might come from your own intuition, or from human raters themselves!)\n2. Create a pool of human “raters” whose job is to tell a model what to do for these tasks. The rater’s job might be to write the gold-standard answers for these input examples, e.g., actually provide wedding-planning tips themself. Or it might be to view different responses generated by the model and rank them from best to worst. At different points in post-training, models can use different types of human-generated data.\n3. Write some guidelines on how these raters should do this job. Often, the developer will include examples or specific details about the task and context to help the raters understand the task better. \n4. Collect this data and “post-train” the pre-trained model on it.\n5. Ship it.\n\nA large part of why LLMs are able to “act human” is because these statistical models are fitted to a large dataset of carefully collected demonstrations of human behavior. The pre-training phase, model architecture, learning algorithm, etc provide the core infrastructure and underlying capability for the model. But post-training provides the overall orientation of the model (via human demonstrations) which dictates how it will actually behave when it actually is deployed.\n\n> [!IMPORTANT]\n> A large part of why LLMs are able to “act human” is because these statistical models are fitted to a large dataset of carefully collected demonstrations of human behavior.\n\nPost-training teams spend a substantial amount of time on quality control on their data. A lot of effort goes into matching raters with the prompts for which they are best suited. For example, to provide a good demonstration of how to respond to a prompt containing a hard Python debugging problem, it's necessary to find a rater who is themself a good Python programmer.\n\nCollecting “high quality” data from human raters is extremely challenging. Some reasons include:\n* **Rating examples can be boring compared to other jobs that require the same skills:** If you are an excellent Python programmer, it is probably more fun to work on your own coding projects than to spend 8 hours a day debugging programs which will be used to train an AI system. If you are a talented poet, you likely want to write your own poetry, not rank AI poems from best to worst. Of course, if you spend ~8 hours a day rating, you’ll get paid for it. But rating examples can be incredibly repetitive, and raters often have incentives based on throughput. There can be challenges with feelings of agency or ownership as well – you don’t always know how that data changed the model’s overall quality, whether that data is going to get thrown away, what model it’s used for, etc. It’s possible that your only relationship to the data’s utility is whether your supervisor tells you that you did a good job. If you don’t have a clear narrative of why being there in that job makes a positive change in the broader world, you might not find it a very meaningful use of our time. That might impact your unconscious enthusiasm for doing a “good” job, even if in the abstract you want to do a good job.\n* **Defining what a “good” response looks like for a given task is very challenging.** This is especially true when the definition necessarily intersects with existing norms about factuality, good expository writing or some other capability. The “boundaries” between good\u002Fbad for most interesting human artifacts is actually quite nebulous and contingent on many normative factors. (E.g., imagine instructing an AI system to serve as a PR agent for a firm. The full complexity of social reality is really hard to nail down into a clear set of propositions. This is a job that ultimately requires “good judgment” and how it is executed will vary substantially across people and contexts.) This makes it very hard for the developer to write good instructions for the task, and very subjective for the human raters to determine how to interpret those instructions. (There’s a parallel with how the common law system works. It’s extremely difficult to write down legislation that can anticipate a large array of edge-cases. So society uses the judicial system to arbitrate edge-cases and create precedents.)\n* **The raters might not understand the task.** Hiring is hard for any job, and rating is no different. Even with a lot of effort spent on recruiting, there is no guarantee that the raters will have the skills to do the task. This could be for any number of reasons. For example, because the designer didn’t anticipate the complexity of the tasks raters would get (i.e., hired someone with entry-level software engineering skills when the tasks actually require an experienced software engineer) or because the rater themself doesn’t realize the limitations of their knowledge (e.g., answering a question based on what they learned in a college biology class which is not in fact aligned with more recent research). \n* **People make mistakes.** Professors release exams with wrong answers. Physicians mis-diagnose more often when they are hungry or tired. People lose focus at work when they have distractions in their home life. For all kinds of reasons, human performance on a task can be less than “gold standard”. This means even despite the best efforts, AI systems will sometimes be trained on wrong or misleading data.\n\n## Considerations for Prompting\n\n**Broad Takeaway -** When you're writing system instructions and prompts, you are writing them for something like the aggregated spirit of the post-training team’s rater pool, seeded by the aggregated spirit of the pre-training corpus. If we write instructions that the average rater (within that specific domain) is likely able to understand, comprehend and faithfully follow, the model is more likely to follow our instructions.\n\n> [!IMPORTANT]\n> When you're writing system instructions and prompts, you are writing them for something like the aggregated spirit of the post-training team’s rater pool, seeded by the aggregated spirit of the pre-training corpus.\n\nWhen we write system instructions, it’s helpful to imagine that there’s a friendly, well-meaning and competent rater prepared to role-play AI on the other side of the screen. The text we provide is all they’ve got. That is, when we make an API call to Gemini, imagine that there’s a human rater on the other side that will carefully read our prompt and provide a response. When constructing prompts, it’s extremely helpful to take on their perspective and to consider our instructions in that light. For example, suppose the instructions are about generating Python code. If we randomly picked a competent Python engineer off the street and asked them to respond to these instructions, would they understand what we want?\n\nThis metaphor is useful for helping you write good instructions. It isn’t necessarily a good metaphor of how the AI system is following them. For example, it starts to break down when we consider that this proverbial rater might have access to all human knowledge. They simply lack the wisdom and context to peer beyond the prompt that we’ve provided. It also is the case that the AI system might not “understand” the prompt the way the human rater would. Instead, the AI system is going to bring to bear its very powerful statistical models for providing an output for your input. But those statistical models are optimized to work on instructions written for well-meaning and competent human raters, and so providing anything other than a prompt that was written for well-meaning and competent human raters is likely to confuse the AI system. There is nothing AI hates more than going “out of distribution”!\n\nGiven this, here are some considerations to help you improve the instructions in our prompts. The considerations below will likely go stale very quickly as models get better. We’d suggest attempting to align with the overall spirit of this bullet list rather than the letter.\n\n* **Are the instructions clear, legible, concise and explicit?** For example, suppose our instructions are about some Python coding task. If we picked up a random Python expert off the street and asked them to pretend to be Gemini, are our instructions good enough for them to immediately understand what we mean without asking any obvious clarifying questions?\n   * \u003Cins>Bad:\u003C\u002Fins> Write a Python function that computes prime numbers.\n   * \u003Cins>Good:\u003C\u002Fins> Write a Python function that computes prime numbers from 1 to 100. Include pytype annotations for the generated function and use 2-space indentation.\n\n* **Are the instructions self-contradictory or otherwise hard to follow?** Would a bored, hungry, tired, etc. rater actually read our overly verbose instructions and faithfully follow them? Note that there is often substantial quality control involved in making sure all the instructions in a given prompt are followed. But humans are humans. Are our instructions actually “easy” to follow? Or do they contain needless indirection, verbosity, etc? When the authors of this playbook write down instructions, we often ask ourselves whether another employee in our company could faithfully follow them if the instructions were presented to them with no additional context.\n   * \u003Cins>Bad:\u003C\u002Fins> Don't write a story about a mean dog, unless it's friendly, and also sad, but not really that sad, and make it long even but not too long. Also the dog should be named Bob, or maybe Susan, doesn't matter. Write it about a cat too. But that’s not actually as important, but make the dog fluffy.\n   * \u003Cins>Good:\u003C\u002Fins> Write a short story (200-300 words) about a loyal golden retriever named Buddy who gets lost in the woods during a family camping trip. The story should focus on Buddy's journey and his determination to find his way back to his family.\n\n* **Are there too many instructions in a given system instruction?** We’ve noticed an inverse relationship between the number of instructions in a prompt and the model’s ability to faithfully follow all of them. Although we’ve definitely seen many cases where a model was able to follow long chains of instructions reasonably well. This is just a rule of thumb, and one that applies when writing instructions for humans, too. Basically, its best to break up tasks into subtasks if you can. It’s difficult to provide good\u002Fbad examples for this consideration, since it’s heavily dependent on the model under consideration, but here is the spirit of what we mean.\n   * \u003Cins>Bad:\u003C\u002Fins> Read each article and, for each key idea, rate it on a scale of 1-10 for how important it is to understanding the general point. Then for anything rated higher than 7, summarize it in the form of a Chinese social media post.\n   * \u003Cins>Good:\u003C\u002Fins> *Call the AI model for each subtask separately (what follows is not an example of a literal prompt for the model).* 1) Break the article into a list of main ideas. 2) Rate each idea on a scale of 1-10. 3) Take the top ones and translate them into Chinese. 4) Take the Chinese text and convert it into a social media post.\n\n* **Use positive instructions rather than negative instructions.** The \"bad\" example below says what the model shouldn’t do. But it doesn’t say what the model should do. The \"good\" example very explicitly lays out what the model should do. There are actually parallels here between effective human-to-human communication that one learns in contexts like teacher training or couple’s therapy. We should try to imagine that we’re attempting to communicate with someone that wants to give us what we want. But we need to give them very explicit guidelines on what “success” means, rather than telling them to “avoid failure”.\n   * \u003Cins>Bad:\u003C\u002Fins> “Don’t ever end your response with a full stop.”\n   * \u003Cins>Good:\u003C\u002Fins> “Your response should always end with an exclamation mark or a question mark”.\n\n* **Good system instructions can act as “reminders” for the model.** When iterating on a new prompt, it’s really important to consider a very diverse range of input examples. It’s very common to see people write prompts that might work for ~60-70% of possible model inputs, but are either unspecified or vague for the remaining ~30-40%. It’s often worth giving the model an explicit set of positive instructions of what it should do in those situations. We often create a separate section in our system instructions called “Additional Considerations” or “Additional Assumptions” that contains a bullet list of specifications for these edge cases.\n\n* **Prompts are the new hyperparameters, and you will likely never find the “best” one.** For example, tuning the learning rate correctly can make a huge difference in the final performance of a model on the validation set for a specific compute budget. Similarly, the difference between a “good” prompt and a “bad” prompt can have a substantial impact on the system’s final performance. In the same vein, there’s probably always a “slightly better” prompt with more prompt tuning. And we’ll never know if we’ve found the “best one”. As discussed in the section below, we can leverage similar intuitions as the ones used in the [Deep Learning Tuning Playbook](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Ftuning_playbook) to systematically find “better” prompts than whatever baseline we currently have. The playbook speaks of “trial budgets” within the context of hyperparameter search. For prompting, it can be quite useful to create a timebox for our experimentation.\n\n* **Experiment with giving the model explicit instructions for saying “I don’t know”.** For example, suppose we’re working on some sort of multi-class text classification task. We have some criteria for how an input example should get mapped to each of the classes. It can be really helpful to create an additional “unknown” or “edge case” class. And to provide an explicit positive instruction for the model to classify the input into this class if it thinks that the instructions are unclear for correctly classifying this example. We can then look at the logs to see when\u002Fhow this happened, and improve the prompt accordingly.\n\n* **Prompts can be deeply coupled with the checkpoint that they were developed on.** That is, if we take a prompt from Gemini 1.5 Flash and run it on Gemini 1.0 Pro, it might not work the “same way” and might have very different aggregate behavior on an eval. This sort of makes sense. Our mental model is that the prompts we write in natural language are akin to the parameters we’d train if we instead did SGD. To what extent this is true is a question of open research. Models are sort of like the machine, the post-training procedure is sort of like a compiler and the system instructions are sort of like computer code. Except that the machine and compiler are totally fluid, and a given model can hold many different combinations of these. We suspect that the ecosystem will organically converge towards some consensus structure around how instructions look, that rapidly changes across time and remains reasonably backwards compatible in a combinatorially explosive way. This is analogous to the x86 instruction set remaining relatively stable across time, as compared to the explosion in diversity of programming languages built on top of it.\n\n## A rudimentary “style guide” for prompts\nAt this point, we suspect that well-tuned prompts executed on a frontier model are sufficient for a large number of ML workloads that would have previously required a bespoke trained model.\n\nAs inference costs, latency, context window sizes, etc. continue to improve, prompting will become increasingly ubiquitous. Programming language style guides emerged with the observation that software is primarily written for other software engineers to facilitate maintainability, etc. We’re not sure what the equivalent of a programming language style guide is for prompting. But there are many interesting possibilities. Markdown files containing prompts might end up being treated as a separate “language” with a specific file extension in version control systems. Or perhaps programming languages will end up having more “transparent” integrations to model inference. It’s too early to make definitive predictions!\n\nWe don’t have a full-fledged style-guide to present here. But we thought we’d share the following observations:\n* **Consider using markdown:** Most version control systems are good at rendering markdown files. Therefore, it can be useful to store each prompt in a separate markdown file. And make judicious use of markdown headings, etc to organize the contents of each of our prompts.\n* **Think about others!** We’d encourage thinking about the saved prompts as being primarily for other maintainers of the prompt rather than just the LLM. As models get better, we’ll hopefully need to spend less energy on idiosyncratic hacks to steer the model towards our desired behavior. As discussed above, that might also naturally lend itself to better prompts.\n* **Simplicity:** The “technical debt” incurred by our prompt is proportional to its length and overall complexity. Prompts are intimately tied to a specific checkpoint. Every time the underlying model changes, it’s worth quantitatively and qualitatively checking that our prompt still works. It’s really worth keeping the prompts as simple, terse and direct as possible. Allow the implicit assumptions made for the model to work in our favor “for free”. Don’t try to add more details to a prompt if a simpler prompt would do. This makes the explicit assumption that the underlying model that executes a prompt won’t be carelessly decoupled from the prompt. Of course, we don’t know if the model will make the same sets of implicit assumptions for different input examples. As the deployment of our prompt grows more “serious”, implementing more rigorous quantitative evaluations becomes unavoidably crucial.\n* **Prefer zero-shot instructions over few-shot instructions:** This is in the same spirit of simplicity. Zero-shot are easier to understand, debug and reason about. Few-shot examples, especially when they contain full conversations, are best when the “letter” of our explicit instructions are inadequate to capture the “spirit” of our instructions. We’ve seen plenty of cases where few-shoting is worse than zero-shot. We shouldn’t just assume that few-shotting will be better, and we should be empirical about this. We’d suggest using few-shot examples as a last resort since they can substantially affect the readability of a prompt.\n   * Instead of full blown few-shot examples, prefer to weave in examples into the prose of our instructions. Consider the following system instruction that uses “For example” at the end of the instruction:\n      * *Always start your response to the user with something passive aggressive. For example, start with something like “Oh that’s what you want? I’m not saying you’re wrong. But I mean, sure, if that’s what you really want.” But keep it fresh and use a different start to each response, based on what’s in the user’s message.*\n\n## Procedure for iterating on new system instructions\n\nPrompting is inherently iterative. It’s spiritually very similar to training a model using some validation set. Except that instead of needing JAX, etc. you can do by writing clear prose.\n\nSimilar to writing prose, we’ve found that it’s best to break things down into separate generate+edit phases.\n\nWe’ve found that most users attempting to prompt a model don’t already have a clean validation set with which they can benchmark the model’s responses. This is fine for rapidly building an MVP. But eventually, formal quantitative evaluations are invaluable for tracking model performance. Even with the best prompt+model combination, it’s impossible to deterministically guarantee a model’s behavior. It’s important to design product surfaces that can fail gracefully when the underlying model does something unexpected.\n\nA tool like [AI Studio](https:\u002F\u002Faistudio.google.com\u002Fapp\u002Fprompts\u002Fnew_chat) can be invaluable for iterating on a new set of system instructions.\n\n1. Start with a small sample of really diverse input examples for the problem, and try to have some intuition of what the desired output behavior is likely to be. This could be something like ~10-50 examples.\n\n2. Start with the simplest system instruction that might satisfy every input example, on the smallest and cheapest model available. For now, that’s likely something like Gemini Flash 8B.\n   * When we say “simplest”, we mean simplest. Make it as terse and short as possible without losing any clarity.\n\n3. Run the system instruction on the first input example.\n\n4. Try to “overfit” on that specific example.\n   * That is, add “reminders” to the original system instruction until the model reliably produces a “good enough” response. When we say reminders, we literally mean reminders. Find any specific deficiencies in the model’s response to the original system instructions. Now add instructions to it calling out those specific behaviors. Do so one at a time. For example, suppose the original instruction tells the model to extract all the named entities from an input string. And we notice that it’s also extracting the names of buildings, places, etc. but our intention in this golden example was to only extract the names of people. We can edit\u002Famend the original instruction to reflect this.\n   * If it can’t produce a good enough response to the first input prompt, try the second input prompt. If the model consistently fails on most of our input examples, go back to step (2) and try a better model. For example, if we started with Flash, try Pro.\n\n5. Once we can successfully overfit to the first example, try the next input example.\n   * At this point, we might notice that some of our instructions are too overfit to the first example. Perhaps they need more nuance. For example, perhaps they need explicit instructions to clarify some underspecification, or some if\u002Felse\u002Fthen clauses.\n   * Repeat this entire process until we’ve found a system instruction that works on all of your diverse input examples.\n\n6. During this process, we might have haphazardly edited the system instructions prose to make it work on all of our input examples. Now is the time to clean up that prose. For example, add section headings, fix spelling errors, etc. Once we’ve done this, verify that the instructions still work on all of our input examples. If they don’t, systematically debug them using the process above. But instead of the MVP system instructions from step (2) as your base, use the current state of the cleaned up prose.\n   * It’d be unsurprising to enter a scenario where a “messy” prompt works better than a “clean” one on some model. This invites a number of questions:\n      * How important is the maintenance of this prompt relative to its performance?\n      * Do we have monitoring for the behavior of the model in production? Is this deployment important enough to merit that?\n\n7. Perhaps someday, this entire process of going from input examples to a system instruction will be reliably automated by a meta-optimizer for generating instructions. That would be amazing, but we shouldn’t be fooled! There’s no free lunch here. There’s always a knob that needs to be dialed when working with ML systems. There’s always some back-and-forth qualitative work that needs to go into ensuring that our model’s sense of relevancy (i.e “cinematic universe”) is aligned with what we as developers find relevant when presented with a particular dataset of input examples. It’ll never be possible to wish away doing the hard work of qualitative analysis for the same reasons that it’ll never be possible to wish away the hard work of training a new member of our engineering team. Increasing the abstraction of tooling merely shifts the focus and activation energy of the qualitative work to a different level of abstraction. But this isn’t to say that such tooling couldn’t be useful!\n\n> [!IMPORTANT]\n> It’ll never be possible to wish away doing the hard work of qualitative analysis for the same reasons that it’ll never be possible to wish away the hard work of training a new member of our engineering team.\n\n## Some thoughts on when LLMs are useful\n\nWe’ve spent a lot of words talking about how we can think about prompting. It’s also worth spending a minute talking about “why” an LLM might prove to be useful. Although this field is profoundly nascent and innovating quickly. So this section might become rapidly stale in a few months.\n\nLLMs are best where the answer is hard to make, but is easy to check. We’d recommend [this post](https:\u002F\u002Fwww.linkedin.com\u002Fpulse\u002Fhmec-principle-finding-sweet-spot-generative-ai-gorgolewski-ph-d--3zl5e) by Chris Gorgolewski. In our experience if we’re using an LLM for a problem where that’s not true, we’ll run into issues.\n\nRather than writing a monolithic prompt that’s hard to understand, debug, etc. it’s better to decompose the problem into sub-problems and chain inferences together. If we make each sub-problem small enough, they’re usually better specified and easier to check\u002Fevaluate.\n\nThere will likely be substantial innovation on hardware, business models, etc. to continue unlocking more inference capacity. We should go where the ball is going to be, rather than where it is right now.\n\nIt seems more future-proof to assume inference costs will soon be “too cheap to meter”, “good enough” or oriented around “value” rather than “cost”. That is, if we’re making a trade-off between a more valuable feature that would require more inference calls to work reliably, rather than a substantially less valuable feature that would cost less, it seems more future proof to orient towards the former. And perhaps grapple with the current unit economics by limiting rollouts, etc. We wouldn’t be surprised if we start to see something equivalent to Moore's Law but for price reductions for LLM inference.\n\n## More Resources\n\nThere’s a lot of good resources online about prompting. There’s too many to cite here. We’re not the only ones thinking about prompting. This playbook is merely a collection of our informal thoughts. But there’s a lot of great work being done all over the internet. For example:\n* [AI prompt engineering: A deep dive](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=T9aRN5JkmL8) by Anthropic.\n* [This guide](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fguides\u002Fprompt-engineering) from OpenAI.\n* [This guide](https:\u002F\u002Fservices.google.com\u002Ffh\u002Ffiles\u002Fmisc\u002Fgemini-for-google-workspace-prompting-guide-101.pdf) for Gemini.\n\nThinking about the differences between [“high-context” and “low-context” cultures](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FHigh-context_and_low-context_cultures) has also been really interesting. We’ve anecdotally found that there’s a substantial intersection between people that seem effective at prompting models, and communication patterns that are useful in low-context cultures. This makes sense to us, because the LLM doesn’t really know which cinematic universe it should be operating in. It needs explicit instructions to inform its role-play of who it is, where it is, and what it should find relevant in its environment.\n\nOn a more personal note, practicing [Nonviolent Communication](https:\u002F\u002Fwww.amazon.com\u002FNonviolent-Communication-Language-Life-Changing-Relationships\u002Fdp\u002F189200528X) can be really helpful. It teaches the difference between observable behaviors and internal narratives. This distinction can be really helpful to articulate instructions in terms of observable behavior. Although needless to say, we didn’t practice Nonviolent Communication to get better at prompting models.\n\n## Acknowledgements\n* Thank you to Slav Petrov and Sian Gooding for reviewing this document during a final review.\n* Thank you to Anna Bortsova for providing lots of helpful comments\u002Ffeedback during the drafting of the final version of the doc.\n* Thank you to Jennimaria Palomaki, James Wexler, Vera Axelrod for suggesting lots of helpful edits.\n","# LLM 提示词调优指南 (LLM Prompt Tuning Playbook)\n\n**Varun Godbole, Ellie Pavlick**\n\n## 目录\n\n- [这份文档适合谁？](#who-is-this-document-for)\n- [为什么需要一份调优指南？](#why-a-tuning-playbook)\n- [背景：预训练与后训练](#background-pre-training-vs-post-training)\n   - [预训练](#pre-training)\n      - [预训练的“电影宇宙”直觉](#the-cinematic-universe-intuition-of-pre-training)\n   - [后训练](#post-training)\n      - [后训练数据收集](#post-training-data-collection)\n- [关于提示词的考量](#considerations-for-prompting)\n- [提示词的基础“风格指南”](#a-rudimentary-style-guide-for-prompts)\n- [迭代新系统指令的流程](#procedure-for-iterating-on-new-system-instructions)\n- [关于何时使用 LLM 的一些思考](#some-thoughts-on-when-llms-are-useful)\n- [更多资源](#more-resources)\n- [致谢](#acknowledgements)\n\n## 这份文档适合谁？\n\n本文档适用于任何希望提升对后训练大语言模型（Post-trained LLMs）进行提示（Prompting）能力的人。我们假设读者已经与某种大语言模型（LLM，Large Language Model）有过一些基本的交互（例如 Gemini），但我们不假设读者具备严谨的技术理解能力。\n\n文档的前半部分提供了关于后训练和提示本质的心智模型（Mental Models）。文档的后半部分提供了更具体的建议和调优提示词的高级流程。鉴于 LLM 的创新速度，我们怀疑后半部分的内容可能会比前半部分更快地过时。\n\n## 为什么需要一份调优指南？\n\n本指南受 [深度学习调优指南 (Deep Learning Tuning Playbook)](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Ftuning_playbook) 启发，该指南用于指导深度学习（Deep Learning）工作负载的超参数（Hyperparameters）调优。\n\n提示（Prompting）的“艺术”，就像更广泛的深度学习领域一样，最好的情况是经验性的，最坏的情况则如同炼金术般玄妙。尽管 LLM 正在迅速改变众多应用，但有效的提示策略对该领域而言仍是一个开放性问题。本文档源于我们与 LLM 合作数年的经验，以及无数次寻求提示工程（Prompt Engineering）协助的请求。它代表了整合并分享有益直觉和实用提示技术的一次尝试。\n\n我们是两位与 LLM 合作了数年的研究人员和工程师。话虽如此，本文档不应被视为绝对真理，也不应被视为 Gemini 后训练团队的集体立场。相反，它是我们要个人观察和最佳实践（Best Practices）的集合。我们希望这本指南能作为我们当前思维的一个快照，随着我们信念的改变和新知识的出现，未来可能会在尽力而为的基础上进行更新。\n\n我们希望通过写下我们具体的一套心智模型和流程，社区可以共同努力找到更好、更系统的提示策略。\n\n本指南专门针对 Gemini 的各种后训练版本。据经验表明，本文档中的一些建议可能泛化到其他模型。但我们对其他模型的实践经验较少。\n\n## 背景：预训练与后训练\n\n### 预训练（Pre-training）\n\n“预训练（Pre-training）”是深度学习中的一个古老概念。本质上：\n\n1. 你有一个真正关心的小数据集（即数据集 A），以及一个大型数据集 B，它实际上不是 A，但在至少某些重要方面相似。例如，A 可能涉及少量乳腺 X 光图像，而 B 可能是像 ImageNet 这样的大型自然图像学术数据集。\n2. 你在大型数据集 B 上训练模型，希望它能学到一些通用的有用特征。然后你在数据集 A 上对其进行“微调（fine-tune）”，以便在 A 的验证集上获得比直接在 A 上从头训练模型更好的性能。也就是说，你只是使用之前在数据集 B 上使用过的相同训练过程，继续在数据集 A 上进行训练。通过这种方式，当你的模型遇到来自数据集 A 的示例时，它能够更好地利用它们，因为它已经从在数据集 B 上的广泛经验中知道了许多通用的有用知识。\n3. 为了更具体，再次考虑乳腺 X 光的例子。通过在互联网上大量现成的图像上进行预训练，你的模型可以学习基础知识，例如如何在图像中分割对象，或者如何识别概念而不受其在图像中位置的影响。这些是重要的图像处理技能，对你的乳腺 X 光应用很有用，但可能需要大量数据才能学会，并且并非乳腺 X 光所特有。如果你试图仅使用你的（获取成本高、供应有限）乳腺 X 光数据来教模型这些技能，它可能永远学不会，因此永远无法达到最佳性能。但是，如果你在日常生活图像上进行预训练，你的模型就可以带着这些通用技能来到你的乳腺 X 光数据面前，准备好利用你的专用数据仅学习那些在其他地方无法学到的专业技能。\n\n训练大语言模型（LLMs, Large Language Models）的关键思想之一是将“语言建模（language modeling）”——即预测句子中的下一个词——作为预训练任务。事实证明，如果你训练一个模型从互联网上获取任意文本片段，并很好地完成预测下一个词的任务，该模型就会隐式地学习到网络中反映出的非常丰富的世界结构。\n\n这似乎很容易理解，直到我们试图回答这个问题：互联网反映了什么样的世界？为了尝试理解这个问题（及其答案），我们建议一个有用但略显虚构的隐喻：电影宇宙（Cinematic Universe）。\n\n#### 预训练的“电影宇宙”直觉\n\n大语言模型必须通过阅读关于世界的文本来了解世界是什么样的。然而，文本从未被限制为只描述传统意义上“真实”的事物。人们非常关注虚假信息或不正确的陈述，但也有许多非常无辜且合理的理由，使得文本不应该也不应反映对应于单一世界状态的单一事实现实。\n\n例如，考虑这句话：“阿拉贡最终成为了刚铎的国王”。这句话是真的吗？这取决于情况。例如，它取决于某种时间性。此外，这句话是否有意义也取决于讨论它的更广泛的前提或背景。如果前提是《指环王》（LoTR），那么是的，你可以认为这是一个事实。但想象一下，你是在漫威电影宇宙（Marvel Cinematic Universe）的前提下谈论。那么这就不是明显的事实了。如果你处于与我们通常认为“真实”相符的非虚构电影宇宙中，那么我们关于阿拉贡的陈述就不是真的。这不是真的，因为阿拉贡和刚铎是你无法在地球上找到的虚构角色。如果你处于漫威电影宇宙中，出于类似的原因，这也是不真实的。但如果你处于 LoTR 电影宇宙中，那么它就变成了真的。\n\n这个问题——即难以定义某物“真实”的含义以及相对于哪个世界的问题——对 LLM 来说并不新鲜。它与哲学和语言学理论及争论的悠久历史有关。这段历史和理论是一个值得深入挖掘的领域（参见，例如，[这个概述](https:\u002F\u002Fplato.stanford.edu\u002Fentries\u002Ftruth\u002F)）。但是，就提示 LLM 的实际用途而言，它可以被过度简化为：一个陈述是否真实取决于作为该陈述背景的“电影宇宙”。\n\n为了本文档的目的，你可以将预训练语料库（corpus）视为人类文化产生的所有电影宇宙的集合并集的近似值。或者更准确地说，是那些积极参与预训练数据源（如网络）的文化。\n\n> [!IMPORTANT]\n> 你可以将预训练语料库视为人类文化产生的所有电影宇宙的集合并集的近似值。或者更准确地说，是那些积极参与预训练数据源（如网络）的文化。\n\n当你给模型一个固定的上下文窗口（context window）（即前缀（prefix））时，它会尝试从前缀中推断它处于哪个宇宙，然后它将按照该宇宙的规则、惯例和事实行事。如果你提供一个具有非常强上下文信号的提示（prompt），LLM 将更容易识别剧本。例如，考虑这样一个提示：\"*梦想成真的水泥丛林不仅仅是一句朗朗上口的歌词——它是纽约市（New York City）的电光真理。从刺破云层的摩天大楼到其多样化社区充满活力的脉搏，NYC 提供了一种地球上独一无二的体验*\"，即我可能会写的关于 NYC 的博客文章的前两行。）在这种情况下，模型在风格和主题上有很强的约束，这将影响它如何进行生成。\n\n但是，如果你的提示非常通用——比如“嗨，你好吗？”——LLM 可能没有足够的上下文来理解它应该处于哪个电影宇宙中。“嗨，你好吗？”可能出现在它接受训练的多样语料库的各种上下文中。也就是说，用于解码生成的概率密度函数中有许多“模式（modes）”。或者简单来说，它看到了许多它可以扮演的可能性。文本“嗨，你好吗？”，甚至更长的内容，都没有给它提供足够的上下文来消除这种歧义。\n\n这就是后训练（post-training）发挥作用的地方。\n\n### 后训练（Post-training）\n\nPost-training（后训练）为 LLM（大语言模型）提供了关于其存在的“默认”宇宙的指引。与其要求 LLM 仅从 Prompt（提示词）中推断这个宇宙，Post-training 可以约束 LLM 以一致的方式做出某些假设或解决歧义。有许多原因使得这对于使模型有用是必要的。例如，可能需要告诉 LLM，默认情况下它们遵循指令。否则，给定一个像\"*Write a report about George Washington*\"这样的 Prompt，未经 Post-training 的 LLM 可能会愉快地生成指令的延续，例如类似\"*It's due by 4:59pm on Friday*\"的内容，而不是生成所要求的报告。但 Post-training 也可用于施加其他默认设置，例如影响模型的默认行为以更符合社会规范（无论如何定义），理想情况下使其成为特定假设用例更安全或更有生产力的工具。\n\n我们非常喜欢 Murray Shanahan 的阐述，他认为概念化这些模型可能在做的事情的一种方式就是，它们正在参与一种 [角色扮演](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16367)，这是其整体训练方案（training recipe）的函数。我们的直觉是，Post-training 教会这些模型在各种部署环境中扮演连贯且默认的角色。\n\n> [!IMPORTANT]\n> Post-training 教会这些模型在各种部署环境中扮演连贯且默认的角色。\n\n以下是它们在 Post-training 期间可能学到的内容的非详尽列表，范围从平凡实用的到主观个人的。\n\n* **模型应遵循特定格式。** 例如，[Gemma’s formatter](https:\u002F\u002Fai.google.dev\u002Fgemma\u002Fdocs\u002Fformatting) 教导它处于一个电影宇宙（cinematic universe）中，其中总是存在它与某个任意人类用户之间的对话。在那个宇宙中，被要求扮演的角色在 System instructions（系统指令）中描述。根据 Formatter（格式化器），在每次对话中，人类的回合总是第一。\n* **模型应“遵循”用户的指令。** 也就是说，如果用户给它一个字符串 Prompt 让它“写一篇关于狗的文章”，它应该实际上这样做，而不是用越来越专横的指令延续来响应用户。\n* **模型应与“现实世界”匹配（而不是其他电影宇宙）。** Post-training 通常用于通过将模型的隐式或默认电影宇宙与大多数用户可能关心的宇宙对齐来提高模型的事实性（factuality）。例如，如果你问它\"$CELEBRITY 出生在哪里？”，它应该默认假设我们在谈论“现实世界”，而不是它可能在网络上遇到的、拥有同名名人的粉丝小说世界。\n* **它应该是“安全”的。** 互联网是一个复杂的规范性标准网络。不用说，相当多的互联网内容在全球商业部署的背景下不会被认为适宜。Post-training 有助于将模型对齐到选定的分布（distribution），该分布可以体现一系列安全策略，从而对模型应该或不应该生成什么施加规范性标准。最终，模型不可能在不就规范做出一些假设的情况下生成足够复杂的内容。\n\n#### Post-training Data Collection（后训练数据收集）\n\n**Broad Takeaway（主要收获）** - 这些模型最终是由 Human raters（人类评分员）训练和评估的。当指导 Post-trained LLM 时，你是在隐式地请求一个数字角色扮演者（即 LLM）扮演人类评分员（即生成 Post-training 数据的人）的角色，而这个人是为了扮演 AI 助手而被付费的。\n\n本节是一个巨大的简化。关于任务人类标注员进行 LLM Post-training 的复杂性和微妙之处，可以写出长得多得多的文档。本节的目标是提供此背景下人类标注的整体直觉，因为它直接影响人们如何思考 Prompting（提示工程）。\n\n从 AI 开发者的角度来看，Post-training 的人类数据收集过程大致如下：\n1. 创建一个多样化的输入示例数据集–即，描述 LLM 可能被要求执行的任务的 Prompts。这可以是任何内容，从“将此数据重新格式化为 json\"到“帮我计划我的婚礼”。（这可能来自你自己的直觉，或者来自人类评分员本身！）\n2. 创建一个人类“评分员”池，他们的工作是告诉模型对于这些任务该做什么。评分员的工作可能是为这些输入示例编写黄金标准答案，例如，实际自己提供婚礼策划技巧。或者可能是查看模型生成的不同响应并从最好到最差进行排名。在 Post-training 的不同阶段，模型可以使用不同类型的人类生成数据。\n3. 编写一些指南，说明这些评分员应该如何完成这项工作。通常，开发者会包含示例或关于任务和上下文的具体细节，以帮助评分员更好地理解任务。\n4. 收集这些数据并在其上\"Post-train\"Pre-trained（预训练）模型。\n5. 发布它。\n\nLLM 能够“表现得像人”的一个很大原因是因为这些统计模型拟合了大量精心收集的人类行为演示数据集。Pre-training 阶段、模型架构、学习算法等提供了模型的核心基础设施和底层能力。但 Post-training 提供了模型的整体方向（通过人类演示），这决定了它在实际部署时的实际行为。\n\n> [!IMPORTANT]\n> LLM 能够“表现得像人”的一个很大原因是因为这些统计模型拟合了大量精心收集的人类行为演示数据集。\n\nPost-training 团队花费大量时间在其数据上进行质量控制。很多精力投入到将评分员与他们最适合的 Prompts 相匹配。例如，为了提供一个如何响应包含困难 Python 调试问题的 Prompt 的良好演示，有必要找到一个本身就是优秀 Python 程序员的评分员。\n\n从人类评分者（human raters）那里收集“高质量”数据极具挑战性。原因包括：\n* **与需要相同技能的其他工作相比，评分示例可能很枯燥：** 如果你是一位优秀的 Python 程序员，从事自己的编码项目可能比每天花 8 小时调试用于训练 AI 系统的程序更有趣。如果你是一位才华横溢的诗人，你可能想写自己的诗，而不是将 AI 写的诗从最好到最差进行排名。当然，如果你每天花约 8 小时进行评分，你会因此获得报酬。但评分示例可能极其重复，而且评分者的激励通常基于产出量（throughput）。此外，自主权（agency）或所有权（ownership）的感觉也可能存在挑战——你并不总是知道这些数据如何改变了模型的整体质量，这些数据是否会被丢弃，它被用于哪个模型等。你与数据效用的唯一关系可能是你的主管告诉你做得好不好。如果你没有一个清晰的叙事（narrative）来说明为什么在那份工作中能起到对更广泛世界的积极改变作用，你可能不会觉得这是对我们时间的有意义的使用。这可能会影响你做一份“好”工作的无意识热情，即使你在抽象层面上想做一份好工作。\n* **定义特定任务的“良好”响应是什么样子的非常具有挑战性。** 当定义必然与关于事实性（factuality）、说明性写作（expository writing）或其他某些能力的现有规范相交时，这一点尤其正确。对于大多数有趣的人工制品而言，“好\u002F坏”之间的“界限”实际上相当模糊，并且取决于许多规范性因素。（例如，想象指示 AI 系统充当一家公司的公关代理人（PR agent）。社会现实的全部复杂性真的很难钉入一套清晰的命题中。这项工作最终需要“良好的判断力”，其执行方式在不同人和情境下会有很大差异。）这使得开发者很难为该任务编写良好的指令，也让人类评分者在确定如何解释这些指令时非常主观。（这与普通法体系（common law system）的工作方式有相似之处。写下能够预见大量边缘情况（edge-cases）的立法（legislation）极其困难。因此，社会使用司法体系（judicial system）来仲裁边缘情况并创建先例（precedents）。）\n* **评分者可能不理解任务。** 任何工作的招聘都很困难，评分工作也不例外。即使在招聘上花费了大量精力，也不能保证评分者具备完成任务的技能。这可能出于多种原因。例如，因为设计者没有预料到评分者将获得的任务的复杂性（即，雇佣了一位具有入门级软件工程技能（software engineering skills）的人，而任务实际上需要一位经验丰富的软件工程师），或者因为评分者自己没有意识到自己知识的局限性（例如，根据他们在大学生物课上学到的内容回答问题，而这实际上与最新的研究不一致）。\n* **人会犯错。** 教授发布的考试中有错误答案。医生在饥饿或疲劳时更容易误诊。当人们在家庭生活中受到干扰时，在工作中会失去专注力。出于各种原因，人类在任务上的表现可能低于“黄金标准（gold standard）”。这意味着即使尽了最大努力，AI 系统有时也会在错误或误导性的数据上进行训练。\n\n\n\n## 提示词（Prompting）编写注意事项\n\n**主要结论 -** 当你编写系统指令（system instructions）和提示词（prompts）时，你实际上是在为后训练团队（post-training team）评分池（rater pool）的聚合精神而写，其种子来源于预训练语料库（pre-training corpus）的聚合精神。如果我们编写的指令是特定领域内的平均评分者能够理解、领会并忠实遵循的，那么模型更有可能遵循我们的指令。\n\n> [!IMPORTANT]\n> 当你编写系统指令和提示词时，你实际上是在为后训练团队评分池的聚合精神而写，其种子来源于预训练语料库的聚合精神。\n\n当我们编写系统指令时，想象屏幕另一端有一位友好、善意且胜任的评分者准备扮演 AI 角色，这很有帮助。我们提供的文本是他们拥有的全部。也就是说，当我们向 Gemini 模型发起 API 调用（API call）时，想象另一端有一位人类评分者会仔细阅读我们的提示词并提供回复。在构建提示词时，站在他们的角度考虑我们的指令非常有帮助。例如，假设指令是关于生成 Python 代码的。如果我们随机找一位胜任的 Python 工程师，让他们回应这些指令，他们会明白我们要什么吗？\n\n这个比喻有助于帮助你编写良好的指令。但它不一定能很好地比喻 AI 系统是如何遵循这些指令的。例如，当我们考虑到这位所谓的评分者可能拥有所有人类知识时，这个比喻就开始失效了。他们只是缺乏智慧和上下文来超越我们提供的提示词进行洞察。此外，AI 系统可能不会像人类评分者那样“理解”提示词。相反，AI 系统将利用其非常强大的统计模型（statistical models）为你的输入提供输出。但是，这些统计模型是为针对善意且胜任的人类评分者编写的指令而优化的，因此，提供任何非为此类评分者编写的提示词可能会让 AI 系统感到困惑。没有什么比“分布外（out of distribution）”的情况更让 AI 讨厌的了！\n\n鉴于此，以下是一些帮助你改进提示词中指令的注意事项。随着模型的改进，以下注意事项可能会很快过时。我们建议尝试与这份列表的整体精神保持一致，而不是拘泥于字面意思。\n\n* **指令是否清晰、易读、简洁且明确？** 例如，假设我们的指令是关于某个 Python 编码任务的。如果我们随机找一位 Python 专家，让他们假装成 Gemini，我们的指令是否足够好，让他们在不问任何明显的澄清问题的情况下立即明白我们的意思？\n   * \u003Cins>差：\u003C\u002Fins> 编写一个计算质数的 Python 函数。\n   * \u003Cins>好：\u003C\u002Fins> 编写一个计算 1 到 100 之间质数的 Python 函数。为生成的函数包含 pytype 类型注解（pytype annotations），并使用 2 空格缩进。\n\n* **指令是否自相矛盾或难以遵循？** 一个无聊、饥饿、疲惫等的评估员（rater）真的会阅读我们过于冗长的指令并忠实遵循吗？请注意，为了确保给定 Prompt（提示词）中的所有指令都被遵循，通常涉及大量的质量控制。但人毕竟是人。我们的指令实际上是否“容易”遵循？还是它们包含了不必要的迂回、冗长等？当本指南（Playbook）的作者写下指令时，我们经常问自己，如果将这些指令在没有额外上下文的情况下呈现给我们公司的另一位员工，他们能否忠实遵循。\n   * \u003Cins>Bad:\u003C\u002Fins> Don't write a story about a mean dog, unless it's friendly, and also sad, but not really that sad, and make it long even but not too long. Also the dog should be named Bob, or maybe Susan, doesn't matter. Write it about a cat too. But that’s not actually as important, but make the dog fluffy.\n   * \u003Cins>Good:\u003C\u002Fins> Write a short story (200-300 words) about a loyal golden retriever named Buddy who gets lost in the woods during a family camping trip. The story should focus on Buddy's journey and his determination to find his way back to his family.\n\n* **给定的系统指令（System Instruction）中是否包含了指令过多？** 我们注意到，Prompt（提示词）中的指令数量与模型忠实遵循所有指令的能力之间存在反比关系。尽管我们确实见过许多模型能够相当好地遵循长链条指令的案例。这只是一个经验法则，也适用于为人类编写指令的情况。基本上，如果可以的话，最好将任务分解为子任务。很难为此考虑提供好坏示例，因为它很大程度上取决于所考虑的模型，但这里是我们所指的精神实质。\n   * \u003Cins>Bad:\u003C\u002Fins> Read each article and, for each key idea, rate it on a scale of 1-10 for how important it is to understanding the general point. Then for anything rated higher than 7, summarize it in the form of a Chinese social media post.\n   * \u003Cins>Good:\u003C\u002Fins> *Call the AI model for each subtask separately (what follows is not an example of a literal prompt for the model).* 1) Break the article into a list of main ideas. 2) Rate each idea on a scale of 1-10. 3) Take the top ones and translate them into Chinese. 4) Take the Chinese text and convert it into a social media post.\n\n* **使用正向指令而非负向指令。** 下面的“坏”例子说明了模型不应该做什么。但它没有说明模型应该做什么。“好”的例子非常明确地列出了模型应该做什么。这里实际上与人们在教师培训或伴侣治疗等情境中学到的有效人际沟通有相似之处。我们应该试着想象我们正在试图与一个想给我们想要东西的人沟通。但我们需要给他们关于什么是“成功”的非常明确的指导，而不是告诉他们要“避免失败”。\n   * \u003Cins>Bad:\u003C\u002Fins> “Don’t ever end your response with a full stop.”\n   * \u003Cins>Good:\u003C\u002Fins> “Your response should always end with an exclamation mark or a question mark”.\n\n* **良好的系统指令可以充当模型的“提醒”。** 在迭代新 Prompt（提示词）时，考虑非常多样化的输入示例非常重要。很常见的是，人们编写的 Prompt（提示词）可能对约 60-70% 的可能模型输入有效，但对剩余的约 30-40% 未指定或模糊不清。通常值得给模型一套明确的正向指令，说明在这些情况下它应该做什么。我们通常在系统指令中创建一个名为“额外考虑”或“额外假设”的独立部分，其中包含针对这些边缘情况的规格要点列表。\n\n* **Prompt（提示词）是新的超参数（Hyperparameters），你可能永远找不到“最好”的一个。** 例如，正确调整学习率可以对特定计算预算下模型在验证集上的最终性能产生巨大影响。同样，“好”Prompt（提示词）和“坏”Prompt（提示词）之间的差异会对系统的最终性能产生重大影响。同理，通过更多的 Prompt（提示词）调优，可能总存在一个“稍好”的 Prompt（提示词）。而且我们永远不会知道是否找到了“最好的一个”。如下文所述，我们可以利用与 [深度学习调优指南](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Ftuning_playbook) 中使用的类似直觉，系统地找到比我们当前基线“更好”的 Prompt（提示词）。该指南在超参数搜索的背景下提到了“试验预算”。对于提示工程（Prompting），为我们的实验创建一个时间盒（Timebox）可能会很有用。\n\n* **尝试给予模型明确的“我不知道”的指令。** 例如，假设我们正在处理某种多类文本分类任务。我们有一些标准，规定输入示例应如何映射到每个类别。创建一个额外的“未知”或“边缘情况”类别可能非常有帮助。并提供明确的正向指令，如果模型认为指令对于正确分类此示例不清楚，则将输入分类到此类别。然后我们可以查看日志，了解何时\u002F如何发生这种情况，并相应地改进 Prompt（提示词）。\n\n* **Prompt（提示词）可能与其开发的检查点（Checkpoint）深度耦合。** 也就是说，如果我们从 Gemini 1.5 Flash 获取一个 Prompt（提示词）并在 Gemini 1.0 Pro 上运行它，它可能不会以“相同的方式”工作，并且在评估（Eval）上可能表现出非常不同的聚合行为。这在某种程度上是有道理的。我们的心理模型是，我们用自然语言编写的 Prompt（提示词）类似于如果我们进行随机梯度下降（SGD）时会训练的参数。这在多大程度上是正确的，是一个开放研究的问题。模型有点像机器，后训练过程有点像编译器，而系统指令有点像计算机代码。只不过机器和编译器是完全流动的，给定的模型可以持有这些的许多不同组合。我们怀疑生态系统将有机地收敛到关于指令外观的某种共识结构，这种结构随时间快速变化，并以组合爆炸的方式保持合理的向后兼容性。这类似于 x86 指令集随时间保持相对稳定，相比之下，构建在其之上的编程语言的多样性呈爆炸式增长。\n\n## 提示词（Prompts）的初步“风格指南”\n目前，我们推测在前沿模型（frontier model）上执行的经过良好调整的提示词（Prompts），足以应对大量此前需要定制训练模型（bespoke trained model）的机器学习（ML）工作负载。\n\n随着推理成本（inference costs）、延迟（latency）、上下文窗口大小（context window sizes）等持续改善，提示词技术（prompting）将变得越来越普遍。编程语言风格指南的出现源于这样一个观察：软件主要是为其他软件工程师编写的，以便于维护等。我们不确定提示词技术（prompting）中相当于编程语言风格指南的是什么。但存在许多有趣的可能性。包含提示词的 Markdown 文件最终可能会被视为版本控制系统（version control systems）中具有特定文件扩展名的独立“语言”。或者，编程语言最终可能会与模型推理（model inference）有更“透明”的集成。现在做出确切的预测还为时过早！\n\n我们这里没有提供一份完整的风格指南。但我们想分享以下观察结果：\n* **考虑使用 Markdown：** 大多数版本控制系统都擅长渲染 Markdown 文件。因此，将每个提示词存储在单独的 Markdown 文件中可能是有用的。并且明智地使用 Markdown 标题等来组织我们每个提示词的内容。\n* **为他人着想！** 我们鼓励将保存的提示词视为主要是为了其他提示词维护者，而不仅仅是为了大语言模型（LLM）。随着模型变得更好，我们希望我们需要花费更少的精力在独特的取巧方法上来引导模型达到我们期望的行为。如上所述，这也可能自然地促成更好的提示词。\n* **简洁性：** 我们的提示词产生的“技术债务”与其长度和整体复杂度成正比。提示词与特定的检查点（checkpoint）紧密相连。每当底层模型发生变化时，都值得定量和定性地检查我们的提示词是否仍然有效。真的值得让提示词尽可能简单、简练和直接。让模型所做的隐含假设免费地为我们所用。如果简单的提示词就能解决问题，就不要试图向提示词中添加更多细节。这明确假设执行提示词的底层模型不会与提示词被随意解耦。当然，我们不知道模型是否会对不同的输入示例做出相同的隐含假设集。随着我们提示词的部署变得更加“严肃”，实施更严格的定量评估变得不可避免地至关重要。\n* **优先选择零样本（zero-shot）指令而非少样本（few-shot）指令：** 这与简洁性的精神一致。零样本指令更容易理解、调试和推理。少样本示例，尤其是当它们包含完整对话时，在我们显式指令的“字面意思”不足以捕捉指令“精神”时最为适用。我们见过很多少样本效果不如零样本的情况。我们不应仅仅假设少样本会更好，而应该对此进行实证研究。我们建议将少样本示例作为最后手段，因为它们会严重影响提示词的可读性。\n   * 与其使用完整的少样本示例，不如将示例编织进我们指令的正文中。考虑以下在指令末尾使用“例如”的系统指令（system instruction）：\n      * *始终用某种被动攻击性的内容开始你对用户的回复。例如，以类似“哦，这就是你想要的？我不是说你错了。但是，好吧，如果你真的想要的话。”这样的话开头。但要保持新鲜感，根据用户消息中的内容，每次回复使用不同的开头。*\n\n## 迭代新系统指令的流程\n\n提示工程（Prompting）本质上是迭代的。它在本质上非常类似于使用某些验证集（validation set）来训练模型。只不过，你不需要 JAX 等工具，而是可以通过编写清晰的文本来完成。\n\n与撰写文本类似，我们发现最好将任务分解为独立的生成（generate）和编辑（edit）阶段。\n\n我们发现，大多数尝试对模型进行提示的用户并没有一个干净的验证集来基准测试模型的响应。这对于快速构建最小可行性产品（MVP）来说是可以的。但最终，正式的定量评估对于跟踪模型性能是无价的。即使拥有最佳的提示词 + 模型组合，也不可能确定性地保证模型的行为。重要的是要设计能够优雅处理失败的产品界面，以防底层模型做出意外行为。\n\n像 [AI Studio](https:\u002F\u002Faistudio.google.com\u002Fapp\u002Fprompts\u002Fnew_chat) 这样的工具对于迭代新的系统指令集可能非常有价值。\n\n1. 从该问题的一组真正多样化的输入示例小样本开始，并尝试直观地了解期望的输出行为可能是什么。这可能是大约 ~10-50 个示例。\n\n2. 从最简单的系统指令开始，该指令可能满足每个输入示例，并使用可用中最小且最便宜的模型。目前，这很可能是像 Gemini Flash 8B 这样的模型。\n   * 当我们说“最简单”时，我们就是指最简单。在不损失任何清晰度的情况下，使其尽可能简洁简短。\n\n3. 在第一个输入示例上运行系统指令。\n\n4. 尝试对该特定示例进行“过拟合（overfit）”。\n   * 也就是说，向原始系统指令添加“提醒”，直到模型可靠地产生“足够好”的响应。当我们说提醒时，我们字面意思就是指提醒。找出模型对原始系统指令响应的任何具体缺陷。现在向其添加指令，指出这些具体行为。一次做一个。例如，假设原始指令告诉模型从输入字符串中提取所有命名实体（named entities）。我们注意到它也在提取建筑物、地点等的名称，但在这个黄金示例中，我们的意图是只提取人名。我们可以编辑\u002F修改原始指令以反映这一点。\n   * 如果它无法对第一个输入提示产生足够好的响应，请尝试第二个输入提示。如果模型在我们的大多数输入示例上一贯失败，请回到步骤 (2) 并尝试更好的模型。例如，如果我们从 Flash 开始，请尝试 Pro。\n\n5. 一旦我们能够成功对第一个示例进行过拟合，请尝试下一个输入示例。\n   * 此时，我们可能会注意到其中一些指令对第一个示例过拟合得太严重了。也许它们需要更多的细微差别。例如，也许它们需要明确的指令来澄清某些规格不足（underspecification），或者一些 if\u002Felse\u002Fthen 子句。\n   * 重复此整个过程，直到我们找到适用于所有多样化输入示例的系统指令。\n\n6. 在此过程中，我们可能随意编辑了系统指令的文本，使其适用于所有输入示例。现在是清理该文本的时候了。例如，添加章节标题、修正拼写错误等。完成此操作后，验证指令是否仍然适用于所有输入示例。如果不适用，请使用上述过程系统地调试它们。但不要以步骤 (2) 中的 MVP 系统指令为基础，而是使用当前清理后的文本状态。\n   * 出现某种情况并不令人惊讶：在某些模型上，“混乱”的提示词比“干净”的效果更好。这引发了一系列问题：\n      * 相对于其性能，维护此提示词有多重要？\n      * 我们是否有针对生产环境中模型行为的监控？此次部署是否重要到值得这样做？\n\n7. 也许有一天，从输入示例到系统指令的整个流程将由用于生成指令的元优化器（meta-optimizer）可靠地自动化。那将很棒，但我们不应被愚弄！这里没有免费的午餐。在使用机器学习（ML）系统时，总有一个旋钮需要调节。总有一些反复的定性工作需要进行，以确保我们的模型的相关性感知（即“电影宇宙”）与我们作为开发者在面对特定输入示例数据集时发现的相关内容保持一致。永远不可能凭空消除定性分析的艰苦工作，原因就像永远不可能凭空消除培训我们工程团队新成员的艰苦工作一样。提高工具的抽象程度仅仅将定性工作的焦点和“活化能”转移到了不同的抽象层级。但这并不意味着此类工具不能有用！\n\n> [!IMPORTANT]\n> 永远不可能凭空消除定性分析的艰苦工作，原因就像永远不可能凭空消除培训我们工程团队新成员的艰苦工作一样。\n\n## 关于大语言模型（LLMs）何时有用的几点思考\n\n我们已经花了很多篇幅讨论如何思考提示工程。也值得花一分钟谈谈“为什么”LLM 可能会证明是有用的。尽管这个领域非常新兴且创新迅速。因此，本节内容可能在几个月内迅速过时。\n\nLLM 最适合于答案难以生成但易于检查的场景。我们推荐 Chris Gorgolewski 的 [这篇文章](https:\u002F\u002Fwww.linkedin.com\u002Fpulse\u002Fhmec-principle-finding-sweet-spot-generative-ai-gorgolewski-ph-d--3zl5e)。根据我们的经验，如果我们将 LLM 用于不符合这一条件的场景，我们会遇到问题。\n\n与其编写难以理解、调试等的单体提示词，不如将问题分解为子问题并将推理链式连接起来。如果我们使每个子问题足够小，它们通常定义得更好，也更易于检查\u002F评估。\n\n硬件、商业模式等方面可能会有重大创新，以继续解锁更多的推理能力。我们应该去球将要落到的地方，而不是它现在所在的地方。\n\n假设推理成本很快会变得“便宜到无法计量”、“足够好”或围绕“价值”而非“成本”导向，似乎更具未来保障。也就是说，如果我们在一个更有价值但需要更多推理调用才能可靠工作的功能，与一个价值低得多但成本更低的功能之间做权衡，那么以前者为导向似乎更具未来保障。并且或许通过限制发布范围等方式来解决当前的单位经济问题。如果我们开始看到类似于摩尔定律的现象，但是是针对 LLM 推理的价格下降，我们也不会感到惊讶。\n\n## 更多资源\n\n关于提示词（Prompting）的在线优质资源非常丰富，这里无法一一列举。我们并非唯一在思考提示词工程的人。本指南仅仅汇集了我们一些非正式的想法。但在互联网上确实有非常多的优秀成果。例如：\n* [AI 提示工程：深度解析](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=T9aRN5JkmL8) by Anthropic.\n* [本指南](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fguides\u002Fprompt-engineering) from OpenAI.\n* [本指南](https:\u002F\u002Fservices.google.com\u002Ffh\u002Ffiles\u002Fmisc\u002Fgemini-for-google-workspace-prompting-guide-101.pdf) for Gemini.\n\n思考“高语境”和“低语境”文化之间的差异也颇有趣味。我们经验性地发现，那些善于向模型提问的人，与那些擅长在低语境文化中进行有效沟通的人之间，存在着显著的交集。这很好理解，因为大语言模型（LLM）其实并不清楚它应该处于哪个“电影宇宙”中运作。它需要明确的指令来明确其角色扮演的身份、所处位置，以及环境中哪些信息是相关的。\n\n从个人经验来看，练习 [非暴力沟通（Nonviolent Communication）](https:\u002F\u002Fwww.amazon.com\u002FNonviolent-Communication-Language-Life-Changing-Relationships\u002Fdp\u002F189200528X) 会非常有帮助。它教会了人们区分可观察的行为和内心的独白。这种区分对于用可观察的行为来表述指令非常有帮助。虽然不言而喻，我们练习非暴力沟通最初并不是为了更擅长向模型提问。\n\n## 致谢\n* 感谢 Slav Petrov 和 Sian Gooding 在最终审查期间审阅了本文档。\n* 感谢 Anna Bortsova 在文档最终版本的起草过程中提供了许多有益的评论\u002F反馈。\n* 感谢 Jennimaria Palomaki, James Wexler, Vera Axelrod 提出了许多有用的修改建议。","# prompt-tuning-playbook 快速上手指南\n\n本指南旨在帮助开发者快速理解并使用 `prompt-tuning-playbook`。该工具主要是一份关于后训练大语言模型（Post-trained LLMs，如 Gemini）提示词调优的最佳实践文档与方法论集合，而非传统的可执行代码库。\n\n## 环境准备\n\n由于本项目主要为文档资源，对环境要求极低：\n\n- **系统要求**：任何支持文本浏览的操作系统（Windows, macOS, Linux）。\n- **前置依赖**：\n  - 互联网连接（用于访问在线文档或克隆仓库）。\n  - （可选）Git 客户端（用于本地离线阅读）。\n  - Markdown 阅读器（如 VS Code, Typora 或浏览器）。\n\n## 安装步骤\n\n本项目无需编译或安装依赖包，主要通过获取源代码或在线查看方式使用。\n\n1. **在线查看**\n   直接访问项目的 GitHub 页面或文档链接阅读 README 文件。\n\n2. **本地克隆（推荐）**\n   若需离线参考，可使用 Git 将仓库克隆到本地：\n\n   ```bash\n   git clone \u003Crepository-url>\n   cd prompt-tuning-playbook\n   ```\n\n   > 请将 `\u003Crepository-url>` 替换为实际的项目地址。\n\n## 基本使用\n\n本“工具”的核心价值在于其提供的思维模型和操作流程。以下是基于文档内容的核心使用方法：\n\n### 1. 理解核心概念\n在阅读具体技巧前，先建立正确的心理模型：\n- **预训练 (Pre-training)**：理解模型通过互联网数据学习到了多种“电影宇宙”（Cinematic Universe），即不同的语境和事实体系。\n- **后训练 (Post-training)**：理解后训练赋予了模型默认的“角色”和行为准则（如遵循指令、符合社会规范）。\n\n### 2. 应用提示词风格指南\n参考文档中的 **\"A rudimentary 'style guide' for prompts\"** 部分优化你的 Prompt：\n- **避免模糊**：不要使用过于通用的开场白（如 \"Hi, how are you?\"），这会让模型难以确定所处的“宇宙”。\n- **提供强信号**：在 Prompt 中明确风格、主题和上下文约束。\n\n**示例：**\n\n```text\n# 较差的 Prompt\n请写一篇关于纽约的文章。\n\n# 推荐的 Prompt\nThe concrete jungle where dreams are made of isn't just a catchy lyric – it's the electric truth of New York City. From the soaring skyscrapers that pierce the clouds to the vibrant pulse of its diverse neighborhoods, NYC offers an experience unlike any other on Earth. \n(续写以下内容，保持这种充满活力的博客风格...)\n```\n\n### 3. 迭代系统指令\n按照文档中的 **\"Procedure for iterating on new system instructions\"** 流程进行优化：\n1. 定义具体的任务目标。\n2. 编写初始系统指令。\n3. 测试并观察模型行为是否符合预期的“默认宇宙”。\n4. 根据反馈调整指令，直到模型行为一致且安全。\n\n通过遵循上述步骤，你可以更系统地利用该 Playbook 提升与大语言模型的交互效果。","某电商技术团队正在紧急上线一个智能售后助手。他们要求大模型能够精准理解用户复杂的退换货诉求，并以专业且具同理心的语气进行回复。\n\n### 没有 prompt-tuning-playbook 时\n- 提示词编写高度依赖个人直觉，导致模型在不同测试集上的表现极不稳定。\n- 缺乏统一的指令规范，使得售前咨询与售后处理的回复风格严重割裂。\n- 调试过程如同黑盒操作，工程师耗费数天时间却只能依靠运气获得理想效果。\n- 混淆了预训练能力与后训练目标，导致模型在特定垂直领域的专业知识调用失败。\n\n### 使用 prompt-tuning-playbook 后\n- 严格遵循文档中的思维模型，构建了模块化的系统指令框架，确保持续稳定的输出质量。\n- 应用推荐的风格指南，统一了全链路的交互语气，显著提升了用户体验的一致性。\n- 执行结构化的迭代流程，能够快速隔离变量，将单点问题的修复时间从数天缩短至数小时。\n- 清晰界定后训练阶段的优化边界，使模型能更精准地调用内部知识解决具体业务逻辑。\n\n通过引入这套经过验证的方法论，团队成功将提示词工程从不可控的“艺术创作”转化为了可维护、可评估的标准化工程实践。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fvarungodbole_prompt-tuning-playbook_07b12964.png","varungodbole","Varun Godbole","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fvarungodbole_07967c3d.png","ex-Google DeepMind. Spent a decade doing deep learning research and was a core member of Gemini's modeling team from day one.",null,"New York","VarunGodbole","www.varungodbole.com","https:\u002F\u002Fgithub.com\u002Fvarungodbole",900,38,"2026-03-18T03:44:30","NOASSERTION",1,"未说明",{"notes":91,"python":89,"dependencies":92},"该仓库为提示词调优指南文档（Playbook），非可执行代码库。内容主要提供关于 LLM 预训练、后训练及提示词工程的心智模型与最佳实践，专注于 Gemini 系列模型。无需安装特定依赖或配置运行环境，适合阅读参考。",[],[15],"2026-03-27T02:49:30.150509","2026-04-06T07:23:22.755932",[],[]]