[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-JindongGu--Awesome-Prompting-on-Vision-Language-Model":3,"tool-JindongGu--Awesome-Prompting-on-Vision-Language-Model":65},[4,23,32,40,49,57],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":22},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,2,"2026-04-05T10:45:23",[13,14,15,16,17,18,19,20,21],"图像","数据工具","视频","插件","Agent","其他","语言模型","开发框架","音频","ready",{"id":24,"name":25,"github_repo":26,"description_zh":27,"stars":28,"difficulty_score":29,"last_commit_at":30,"category_tags":31,"status":22},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,3,"2026-04-04T04:44:48",[17,13,20,19,18],{"id":33,"name":34,"github_repo":35,"description_zh":36,"stars":37,"difficulty_score":29,"last_commit_at":38,"category_tags":39,"status":22},519,"PaddleOCR","PaddlePaddle\u002FPaddleOCR","PaddleOCR 是一款基于百度飞桨框架开发的高性能开源光学字符识别工具包。它的核心能力是将图片、PDF 等文档中的文字提取出来，转换成计算机可读取的结构化数据，让机器真正“看懂”图文内容。\n\n面对海量纸质或电子文档，PaddleOCR 解决了人工录入效率低、数字化成本高的问题。尤其在人工智能领域，它扮演着连接图像与大型语言模型（LLM）的桥梁角色，能将视觉信息直接转化为文本输入，助力智能问答、文档分析等应用场景落地。\n\nPaddleOCR 适合开发者、算法研究人员以及有文档自动化需求的普通用户。其技术优势十分明显：不仅支持全球 100 多种语言的识别，还能在 Windows、Linux、macOS 等多个系统上运行，并灵活适配 CPU、GPU、NPU 等各类硬件。作为一个轻量级且社区活跃的开源项目，PaddleOCR 既能满足快速集成的需求，也能支撑前沿的视觉语言研究，是处理文字识别任务的理想选择。",74913,"2026-04-05T10:44:17",[19,13,20,18],{"id":41,"name":42,"github_repo":43,"description_zh":44,"stars":45,"difficulty_score":46,"last_commit_at":47,"category_tags":48,"status":22},3215,"awesome-machine-learning","josephmisiti\u002Fawesome-machine-learning","awesome-machine-learning 是一份精心整理的机器学习资源清单，汇集了全球优秀的机器学习框架、库和软件工具。面对机器学习领域技术迭代快、资源分散且难以甄选的痛点，这份清单按编程语言（如 Python、C++、Go 等）和应用场景（如计算机视觉、自然语言处理、深度学习等）进行了系统化分类，帮助使用者快速定位高质量项目。\n\n它特别适合开发者、数据科学家及研究人员使用。无论是初学者寻找入门库，还是资深工程师对比不同语言的技术选型，都能从中获得极具价值的参考。此外，清单还延伸提供了免费书籍、在线课程、行业会议、技术博客及线下聚会等丰富资源，构建了从学习到实践的全链路支持体系。\n\n其独特亮点在于严格的维护标准：明确标记已停止维护或长期未更新的项目，确保推荐内容的时效性与可靠性。作为机器学习领域的“导航图”，awesome-machine-learning 以开源协作的方式持续更新，旨在降低技术探索门槛，让每一位从业者都能高效地站在巨人的肩膀上创新。",72149,1,"2026-04-03T21:50:24",[20,18],{"id":50,"name":51,"github_repo":52,"description_zh":53,"stars":54,"difficulty_score":46,"last_commit_at":55,"category_tags":56,"status":22},2234,"scikit-learn","scikit-learn\u002Fscikit-learn","scikit-learn 是一个基于 Python 构建的开源机器学习库，依托于 SciPy、NumPy 等科学计算生态，旨在让机器学习变得简单高效。它提供了一套统一且简洁的接口，涵盖了从数据预处理、特征工程到模型训练、评估及选择的全流程工具，内置了包括线性回归、支持向量机、随机森林、聚类等在内的丰富经典算法。\n\n对于希望快速验证想法或构建原型的数据科学家、研究人员以及 Python 开发者而言，scikit-learn 是不可或缺的基础设施。它有效解决了机器学习入门门槛高、算法实现复杂以及不同模型间调用方式不统一的痛点，让用户无需重复造轮子，只需几行代码即可调用成熟的算法解决分类、回归、聚类等实际问题。\n\n其核心技术亮点在于高度一致的 API 设计风格，所有估算器（Estimator）均遵循相同的调用逻辑，极大地降低了学习成本并提升了代码的可读性与可维护性。此外，它还提供了强大的模型选择与评估工具，如交叉验证和网格搜索，帮助用户系统地优化模型性能。作为一个由全球志愿者共同维护的成熟项目，scikit-learn 以其稳定性、详尽的文档和活跃的社区支持，成为连接理论学习与工业级应用的最",65628,"2026-04-05T10:10:46",[20,18,14],{"id":58,"name":59,"github_repo":60,"description_zh":61,"stars":62,"difficulty_score":10,"last_commit_at":63,"category_tags":64,"status":22},3364,"keras","keras-team\u002Fkeras","Keras 是一个专为人类设计的深度学习框架，旨在让构建和训练神经网络变得简单直观。它解决了开发者在不同深度学习后端之间切换困难、模型开发效率低以及难以兼顾调试便捷性与运行性能的痛点。\n\n无论是刚入门的学生、专注算法的研究人员，还是需要快速落地产品的工程师，都能通过 Keras 轻松上手。它支持计算机视觉、自然语言处理、音频分析及时间序列预测等多种任务。\n\nKeras 3 的核心亮点在于其独特的“多后端”架构。用户只需编写一套代码，即可灵活选择 TensorFlow、JAX、PyTorch 或 OpenVINO 作为底层运行引擎。这一特性不仅保留了 Keras 一贯的高层易用性，还允许开发者根据需求自由选择：利用 JAX 或 PyTorch 的即时执行模式进行高效调试，或切换至速度最快的后端以获得最高 350% 的性能提升。此外，Keras 具备强大的扩展能力，能无缝从本地笔记本电脑扩展至大规模 GPU 或 TPU 集群，是连接原型开发与生产部署的理想桥梁。",63927,"2026-04-04T15:24:37",[20,14,18],{"id":66,"github_repo":67,"name":68,"description_en":69,"description_zh":70,"ai_summary_zh":71,"readme_en":72,"readme_zh":73,"quickstart_zh":74,"use_case_zh":75,"hero_image_url":76,"owner_login":77,"owner_name":78,"owner_avatar_url":79,"owner_bio":80,"owner_company":81,"owner_location":82,"owner_email":78,"owner_twitter":78,"owner_website":83,"owner_url":84,"languages":78,"stars":85,"forks":86,"last_commit_at":87,"license":78,"difficulty_score":46,"env_os":88,"env_gpu":89,"env_ram":89,"env_deps":90,"category_tags":93,"github_topics":94,"view_count":10,"oss_zip_url":78,"oss_zip_packed_at":78,"status":22,"created_at":98,"updated_at":99,"faqs":100,"releases":101},2567,"JindongGu\u002FAwesome-Prompting-on-Vision-Language-Model","Awesome-Prompting-on-Vision-Language-Model","This repo lists relevant papers summarized in our survey paper:  A Systematic Survey of Prompt Engineering on Vision-Language Foundation Models.","Awesome-Prompting-on-Vision-Language-Model 是一个专注于视觉 - 语言模型提示工程（Prompt Engineering）的开源资源库。它系统性地整理了相关前沿论文，旨在帮助研究者快速掌握如何利用提示技术，让大规模预训练模型更好地适应新任务，而无需重新训练整个模型。\n\n该资源库主要解决了在多模态人工智能研究中，如何高效设计和应用提示以激发模型潜力的难题。它将复杂的提示方法归纳为“硬提示”（如任务指令、少样本学习）和“软提示”（如提示微调），并覆盖了三大类主流模型：多模态到文本生成模型（如 Flamingo）、图文匹配模型（如 CLIP）以及文生图模型（如 Stable Diffusion）。\n\n这一工具特别适合人工智能领域的研究人员、算法工程师以及对多模态大模型感兴趣的高级开发者使用。通过查阅其中分类清晰的论文列表和技术综述，用户可以迅速了解不同场景下的最佳实践，避免重复造轮子。其独特的亮点在于提供了一份基于系统性调研的知识地图，不仅区分了不同的融合模块架构，还深入剖析了各类提示策略的适用场景，是探索视觉 - 语言基础模型不可或缺的理论指南与实","Awesome-Prompting-on-Vision-Language-Model 是一个专注于视觉 - 语言模型提示工程（Prompt Engineering）的开源资源库。它系统性地整理了相关前沿论文，旨在帮助研究者快速掌握如何利用提示技术，让大规模预训练模型更好地适应新任务，而无需重新训练整个模型。\n\n该资源库主要解决了在多模态人工智能研究中，如何高效设计和应用提示以激发模型潜力的难题。它将复杂的提示方法归纳为“硬提示”（如任务指令、少样本学习）和“软提示”（如提示微调），并覆盖了三大类主流模型：多模态到文本生成模型（如 Flamingo）、图文匹配模型（如 CLIP）以及文生图模型（如 Stable Diffusion）。\n\n这一工具特别适合人工智能领域的研究人员、算法工程师以及对多模态大模型感兴趣的高级开发者使用。通过查阅其中分类清晰的论文列表和技术综述，用户可以迅速了解不同场景下的最佳实践，避免重复造轮子。其独特的亮点在于提供了一份基于系统性调研的知识地图，不仅区分了不同的融合模块架构，还深入剖析了各类提示策略的适用场景，是探索视觉 - 语言基础模型不可或缺的理论指南与实践手册。","\n\n# Awesome Prompting on Vision-Language Models\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FJindongGu_Awesome-Prompting-on-Vision-Language-Model_readme_e31dfd33163a.png\" width=\"100%\" height=\"100%\">\n\n## # :nerd_face: What is Prompting on Vision-Language Models?\nPrompt engineering is a technique that involves augmenting a large pre-trained model with task-specific hints, known as prompts, to adapt the model to new tasks. This repo aims to provide **a comprehensive survey** of cutting-edge research in prompt engineering on **three** types of vision-language models (VLMs): **multimodal-to-text generation models** (*e.g.*, Flamingo), **image-text matching models** (*e.g.*, CLIP), and **text-to-image generation models** (*e.g.*, Stable Diffusion) (Fig. 1).\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FJindongGu_Awesome-Prompting-on-Vision-Language-Model_readme_551ea06ea72e.png\">\n\n\u003Cp align=\"center\"> \u003Ci>Fig. 1: This work focuses on three main types of vision-language models.\u003C\u002Fi> \t\t \u003C\u002Fp>\n\n### Reference\n\nThis repo lists relevant papers summarized in our survey: \n\n**A Systematic Survey of Prompt Engineering on Vision-Language Foundation Models.**  *[Jindong Gu](https:\u002F\u002Fjindonggu.github.io\u002F), [Zhen Han](https:\u002F\u002Fsites.google.com\u002Fview\u002Fzhenhan\u002Fhome?authuser=0), [Shuo Chen](https:\u002F\u002Fchenxshuo.github.io\u002F), [Ahmad Beirami](https:\u002F\u002Fsites.google.com\u002Fview\u002Fbeirami), [Bailan He](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=n5zUQtAAAAAJ&hl=en), [Gengyuan Zhang](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=LN2tYr0AAAAJ&hl=en), [Ruotong Liao](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=XFQv_oYAAAAJ&hl=en), [Yao Qin](https:\u002F\u002Fcseweb.ucsd.edu\u002F~yaq007\u002F), [Volker Tresp](https:\u002F\u002Fwww.dbs.ifi.lmu.de\u002F~tresp\u002F), [Philip Torr](https:\u002F\u002Ftorrvision.com\u002Findex.html)*. Preprint 2023. [[pdf]](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.12980)\n\nIf you find our paper and repo helpful to your research, please cite the following paper:\n```latex\n@article{gu2023survey,\n  title={A Systematic Survey of Prompt Engineering on Vision-Language Foundation Models},\n  author={Gu, Jindong and Han, Zhen and Chen, Shuo, and Beirami, Ahmad and He, Bailan and Zhang, Gengyuan and Liao, Ruotong and Qin, Yao and Tresp, Volker and Torr, Philip}\n  journal={arXiv preprint arXiv:2307.12980},\n  year={2023}\n}\n```\n\n\n\n## # :paperclips: Awesome Papers\n\n- [Prompting Model in Multimodal-to-Text Generation](#prompting-model-in-multimodal-to-text-generation-eg-on-flamingo)\n- [Prompting Model in Image-Text Matching](#prompting-model-in-image-text-matching-eg-on-clip)\n\n- [Prompting Model in Text-to-Image Generation](#prompting-model-in-text-to-image-generation-eg-on-stable-diffusion)\n\n\n\n### Prompting Models in Multimodal-to-Text Generation (*e.g.* on Flamingo)\n\nThere are two main types of fusion module approaches based on the integration of visual and textual modalities: **encoder-decoder as a multi-modal fusion module** and **decoder-only as a multi-modal fusion module**. Prompting methods can be divided into **two main categories**  (Fig. 2) based on the readability of the templates: **hard prompt** and **soft prompt**. Hard prompt encompasses four subcategories: *task instruction, in-context learning,* *retrieval-based prompting, and chain-of-thought prompting*. Soft prompts are classified into two strategies: *prompt tuning* and *prefix token tuning*, based on whether they internally add new tokens to the model's architecture or simply append them to the input. this study primarily concentrates on prompt methods that avoid altering the base model.\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FJindongGu_Awesome-Prompting-on-Vision-Language-Model_readme_ac1361c47548.png\">\n\n\u003Cp align=\"center\">  \u003Ci>Fig. 2 : Classification of prompting methods.\u003C\u002Fi> \t\t \u003C\u002Fp>\n\n\n\n| Title                                                        | Venue       | Year | Code if available                                            | Comment                                              |\n| :----------------------------------------------------------- | ----------- | ---- | ------------------------------------------------------------ | ---------------------------------------------------- |\n| [Unifying Vision-and-Language Tasks via Text Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.02779) | ICML        | 2021 | [Github](https:\u002F\u002Fgithub.com\u002Fj-min\u002FVL-T5)                     | Encoder-decoder fusion; Text prefixes as prompt      |\n| [SimVLM: Simple Visual Language Model Pretraining with Weak Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.10904) | ICLR        | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FYulongBonjour\u002FSimVLM)            | Encoder-decoder fusion; Text prefixes as prompt      |\n| [OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.03052) | ICML        | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FOFA-Sys\u002FOFA)                     | Encoder-decoder fusion; Text prefixes as prompt      |\n| [PaLI: A Jointly-Scaled Multilingual Language-Image Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.06794) | ICLR        | 2023 | ---                                                          | Encoder-decoder fusion; Instruction prompt           |\n| [Multimodal Few-Shot Learning with Frozen Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.13884) | NeurIPS     | 2021 | [Page](https:\u002F\u002Ffh295.github.io\u002Ffrozen.html)                  | Decoder-only fusion; Image conditional prefix tuning |\n| [Flamingo: a Visual Language Model for Few-Shot Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.14198) | NeurIPS     | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_flamingo)     | Decoder-only fusion; Text prompts;                   |\n| [MAGMA -- Multimodal Augmentation of Generative Models through Adapter-based Finetuning](https:\u002F\u002Faclanthology.org\u002F2022.findings-emnlp.179\u002F) | EMNLP       | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FAleph-Alpha\u002Fmagma)               | Decoder-only fusion; Image conditional prefix tuning |\n| [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12597) | ICML        | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FLAVIS\u002Ftree\u002Fmain\u002Fprojects\u002Fblip2) | Decoder-only fusion; Image conditional prefix tuning |\n| [Language Models are Unsupervised Multitask Learners](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage-models.pdf) | OpenAI Blog | 2019 | [Github](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgpt-2)                    | Task instruction prompt                              |\n| [The Turking Test: Can Language Models Understand Instructions?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.11982) | arXiv       | 2020 | ---                                                          | Task instruction prompt                              |\n| [Language Models are Few-Shot Learners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14165) | NeurIPS     | 2020 | ---                                                          | In-context learning                                  |\n| [Learning To Retrieve Prompts for In-Context Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.08633) | NAACL-HLT   | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FOhadRubin\u002FEPR)                   | Retrieval-based prompting                            |\n| [Unified Demonstration Retriever for In-Context Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04320) | ACL         | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FKaiLv69\u002FUDR)                     | Retrieval-based prompting                            |\n| [Compositional Exemplars for In-context Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.05698) | ICML        | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FHKUNLP\u002Ficl-ceil)                 | Retrieval-based prompting                            |\n| [Chain-of-Thought Prompting Elicits Reasoning in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903) | NeurIPS     | 2022 | ---                                                          | Chain-of-thought prompting                           |\n| [Automatic Chain of Thought Prompting in Large Language Models]() | ICLR        | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Famazon-research\u002Fauto-cot)        | Chain-of-thought prompting                           |\n| [The Power of Scale for Parameter-Efficient Prompt Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08691) | EMNLP       | 2021 | ---                                                          | Prompt tuning                                        |\n| [Learning How to Ask: Querying LMs with Mixtures of Soft Prompts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.06599) | NAACL-HLT   | 2021 | [Github](https:\u002F\u002Fgithub.com\u002Fhiaoxui\u002Fsoft-prompts)            | Prompt tuning                                        |\n| [Prefix-Tuning: Optimizing Continuous Prompts for Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.00190) | ACL         | 2021 | [Github](https:\u002F\u002Fgithub.com\u002FXiangLi1999\u002FPrefixTuning)        | Prefix tuning                                        |\n| [Prompt Tuning for Generative Multimodal Pretrained Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.02532) | ACL         | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FOFA-Sys\u002FOFA)                     | Prompt tuning on OFA                                 |\n| [Language Is Not All You Need: Aligning Perception with Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.14045) | NeurIPS       | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Funilm)                 | Textual instruction prompts                          |\n| [Benchmarking Robustness of Adaptation Methods on Pre-trained Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.02080) | NeurIPS       | 2024 | [Page](https:\u002F\u002Fadarobustness.github.io\u002F)                     | Robustness of prompt tuning on VLMs                  |\n| [Towards Robust Prompts on Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08479) | NextGenAISafety@ICLR       | 2024 | ---                                                          | Robustness of prompt tuning on VLMs                  |\n| [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.06500.pdf) | NeurIPS | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FLAVIS\u002Ftree\u002Fmain\u002Fprojects\u002Finstructblip) | Prompt tuning |\n| [Visual Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.08485.pdf) | NeurIPS | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fhaotian-liu\u002FLLaVA) | |\n| [Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.12966.pdf) | arXiv | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen-VL) | Prompt tuning |\n| [Shikra: Unleashing Multimodal LLM’s Referential Dialogue Magic](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.15195.pdf) | arXiv  | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fshikras\u002Fshikra) | |\n| [MINIGPT-4: ENHANCING VISION-LANGUAGE UNDERSTANDING WITH ADVANCED LARGE LANGUAGE MODELS](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.10592.pdf) | ICLR | 2023 | [Github](https:\u002F\u002Fminigpt-4.github.io\u002F) | Prompt tuning |\n\n\n## Prompting Model in Image-Text Matching (*e.g.* on CLIP)\n\nDepending on the target of prompting, existing methods can be classified into three categories: **prompting the text encoder**, **prompting the visual encoder**, or **jointly prompting both branches** as shown in Fig. 2 . These approaches aim to enhance the flexibility and task-specific performance of VLMs.\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FJindongGu_Awesome-Prompting-on-Vision-Language-Model_readme_1cedb71c8412.png\">\n\n\u003Cp align=\"center\">  \u003Ci>Fig. 2: Classification of prompting methods on Image-Text Matching VLMs. \u003C\u002Fi> \t\t \u003C\u002Fp>\n\n\n\n| Title                                                        | Venue   | Year | Code if available                                            | Comment                                            |\n| ------------------------------------------------------------ | ------- | ---- | ------------------------------------------------------------ | -------------------------------------------------- |\n| [Learning Transferable Visual Models From Natural Language Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020) | ICML    | 2021 | [Github](https:\u002F\u002Fgithub.com\u002FOpenAI\u002FCLIP)                     | Hard text prompts; Prompt for Image classification |\n| [Delving into the Openness of CLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.01986) | ACL       | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Flancopku\u002Fclip-openness)                      | Hard text prompts for understanding           |\n| [Test-Time Prompt Tuning for Zero-Shot Generalization in Vision-Language Models](https:\u002F\u002Fopenreview.net\u002Fforum?id=e8PVEkSa4Fq) | NeurIPS | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fazshue\u002FTPT)                      | Soft text prompts                                  |\n| [Learning to Prompt for Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01134) | IJCV    | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FKaiyangZhou\u002FCoOp)                | Soft text prompts                                  |\n| [Prompting Visual-Language Models for Efficient Video Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.04478) | ECCV    | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fju-chen\u002FEfficient-Prompt)        | Soft text prompts                                  |\n| [Multitask Vision-Language Prompt Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.11720) | WACV   | 2024 | [Github](https:\u002F\u002Fgithub.com\u002FsIncerass\u002FMVLPT)                 | Soft text prompts                                  |\n| [Conditional Prompt Learning for Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.05557) | CVPR    | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FKaiyangZhou\u002FCoOp)                | Soft text prompts                                  |\n| [Visual Prompt Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.12119)     | ECCV    | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FKMnP\u002Fvpt)                        | Visual patch-wise prompts                          |\n| [Exploring Visual Prompts for Adapting Large-Scale Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.17274) | arXiv   | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fhjbahng\u002Fvisual_prompting)        | Visual patch-wise prompts                          |\n| [Multitask Vision-Language Prompt Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.11720) | WACV   | 2024 | [Github](https:\u002F\u002Fgithub.com\u002FsIncerass\u002FMVLPT)                 | Visual patch-wise prompts                          |\n| [Unleashing the Power of Visual Prompting At the Pixel Level](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10556) | TMLR   | 2024 | [Github](https:\u002F\u002Fgithub.com\u002FUCSC-VLAA\u002FEVP)                   | Visual patch-wise prompts                          |\n| [Diversity-Aware Meta Visual Prompting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.08138) | CVPR    | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fshikiw\u002FDAM-VP)                   | Visual patch-wise prompts                          |\n| [CPT: Colorful Prompt Tuning for Pre-trained Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.11797) | AI open   | 2024 | [Github](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FCPT)                      | Visual annotation prompts                          |\n| [What does CLIP know about a red circle? Visual prompt engineering for VLMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.06712) | ICCV   | 2023 | ---                                                          | Visual annotation prompts                          |\n| [Visual Prompting via Image Inpainting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.00647) | NeurIPS | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Famirbar\u002Fvisual_prompting)        | Visual annotation prompts                          |\n| [Unified Vision and Language Prompt Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07225) | arXiv   | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fyuhangzang\u002FUPT)                  | Coupled unified prompting                          |\n| [Multitask Vision-Language Prompt Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.11720) | WACV   | 2024 | [Github](https:\u002F\u002Fgithub.com\u002FsIncerass\u002FMVLPT)                 | Decoupled unified prompting                        |\n| [MaPLe: Multi-modal Prompt Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03117) | CVPR    | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fmuzairkhattak\u002Fmultimodal-prompt-learning) | Decoupled unified prompting                        |\n| [Understanding Zero-shot Adversarial Robustness for Large-Scale Models](https:\u002F\u002Fopenreview.net\u002Fforum?id=P4bXCawRi5J) | ICLR    | 2023 | [Code](https:\u002F\u002Fwww.catalyzex.com\u002Fpaper\u002Farxiv:2212.07016\u002Fcode) | Adversarial robustness of prompt                   |\n| [Visual Prompting for Adversarial Robustness](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.06284) | ICASSP  | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FPhoveran\u002Fvp-for-adversarial-robustness) | Adversarial robustness of prompt                   |\n| [Align before Fuse: Vision and Language Representation Learning with Momentum Distillation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.07651) | NeurIPS | 2021 | [Github](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FALBEF\u002F)               | Image-Text Matching Model                          |\n| [Unsupervised Prompt Learning for Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.03649) | arXiv   | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Ftonyhuang2022\u002FUPL)               | Unspervised learnable prompts                      |\n| [Test-Time Prompt Tuning for Zero-Shot Generalization in Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.07511) | NeurIPS | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fazshue\u002FTPT)                      | Learnable prompt                                   |\n| [Prompt Pre-Training with Over Twenty-Thousand Classes for Open-Vocabulary Visual Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.04704) | NeurIPS       | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Famazon-science\u002Fprompt-pretraining)   | Prompt Pre-Training  |\n| [Consistency-guided Prompt Learning for Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.01195) | ICLR    | 2024 | --- | Decoupled unified prompting                        |\n| [Improving Adaptability and Generalizability of Efficient Transfer Learning for Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.15569) | arXiv       | 2023 | ---   | Learnable prompt  |\n| [Efficient Test-Time Prompt Tuning for Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.05775) | arXiv | 2024 | --- | Efficient test-time prompt tuning |\n| [Progressive Visual Prompt Learning with Contrastive Feature Re-formation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08386) | IJCV | 2024 | [GitHub](https:\u002F\u002Fgithub.com\u002FMCG-NJU\u002FProVP) | Visual prompt tuning |\n| [AWT: Transferring Vision-Language Models via Augmentation, Weighting, and Transportation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.04603) | NeurIPS | 2024 | [GitHub](https:\u002F\u002Fgithub.com\u002FMCG-NJU\u002FAWT) | LLM prompt generation; Optimal transport |\n\n\n### Applications & Responsible AI\n\n| Title                                                        | Venue         | Year | Code if available                                            | Comment                                                      |\n| ------------------------------------------------------------ | ------------- | ---- | ------------------------------------------------------------ | ------------------------------------------------------------ |\n| [LMPT: Prompt Tuning with Class-Specific Embedding Loss for Long-tailed Multi-Label Visual Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04536) | ALVR       | 2024 | [Github](https:\u002F\u002Fgithub.com\u002Frichard-peng-xia\u002FLMPT)                      | Prompts for long-tailed multi-label image classification           |\n| [Test-Time Prompt Tuning for Zero-Shot Generalization in Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.07511) | NeurIPS       | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fazshue\u002FTPT)                      | Learnable prompt; Prompts for image classification           |\n| [LPT: Long-tailed Prompt Tuning for Image Classification](https:\u002F\u002Fopenreview.net\u002Fforum?id=8pOVAeo8ie) | ICLR          | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FDongSky\u002FLPT)                     | Prompts for long-tailed image classification                 |\n| [Texts as Images in Prompt Tuning for Multi-Label Image Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.12739) | CVPR          | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fguozix\u002FTaI-DPT)                  | Prompts for multi-label image classification and detection   |\n| [DualCoOp: Fast Adaptation to Multi-Label Recognition with Limited Annotations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.09541) | NeurIPS       | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fsunxm2357\u002FDualCoOp)              | Prompts for multi-label image classification and recognition |\n| [Visual Prompt Tuning for Few-Shot Text Classification](https:\u002F\u002Faclanthology.org\u002F2022.coling-1.492.pdf) | ICCL          | 2022 | ---                                                          | Visual prompts for text classification                       |\n| [Open-vocabulary Object Detection via Vision and Language Knowledge Distillation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.13921) | ICLR          | 2021 | [Github](https:\u002F\u002Fgithub.com\u002Ftensorflow\u002Ftpu\u002Ftree\u002Fmaster\u002Fmodels\u002Fofficial\u002Fdetection\u002Fprojects\u002Fvild) | Prompts for object detection                                 |\n| [Learning to Prompt for Open-Vocabulary Object Detection with Vision-Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.14940) | CVPR          | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fdyabel\u002Fdetpro)                   | Prompts for object detection                                 |\n| [PromptDet: Towards Open-vocabulary Detection using Uncurated Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.16513) | ECCV          | 2022 | [Github](https:\u002F\u002Ffcjian.github.io\u002Fpromptdet)                 | Prompts for object detection                                 |\n| [Optimizing Continuous Prompts for Visual Relationship Detection by Affix-Tuning](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9815128) | IEEE Access   | 2022 | ---                                                          | Soft prompts for visual relation detection                   |\n| [Towards Open-vocabulary Scene Graph Generation with Prompt-based Finetuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.08165) | ECCV          | 2022 | ---                                                          | Soft prompts for visual relation detection                   |\n| [Compositional Prompt Tuning with Motion Cues for Open-vocabulary Video Relation Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.00268) | ICLR          | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FDawn-LX\u002FOpenVoc-VidVRD)          | Relation Prompts for video open-vocabulary relation detection |\n| [DenseCLIP: Language-Guided Dense Prediction with Context-Aware Prompting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.01518) | CVPR          | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fraoyongming\u002FDenseCLIP)           | Class-conditioned text prompts for semantic segmentation     |\n| [Segment Anything](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.02643)         | ICCV          | 2023 | [Github](https:\u002F\u002Fsegment-anything.com\u002F)                      | Promptable queries for semantic segmentation                 |\n| [Domain Adaptation via Prompt Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.06687) | IEEE         | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FLeapLabTHU\u002FDAPrompt)             | Domain-specific textual prompts for domain adaptation        |\n| [Visual Prompt Tuning for Test-time Domain Adaptation]()     | arXiv         | 2022 | ---                                                          | Prompts for domain adaptation                                |\n| [Learning to Prompt for Continual Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.08654) | CVPR          | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fl2p)             | Prompts for continual learning                               |\n| [DualPrompt: Complementary Prompting for Rehearsal-free Continual Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.04799) | ECCV          | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fl2p)             | Prompts for continual learning                               |\n| [Prompt Vision Transformer for Domain Generalization]()      | arXiv         | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fzhengzangw\u002FDoPrompt)             | Prompts for domain generalization                            |\n| [Understanding Zero-Shot Adversarial Robustness for Large-Scale Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.07016) | LCLR         | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fcvlab-columbia\u002FZSRobust4FoundationModel) | Visual prompt tuning under adversarial attack                |\n| [Visual Prompting for Adversarial Robustness](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.06284) | ICASSP        | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FPhoveran\u002Fvp-for-adversarial-robustness) | Visual prompting to improve the adversarial robustness       |\n| [Exploring the Universal Vulnerability of Prompt-based Learning Paradigm](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.05239) | NAACL         | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fleix28\u002Fprompt-universal-vulnerability) | Visual prompting vulnerability                               |\n| [Poisoning and Backdooring Contrastive Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=iC4UHbQ01Mp) | ICLR          | 2022 | ---                                                          | Backdoor and poisoning attacks on CLIP                       |\n| [BadEncoder: Backdoor Attacks to Pre-trained Encoders in Self-Supervised Learning](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9833644) | IEEE          | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fjjy1994\u002FBadEncoder)              | Backdoor attack on CLIP                                      |\n| [CleanCLIP: Mitigating Data Poisoning Attacks in Multimodal Contrastive Learning ](https:\u002F\u002Fopenreview.net\u002Fforum?id=GfgCNeVRFhV) | ICLR Workshop | 2023 | ---                                                          | Defense backdoor attacks on CLIP                             |\n| [Debiasing Vision-Language Models via Biased Prompts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.00070) | arXiv         | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fchingyaoc\u002Fdebias_vl)             | Prompts to alleviate bias                                    |\n\n\n\n## Prompting Model in Text-to-Image Generation (*e.g.* on Stable Diffusion)\n| Title                                                        | Venue            | Year | Code if available                                            | Comment                                                      |\n| ------------------------------------------------------------ | ---------------- | ---- | ------------------------------------------------------------ | ------------------------------------------------------------ |\n| [Diffusion Models Beat GANs on Image Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.05233) | NeurIPS          | 2021 | [Github](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fguided-diffusion)         | Diffusion models on image generation                         |\n| [Diffusion Models Beat GANs on Image Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.05233) | NeurIPS          | 2021 | [Github](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fguided-diffusion)         | Diffusion models on image generation                         |\n| [Denoising Diffusion Probabilistic Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.11239) | NeurIPS          | 2020 | [Github](https:\u002F\u002Fgithub.com\u002Fhojonathanho\u002Fdiffusion)          | Diffusion models on image generation                         |\n| [SuS-X: Training-Free Name-Only Transfer of Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.16198.pdf) | ICCV             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fvishaal27\u002Fsus-X\u002F)                | Diffusion models on image generation                         |\n| [Investigating Prompt Engineering in Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.15462) | NeurIPS Workshop | 2022 | ---                                                          | Semantic prompt design                                       |\n| [DiffuMask: Synthesizing Images with Pixel-level Annotations for Semantic Segmentation Using Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11681) | IEEE\u002FCVF            | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fweijiawu\u002FDiffuMask)              | Diversify generation with prompt; Prompts for synthetic data generation |\n| [Is synthetic data from generative models ready for image recognition?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07574) | ICLR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FCVMI-Lab\u002FSyntheticData)          | Diversify generation with prompt                             |\n| [An Image is Worth One Word: Personalizing Text-to-Image Generation using Textual Inversion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.01618) | ICLR             | 2023 | [Github](https:\u002F\u002Ftextual-inversion.github.io\u002F)               | Complex control of synthesis results via prompts             |\n| [DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.12242) | CVPR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fdreambooth)               | Complex control of synthesis results via prompts             |\n| [Multi-Concept Customization of Text-to-Image Diffusion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.04488) | CVPR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fadobe-research\u002Fcustom-diffusion) | Complex control of synthesis results via prompts             |\n| [Prompt-to-Prompt Image Editing with Cross Attention Control](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.01626) | ICLR            | 2023 | ---                                                          | Complex control of synthesis results via prompts             |\n| [Training-Free Structured Diffusion Guidance for Compositional Text-to-Image Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.05032) | ICLR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fshunk031\u002Ftraining-free-structured-diffusion-guidance) | Controllable text-to-image generation                        |\n| [Diffusion Self-Guidance for Controllable Image Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.00986) | NeurIPS            | 2023 | [Page](https:\u002F\u002Fdave.ml\u002Fselfguidance\u002F)                        | Controllable text-to-image generation                        |\n| [Imagic: Text-Based Real Image Editing with Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.09276) | CVPR             | 2023 | [Github](https:\u002F\u002Fimagic-editing.github.io\u002F)                  | Controllable text-to-image generation                        |\n| [Adding Conditional Control to Text-to-Image Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.05543) | IEEE\u002FCVF            | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Flllyasviel\u002FControlNet)           | Controllable text-to-image generation                        |\n| [Prompt-to-Prompt Image Editing with Cross Attention Control](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.01626) | ICLR            | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fprompt-to-prompt)         | Complex control of synthesis results via prompts             |\n| [ImaginaryNet: Learning Object Detectors without Real Images and Annotations](https:\u002F\u002Fopenreview.net\u002Fforum?id=9MbhFHqrti9) | ICLR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fkodenii\u002FImaginaryNet)            | Prompts for synthetic data generation                        |\n| [Is synthetic data from generative models ready for image recognition?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07574) | ICLR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FCVMI-Lab\u002FSyntheticData)          | Prompts for synthetic data generation                        |\n| [Make-A-Video: Text-to-Video Generation without Text-Video Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.14792) | ICLR             | 2023 | [Page](https:\u002F\u002Fmakeavideo.studio\u002F)                           | Prompts for text-to-video generation                         |\n| [Imagen Video: High Definition Video Generation with Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02303) | arXiv            | 2022 | [Page](https:\u002F\u002Fimagen.research.google\u002Fvideo\u002F)                | Prompts for text-to-video generation                         |\n| [FateZero: Fusing Attentions for Zero-shot Text-based Video Editing](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.09535) | ICCV            | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FChenyangQiQi\u002FFateZero)           | Prompts for text-to-video generation                         |\n| [Tune-A-Video: One-Shot Tuning of Image Diffusion Models for Text-to-Video Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.11565) | ICCV             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fshowlab\u002FTune-A-Video)            | Prompts for text-to-video generation                         |\n| [DiffRF: Rendering-Guided 3D Radiance Field Diffusion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.01206) | CVPR             | 2023 | [Page](https:\u002F\u002Fsirwyver.github.io\u002FDiffRF\u002F)                   | Prompts for text-to-3D generation                            |\n| [DreamFusion: Text-to-3D using 2D Diffusion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.14988) | ICLR notable top 5%            | 2023 | [Page](https:\u002F\u002Fdreamfusion3d.github.io\u002F)                     | Prompts for text-to-3D generation                            |\n| [Dream3D: Zero-Shot Text-to-3D Synthesis Using 3D Shape Prior and Text-to-Image Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.14704) | CVPR             | 2023 | [Page](https:\u002F\u002Fbluestyle97.github.io\u002Fdream3d\u002F)               | Prompts for text-to-3D generation                            |\n| [MotionDiffuse: Text-Driven Human Motion Generation with Diffusion Model](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.15001.pdf) | IEEE            | 2024 | [Page](https:\u002F\u002Fmingyuan-zhang.github.io\u002Fprojects\u002FMotionDiffuse.html) | Prompts for text-to-motion generation                        |\n| [FLAME: Free-form Language-based Motion Synthesis & Editing](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.00349) | AAAI             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fkakaobrain\u002Fflame)                | Prompts for text-to-motion generation                        |\n| [MDM: Human Motion Diffusion Model]()                        | ICLR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FGuyTevet\u002Fmotion-diffusion-model) | Prompts for text-to-motion generation                        |\n| [Zero-shot Generation of Coherent Storybook from Plain Text Story using Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.03900) | arXiv            | 2023 | ---                                                          | Prompts for complex tasks                                    |\n| [Multimodal Procedural Planning via Dual Text-Image Prompting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.01795) | ICLR            | 2024 | [Github](https:\u002F\u002Fgithub.com\u002FYujieLu10\u002FMPP)                   | Prompts for complex tasks                                    |\n| [Prompt Stealing Attacks Against Text-to-Image Generation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.09923) | USENIX Security Symposium            | 2023 | ---                                                          | Prompts for responsible AI                                   |\n| [Membership Inference Attacks Against Text-to-image Generation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.00968) | ICLR            | 2023 | ---                                                          | Membership attacks against text-to-image models              |\n| [Are Diffusion Models Vulnerable to Membership Inference Attacks?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.01316) | ICML             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fjinhaoduan\u002FSecMI)                | Membership attacks against text-to-image models              |\n| [A Reproducible Extraction of Training Images from Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.08694) | arXiv            | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fryanwebster90\u002Fonestep-extraction) | Membership attacks against text-to-image models              |\n| [Fair Diffusion: Instructing Text-to-Image Generation Models on Fairness](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.10893) | arXiv            | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fml-research\u002FFair-Diffusion)      | Prompts on text-to-image models considering fairness         |\n| [Social Biases through the Text-to-Image Generation Lens](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.06034) | AAAI\u002FACM            | 2023 | ---                                                          | Prompts on text-to-image models considering biases           |\n| [T2IAT: Measuring Valence and Stereotypical Biases in Text-to-Image Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.00905) | ACL              | 2023 | ---                                                          | Prompts on text-to-image models considering biases           |\n| [Stable Bias: Analyzing Societal Representations in Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11408) | NeurIPS            | 2023 | ---                                                          | Prompts on text-to-image models considering biases           |\n| [A Pilot Study of Query-Free Adversarial Attack Against Stable Diffusion](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023W\u002FAML\u002Fpapers\u002FZhuang_A_Pilot_Study_of_Query-Free_Adversarial_Attack_Against_Stable_Diffusion_CVPRW_2023_paper.pdf) | CVPR             | 2023 | ---                                                          | Adversarial robustness of text-to-image models               |\n| [Diffusion Models for Imperceptible and Transferable Adversarial Attack]() | ICLR            | 2024 | [Github](https:\u002F\u002Fgithub.com\u002FWindVChen\u002FDiffAttack)            | Adversarial robustness of text-to-image models               |\n| [Diffusion Models for Adversarial Purification](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.07460) | ICML             | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FNVlabs\u002FDiffPure)                 | Adversarial robustness of text-to-image models               |\n| [Rickrolling the Artist: Injecting Backdoors into Text Encoders for Text-to-Image Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.02408) | ICCV            | 2023 | ---                                                          | Backdoor attack on text-to-image models                      |\n| [Text-to-Image Diffusion Models can be Easily Backdoored through Multimodal Data Poisoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04175) | ACM MM            | 2023 | ---                                                          | Backdoor attack on text-to-image models                      |\n| [Personalization as a Shortcut for Few-Shot Backdoor Attack against Text-to-Image Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10701) | AAAI           | 2024 | ---                                                          | Backdoor attack on text-to-image models                      |\n\n\n## # :mailbox_with_mail: Contact \n\nPlease contact us (jindong.gu@outlook.com, chenshuo.cs@outlook.com) if \n- you would like to add your papers in this repo,\n- you find any mistakes in this repo, \n- you have any suggestions for this repo. \n\n","# 视觉-语言模型中的优秀提示工程\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FJindongGu_Awesome-Prompting-on-Vision-Language-Model_readme_e31dfd33163a.png\" width=\"100%\" height=\"100%\">\n\n## # :nerd_face: 什么是视觉-语言模型的提示工程？\n提示工程是一种技术，通过为大型预训练模型添加特定任务的提示（即“提示词”），使其能够适应新的任务。本仓库旨在提供一份关于**三种**视觉-语言模型（VLMs）上提示工程前沿研究的**全面综述**：**多模态到文本生成模型**（如Flamingo）、**图像-文本匹配模型**（如CLIP）以及**文本到图像生成模型**（如Stable Diffusion）（图1）。\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FJindongGu_Awesome-Prompting-on-Vision-Language-Model_readme_551ea06ea72e.png\">\n\n\u003Cp align=\"center\"> \u003Ci>图1：本工作聚焦于三大类视觉-语言模型。\u003C\u002Fi> \u003C\u002Fp>\n\n### 参考文献\n\n本仓库列出了我们综述中总结的相关论文：\n\n**视觉-语言基础模型上的提示工程系统性综述。** *[Jindong Gu](https:\u002F\u002Fjindonggu.github.io\u002F)、[Zhen Han](https:\u002F\u002Fsites.google.com\u002Fview\u002Fzhenhan\u002Fhome?authuser=0)、[Shuo Chen](https:\u002F\u002Fchenxshuo.github.io\u002F)、[Ahmad Beirami](https:\u002F\u002Fsites.google.com\u002Fview\u002Fbeirami)、[Bailan He](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=n5zUQtAAAAAJ&hl=en)、[Gengyuan Zhang](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=LN2tYr0AAAAJ&hl=en)、[Ruotong Liao](https:\u002F\u002Fscholar.google.com\u002Fcitations?user=XFQv_oYAAAAJ&hl=en)、[Yao Qin](https:\u002F\u002Fcseweb.ucsd.edu\u002F~yaq007\u002F)、[Volker Tresp](https:\u002F\u002Fwww.dbs.ifi.lmu.de\u002F~tresp\u002F)、[Philip Torr](https:\u002F\u002Ftorrvision.com\u002Findex.html)*。预印本，2023年。[[pdf]](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.12980)\n\n如果您觉得我们的论文和仓库对您的研究有所帮助，请引用以下论文：\n```latex\n@article{gu2023survey,\n  title={A Systematic Survey of Prompt Engineering on Vision-Language Foundation Models},\n  author={Gu, Jindong and Han, Zhen and Chen, Shuo, and Beirami, Ahmad and He, Bailan and Zhang, Gengyuan and Liao, Ruotong and Qin, Yao and Tresp, Volker and Torr, Philip}\n  journal={arXiv preprint arXiv:2307.12980},\n  year={2023}\n}\n```\n\n\n\n## # :paperclips: 精选论文\n\n- [多模态到文本生成中的提示方法](#prompting-model-in-multimodal-to-text-generation-eg-on-flamingo)\n- [图像-文本匹配中的提示方法](#prompting-model-in-image-text-matching-eg-on-clip)\n\n- [文本到图像生成中的提示方法](#prompting-model-in-text-to-image-generation-eg-on-stable-diffusion)\n\n\n\n### 多模态到文本生成中的提示方法（例如在Flamingo上）\n\n基于视觉和文本模态的融合方式，主要有两种融合模块方法：**编码器-解码器作为多模态融合模块**和**仅解码器作为多模态融合模块**。根据模板的可读性，提示方法可分为**两大类**（图2）：**硬提示**和**软提示**。硬提示又细分为四个子类别：*任务指令、上下文学习、基于检索的提示以及思维链提示*。而软提示则根据是否在模型架构内部添加新标记或仅将其附加到输入中，被划分为*提示调优*和*前缀标记调优*两种策略。本研究主要关注那些无需修改基础模型的提示方法。\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FJindongGu_Awesome-Prompting-on-Vision-Language-Model_readme_ac1361c47548.png\">\n\n\u003Cp align=\"center\"> \u003Ci>图2：提示方法分类。\u003C\u002Fi> \u003C\u002Fp>\n\n| 标题                                                        | 会议       | 年份 | 如有代码则提供代码链接                                            | 备注                                              |\n| :----------------------------------------------------------- | ----------- | ---- | ------------------------------------------------------------ | ---------------------------------------------------- |\n| [通过文本生成统一视觉-语言任务](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.02779) | ICML        | 2021 | [Github](https:\u002F\u002Fgithub.com\u002Fj-min\u002FVL-T5)                     | 编码器-解码器融合；文本前缀作为提示      |\n| [SimVLM：基于弱监督的简单视觉语言模型预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.10904) | ICLR        | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FYulongBonjour\u002FSimVLM)            | 编码器-解码器融合；文本前缀作为提示      |\n| [OFA：通过简单的序列到序列学习框架统一架构、任务和模态](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.03052) | ICML        | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FOFA-Sys\u002FOFA)                     | 编码器-解码器融合；文本前缀作为提示      |\n| [PaLI：联合扩展的多语言语言-图像模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.06794) | ICLR        | 2023 | ---                                                          | 编码器-解码器融合；指令提示           |\n| [使用冻结语言模型进行多模态少样本学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.13884) | NeurIPS     | 2021 | [页面](https:\u002F\u002Ffh295.github.io\u002Ffrozen.html)                  | 仅解码器融合；图像条件前缀调优         |\n| [Flamingo：用于少样本学习的视觉语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.14198) | NeurIPS     | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_flamingo)     | 仅解码器融合；文本提示；                   |\n| [MAGMA——通过适配器微调增强生成模型的多模态能力](https:\u002F\u002Faclanthology.org\u002F2022.findings-emnlp.179\u002F) | EMNLP       | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FAleph-Alpha\u002Fmagma)               | 仅解码器融合；图像条件前缀调优         |\n| [BLIP-2：利用冻结图像编码器和大型语言模型进行语言-图像预训练的自举](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12597) | ICML        | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FLAVIS\u002Ftree\u002Fmain\u002Fprojects\u002Fblip2) | 仅解码器融合；图像条件前缀调优         |\n| [语言模型是无监督的多任务学习者](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage-models.pdf) | OpenAI博客 | 2019 | [Github](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fgpt-2)                    | 任务指令提示                              |\n| [图灵测试：语言模型能理解指令吗？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.11982) | arXiv       | 2020 | ---                                                          | 任务指令提示                              |\n| [语言模型是少样本学习者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14165) | NeurIPS     | 2020 | ---                                                          | 上下文学习                                  |\n| [学习检索用于上下文学习的提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.08633) | NAACL-HLT   | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FOhadRubin\u002FEPR)                   | 基于检索的提示                            |\n| [用于上下文学习的统一演示检索器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04320) | ACL         | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FKaiLv69\u002FUDR)                     | 基于检索的提示                            |\n| [用于上下文学习的组合示例](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.05698) | ICML        | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FHKUNLP\u002Ficl-ceil)                 | 基于检索的提示                            |\n| [思维链提示可激发大型语言模型的推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903) | NeurIPS     | 2022 | ---                                                          | 思维链提示                                |\n| [大型语言模型中的自动思维链提示]() | ICLR        | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Famazon-research\u002Fauto-cot)        | 思维链提示                                |\n| [规模对参数高效提示调优的力量](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08691) | EMNLP       | 2021 | ---                                                          | 提示调优                                  |\n| [学会提问：用软提示混合物查询语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.06599) | NAACL-HLT   | 2021 | [Github](https:\u002F\u002Fgithub.com\u002Fhiaoxui\u002Fsoft-prompts)            | 提示调优                                  |\n| [前缀调优：优化连续提示以进行生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.00190) | ACL         | 2021 | [Github](https:\u002F\u002Fgithub.com\u002FXiangLi1999\u002FPrefixTuning)        | 前缀调优                                  |\n| [面向生成型多模态预训练模型的提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.02532) | ACL         | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FOFA-Sys\u002FOFA)                     | OFA上的提示调优                           |\n| [语言并非一切：将感知与语言模型对齐](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.14045) | NeurIPS       | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Funilm)                 | 文本指令提示                              |\n| [预训练视觉-语言模型上适应方法鲁棒性的基准测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.02080) | NeurIPS       | 2024 | [页面](https:\u002F\u002Fadarobustness.github.io\u002F)                     | VLMs上提示调优的鲁棒性                  |\n| [迈向视觉-语言模型的稳健提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08479) | NextGenAISafety@ICLR       | 2024 | ---                                                          | VLMs上提示调优的鲁棒性                  |\n| [InstructBLIP：通过指令调优迈向通用视觉-语言模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.06500.pdf) | NeurIPS | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FLAVIS\u002Ftree\u002Fmain\u002Fprojects\u002Finstructblip) | 提示调优 |\n| [视觉指令调优](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.08485.pdf) | NeurIPS | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fhaotian-liu\u002FLLaVA) | |\n| [Qwen-VL：一款多功能的视觉-语言模型，用于理解、定位、文本阅读等](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.12966.pdf) | arXiv | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen-VL) | 提示调优 |\n| [Shikra：释放多模态大模型的指代对话魔法](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.15195.pdf) | arXiv  | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fshikras\u002Fshikra) | |\n| [MINIGPT-4：利用先进大型语言模型增强视觉-语言理解](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.10592.pdf) | ICLR | 2023 | [Github](https:\u002F\u002Fminigpt-4.github.io\u002F) | 提示调优 |\n\n## 图像-文本匹配中的提示方法（*如* CLIP 上）\n\n根据提示的目标不同，现有方法可分为三类：**提示文本编码器**、**提示视觉编码器**，或如图 2 所示的**联合提示双分支**。这些方法旨在提升多模态模型的灵活性和任务特定性能。\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FJindongGu_Awesome-Prompting-on-Vision-Language-Model_readme_1cedb71c8412.png\">\n\n\u003Cp align=\"center\">  \u003Ci>图 2：图像-文本匹配多模态模型中提示方法的分类。\u003C\u002Fi> \t\t \u003C\u002Fp>\n\n| 标题                                                        | 会议\u002F期刊   | 年份 | 如有代码，提供代码链接                                            | 备注                                            |\n| ------------------------------------------------------------ | ------- | ---- | ------------------------------------------------------------ | -------------------------------------------------- |\n| [从自然语言监督中学习可迁移的视觉模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020) | ICML    | 2021 | [Github](https:\u002F\u002Fgithub.com\u002FOpenAI\u002FCLIP)                     | 硬文本提示；用于图像分类                         |\n| [深入探讨 CLIP 的开放性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.01986) | ACL       | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Flancopku\u002Fclip-openness)                      | 用于理解的硬文本提示           |\n| [视觉-语言模型零样本泛化中的测试时提示调优](https:\u002F\u002Fopenreview.net\u002Fforum?id=e8PVEkSa4Fq) | NeurIPS | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fazshue\u002FTPT)                      | 软文本提示                                  |\n| [为视觉-语言模型学习提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01134) | IJCV    | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FKaiyangZhou\u002FCoOp)                | 软文本提示                                  |\n| [高效视频理解的视觉-语言模型提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.04478) | ECCV    | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fju-chen\u002FEfficient-Prompt)        | 软文本提示                                  |\n| [多任务视觉-语言提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.11720) | WACV   | 2024 | [Github](https:\u002F\u002Fgithub.com\u002FsIncerass\u002FMVLPT)                 | 软文本提示                                  |\n| [视觉-语言模型的条件提示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.05557) | CVPR    | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FKaiyangZhou\u002FCoOp)                | 软文本提示                                  |\n| [视觉提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.12119)     | ECCV    | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FKMnP\u002Fvpt)                        | 基于视觉补丁的提示                          |\n| [探索用于大规模模型适配的视觉提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.17274) | arXiv   | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fhjbahng\u002Fvisual_prompting)        | 基于视觉补丁的提示                          |\n| [多任务视觉-语言提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.11720) | WACV   | 2024 | [Github](https:\u002F\u002Fgithub.com\u002FsIncerass\u002FMVLPT)                 | 基于视觉补丁的提示                          |\n| [释放像素级视觉提示的力量](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10556) | TMLR   | 2024 | [Github](https:\u002F\u002Fgithub.com\u002FUCSC-VLAA\u002FEVP)                   | 基于视觉补丁的提示                          |\n| [多样性感知的元视觉提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.08138) | CVPR    | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fshikiw\u002FDAM-VP)                   | 基于视觉补丁的提示                          |\n| [CPT：预训练视觉-语言模型的彩色提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.11797) | AI open   | 2024 | [Github](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FCPT)                      | 视觉标注提示                          |\n| [CLIP对红色圆圈了解多少？VLM的视觉提示工程](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.06712) | ICCV   | 2023 | ---                                                          | 视觉标注提示                          |\n| [通过图像修复进行视觉提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.00647) | NeurIPS | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Famirbar\u002Fvisual_prompting)        | 视觉标注提示                          |\n| [统一的视觉与语言提示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07225) | arXiv   | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fyuhangzang\u002FUPT)                  | 耦合的统一提示                              |\n| [多任务视觉-语言提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.11720) | WACV   | 2024 | [Github](https:\u002F\u002Fgithub.com\u002FsIncerass\u002FMVLPT)                 | 解耦的统一提示                        |\n| [MaPLe：多模态提示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03117) | CVPR    | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fmuzairkhattak\u002Fmultimodal-prompt-learning) | 解耦的统一提示                        |\n| [理解大规模模型的零样本对抗鲁棒性](https:\u002F\u002Fopenreview.net\u002Fforum?id=P4bXCawRi5J) | ICLR    | 2023 | [代码](https:\u002F\u002Fwww.catalyzex.com\u002Fpaper\u002Farxiv:2212.07016\u002Fcode) | 提示的对抗鲁棒性                   |\n| [用于对抗鲁棒性的视觉提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.06284) | ICASSP  | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FPhoveran\u002Fvp-for-adversarial-robustness) | 提示的对抗鲁棒性                   |\n| [先对齐再融合：基于动量蒸馏的视觉与语言表征学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.07651) | NeurIPS | 2021 | [Github](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002FALBEF\u002F)               | 图像-文本匹配模型                          |\n| [视觉-语言模型的无监督提示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.03649) | arXiv   | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Ftonyhuang2022\u002FUPL)               | 可学习的无监督提示                      |\n| [视觉-语言模型零样本泛化的测试时提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.07511) | NeurIPS | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fazshue\u002FTPT)                      | 可学习的提示                                   |\n| [面向开放词汇视觉识别的两万多个类别的提示预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.04704) | NeurIPS       | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Famazon-science\u002Fprompt-pretraining)   | 提示预训练  |\n| [视觉-语言模型的一致性引导提示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.01195) | ICLR    | 2024 | --- | 解耦的统一提示                        |\n| [提升视觉-语言模型高效迁移学习的适应性和泛化能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.15569) | arXiv       | 2023 | ---   | 可学习的提示  |\n| [视觉-语言模型的高效测试时提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.05775) | arXiv | 2024 | --- | 高效的测试时提示调优 |\n| [基于对比特征重构的渐进式视觉提示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08386) | IJCV | 2024 | [GitHub](https:\u002F\u002Fgithub.com\u002FMCG-NJU\u002FProVP) | 视觉提示调优 |\n| [AWT：通过增强、加权和运输转移视觉-语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.04603) | NeurIPS | 2024 | [GitHub](https:\u002F\u002Fgithub.com\u002FMCG-NJU\u002FAWT) | LLM提示生成；最优传输 |\n\n\n\n\n### 应用场景与负责任的人工智能\n\n| 标题                                                        | 会议\u002F期刊       | 年份 | 若有代码，提供代码链接                                            | 备注                                                      |\n| ------------------------------------------------------------ | ------------- | ---- | ------------------------------------------------------------ | ------------------------------------------------------------ |\n| [LMPT：针对长尾多标签视觉识别的类别特定嵌入损失提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04536) | ALVR       | 2024 | [Github](https:\u002F\u002Fgithub.com\u002Frichard-peng-xia\u002FLMPT)                      | 用于长尾多标签图像分类的提示词           |\n| [视觉-语言模型零样本泛化中的测试时提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.07511) | NeurIPS       | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fazshue\u002FTPT)                      | 可学习提示词；用于图像分类的提示词           |\n| [LPT：面向图像分类的长尾提示调优](https:\u002F\u002Fopenreview.net\u002Fforum?id=8pOVAeo8ie) | ICLR          | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FDongSky\u002FLPT)                     | 用于长尾图像分类的提示词                 |\n| [多标签图像识别中以文本作为图像的提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.12739) | CVPR          | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fguozix\u002FTaI-DPT)                  | 用于多标签图像分类与检测的提示词   |\n| [DualCoOp：在标注数据有限的情况下快速适应多标签识别](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.09541) | NeurIPS       | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fsunxm2357\u002FDualCoOp)              | 用于多标签图像分类与识别的提示词 |\n| [少样本文本分类中的视觉提示调优](https:\u002F\u002Faclanthology.org\u002F2022.coling-1.492.pdf) | ICCL          | 2022 | ---                                                          | 用于文本分类的视觉提示词                       |\n| [通过视觉与语言知识蒸馏实现开放词汇目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.13921) | ICLR          | 2021 | [Github](https:\u002F\u002Fgithub.com\u002Ftensorflow\u002Ftpu\u002Ftree\u002Fmaster\u002Fmodels\u002Fofficial\u002Fdetection\u002Fprojects\u002Fvild) | 用于目标检测的提示词                                 |\n| [利用视觉-语言模型学习提示以进行开放词汇目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.14940) | CVPR          | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fdyabel\u002Fdetpro)                   | 用于目标检测的提示词                                 |\n| [PromptDet：基于未筛选图像实现开放词汇目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.16513) | ECCV          | 2022 | [Github](https:\u002F\u002Ffcjian.github.io\u002Fpromptdet)                 | 用于目标检测的提示词                                 |\n| [通过前缀调优优化连续提示以进行视觉关系检测](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9815128) | IEEE Access   | 2022 | ---                                                          | 用于视觉关系检测的软提示                             |\n| [基于提示微调实现开放词汇场景图生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.08165) | ECCV          | 2022 | ---                                                          | 用于视觉关系检测的软提示                             |\n| [结合运动线索的组合式提示调优用于开放词汇视频关系检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.00268) | ICLR          | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FDawn-LX\u002FOpenVoc-VidVRD)          | 用于视频开放词汇关系检测的关系提示词 |\n| [DenseCLIP：基于上下文感知提示的语言引导密集预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.01518) | CVPR          | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fraoyongming\u002FDenseCLIP)           | 用于语义分割的类别条件文本提示词     |\n| [Segment Anything](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.02643)         | ICCV          | 2023 | [Github](https:\u002F\u002Fsegment-anything.com\u002F)                      | 可提示查询的语义分割                 |\n| [通过提示学习进行领域适应](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.06687) | IEEE         | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FLeapLabTHU\u002FDAPrompt)             | 领域特定的文本提示词，用于领域适应        |\n| [测试时领域适应中的视觉提示调优]()     | arXiv         | 2022 | ---                                                          | 用于领域适应的提示词                                |\n| [为持续学习学习提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.08654) | CVPR          | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fl2p)             | 用于持续学习的提示词                               |\n| [DualPrompt：无回放持续学习的互补提示调优](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.04799) | ECCV          | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fl2p)             | 用于持续学习的提示词                               |\n| [用于领域泛化的提示视觉Transformer]()      | arXiv         | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fzhengzangw\u002FDoPrompt)             | 用于领域泛化的提示词                            |\n| [理解大规模模型的零样本对抗鲁棒性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.07016) | LCLR         | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fcvlab-columbia\u002FZSRobust4FoundationModel) | 对抗攻击下的视觉提示调优                |\n| [用于对抗鲁棒性的视觉提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.06284) | ICASSP        | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FPhoveran\u002Fvp-for-adversarial-robustness) | 通过视觉提示提升对抗鲁棒性       |\n| [探索基于提示的学习范式的通用脆弱性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.05239) | NAACL         | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fleix28\u002Fprompt-universal-vulnerability) | 视觉提示的脆弱性                               |\n| [对比学习中的投毒与后门攻击](https:\u002F\u002Fopenreview.net\u002Fforum?id=iC4UHbQ01Mp) | ICLR          | 2022 | ---                                                          | 对CLIP的后门和投毒攻击                       |\n| [BadEncoder：自监督学习中预训练编码器的后门攻击](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9833644) | IEEE          | 2022 | [Github](https:\u002F\u002Fgithub.com\u002Fjjy1994\u002FBadEncoder)              | 对CLIP的后门攻击                                      |\n| [CleanCLIP：缓解多模态对比学习中的数据投毒攻击](https:\u002F\u002Fopenreview.net\u002Fforum?id=GfgCNeVRFhV) | ICLR研讨会  | 2023 | ---                                                          | 防御对CLIP的后门攻击                             |\n| [通过有偏提示去偏视觉-语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.00070) | arXiv         | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fchingyaoc\u002Fdebias_vl)             | 用于缓解偏见的提示词                              |\n\n\n\n## 文本到图像生成中的提示工程（*例如* 在 Stable Diffusion 上）\n| 标题                                                        | 会议\u002F期刊           | 年份 | 如有代码，提供链接                                            | 备注                                                      |\n| ------------------------------------------------------------ | ------------------- | ---- | ------------------------------------------------------------ | ------------------------------------------------------------ |\n| [扩散模型在图像合成上超越 GAN](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.05233) | NeurIPS          | 2021 | [Github](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fguided-diffusion)         | 扩散模型在图像生成中的应用                                 |\n| [扩散模型在图像合成上超越 GAN](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.05233) | NeurIPS          | 2021 | [Github](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fguided-diffusion)         | 扩散模型在图像生成中的应用                                 |\n| [去噪扩散概率模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.11239)       | NeurIPS          | 2020 | [Github](https:\u002F\u002Fgithub.com\u002Fhojonathanho\u002Fdiffusion)          | 扩散模型在图像生成中的应用                                 |\n| [SuS-X：视觉-语言模型的无训练仅名称迁移](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.16198.pdf) | ICCV             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fvishaal27\u002Fsus-X\u002F)                | 扩散模型在图像生成中的应用                                 |\n| [探索扩散模型中的提示工程](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.15462) | NeurIPS Workshop | 2022 | ---                                                          | 语义提示设计                                               |\n| [DiffuMask：利用扩散模型结合像素级标注进行语义分割图像合成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11681) | IEEE\u002FCVF            | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fweijiawu\u002FDiffuMask)              | 通过提示实现多样化生成；用于合成数据生成的提示           |\n| [生成模型生成的合成数据是否已可用于图像识别？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07574) | ICLR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FCVMI-Lab\u002FSyntheticData)          | 通过提示实现多样化生成                                   |\n| [一张图胜过千言万语：利用文本反演个性化文本到图像生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.01618) | ICLR             | 2023 | [Github](https:\u002F\u002Ftextual-inversion.github.io\u002F)               | 通过提示对生成结果进行复杂控制                           |\n| [DreamBooth：针对特定主题的文本到图像扩散模型微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.12242) | CVPR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fdreambooth)               | 通过提示对生成结果进行复杂控制                           |\n| [文本到图像扩散模型的多概念自定义](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.04488) | CVPR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fadobe-research\u002Fcustom-diffusion) | 通过提示对生成结果进行复杂控制                           |\n| [基于交叉注意力控制的提示到提示图像编辑](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.01626) | ICLR            | 2023 | ---                                                          | 通过提示对生成结果进行复杂控制                           |\n| [面向组合式文本到图像合成的无训练结构化扩散引导](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.05032) | ICLR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fshunk031\u002Ftraining-free-structured-diffusion-guidance) | 可控的文本到图像生成                                       |\n| [扩散自我引导用于可控图像生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.00986) | NeurIPS            | 2023 | [页面](https:\u002F\u002Fdave.ml\u002Fselfguidance\u002F)                        | 可控的文本到图像生成                                       |\n| [Imagic：基于文本的扩散模型真实图像编辑](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.09276) | CVPR             | 2023 | [Github](https:\u002F\u002Fimagic-editing.github.io\u002F)                  | 可控的文本到图像生成                                       |\n| [为文本到图像扩散模型添加条件控制](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.05543) | IEEE\u002FCVF            | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Flllyasviel\u002FControlNet)           | 可控的文本到图像生成                                       |\n| [基于交叉注意力控制的提示到提示图像编辑](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.01626) | ICLR            | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fprompt-to-prompt)         | 通过提示对生成结果进行复杂控制                           |\n| [ImaginaryNet：无需真实图像和标注即可学习目标检测器](https:\u002F\u002Fopenreview.net\u002Fforum?id=9MbhFHqrti9) | ICLR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fkodenii\u002FImaginaryNet)            | 用于合成数据生成的提示                                     |\n| [生成模型生成的合成数据是否已可用于图像识别？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07574) | ICLR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FCVMI-Lab\u002FSyntheticData)          | 用于合成数据生成的提示                                     |\n| [Make-A-Video：无需文本-视频数据的文本到视频生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.14792) | ICLR             | 2023 | [页面](https:\u002F\u002Fmakeavideo.studio\u002F)                           | 用于文本到视频生成的提示                                   |\n| [Imagen Video：利用扩散模型生成高清视频](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02303) | arXiv            | 2022 | [页面](https:\u002F\u002Fimagen.research.google\u002Fvideo\u002F)                | 用于文本到视频生成的提示                                   |\n| [FateZero：融合注意力实现零样本基于文本的视频编辑](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.09535) | ICCV            | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FChenyangQiQi\u002FFateZero)           | 用于文本到视频生成的提示                                   |\n| [Tune-A-Video：为文本到视频生成对图像扩散模型进行一次-shot 微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.11565) | ICCV             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fshowlab\u002FTune-A-Video)            | 用于文本到视频生成的提示                                   |\n| [DiffRF：渲染引导的 3D 辐射场扩散](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.01206) | CVPR             | 2023 | [页面](https:\u002F\u002Fsirwyver.github.io\u002FDiffRF\u002F)                   | 用于文本到 3D 生成的提示                                   |\n| [DreamFusion：使用 2D 拓展扩散模型进行文本到 3D 的生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.14988) | ICLR 著名前 5%            | 2023 | [页面](https:\u002F\u002Fdreamfusion3d.github.io\u002F)                     | 用于文本到 3D 生成的提示                                   |\n| [Dream3D：利用 3D 形状先验和文本到图像扩散模型实现零样本文本到 3D 合成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.14704) | CVPR             | 2023 | [页面](https:\u002F\u002Fbluestyle97.github.io\u002Fdream3d\u002F)               | 用于文本到 3D 生成的提示                                   |\n| [MotionDiffuse：利用扩散模型进行文本驱动的人体运动生成](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.15001.pdf) | IEEE            | 2024 | [页面](https:\u002F\u002Fmingyuan-zhang.github.io\u002Fprojects\u002FMotionDiffuse.html) | 用于文本到运动生成的提示                                   |\n| [FLAME：自由形式的语言驱动运动合成与编辑](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.00349) | AAAI             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fkakaobrain\u002Fflame)                | 用于文本到运动生成的提示                                   |\n| [MDM：人体运动扩散模型]()                        | ICLR             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002FGuyTevet\u002Fmotion-diffusion-model) | 用于文本到运动生成的提示                                   |\n| [利用扩散模型从纯文本故事中零样本生成连贯的故事书](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.03900) | arXiv            | 2023 | ---                                                          | 用于复杂任务的提示                                         |\n| [通过双模态文本-图像提示进行多模态程序化规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.01795) | ICLR            | 2024 | [Github](https:\u002F\u002Fgithub.com\u002FYujieLu10\u002FMPP)                   | 用于复杂任务的提示                                         |\n| [针对文本到图像生成模型的提示窃取攻击](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.09923) | USENIX 安全研讨会            | 2023 | ---                                                          | 关于负责任 AI 的提示                                       |\n| [针对文本到图像生成模型的成员身份推断攻击](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.00968) | ICLR            | 2023 | ---                                                          | 针对文本到图像模型的成员身份攻击                           |\n| [扩散模型是否容易受到成员身份推断攻击？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.01316) | ICML             | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fjinhaoduan\u002FSecMI)                | 针对文本到图像模型的成员身份攻击                           |\n| [可重复地从扩散模型中提取训练图像](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.08694) | arXiv            | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fryanwebster90\u002Fonestep-extraction) | 针对文本到图像模型的成员身份攻击                           |\n| [公平扩散：指导文本到图像生成模型关注公平性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.10893) | arXiv            | 2023 | [Github](https:\u002F\u002Fgithub.com\u002Fml-research\u002FFair-Diffusion)      | 关注公平性的文本到图像模型提示                             |\n| [透过文本到图像生成的视角看社会偏见](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.06034) | AAAI\u002FACM            | 2023 | ---                                                          | 关注偏见的文本到图像模型提示                               |\n| [T2IAT：衡量文本到图像生成中的效价与刻板印象偏见](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.00905) | ACL              | 2023 | ---                                                          | 关注偏见的文本到图像模型提示                               |\n| [稳定偏见：分析扩散模型中的社会表征](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11408) | NeurIPS            | 2023 | ---                                                          | 关注偏见的文本到图像模型提示                               |\n| [关于 Stable Diffusion 的无查询对抗攻击试点研究](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023W\u002FAML\u002Fpapers\u002FZhuang_A_Pilot_Study_of_Query-Free_Adversarial_Attack_Against_Stable_Diffusion_CVPRW_2023_paper.pdf) | CVPR             | 2023 | ---                                                          | 文本到图像模型的对抗鲁棒性                                |\n| [用于不可察觉且可迁移对抗攻击的扩散模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.07460) | ICLR            | 2024 | [Github](https:\u002F\u002Fgithub.com\u002FWindVChen\u002FDiffAttack)            | 文本到图像模型的对抗鲁棒性                                 |\n| [用于对抗净化的扩散模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.07460) | ICML             | 2022 | [Github](https:\u002F\u002Fgithub.com\u002FNVlabs\u002FDiffPure)                 | 文本到图像模型的对抗鲁棒性                                 |\n| [艺术家的“Rickroll”：向文本编码器注入后门以用于文本到图像合成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.02408) | ICCV            | 2023 | ---                                                          | 对文本到图像模型的后门攻击                                 |\n| [通过多模态数据投毒，文本到图像扩散模型很容易被植入后门](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04175) | ACM MM            | 2023 | ---                                                          | 对文本到图像模型的后门攻击                                 |\n| [个性化作为针对文本到图像扩散模型的少样本后门攻击捷径](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10701) | AAAI           | 2024 | ---                                                          | 对文本到图像模型的后门攻击                                 |\n\n## # :mailbox_with_mail: 联系方式\n\n如果您有以下情况，请联系我们（jindong.gu@outlook.com，chenshuo.cs@outlook.com）：\n- 您希望将您的论文添加到本仓库；\n- 您发现本仓库存在任何错误；\n- 您对本仓库有任何建议。","# Awesome-Prompting-on-Vision-Language-Model 快速上手指南\n\n本指南旨在帮助开发者快速了解并利用 **Awesome-Prompting-on-Vision-Language-Model** 资源库。该项目并非一个可直接安装的单一软件包，而是一个**系统性的综述资源库**，汇集了视觉 - 语言模型（VLM）提示工程领域的前沿论文、代码实现及分类方法。\n\n## 1. 环境准备\n\n由于本项目是论文与代码的索引集合，运行具体示例需要针对所选论文对应的独立模型环境进行配置。以下是通用的基础环境建议：\n\n*   **操作系统**: Linux (推荐 Ubuntu 20.04+), macOS, 或 Windows (WSL2)\n*   **Python 版本**: 3.8 或更高 (具体取决于所选子项目，如 Flamingo, CLIP, Stable Diffusion 等)\n*   **核心依赖**:\n    *   PyTorch (建议 1.10+)\n    *   Transformers (Hugging Face)\n    *   CUDA (如需 GPU 加速，建议 11.3+)\n*   **前置知识**: 了解多模态模型基础概念（如 Encoder-Decoder, Decoder-only, Hard\u002FSoft Prompting）。\n\n> **注意**：请根据您感兴趣的具体论文（如下文表格所示），前往其对应的 GitHub 仓库查看特定的 `requirements.txt`。\n\n## 2. 安装步骤\n\n本项目本身无需通过 `pip` 安装，主要通过克隆仓库获取文献列表和思维导图。\n\n### 克隆资源库\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fjindonggu\u002FAwesome-Prompting-on-Vision-Language-Model.git\ncd Awesome-Prompting-on-Vision-Language-Model\n```\n\n### 获取具体模型代码\n本仓库整理了三大类模型的提示工程方法。请选择您需要的方向，点击对应论文的 \"Code if available\" 链接进入子项目进行安装。\n\n**示例：以安装 Flamingo (Multimodal-to-Text) 为例**\n```bash\n# 进入你选择的子项目目录 (此处以 open_flamingo 为例)\ngit clone https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_flamingo.git\ncd open_flamingo\n\n# 创建虚拟环境并安装依赖\npython -m venv venv\nsource venv\u002Fbin\u002Factivate  # Windows 使用: venv\\Scripts\\activate\n\n# 安装依赖 (国内用户推荐使用清华源加速)\npip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n## 3. 基本使用\n\n本项目的核心价值在于**分类指引**。使用者应根据任务类型选择对应的提示策略。\n\n### 步骤一：确定模型类型与任务\n参考仓库中的分类逻辑（见 `README` 中的 Fig. 1 & Fig. 2）：\n1.  **多模态转文本生成 (Multimodal-to-Text)**: 如 Flamingo, BLIP-2。\n    *   *策略*: 硬提示 (Hard Prompt, 如指令、上下文学习) 或 软提示 (Soft Prompt, 如 Prefix Tuning)。\n2.  **图文匹配 (Image-Text Matching)**: 如 CLIP。\n    *   *策略*: 提示文本编码器、视觉编码器或两者联合提示。\n3.  **文生图生成 (Text-to-Image)**: 如 Stable Diffusion。\n    *   *策略*: 优化文本提示词以控制生成内容。\n\n### 步骤二：应用提示工程 (代码示例)\n\n以下展示如何在典型的 **CLIP (图文匹配)** 场景中应用**硬提示 (Hard Prompt)** 技术（基于零样本分类）：\n\n```python\nimport torch\nfrom PIL import Image\nimport clip\n\n# 1. 加载模型\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\"\nmodel, preprocess = clip.load(\"ViT-B\u002F32\", device=device)\n\n# 2. 准备图像\nimage = preprocess(Image.open(\"assets\u002Fpvlm-mindmap.png\")).unsqueeze(0).to(device)\n\n# 3. 构建提示模板 (Hard Prompting 的核心)\n# 原始方法可能只用 \"a photo of a {}\", 这里使用更丰富的模板增强鲁棒性\ntemplates = [\n    \"a bad photo of a {}.\",\n    \"a photo of a nice {}.\",\n    \"a photo of the large {}.\",\n    \"a photo of the small {}.\",\n    \"a photo of a {}.\"\n]\n\nclasses = [\"cat\", \"dog\", \"bird\"]\n\n# 4. 生成所有模板下的文本嵌入并取平均\ntexts = []\nfor cls in classes:\n    for template in templates:\n        texts.append(clip.tokenize(template.format(cls)).to(device))\n\nwith torch.no_grad():\n    text_features = model.encode_text(torch.cat(texts, dim=0))\n    # 重塑并平均每个类别的特征\n    text_features = text_features.reshape(len(classes), len(templates), -1).mean(dim=1)\n    text_features \u002F= text_features.norm(dim=-1, keepdim=True)\n\n# 5. 推理\nwith torch.no_grad():\n    image_features = model.encode_image(image)\n    image_features \u002F= image_features.norm(dim=-1, keepdim=True)\n    \n    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)\n    print(\"预测结果:\", classes[similarity.argmax().item()])\n```\n\n### 步骤三：进阶研究\n若需尝试**软提示 (Soft Prompt \u002F Prompt Tuning)**，请参考仓库中列出的具体论文实现（如 `CoOp`, `Prefix-Tuning`），通常涉及在模型冻结参数的情况下，训练少量可学习的连续向量参数。\n\n*   **查阅论文**: 在本地 `README.md` 或 ArXiv 链接中查找 \"Prompt Tuning\" 相关章节。\n*   **复现代码**: 直接访问表格中提供的 GitHub 链接（如 `OFA`, `BLIP-2` 等项目），运行其官方提供的微调脚本。","某电商公司的算法团队正致力于优化其智能客服系统，希望利用视觉 - 语言模型（VLM）自动识别用户上传的商品破损图片并生成准确的理赔回复。\n\n### 没有 Awesome-Prompting-on-Vision-Language-Model 时\n- **选型迷茫**：面对 Flamingo、CLIP 和 Stable Diffusion 等不同类型的模型，团队难以快速确定哪种架构最适合“图像理解 + 文本生成”的特定任务。\n- **试错成本高**：缺乏系统的提示工程（Prompt Engineering）方法论，开发人员只能凭经验盲目尝试硬提示（Hard Prompt）或软提示（Soft Prompt），耗费数周调试效果仍不稳定。\n- **前沿技术缺失**：无法及时获取如“思维链（Chain-of-Thought）”或“上下文学习（In-context Learning）”在多模态领域的最新应用案例，导致生成的回复逻辑简单，难以处理复杂破损场景。\n- **资源分散**：需要手动在海量论文中筛选相关研究，效率极低且容易遗漏关键的技术实现细节。\n\n### 使用 Awesome-Prompting-on-Vision-Language-Model 后\n- **精准定位模型**：通过仓库清晰的分类导航，团队迅速锁定基于 Flamingo 的多模态生成方案，明确了技术路线。\n- **方法有据可依**：参考综述中总结的硬\u002F软提示策略，直接复用成熟的模板设计，将模型适配时间从数周缩短至几天。\n- **能力显著增强**：应用文中推荐的“思维链”提示技巧，模型不仅能识别破损，还能逐步推理损坏原因并生成富有同理心的专业回复，准确率大幅提升。\n- **一站式资源获取**：直接获取经过筛选的高质量论文列表及对应代码链接，快速复现了业界最先进的提示微调技术。\n\nAwesome-Prompting-on-Vision-Language-Model 将散乱的学术研究转化为可落地的工程指南，帮助开发者以最低成本释放视觉 - 语言模型的最大潜力。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FJindongGu_Awesome-Prompting-on-Vision-Language-Model_e31dfd33.png","JindongGu",null,"https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FJindongGu_758666a9.jpg","Research Fellow the University of Oxford\r\nFaculty Researcher in Google DeepMind","University of Oxford","Oxford, UK","https:\u002F\u002Fjindonggu.github.io\u002F","https:\u002F\u002Fgithub.com\u002FJindongGu",507,37,"2026-03-30T00:48:49","","未说明",{"notes":91,"python":89,"dependencies":92},"该仓库是一个综述列表（Awesome List），主要整理了关于视觉语言模型提示工程的研究论文、代码链接和分类方法，本身不是一个可直接运行的单一软件工具或框架。因此 README 中未包含具体的操作系统、GPU、内存、Python 版本或依赖库的安装需求。用户若需运行列表中提到的具体模型（如 Flamingo, CLIP, Stable Diffusion 等），需参考各模型对应的独立代码仓库（表中 'Code if available' 列提供的链接）以获取具体的环境配置信息。",[],[18],[95,96,97],"foundation-models","prompt-engineering","vision-and-language","2026-03-27T02:49:30.150509","2026-04-06T07:13:55.094366",[],[]]