[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-yueliu1999--Awesome-Jailbreak-on-LLMs":3,"tool-yueliu1999--Awesome-Jailbreak-on-LLMs":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",140436,2,"2026-04-05T23:32:43",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":75,"owner_avatar_url":76,"owner_bio":77,"owner_company":78,"owner_location":79,"owner_email":80,"owner_twitter":81,"owner_website":82,"owner_url":83,"languages":81,"stars":84,"forks":85,"last_commit_at":86,"license":87,"difficulty_score":88,"env_os":89,"env_gpu":90,"env_ram":90,"env_deps":91,"category_tags":94,"github_topics":95,"view_count":23,"oss_zip_url":81,"oss_zip_packed_at":81,"status":16,"created_at":105,"updated_at":106,"faqs":107,"releases":136},2028,"yueliu1999\u002FAwesome-Jailbreak-on-LLMs","Awesome-Jailbreak-on-LLMs","Awesome-Jailbreak-on-LLMs is a collection of state-of-the-art, novel, exciting jailbreak methods on LLMs. It contains papers, codes, datasets, evaluations, and analyses.","Awesome-Jailbreak-on-LLMs 是一个汇集了最新大语言模型（LLM）绕过安全限制方法的开源资源库，涵盖攻击论文、代码、数据集与评估分析。它系统整理了针对模型对齐机制的多种突破技术，如基于推理诱导的“BadThink”、通过翻转提示实现的“FlipAttack”，以及针对多模态和RAG系统的新型攻击手段，帮助研究者深入理解模型的安全边界。该资源库解决了当前LLM安全评估中缺乏系统性攻击案例库的问题，为安全防护研究提供真实、前沿的测试基准。适合从事AI安全、模型对齐、红队测试的研究人员和开发者使用，也可辅助安全工程师设计更鲁棒的防御机制。资源中包含多篇顶会论文与可复现代码，尤其在“推理诱导攻击”和“多模态越狱”方向具有创新性，部分方法已在实际模型中验证有效。欢迎研究人员贡献新方法，共同推动AI安全领域的透明与进步。","# Awesome-Jailbreak-on-LLMs\n\nAwesome-Jailbreak-on-LLMs is a collection of state-of-the-art, novel, exciting jailbreak methods on LLMs. It contains papers, codes, datasets, evaluations, and analyses. Any additional things regarding jailbreak, PRs, issues are welcome and we are glad to add you to the contributor list [here](#contributors). Any problems, please contact yliu@u.nus.edu. If you find this repository useful to your research or work, it is really appreciated to star this repository and cite our papers [here](#Reference). :sparkles:\n\n\n## Reference\n\nIf you find this repository helpful for your research, we would greatly appreciate it if you could cite our papers. :sparkles:\n\n```\n@article{zhuzhenhao_GuardReasoner_Omni,\n  title={GuardReasoner-Omni: A Reasoning-based Multi-modal Guardrail for Text, Image, and Video},\n  author={Zhu, Zhenhao and Liu, Yue and Guo, Yanpei and Qu, Wenjie and Chen, Cancan and He, Yufei and Li, Yibo and Chen, Yulin and Wu, Tianyi and Xu, Huiying and others},\n  journal={arXiv preprint arXiv:2602.03328},\n  year={2026}\n}\n\n@article{liuyue_GuardReasoner_VL,\n  title={GuardReasoner-VL: Safeguarding VLMs via Reinforced Reasoning},\n  author={Liu, Yue and Zhai, Shengfang and Du, Mingzhe and Chen, Yulin and Cao, Tri and Gao, Hongcheng and Wang, Cheng and Li, Xinfeng and Wang, Kun and Fang, Junfeng and Zhang, Jiaheng and Hooi, Bryan},\n  journal={arXiv preprint arXiv:2505.11049},\n  year={2025}\n}\n\n@article{liuyue_GuardReasoner,\n  title={GuardReasoner: Towards Reasoning-based LLM Safeguards},\n  author={Liu, Yue and Gao, Hongcheng and Zhai, Shengfang and Jun, Xia and Wu, Tianyi and Xue, Zhiwei and Chen, Yulin and Kawaguchi, Kenji and Zhang, Jiaheng and Hooi, Bryan},\n  journal={arXiv preprint arXiv:2501.18492},\n  year={2025}\n}\n\n@article{liuyue_FlipAttack,\n  title={FlipAttack: Jailbreak LLMs via Flipping},\n  author={Liu, Yue and He, Xiaoxin and Xiong, Miao and Fu, Jinlan and Deng, Shumin and Hooi, Bryan},\n  journal={arXiv preprint arXiv:2410.02832},\n  year={2024}\n}\n\n@article{wang2025safety,\n  title={Safety in Large Reasoning Models: A Survey},\n  author={Wang, Cheng and Liu, Yue and Li, Baolong and Zhang, Duzhen and Li, Zhongzhi and Fang, Junfeng},\n  journal={arXiv preprint arXiv:2504.17704},\n  year={2025}\n}\n```\n\n\n## Bookmarks\n\n- [Jailbreak Attack](#jailbreak-attack)\n  - [Attack on LRMs](#attack-on-lrms)\n  - [Black-box Attack](#black-box-attack)\n  - [White-box Attack](#white-box-attack)\n  - [Multi-turn Attack](#multi-turn-attack)\n  - [Attack on RAG-based LLM](#attack-on-rag-based-llm)\n  - [Multi-modal Attack](#multi-modal-attack)\n- [Jailbreak Defense](#jailbreak-defense)\n  - [Learning-based Defense](#learning-based-defense)\n  - [Strategy-based Defense](#strategy-based-defense)\n  - [Guard Model](#Guard-model)\n  - [Moderation API](#Moderation-API)\n- [Evaluation & Analysis](#evaluation--analysis)\n- [Application](#application)\n\n\n\n## Papers\n\n\n\n\n### Jailbreak Attack\n\n\n\n#### Attack on LRMs\n| Time    | Title                                                        | Venue |                  Paper                   |                             Code                             |\n| ------- | ------------------------------------------------------------ | :---: | :--------------------------------------: | :----------------------------------------------------------: |\n| 2025.11 | **BadThink: Triggered Overthinking Attacks on Chain-of-Thought Reasoning in Large Language Models** | AAAI'26  | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2511.10714) | - |\n| 2025.08 | **Jinx: Unlimited LLMs for Probing Alignment Failures** | arXiv  | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.08243) | [models](https:\u002F\u002Fhuggingface.co\u002FJinx-org) |\n| 2025.07 | **BadReasoner: Planting Tunable Overthinking Backdoors into Large Reasoning Models for Fun or Profit** | arXiv  | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.18305) | [link](https:\u002F\u002Fgithub.com\u002FFZaKK\u002FBadReasoner) |\n| 2025.06 | **ExtendAttack: Attacking Servers of LRMs via Extending Reasoning** | AAAI'26  | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.13737) | [link](https:\u002F\u002Fgithub.com\u002Fzzh-thu-22\u002FExtendAttack) |\n| 2025.06 | **Excessive Reasoning Attack on Reasoning LLMs** | arXiv  | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.14374) | - |\n| 2025.03 | **Cats Confuse Reasoning LLM: Query Agnostic Adversarial Triggers for Reasoning Models** | arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.01781) |- |\n| 2025.02 | **OverThink: Slowdown Attacks on Reasoning LLMs** | arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.02542#:~:text=We%20increase%20overhead%20for%20applications%20that%20rely%20on,the%20user%20query%20while%20providing%20contextually%20correct%20answers.) | [link](https:\u002F\u002Fgithub.com\u002Fakumar2709\u002FOVERTHINK_public) |\n| 2025.02 | **BoT: Breaking Long Thought Processes of o1-like Large Language Models through Backdoor Attack** | arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.12202) | [link](https:\u002F\u002Fgithub.com\u002Fzihao-ai\u002FBoT) |\n| 2025.02 | **H-CoT: Hijacking the Chain-of-Thought Safety Reasoning Mechanism to Jailbreak Large Reasoning Models, Including OpenAI o1\u002Fo3, DeepSeek-R1, and Gemini 2.0 Flash Thinking** | arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.12893) |[link](https:\u002F\u002Fgithub.com\u002Fdukeceicenter\u002Fjailbreak-reasoning-openai-o1o3-deepseek-r1) |\n| 2025.02 | **A Mousetrap: Fooling Large Reasoning Models for Jailbreak with Chain of Iterative Chaos** | arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.15806) |- |\n\n\n\n\n#### Black-box Attack\n\n| Time    | Title                                                       |  Venue  |                                                           Paper                                                            |                                          Code                                          |\n|---------| ----------------------------------------------------------- | :-----: |:--------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------:|\n| 2026.03 | **Internal Safety Collapse in Frontier Large Language Models (ISC-Bench)** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.23509) | [link](https:\u002F\u002Fgithub.com\u002Fwuyoscar\u002FISC-Bench) |\n| 2025.10 | **BreakFun: Jailbreaking LLMs via Schema Exploitation** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.17904) | - |\n| 2025.07 | **Response Attack: Exploiting Contextual Priming to Jailbreak Large Language Models** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.05248) | [link](https:\u002F\u002Fgithub.com\u002FDtc7w3PQ\u002FResponse-Attack) |\n| 2025.05 | **Emoji Attack: Enhancing Jailbreak Attacks Against Judge LLM Detection** |   ICML'25     |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.01077)                                          |                   [link](https:\u002F\u002Fgithub.com\u002Fzhipeng-wei\u002FEmojiAttack)                   |\n| 2025.05 | **FlipAttack: Jailbreak LLMs via Flipping (FlipAttack)** |   ICML'25     |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.02832)                                          |                    [link](https:\u002F\u002Fgithub.com\u002Fyueliu1999\u002FFlipAttack)                    |\n| 2025.03 | **Playing the Fool: Jailbreaking LLMs and Multimodal LLMs with Out-of-Distribution Strategy (JOOD)** |   CVPR'25     |                                          [link](http:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.20823)                                           |                        [link](https:\u002F\u002Fgithub.com\u002Fnaver-ai\u002FJOOD)                        |\n| 2025.02 | **StructTransform: A Scalable Attack Surface for Safety-Aligned Large Language Models** |   arXiv     |                                          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11853)                                          |                  [link](https:\u002F\u002Fgithub.com\u002FStructTransform\u002FBenchmark)                  |\n| 2025.01 | **Jailbreaking LLMs' Safeguard with Universal Magic Words for Text Embedding Models** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.18280) | - |\n| 2025.01 | **Understanding and Enhancing the Transferability of Jailbreaking Attacks** |   ICLR'25     |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2502.03052)                                          |                    [link](https:\u002F\u002Fgithub.com\u002Ftmllab\u002F2025_ICLR_PiF)                     |\n| 2024.11 | **The Dark Side of Trust: Authority Citation-Driven Jailbreak Attacks on Large Language Models** | arXiv |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.11407)                                          |                     [link](https:\u002F\u002Fgithub.com\u002FYancyKahn\u002FDarkCite)                      |\n| 2024.11 | **Playing Language Game with LLMs Leads to Jailbreaking** | arXiv |                                         [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.12762v1)                                         | [link](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002Fencode_jailbreaking_anonymous-B4C4\u002FREADME.md) |\n| 2024.11 | **GASP: Efficient Black-Box Generation of Adversarial Suffixes for Jailbreaking LLMs (GASP)** | arXiv |                                         [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.14133v1)                                         |                        [link](https:\u002F\u002Fgithub.com\u002Fllm-gasp\u002Fgasp)                        |\n| 2024.11 | **LLM STINGER: Jailbreaking LLMs using RL fine-tuned LLMs** | arXiv |                                         [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.08862v1)                                         |                                           -                                            |\n| 2024.11 | **SequentialBreak: Large Language Models Can be Fooled by Embedding Jailbreak Prompts into Sequential Prompt** | arXiv |                                         [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.06426v1)                                         |            [link](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002FJailBreakAttack-4F3B\u002F)             |\n| 2024.11 | **Diversity Helps Jailbreak Large Language Models** | arXiv |                                         [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.04223v1)                                         |                                           -                                            |\n| 2024.11 | **Plentiful Jailbreaks with String Compositions** | arXiv |                                         [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.01084v1)                                         |                                           -                                            |\n| 2024.11 | **Transferable Ensemble Black-box Jailbreak Attacks on Large Language Models** |   arXiv    |                                         [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.23558v1)                                         |          [link](https:\u002F\u002Fgithub.com\u002FYQYANG2233\u002FLarge-Language-Model-Break-AI)           |\n| 2024.11 | **Stealthy Jailbreak Attacks on Large Language Models via Benign Data Mirroring** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.21083)                                          |                                           -                                            |\n| 2024.10 | **Endless Jailbreaks with Bijection** |   arXiv    |                                         [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.01294v1)                                         |                                           -                                            |\n| 2024.10 | **Harnessing Task Overload for Scalable Jailbreak Attacks on Large Language Models** |   arXiv    |                                         [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.04190v1)                                         |                                           -                                            |\n| 2024.10 | **You Know What I'm Saying: Jailbreak Attack via Implicit Reference** |   arXiv    |                                         [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.03857v2)                                         |               [link](https:\u002F\u002Fgithub.com\u002FLucas-TY\u002Fllm_Implicit_reference)               |\n| 2024.10 | **Deciphering the Chaos: Enhancing Jailbreak Attacks via Adversarial Prompt Translation** |   arXiv    |                                         [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.11317v1)                                         |           [link](https:\u002F\u002Fgithub.com\u002Fqizhangli\u002FAdversarial-Prompt-Translator)           |\n| 2024.10 | **AutoDAN-Turbo: A Lifelong Agent for Strategy Self-Exploration to Jailbreak LLMs (AutoDAN-Turbo)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.05295)                                          |                 [link](https:\u002F\u002Fgithub.com\u002FSaFoLab-WISC\u002FAutoDAN-Turbo)                  |\n| 2024.10 | **PathSeeker: Exploring LLM Security Vulnerabilities with a Reinforcement Learning-Based Jailbreak Approach (PathSeeker)** | arXiv |                                        [link](https:\u002F\u002Fwww.arxiv.org\u002Fpdf\u002F2409.14177)                                        |                                           -                                            |\n| 2024.10 | **Read Over the Lines: Attacking LLMs and Toxicity Detection Systems with ASCII Art to Mask Profanity** | arXiv |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.18708)                                          |                     [link](https:\u002F\u002Fgithub.com\u002FSerbernari\u002FToxASCII)                     |\n| 2024.09 | **AdaPPA: Adaptive Position Pre-Fill Jailbreak Attack Approach Targeting LLMs** | arXiv |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.07503)                                          |                       [link](https:\u002F\u002Fgithub.com\u002FYummy416\u002FAdaPPA)                       |\n| 2024.09 | **Effective and Evasive Fuzz Testing-Driven Jailbreaking Attacks against LLMs** | arXiv |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.14866)                                          |                                           -                                            |\n| 2024.09 | **Jailbreaking Large Language Models with Symbolic Mathematics** |   arXiv    |                                         [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.11445v1)                                         |                                           -                                            |\n| 2024.08 | **Play Guessing Game with LLM: Indirect Jailbreak Attack with Implicit Clues** |   ACL Findings'24    |                                   [link](https:\u002F\u002Faclanthology.org\u002F2024.findings-acl.304)                                   |                       [link](https:\u002F\u002Fgithub.com\u002Fczycurefun\u002FIJBR)                       |\n| 2024.08 | **Advancing Adversarial Suffix Transfer Learning on Aligned Large Language Models** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.14866)                                          |                                           -                                            |\n| 2024.08 | **Hide Your Malicious Goal Into Benign Narratives: Jailbreak Large Language Models through Neural Carrier Articles** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.11182)                                          |                                           -                                            |\n| 2024.08 | **h4rm3l: A Dynamic Benchmark of Composable Jailbreak Attacks for LLM Safety Assessment (h4rm3l)** |    arXiv   |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.04811)                                          |                      [link](https:\u002F\u002Fmdoumbouya.github.io\u002Fh4rm3l\u002F)                      |\n| 2024.08 | **EnJa: Ensemble Jailbreak on Large Language Models (EnJa)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.03603)                                          |                                           -                                            |\n| 2024.07 | **Knowledge-to-Jailbreak: One Knowledge Point Worth One Attack** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.11682)                                          |               [link](https:\u002F\u002Fgithub.com\u002FTHU-KEG\u002FKnowledge-to-Jailbreak\u002F)               |\n| 2024.07 | **LLMs can be Dangerous Reasoners: Analyzing-based Jailbreak Attack on Large Language Models** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.16205)                                          |                                                                                        |\n| 2024.07 | **Single Character Perturbations Break LLM Alignment** |   arXiv    |                                     [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.03232#page=3.00)                                     |                  [link](https:\u002F\u002Fgithub.com\u002Fhannah-aught\u002Fspace_attack)                  |\n| 2024.07 | **A False Sense of Safety: Unsafe Information Leakage in 'Safe' AI Responses** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.02551)                                          |                                           -                                            |\n| 2024.07 | **Virtual Context: Enhancing Jailbreak Attacks with Special Token Injection (Virtual Context)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.19845)                                          |                                           -                                            |\n| 2024.07 | **SoP: Unlock the Power of Social Facilitation for Automatic Jailbreak Attack (SoP)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.01902)                                          |                    [link](https:\u002F\u002Fgithub.com\u002FYang-Yan-Yang-Yan\u002FSoP)                    |\n| 2024.06 | **Jailbreaking as a Reward Misspecification Problem**| ICLR'25|                                                          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.14393)                                                          |                  [link](https:\u002F\u002Fgithub.com\u002Fzhxieml\u002Fremiss-jailbreak)                   |\n| 2024.06 | **Improved Few-Shot Jailbreaking Can Circumvent Aligned Language Models and Their Defenses (I-FSJ)** |    NeurIPS'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.01288)                                          |                        [link](https:\u002F\u002Fgithub.com\u002Fsail-sg\u002FI-FSJ)                        |\n| 2024.06 | **When LLM Meets DRL: Advancing Jailbreaking Efficiency via DRL-guided Search (RLbreaker)** |   NeurIPS'24   |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.08705)                                          |                                           -                                            |\n| 2024.06 | **Agent Smith: A Single Image Can Jailbreak One Million Multimodal LLM Agents Exponentially Fast (Agent Smith)** |   ICML'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.08567)                                          |                     [link](https:\u002F\u002Fgithub.com\u002Fsail-sg\u002FAgent-Smith)                     |\n| 2024.06 | **Covert Malicious Finetuning: Challenges in Safeguarding LLM Adaptation** |   ICML'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.20053)                                          |                                           -                                            |\n| 2024.06 | **ArtPrompt: ASCII Art-based Jailbreak Attacks against Aligned LLMs (ArtPrompt)** |   ACL'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.11753)                                          |                      [link](https:\u002F\u002Fgithub.com\u002Fuw-nsl\u002FArtPrompt)                       |\n| 2024.06 | **From Noise to Clarity: Unraveling the Adversarial Suffix of Large Language Model Attacks via Translation of Text Embeddings (ASETF)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.16006)                                          |                                           -                                            |\n| 2024.06 | **CodeAttack: Revealing Safety Generalization Challenges of Large Language Models via Code Completion (CodeAttack)** |   ACL'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.07865)                                          |                                           -                                            |\n| 2024.06 | **Making Them Ask and Answer: Jailbreaking Large Language Models in Few Queries via Disguise and Reconstruction (DRA)** |   USENIX Security'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.18104)                                          |                        [link](https:\u002F\u002Fgithub.com\u002FLLM-DRA\u002FDRA\u002F)                         |\n| 2024.06 | **AutoJailbreak: Exploring Jailbreak Attacks and Defenses through a Dependency Lens (AutoJailbreak)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.08705)                                          |                                           -                                            |\n| 2024.06 | **Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.02151)                                          |                [link](https:\u002F\u002Fgithub.com\u002Ftml-epfl\u002Fllm-adaptive-attacks)                |\n| 2024.06 | **GPTFUZZER: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts (GPTFUZZER)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.10253)                                          |                    [link](https:\u002F\u002Fgithub.com\u002Fsherdencooper\u002FGPTFuzz)                    |\n| 2024.06 | **A Wolf in Sheep’s Clothing: Generalized Nested Jailbreak Prompts can Fool Large Language Models Easily (ReNeLLM)** |   NAACL'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.08268)                                          |                       [link](https:\u002F\u002Fgithub.com\u002FNJUNLP\u002FReNeLLM)                        |\n| 2024.06 | **QROA: A Black-Box Query-Response Optimization Attack on LLMs (QROA)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.02044)                                          |                          [link](https:\u002F\u002Fgithub.com\u002Fqroa\u002Fqroa)                          |\n| 2024.06 | **Poisoned LangChain: Jailbreak LLMs by LangChain (PLC)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18122)                                          |                 [link](https:\u002F\u002Fgithub.com\u002FCAM-FSS\u002Fjailbreak-langchain)                 |\n| 2024.05 | **Multilingual Jailbreak Challenges in Large Language Models** |   ICLR'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.06474)                                          |          [link](https:\u002F\u002Fgithub.com\u002FDAMO-NLP-SG\u002Fmultilingual-safety-for-LLMs)           |\n| 2024.05 | **DeepInception: Hypnotize Large Language Model to Be Jailbreaker (DeepInception)** |   EMNLP'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.03191)                                          |                  [link](https:\u002F\u002Fgithub.com\u002Ftmlr-group\u002FDeepInception)                   |\n| 2024.05 | **GPT-4 Jailbreaks Itself with Near-Perfect Success Using Self-Explanation (IRIS)** |   ACL'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.13077)                                          |                                           -                                            |\n| 2024.05 | **GUARD: Role-playing to Generate Natural-language Jailbreakings to Test Guideline Adherence of LLMs (GUARD)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.03299)                                          |                                           -                                            |\n| 2024.05 | **\"Do Anything Now\": Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models (DAN)** |   CCS'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.03825)                                          |                   [link](https:\u002F\u002Fgithub.com\u002Fverazuo\u002Fjailbreak_llms)                    |\n| 2024.05 | **Gpt-4 is too smart to be safe: Stealthy chat with llms via cipher (SelfCipher)** |   ICLR'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.06463)                                          |                    [link](https:\u002F\u002Fgithub.com\u002FRobustNLP\u002FCipherChat)                     |\n| 2024.05 | **Jailbreaking Large Language Models Against Moderation Guardrails via Cipher Characters (JAM)** | NeurIPS'24 |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.20413)                                          |                                           -                                            |\n| 2024.05 | **Jailbreak and Guard Aligned Language Models with Only Few In-Context Demonstrations (ICA)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.06387)                                          |                                           -                                            |\n| 2024.04 | **Many-shot jailbreaking (MSJ)** |   NeurIPS'24 Anthropic   | [link](https:\u002F\u002Fwww-cdn.anthropic.com\u002Faf5633c94ed2beb282f6a53c595eb437e8e7b630\u002FMany_Shot_Jailbreaking__2024_04_02_0936.pdf) |                                           -                                            |\n| 2024.04 | **PANDORA: Detailed LLM jailbreaking via collaborated phishing agents with decomposed reasoning (PANDORA)** |   ICLR Workshop'24    |                                      [link](https:\u002F\u002Fopenreview.net\u002Fpdf?id=9o06ugFxIj)                                      |                                           -                                            |\n| 2024.04 | **Fuzzllm: A novel and universal fuzzing framework for proactively discovering jailbreak vulnerabilities in large language models (FuzzLLM)** |   ICASSP'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.05274)                                          |                     [link](https:\u002F\u002Fgithub.com\u002FRainJamesY\u002FFuzzLLM)                      |\n| 2024.04 | **Sandwich attack: Multi-language mixture adaptive attack on llms (Sandwich attack)** |   TrustNLP'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.07242)                                          |                                           -                                            |\n| 2024.03 | **Tastle: Distract large language models for automatic jailbreak attack (TASTLE)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.08424)                                          |                                           -                                            |\n| 2024.03 | **DrAttack: Prompt Decomposition and Reconstruction Makes Powerful LLM Jailbreakers (DrAttack)** |   EMNLP'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.16914)                                          |                      [link](https:\u002F\u002Fgithub.com\u002Fxirui-li\u002FDrAttack)                      |\n| 2024.02 | **PRP: Propagating Universal Perturbations to Attack Large Language Model Guard-Rails (PRP)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.15911)                                          |                                           -                                            |\n| 2024.02 | **CodeChameleon: Personalized Encryption Framework for Jailbreaking Large Language Models (CodeChameleon)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.16717)                                          |                  [link](https:\u002F\u002Fgithub.com\u002Fhuizhang-L\u002FCodeChameleon)                   |\n| 2024.02 | **PAL: Proxy-Guided Black-Box Attack on Large Language Models (PAL)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.09674)                                          |                         [link](https:\u002F\u002Fgithub.com\u002Fchawins\u002Fpal)                         |\n| 2024.02 | **Jailbreaking Proprietary Large Language Models using Word Substitution Cipher** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.10601)                                          |                                           -                                            |\n| 2024.02 | **Query-Based Adversarial Prompt Generation** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.12329)                                          |                                           -                                            |\n| 2024.02 | **Leveraging the Context through Multi-Round Interactions for Jailbreaking Attacks (Contextual Interaction Attack)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.09177)                                          |                                           -                                            |\n| 2024.02 | **Semantic Mirror Jailbreak: Genetic Algorithm Based Jailbreak Prompts Against Open-source LLMs (SMJ)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.14872)                                          |                                           -                                            |\n| 2024.02 | **Cognitive Overload: Jailbreaking Large Language Models with Overloaded Logical Thinking** |   NAACL'24    |                                    [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.09827#page=10.84)                                     |                [link](https:\u002F\u002Fgithub.com\u002Fluka-group\u002FCognitiveOverload)                 |\n| 2024.01 | **Low-Resource Languages Jailbreak GPT-4** |   NeurIPS Workshop'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.02446)                                          |                                           -                                            |\n| 2024.01 | **How Johnny Can Persuade LLMs to Jailbreak Them: Rethinking Persuasion to Challenge AI Safety by Humanizing LLMs (PAP)** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.06373)                                          |              [link](https:\u002F\u002Fgithub.com\u002FCHATS-lab\u002Fpersuasive_jailbreaker)               |\n| 2023.12 | **Tree of Attacks: Jailbreaking Black-Box LLMs Automatically (TAP)** |   NeurIPS'24   |                                          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.02119)                                          |                       [link](https:\u002F\u002Fgithub.com\u002FRICommunity\u002FTAP)                       |\n| 2023.12 | **Make Them Spill the Beans! Coercive Knowledge Extraction from (Production) LLMs** |   arXiv    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.04782)                                          |                                           -                                            |\n| 2023.12 | **Ignore This Title and HackAPrompt: Exposing Systemic Vulnerabilities of LLMs through a Global Scale Prompt Hacking Competition** |   ACL'24    |                                   [link](https:\u002F\u002Faclanthology.org\u002F2023.emnlp-main.302\u002F)                                    |                                           -                                            |\n| 2023.11 | **Scalable and Transferable Black-Box Jailbreaks for Language Models via Persona Modulation (Persona)** |   NeurIPS Workshop'23    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.03348)                                          |                                           -                                            |\n| 2023.10 | **Jailbreaking Black Box Large Language Models in Twenty Queries (PAIR)** |   NeurIPS'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.08419)                                          |                [link](https:\u002F\u002Fgithub.com\u002Fpatrickrchao\u002FJailbreakingLLMs)                |\n| 2023.10 | **Adversarial Demonstration Attacks on Large Language Models (advICL)** |   EMNLP'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.14950)                                          |                                           -                                            |\n| 2023.10 | **MASTERKEY: Automated Jailbreaking of Large Language Model Chatbots (MASTERKEY)** |   NDSS'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.08715)                                          |                    [link](https:\u002F\u002Fgithub.com\u002FLLMSecurity\u002FMasterKey)                    |              -                               |\n| 2023.10 | **Attack Prompt Generation for Red Teaming and Defending Large Language Models (SAP)** |   EMNLP'23    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.12505)                                          |                        [link](https:\u002F\u002Fgithub.com\u002FAatrox103\u002FSAP)                        |\n| 2023.10 | **An LLM can Fool Itself: A Prompt-Based Adversarial Attack (PromptAttack)** |   ICLR'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.13345)                                          |                   [link](https:\u002F\u002Fgithub.com\u002FGodXuxilie\u002FPromptAttack)                   |\n| 2023.09 | **Multi-step Jailbreaking Privacy Attacks on ChatGPT (MJP)** |   EMNLP Findings'23    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.05197)                                          |           [link](https:\u002F\u002Fgithub.com\u002FHKUST-KnowComp\u002FLLM-Multistep-Jailbreak)            |\n| 2023.09 | **Open Sesame! Universal Black Box Jailbreaking of Large Language Models (GA)** |   Applied Sciences'24    |                                          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.01446)                                          |                                           -                                            |\n| 2023.05 | **Not what you’ve signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection** |   CCS'23    |                           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2302.12173?trk=public_post_comment-text)                            |                    [link](https:\u002F\u002Fgithub.com\u002Fgreshake\u002Fllm-security)                    |\n| 2022.11 | **Ignore Previous Prompt: Attack Techniques For Language Models (PromptInject)** |   NeurIPS WorkShop'22    |                                          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.09527)                                          |                [link](https:\u002F\u002Fgithub.com\u002Fagencyenterprise\u002FPromptInject)                |\n\n\n\n\n\n\n\n\n\n\n\n\n\n#### White-box Attack\n\n| Year    | Title                                                        |      Venue       |                            Paper                             |                            Code                            |\n| ------- | ------------------------------------------------------------ | :--------------: | :----------------------------------------------------------: | :--------------------------------------------------------: |\n| 2025.08 | **Don’t Say No: Jailbreaking LLM by Suppressing Refusal (DSN)** | ACL'25 | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.16369) |    [link](https:\u002F\u002Fgithub.com\u002FDSN-2024\u002FDSN) |\n| 2025.03 | **Guiding not Forcing: Enhancing the Transferability of Jailbreaking Attacks on LLMs via Removing Superfluous Constraints** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.01865) |    [link](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FTransferAttack) |\n| 2025.02 | **Improved techniques for optimization-based jailbreaking on large language models (I-GCG)** |   ICLR'25     | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.21018) |        [link](https:\u002F\u002Fgithub.com\u002FjiaxiaojunQAQ\u002FI-GCG)      |\n| 2024.12 | **Efficient Adversarial Training in LLMs with Continuous Attacks** | NeurIPS'24 | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15589) | [link](https:\u002F\u002Fgithub.com\u002Fsophie-xhonneux\u002FContinuous-AdvTrain) |\n| 2024.11 | **AmpleGCG-Plus: A Strong Generative Model of Adversarial Suffixes to Jailbreak LLMs with Higher Success Rates in Fewer Attempts** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.22143v1)           |                          -                             |\n| 2024.11 | **DROJ: A Prompt-Driven Attack against Large Language Models** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.09125)           |                           [link](https:\u002F\u002Fgithub.com\u002FLeon-Leyang\u002FLLM-Safeguard)                              |\n| 2024.11 | **SQL Injection Jailbreak: a structural disaster of large language models** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.01565)           |                           [link](https:\u002F\u002Fgithub.com\u002Fweiyezhimeng\u002FSQL-Injection-Jailbreak)                              |\n| 2024.10 | **Functional Homotopy: Smoothing Discrete Optimization via Continuous Parameters for LLM Jailbreak Attacks** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.04234)           |                           -                              |\n| 2024.10 | **AttnGCG: Enhancing Jailbreaking Attacks on LLMs with Attention Manipulation** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.09040v1)           |                           [link](https:\u002F\u002Fgithub.com\u002FUCSC-VLAA\u002FAttnGCG-attack)                               |\n| 2024.10 | **Jailbreak Instruction-Tuned LLMs via end-of-sentence MLP Re-weighting** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.10150v1)           |                            -                              |\n| 2024.10 | **Boosting Jailbreak Transferability for Large Language Models (SI-GCG)** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.15645v1)           |                            -                              |\n| 2024.10 | **Iterative Self-Tuning LLMs for Enhanced Jailbreaking Capabilities (ADV-LLM)** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.18469v1)           |                             [link](https:\u002F\u002Fgithub.com\u002FSunChungEn\u002FADV-LLM)                               |\n| 2024.08 | **Probing the Safety Response Boundary of Large Language Models via Unsafe Decoding Path Generation (JVD)** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.10668)           |                             -                              |\n| 2024.08 | **Jailbreak Open-Sourced Large Language Models via Enforced Decoding (EnDec)** |      ACL'24      | [link](https:\u002F\u002Faclanthology.org\u002F2024.acl-long.299.pdf#page=4.96) |                             -                              |\n| 2024.07 | **Best-of-Venom: Attacking RLHF by Injecting Poisoned Preference Data** |      COLM'24       |           [Link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.05530)           |    -    |\n| 2024.07 | **Refusal in Language Models Is Mediated by a Single Direction** |      arXiv       |           [Link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.11717)           |    [Link](https:\u002F\u002Fgithub.com\u002Fandyrdt\u002Frefusal_direction)    |\n| 2024.07 | **Revisiting Character-level Adversarial Attacks for Language Models** |     ICML'24      |           [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.04346)           |       [link](https:\u002F\u002Fgithub.com\u002FLIONS-EPFL\u002FCharmer)        |\n| 2024.07 | **Badllama 3: removing safety finetuning from Llama 3 in minutes (Badllama 3)** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.01376)           |                             -                              |\n| 2024.07 | **SOS! Soft Prompt Attack Against Open-Source Large Language Models** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.03160)           |                             -                              |\n| 2024.06 | **COLD-Attack: Jailbreaking LLMs with Stealthiness and Controllability (COLD-Attack)** |     ICML'24      |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.08679)           |      [link](https:\u002F\u002Fgithub.com\u002FYu-Fangxu\u002FCOLD-Attack)      |\n| 2024.05 | **Semantic-guided Prompt Organization for Universal Goal Hijacking against LLMs** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14189)           |                                                            |\n| 2024.05 | **Efficient LLM Jailbreak via Adaptive Dense-to-sparse Constrained Optimization** |    NeurIPS'24    |           [Link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.09113)           |                             -                              |\n| 2024.05 | **AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models (AutoDAN)** |     ICLR'24      |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.04451)           |      [link](https:\u002F\u002Fgithub.com\u002FSheltonLiu-N\u002FAutoDAN)       |\n| 2024.05 | **AmpleGCG: Learning a Universal and Transferable Generative Model of Adversarial Suffixes for Jailbreaking Both Open and Closed LLMs (AmpleGCG)** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.07921)           |     [link](https:\u002F\u002Fgithub.com\u002FOSU-NLP-Group\u002FAmpleGCG)      |\n| 2024.05 | **Boosting jailbreak attack with momentum (MAC)**            | ICLR Workshop'24 |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.01229)           |  [link](https:\u002F\u002Fgithub.com\u002Fweizeming\u002Fmomentum-attack-llm)  |\n| 2024.04 | **AdvPrompter: Fast Adaptive Adversarial Prompting for LLMs (AdvPrompter)** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.16873)           |  [link](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fadvprompter)   |\n| 2024.03 | **Universal Jailbreak Backdoors from Poisoned Human Feedback** |     ICLR'24      |       [link](https:\u002F\u002Fopenreview.net\u002Fpdf?id=GxCGsxiAaK)       |                             -                              |\n| 2024.02 | **Attacking large language models with projected gradient descent (PGD)** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.09154)           |                             -                              |\n| 2024.02 | **Open the Pandora's Box of LLMs: Jailbreaking LLMs through Representation Engineering (JRE)** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.06824)           |                             -                              |\n| 2024.02 | **Curiosity-driven red-teaming for large language models (CRT)** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.19464)           | [link](https:\u002F\u002Fgithub.com\u002FImprobable-AI\u002Fcuriosity_redteam) |\n| 2023.12 | **AutoDAN: Interpretable Gradient-Based Adversarial Attacks on Large Language Models (AutoDAN)** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.15140)           |    [link](https:\u002F\u002Fgithub.com\u002Frotaryhammer\u002Fcode-autodan)    |\n| 2023.10 | **Catastrophic jailbreak of open-source llms via exploiting generation** |     ICLR'24      |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.06987)           |  [link](https:\u002F\u002Fgithub.com\u002FPrinceton-SysML\u002FJailbreak_LLM)  |\n| 2023.06 | **Automatically Auditing Large Language Models via Discrete Optimization (ARCA)** |     ICML'23      | [link](https:\u002F\u002Fproceedings.mlr.press\u002Fv202\u002Fjones23a\u002Fjones23a.pdf) |     [link](https:\u002F\u002Fgithub.com\u002Fejones313\u002Fauditing-llms)     |\n| 2023.07 | **Universal and Transferable Adversarial Attacks on Aligned Language Models (GCG)** |      arXiv       |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.15043)           |     [link](https:\u002F\u002Fgithub.com\u002Fllm-attacks\u002Fllm-attacks)     |\n\n\n\n\n\n#### Multi-turn Attack\n\n\n\n| Time    | Title                                                        |   Venue   |                  Paper                   |                             Code                             |\n| ------- | ------------------------------------------------------------ | :-------: | :--------------------------------------: | :----------------------------------------------------------: |\n| 2025.04 | **Multi-Turn Jailbreaking Large Language Models via Attention Shifting** |   AAAI'25    | [link](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F34553) |    -      |\n| 2025.04 | **X-Teaming: Multi-Turn Jailbreaks and Defenses with Adaptive Multi-Agents** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.13203) |    [link](https:\u002F\u002Fgithub.com\u002Fsalman-lui\u002Fx-teaming)      |\n| 2025.04 | **Strategize Globally, Adapt Locally: A Multi-Turn Red Teaming Agent with Dual-Level Learning** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.01278) |      -    |\n| 2025.03 | **Foot-In-The-Door: A Multi-turn Jailbreak for LLMs** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2502.19820) |    [link](https:\u002F\u002Fgithub.com\u002FJinxiaolong1129\u002FFoot-in-the-door-Jailbreak)      |\n| 2025.03 | **Siege: Autonomous Multi-Turn Jailbreaking of Large Language Models with Tree Search** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.10619) |      -    |\n| 2024.11 | **MRJ-Agent: An Effective Jailbreak Agent for Multi-Round Dialogue** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.03814) |      -    |\n| 2024.10 | **Jigsaw Puzzles: Splitting Harmful Questions to Jailbreak Large Language Models (JSP)** |   arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.11459v1) |    [link](https:\u002F\u002Fgithub.com\u002FYangHao97\u002FJigSawPuzzles)      |\n| 2024.10 | **Multi-round jailbreak attack on large language** |   arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.11533v1) |     -      |\n| 2024.10 | **Derail Yourself: Multi-turn LLM Jailbreak Attack through Self-discovered Clues** |   arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.10700) |     [link](https:\u002F\u002Fgithub.com\u002Frenqibing\u002FActorAttack)      |\n| 2024.10 | **Automated Red Teaming with GOAT: the Generative Offensive Agent Tester** |   arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.01606) |     -      |\n| 2024.09 | **LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet** |   arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.15221) |     [link](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FScaleAI\u002Fmhj)      |\n| 2024.09 | **RED QUEEN: Safeguarding Large Language Models against Concealed Multi-Turn Jailbreaking** |   arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.17458) |       [link](https:\u002F\u002Fgithub.com\u002Fkriti-hippo\u002Fred_queen)       |\n| 2024.08 | **FRACTURED-SORRY-Bench: Framework for Revealing Attacks in Conversational Turns Undermining Refusal Efficacy and Defenses over SORRY-Bench (Automated Multi-shot Jailbreaks)** |   arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.16163) |     -      |\n| 2024.08 | **Emerging Vulnerabilities in Frontier Models: Multi-Turn Jailbreak Attacks** |   arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.00137) | [link](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Ftom-gibbs\u002Fmulti-turn_jailbreak_attack_datasets) |\n| 2024.05 | **CoA: Context-Aware based Chain of Attack for Multi-Turn Dialogue LLM (CoA)** |   arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.05610) |           [link](https:\u002F\u002Fgithub.com\u002FYancyKahn\u002FCoA)           |\n| 2024.04 | **Great, Now Write an Article About That: The Crescendo Multi-Turn LLM Jailbreak Attack (Crescendo)** | Microsoft Azure | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.01833) |                              -                               |\n\n\n\n\n\n\n\n#### Attack on RAG-based LLM\n\n\n\n| Time    | Title                                                        | Venue |                  Paper                   |                             Code                             |\n| ------- | ------------------------------------------------------------ | :---: | :--------------------------------------: | :----------------------------------------------------------: |\n| 2024.09 | **Unleashing Worms and Extracting Data: Escalating the Outcome of Attacks against RAG-based Inference in Scale and Severity Using Jailbreaking** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.08045) | [link](https:\u002F\u002Fgithub.com\u002FStavC\u002FUnleashingWorms-ExtractingData) |\n| 2024.02 | **Pandora: Jailbreak GPTs by Retrieval Augmented Generation Poisoning (Pandora)** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.08416) |                              -                               |\n\n\n\n\n\n\n\n#### Multi-modal Attack\n\n| Time | Title                                                        |  Venue  |                            Paper                             |                             Code                             |\n| ---- | ------------------------------------------------------------ | :-----: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2024.11 | **Jailbreak Attacks and Defenses against Multimodal Generative Models: A Survey** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.09259) | [link](https:\u002F\u002Fgithub.com\u002Fliuxuannan\u002FAwesome-Multimodal-Jailbreak) |\n| 2024.10 | **Chain-of-Jailbreak Attack for Image Generation Models via Editing Step by Step** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.03869) | - |\n| 2024.10 | **ColJailBreak: Collaborative Generation and Editing for Jailbreaking Text-to-Image Deep Generation** | NeurIPS'24 | [Link](https:\u002F\u002Fnips.cc\u002Fvirtual\u002F2024\u002Fposter\u002F94287) | - |\n| 2024.08 | **Jailbreaking Text-to-Image Models with LLM-Based Agents (Atlas)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.00523) |                              -                              |\n| 2024.07 | **Image-to-Text Logic Jailbreak: Your Imagination can Help You Do Anything** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.02534) |                              -                              |\n| 2024.06 | **Jailbreak Vision Language Models via Bi-Modal Adversarial Prompt** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.04031) |                              [link](https:\u002F\u002Fgithub.com\u002FNY1024\u002FBAP-Jailbreak-Vision-Language-Models-via-Bi-Modal-Adversarial-Prompt)                               |\n| 2024.05 | **Voice Jailbreak Attacks Against GPT-4o** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.19103) |                              [link](https:\u002F\u002Fgithub.com\u002FTrustAIRLab\u002FVoiceJailbreakAttack)                               |\n| 2024.05 | **Automatic Jailbreaking of the Text-to-Image Generative AI Systems** |     ICML'24 Workshop    | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.16567) | [link](https:\u002F\u002Fgithub.com\u002FKim-Minseon\u002FAPGP) |\n| 2024.04 | **Image hijacks: Adversarial images can control generative models at runtime** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.00236) |                              [link](https:\u002F\u002Fgithub.com\u002Feuanong\u002Fimage-hijacks)                               |\n| 2024.03 | **An image is worth 1000 lies: Adversarial transferability across prompts on vision-language models (CroPA)** |   ICLR'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.09766) |                              [link](https:\u002F\u002Fgithub.com\u002FHaochen-Luo\u002FCroPA)                               |\n| 2024.03 | **Jailbreak in pieces: Compositional adversarial attacks on multi-modal language model** |   ICLR'24    | [link](https:\u002F\u002Fopenreview.net\u002Fpdf?id=plmBsXHxgR) |                              -                               |\n| 2024.03 | **Rethinking model ensemble in transfer-based adversarial attacks** |   ICLR'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.09105) |                              [link](https:\u002F\u002Fgithub.com\u002Fhuanranchen\u002FAdversarialAttacks)                               |\n| 2024.02 | **VLATTACK: Multimodal Adversarial Attacks on Vision-Language Tasks via Pre-trained Models** |   NeurIPS'23    | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.04655) |               [link](https:\u002F\u002Fgithub.com\u002Fericyinyzy\u002FVLAttack)                                         |\n| 2024.02 | **Jailbreaking Attack against Multimodal Large Language Model** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.02309) |                             -                          |\n| 2024.01 | **Jailbreaking GPT-4V via Self-Adversarial Attacks with System Prompts** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.09127) |                             -                          |\n| 2024.03 | **Visual Adversarial Examples Jailbreak Aligned Large Language Models** |   AAAI'24    | [link](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F30150\u002F32038) |                              -                               |\n| 2023.12 | **OT-Attack: Enhancing Adversarial Transferability of Vision-Language Models via Optimal Transport Optimization (OT-Attack)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.04403) |                              -                               |\n| 2023.12 | **FigStep: Jailbreaking Large Vision-language Models via Typographic Visual Prompts (FigStep)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.05608) |                              [link](https:\u002F\u002Fgithub.com\u002FThuCCSLab\u002FFigStep)                               |\n| 2023.11 | **SneakyPrompt: Jailbreaking Text-to-image Generative Models** |   S&P'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.12082) |                              [link](https:\u002F\u002Fgithub.com\u002FYuchen413\u002Ftext2image_safety)                               |\n| 2023.11 | **On Evaluating Adversarial Robustness of Large Vision-Language Models** |   NeurIPS'23    | [link](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2023\u002Ffile\u002Fa97b58c4f7551053b0512f92244b0810-Paper-Conference.pdf) |                              [link](https:\u002F\u002Fgithub.com\u002Fyunqing-me\u002FAttackVLM)                               |\n| 2023.10 | **How Robust is Google's Bard to Adversarial Image Attacks?** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.11751) |                              [link](https:\u002F\u002Fgithub.com\u002Fthu-ml\u002FAttack-Bard)                               |\n| 2023.08 | **AdvCLIP: Downstream-agnostic Adversarial Examples in Multimodal Contrastive Learning (AdvCLIP)** |   ACM MM'23    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.07026) |                              [link](https:\u002F\u002Fgithub.com\u002FCGCL-codes\u002FAdvCLIP)                               |\n| 2023.07 | **Set-level Guidance Attack: Boosting Adversarial Transferability of Vision-Language Pre-training Models (SGA)** |   ICCV'23    | [link](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2023\u002Fpapers\u002FLu_Set-level_Guidance_Attack_Boosting_Adversarial_Transferability_of_Vision-Language_Pre-training_Models_ICCV_2023_paper.pdf) |                              [link](https:\u002F\u002Fgithub.com\u002FZoky-2020\u002FSGA)                               |\n| 2023.07 | **On the Adversarial Robustness of Multi-Modal Foundation Models** |   ICCV Workshop'23    | [link](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2023W\u002FAROW\u002Fpapers\u002FSchlarmann_On_the_Adversarial_Robustness_of_Multi-Modal_Foundation_Models_ICCVW_2023_paper.pdf) |                              -                               |\n| 2022.10 | **Towards Adversarial Attack on Vision-Language Pre-training Models** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.09391) |                              [link](https:\u002F\u002Fgithub.com\u002Fadversarial-for-goodness\u002FCo-Attack)                               |\n\n\n\n\n\n\n\n\n\n\n\n\n\n### Jailbreak Defense\n\n#### Learning-based Defense\n| Time | Title                                                        |  Venue  |                            Paper                             |                             Code                             |\n| ---- | ------------------------------------------------------------ | :-----: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2025.12 | **Rethinking Jailbreak Detection of Large Vision Language Models with Representational Contrastive Scoring** | arXiv'25 | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.12069) | [link](https:\u002F\u002Fgithub.com\u002Fsarendis56\u002FJailbreak_Detection_RCS) |\n| 2025.07 | **Reasoning as an Adaptive Defense for Safety** | NeurIPS'25 | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.00971) | [link](https:\u002F\u002Ftraining-adaptive-reasoners-safety.github.io) |\n| 2025.04 | **JailDAM: Jailbreak Detection with Adaptive Memory for Vision-Language Model** | COLM'25 | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.03770) | [link](https:\u002F\u002Fgithub.com\u002FShenzheZhu\u002FJailDAM) |\n| 2024.12 | **Shaping the Safety Boundaries: Understanding and Defending Against Jailbreaks in Large Language Models** | arXiv'24 | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.17034) | - |\n| 2024.10 | **Safety-Aware Fine-Tuning of Large Language Models** | arXiv'24 | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.10014) | - |\n| 2024.10 | **MoJE: Mixture of Jailbreak Experts, Naive Tabular Classifiers as Guard for Prompt Attacks** | AAAI'24 | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.17699) | - |\n| 2024.08 | **BaThe: Defense against the Jailbreak Attack in Multimodal Large Language Models by Treating Harmful Instruction as Backdoor Trigger (BaThe)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.09093)  |                               -                              |\n| 2024.07 | **DART: Deep Adversarial Automated Red Teaming for LLM Safety** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.03876)  |                               -                              |\n| 2024.07 | **Eraser: Jailbreaking Defense in Large Language Models via Unlearning Harmful Knowledge (Eraser)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.05880) |                              [link](https:\u002F\u002Fgithub.com\u002FZeroNLP\u002FEraser)                               |\n| 2024.07 | **Safe Unlearning: A Surprisingly Effective and Generalizable Solution to Defend Against Jailbreak Attacks** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.02855) |                              [link](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FSafeUnlearning)                               |\n| 2024.06 | **Adversarial Tuning: Defending Against Jailbreak Attacks for LLMs** | arXiv | [Link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.06622) | - |\n| 2024.06 | **Jatmo: Prompt Injection Defense by Task-Specific Finetuning (Jatmo)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.17673) |                              [link](https:\u002F\u002Fgithub.com\u002Fwagner-group\u002Fprompt-injection-defense)                               |\n| 2024.06 | **Defending Large Language Models Against Jailbreaking Attacks Through Goal Prioritization (SafeDecoding)** |   ACL'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.09096) |                              [link](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FJailbreakDefense_GoalPriority)                               |\n| 2024.06 | **Mitigating Fine-tuning based Jailbreak Attack with Backdoor Enhanced Safety Alignment** |   NeurIPS'24   | [link](https:\u002F\u002Fjayfeather1024.github.io\u002FFinetuning-Jailbreak-Defense\u002F) |                              [link](https:\u002F\u002Fgithub.com\u002FJayfeather1024\u002FBackdoor-Enhanced-Alignment)                               |\n| 2024.06 | **On Prompt-Driven Safeguarding for Large Language Models (DRO)** |   ICML'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.18018) |                              [link](https:\u002F\u002Fgithub.com\u002Fchujiezheng\u002FLLM-Safeguard)          |\n| 2024.06 | **Robust Prompt Optimization for Defending Language Models Against Jailbreaking Attacks (RPO)** |   NeurIPS'24   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.17263) |                              -          |\n| 2024.06 | **Fight Back Against Jailbreaking via Prompt Adversarial Tuning (PAT)** |   NeurIPS'24   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.06255) |                              [link](https:\u002F\u002Fgithub.com\u002Frain152\u002FPAT)          |\n| 2024.05 | **Towards Comprehensive and Efficient Post Safety Alignment of Large Language Models via Safety Patching (SAFEPATCHING)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.13820) |                              -          |\n| 2024.05 | **Detoxifying Large Language Models via Knowledge Editing (DINM)** |   ACL'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.14472) |                              [link](https:\u002F\u002Fgithub.com\u002Fzjunlp\u002FEasyEdit\u002Fblob\u002Fmain\u002Fexamples\u002FSafeEdit.md)          |\n| 2024.05 | **Defending Large Language Models Against Jailbreak Attacks via Layer-specific Editing** |   arXiv | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.18166) | [link](https:\u002F\u002Fgithub.com\u002Fledllm\u002Fledllm) |\n| 2023.11 | **MART: Improving LLM Safety with Multi-round Automatic Red-Teaming (MART)** |   ACL'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.07689) |                              -          |\n| 2023.11 | **Baseline defenses for adversarial attacks against aligned language models** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.14132) |                              -                               |\n| 2023.10 | **Safe rlhf: Safe reinforcement learning from human feedback** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.12773) |                              [link](https:\u002F\u002Fgithub.com\u002FPKU-Alignment\u002Fsafe-rlhf)                                |\n| 2023.08 | **Red-Teaming Large Language Models using Chain of Utterances for Safety-Alignment (RED-INSTRUCT)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.09662) |                              [link](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fred-instruct)                                |\n| 2022.04 | **Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback** |   Anthropic    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.05862?spm=a2c6h.13046898.publish-article.36.6cd56ffaIPu4NQ&file=2204.05862) |                              -                                |\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n#### Strategy-based Defense\n\n\n\n| Time | Title                                                        |  Venue  |                            Paper                             |                             Code                             |\n| ---- | ------------------------------------------------------------ | :-----: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2025.12 | **Compressed but Compromised? A Study of Jailbreaking in Compressed LLMs** | NeurIPS-W | [link](https:\u002F\u002Fopenreview.net\u002Fpdf?id=OkNfb8SmLh) | [BlogPost Link](https:\u002F\u002Fnamburisrinath.medium.com\u002Fcompressed-but-compromised-a-study-of-jailbreaking-in-compressed-llms-02a6e40aaf17) |\n| 2025.09 | **LLM Jailbreak Detection for (Almost) Free!** | arXiv | [link](http:\u002F\u002Farxiv.org\u002Fabs\u002F2509.14558) | [link](https:\u002F\u002Fgithub.com\u002FGuoruiC\u002FFJD) |\n| 2025.05 | **Reasoning-to-Defend: Safety-Aware Reasoning Can Defend Large Language Models from Jailbreaking** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2502.12970?) | [link](https:\u002F\u002Fgithub.com\u002Fchuhac\u002FReasoning-to-Defend) |\n| 2024.11 | **Rapid Response: Mitigating LLM Jailbreaks with a Few Examples** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.07494v1) | [link](https:\u002F\u002Fgithub.com\u002Frapidresponsebench\u002Frapidresponsebench) |\n| 2024.10 | **RePD: Defending Jailbreak Attack through a Retrieval-based Prompt Decomposition Process (RePD)** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.08660v1) |  - |\n| 2024.10 | **Guide for Defense (G4D): Dynamic Guidance for Robust and Balanced Defense in Large Language Models (G4D)** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.17922v1) |  [link](https:\u002F\u002Fgithub.com\u002FIDEA-XL\u002FG4D) |\n| 2024.10 | **Jailbreak Antidote: Runtime Safety-Utility Balance via Sparse Representation Adjustment in Large Language Models** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fhtml\u002F2410.02298v1) | - |\n| 2024.09 | **HSF: Defending against Jailbreak Attacks with Hidden State Filtering** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fhtml\u002F2409.03788v1) | [link](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002FHidden-State-Filtering-8652\u002F) |\n| 2024.08 | **EEG-Defender: Defending against Jailbreak through Early Exit Generation of Large Language Models (EEG-Defender)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.11308) |                              -                               |\n| 2024.08 | **Prefix Guidance: A Steering Wheel for Large Language Models to Defend Against Jailbreak Attacks (PG)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.08924) |                              [link](https:\u002F\u002Fgithub.com\u002Fweiyezhimeng\u002FPrefix-Guidance)                               |\n| 2024.08 | **Self-Evaluation as a Defense Against Adversarial Attacks on LLMs (Self-Evaluation)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.03234#page=2.47) |                              [link](https:\u002F\u002Fgithub.com\u002FLinlt-leon\u002Fself-eval)                               |\n| 2024.06 | **Defending LLMs against Jailbreaking Attacks via Backtranslation (Backtranslation)** |   ACL Findings'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.16459) |                              [link](https:\u002F\u002Fgithub.com\u002FYihanWang617\u002FLLM-Jailbreaking-Defense-Backtranslation)                               |\n| 2024.06 | **SafeDecoding: Defending against Jailbreak Attacks via Safety-Aware Decoding (SafeDecoding)** |   ACL'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.08983) |                              [link](https:\u002F\u002Fgithub.com\u002Fuw-nsl\u002FSafeDecoding)                               |\n| 2024.06 | **Defending Against Alignment-Breaking Attacks via Robustly Aligned LLM** |   ACL'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.14348) |                              -                               |\n| 2024.06 | **A Wolf in Sheep’s Clothing: Generalized Nested Jailbreak Prompts can Fool Large Language Models Easily (ReNeLLM)** |   NAACL'24    | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.08268) |                              [link](https:\u002F\u002Fgithub.com\u002FNJUNLP\u002FReNeLLM)                               |\n| 2024.06 | **SMOOTHLLM: Defending Large Language Models Against Jailbreaking Attacks** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.03684) |                              [link](https:\u002F\u002Fgithub.com\u002Farobey1\u002Fsmooth-llm)                               |\n| 2024.05 | **Enhancing Large Language Models Against Inductive Instructions with Dual-critique Prompting (Dual-critique)** |   ACL'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.13733) |                             [link](https:\u002F\u002Fgithub.com\u002FDevoAllen\u002FINDust)                               |\n| 2024.05 | **PARDEN, Can You Repeat That? Defending against Jailbreaks via Repetition (PARDEN)** |   ICML'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.07932) |                             [link](https:\u002F\u002Fgithub.com\u002FEd-Zh\u002FPARDEN)                               |\n| 2024.05 | **LLM Self Defense: By Self Examination, LLMs Know They Are Being Tricked** |   ICLR Tiny Paper'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.07308) |                             [link](https:\u002F\u002Fgithub.com\u002Fpoloclub\u002Fllm-self-defense)                               |\n| 2024.05 | **GradSafe: Detecting Unsafe Prompts for LLMs via Safety-Critical Gradient Analysis (GradSafe)** |   ACL'24    | [link](\\https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.13494) |                             [link](https:\u002F\u002Fgithub.com\u002Fxyq7\u002FGradSafe)                               |\n| 2024.05 | **Multilingual Jailbreak Challenges in Large Language Models** |   ICLR'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.06474)  |                              [link](https:\u002F\u002Fgithub.com\u002FDAMO-NLP-SG\u002Fmultilingual-safety-for-LLMs)                               |\n| 2024.05 | **Gradient Cuff: Detecting Jailbreak Attacks on Large Language Models by Exploring Refusal Loss Landscapes** |   NeurIPS'24   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.00867)  |                              -                            |\n| 2024.05 | **AutoDefense: Multi-Agent LLM Defense against Jailbreak Attacks** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.04783) |                             [link](https:\u002F\u002Fgithub.com\u002FXHMY\u002FAutoDefense)                               |\n| 2024.05 | **Bergeron: Combating adversarial attacks through a conscience-based alignment framework (Bergeron)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.00029) |                             [link](https:\u002F\u002Fgithub.com\u002Fmatthew-pisano\u002FBergeron)                               |\n| 2024.05 | **Jailbreak and Guard Aligned Language Models with Only Few In-Context Demonstrations (ICD)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.06387)  |                              -                               |\n| 2024.04 | **Protecting your llms with information bottleneck** |   NeurIPS'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.13968)  |                              [link](https:\u002F\u002Fgithub.com\u002Fzichuan-liu\u002FIB4LLMs)                                |\n| 2024.04 | **Pruning for Protection: Increasing Jailbreak Resistance in Aligned LLMs Without Fine-Tuning** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.10862)  |                              [link](https:\u002F\u002Fgithub.com\u002FCrystalEye42\u002Feval-safety)                                |\n| 2024.02 | **Certifying LLM Safety against Adversarial Prompting** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.02705) |                              [link](https:\u002F\u002Fgithub.com\u002Faounon\u002Fcertified-llm-safety)                              |\n| 2024.02 | **Break the Breakout: Reinventing LM Defense Against Jailbreak Attacks with Self-Refinement** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.15180) |                              -                              |\n| 2024.02 | **Defending large language models against jailbreak attacks via semantic smoothing (SEMANTICSMOOTH)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.16192) |                              [link](https:\u002F\u002Fgithub.com\u002FUCSB-NLP-Chang\u002FSemanticSmooth)                             |\n| 2024.01 | **Intention Analysis Makes LLMs A Good Jailbreak Defender (IA)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.06561) |                              [link](https:\u002F\u002Fgithub.com\u002Falphadl\u002FSafeLLM_with_IntentionAnalysis)                               |\n| 2024.01 | **How Johnny Can Persuade LLMs to Jailbreak Them: Rethinking Persuasion to Challenge AI Safety by Humanizing LLMs (PAP)** |   ACL'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.06373) |                              [link](https:\u002F\u002Fgithub.com\u002FCHATS-lab\u002Fpersuasive_jailbreaker)                               |\n| 2023.12 | **Defending ChatGPT against jailbreak attack via self-reminders (Self-Reminder)** |   Nature Machine Intelligence    | [link](https:\u002F\u002Fxyq7.github.io\u002Fpapers\u002FNMI-JailbreakDefense.pdf) |                              [link](https:\u002F\u002Fgithub.com\u002Fyjw1029\u002FSelf-Reminder\u002F)                               |\n| 2023.11 | **Detecting language model attacks with perplexity** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.14132) |                             -                              |\n| 2023.10 | **RAIN: Your Language Models Can Align Themselves without Finetuning (RAIN)** |    ICLR'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.07124) |                              [link](https:\u002F\u002Fgithub.com\u002FSafeAILab\u002FRAIN)                               |\n\n\n\n\n\n\n\n\n\n\n#### Guard Model\n\n| Time    | Title                                                        |   Venue    |                            Paper                             |                             Code                             |\n| ------- | ------------------------------------------------------------ | :--------: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2026.02 | **GuardReasoner-Omni: A Reasoning-based Multi-modal Guardrail for Text, Image, and Video** | arXiv'26  |          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.03328)          |                              [link](https:\u002F\u002Fgithub.com\u002Fzzh-thu-22\u002FGuardReasoner-Omni)   \n| 2025.12 | **OmniGuard: Unified Omni-Modal Guardrails with Deliberate Reasoning** | arXiv'25 | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.02306) | - |\n| 2025.10 | **Think Twice, Generate Once: Safeguarding by Progressive Self-Reflection (PSR)** | EMNLP'25  |          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.01270)          |                              [link](https:\u002F\u002Fgithub.com\u002FVietHoang1512\u002FPSR)                  |\n| 2025.05 | **GuardReasoner-VL: Safeguarding VLMs via Reinforced Reasoning (GuardReasoner-VL)** | NeurIPS'25  |          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.11049)          |                              [link](https:\u002F\u002Fgithub.com\u002Fyueliu1999\u002FGuardReasoner-VL\u002F)                               |\n| 2025.04 | **X-Guard: Multilingual Guard Agent for Content Moderation (X-Guard)** | arXiv'25  |          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.08848)          |                              [link](https:\u002F\u002Fgithub.com\u002FUNHSAILLab\u002FX-Guard)                               |\n| 2025.02 | **ThinkGuard: Deliberative Slow Thinking Leads to Cautious Guardrails (ThinkGuard)** | arXiv'25  |          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13458)          |                             [link](https:\u002F\u002Fgithub.com\u002Fluka-group\u002FThinkGuard)                              |\n| 2025.02 | **Constitutional Classifiers: Defending against Universal Jailbreaks across Thousands of Hours of Red Teaming** | arXiv'25  |          [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.18837)          |                             -                               |\n| 2025.01 | **GuardReasoner: Towards Reasoning-based LLM Safeguards (GuardReasoner)** | ICLR Workshop'25  |          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2501.18492)          |                              [link](https:\u002F\u002Fgithub.com\u002Fyueliu1999\u002FGuardReasoner\u002F)                               |\n| 2024.12 | **Lightweight Safety Classification Using Pruned Language Models (Sentence-BERT)** | arXiv'24  |          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.13435)          |                              -                               |\n| 2024.11 | **GuardFormer: Guardrail Instruction Pretraining for Efficient SafeGuarding (GuardFormer)** | Meta  |          [link](https:\u002F\u002Fopenreview.net\u002Fpdf?id=vr31i9pzQk)          |                              -                               |\n| 2024.11 | **Llama Guard 3 Vision: Safeguarding Human-AI Image Understanding Conversations (LLaMA Guard 3 Vision)** | Meta  |          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.10414?)          |                              [link](https:\u002F\u002Fgithub.com\u002Fmeta-llama\u002Fllama-recipes\u002Ftree\u002Fmain\u002Frecipes\u002Fresponsible_ai\u002Fllama_guard)                                |\n| 2024.11 | **AEGIS2.0: A Diverse AI Safety Dataset and Risks Taxonomy for Alignment of LLM Guardrails (Aegis2.0)** | Nvidia, NeurIPS'24 Workshop  |          [link](https:\u002F\u002Fopenreview.net\u002Fpdf?id=0MvGCv35wi)          |                              -                               |\n| 2024.11 | **Lightweight Safety Guardrails Using Fine-tuned BERT Embeddings (Sentence-BERT)** | arXiv'24  |          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.14398?)          |                              -                               |\n| 2024.11 | **STAND-Guard: A Small Task-Adaptive Content Moderation Model (STAND-Guard)** | Microsoft  |          [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.05214v1)          |                              -                               |\n| 2024.10 | **VLMGuard: Defending VLMs against Malicious Prompts via Unlabeled Data** |   arXiv    |         [link](https:\u002F\u002Farxiv.org\u002Fhtml\u002F2410.00296v1)          |                              -                               |\n| 2024.09 | **AEGIS: Online Adaptive AI Content Safety Moderation with Ensemble of LLM Experts (Aegis)** |   Nvidia   |           [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.05993)           | [link](https:\u002F\u002Fhuggingface.co\u002Fnvidia\u002FAegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) |\n| 2024.09 | **Llama 3.2: Revolutionizing edge AI and vision with open, customizable models (LLaMA Guard 3)** |    Meta    | [link](https:\u002F\u002Fai.meta.com\u002Fblog\u002Fllama-3-2-connect-2024-vision-edge-mobile-devices\u002F) |  [link](https:\u002F\u002Fhuggingface.co\u002Fmeta-llama\u002FLlama-Guard-3-1B)  |\n| 2024.08 | **ShieldGemma: Generative AI Content Moderation Based on Gemma (ShieldGemma)** |   Google   |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.21772)           |     [link](https:\u002F\u002Fhuggingface.co\u002Fgoogle\u002Fshieldgemma-2b)     |\n| 2024.07 | **WildGuard: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of LLMs (WildGuard)** | NeurIPS'24 |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18495)           |         [link](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fwildguard)         |\n| 2024.06 | **GuardAgent: Safeguard LLM Agents by a Guard Agent via Knowledge-Enabled Reasoning (GuardAgent)** | arXiv'24 |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.09187)           |         -        |\n| 2024.06 | **R2-Guard: Robust Reasoning Enabled LLM Guardrail via Knowledge-Enhanced Logical Reasoning (R2-Guard)** |   arXiv    |           [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.05557)           |       [link](https:\u002F\u002Fgithub.com\u002Fkangmintong\u002FR-2-Guard)       |\n| 2024.04 | **Llama Guard 2**                                            |    Meta    | [link](https:\u002F\u002Fwww.llama.com\u002Fdocs\u002Fmodel-cards-and-prompt-formats\u002Fmeta-llama-guard-2\u002F) | [link](https:\u002F\u002Fgithub.com\u002Fmeta-llama\u002FPurpleLlama\u002Fblob\u002Fmain\u002FLlama-Guard2\u002FMODEL_CARD.md) |\n| 2024.03 | **AdaShield: Safeguarding Multimodal Large Language Models from Structure-based Attack via Adaptive Shield Prompting (AdaShield)** |  ECCV'24   |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.09513)           |      [link](https:\u002F\u002Fgithub.com\u002FSaFoLab-WISC\u002FAdaShield)       |\n| 2023.12 | **Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations (LLaMA Guard)** |    Meta    |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.06674)           | [link](https:\u002F\u002Fgithub.com\u002Fmeta-llama\u002FPurpleLlama\u002Ftree\u002Fmain\u002FLlama-Guard) |\n\n\n\n\n\n#### Moderation API\n\n\n\n| Time    | Title                                                        |      Venue      |                            Paper                             |                             Code                             |\n| ------- | ------------------------------------------------------------ | :-------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2023.08 | **Using GPT-4 for content moderation (GPT-4)**               |     OpenAI      | [link](https:\u002F\u002Fopenai.com\u002Findex\u002Fusing-gpt-4-for-content-moderation\u002F) |                              -                               |\n| 2023.02 | **A Holistic Approach to Undesired Content Detection in the Real World (OpenAI Moderation Endpoint)** |   AAAI OpenAI   |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.03274)           |   [link](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmoderation-api-release)   |\n| 2022.02 | **A New Generation of Perspective API: Efficient Multilingual Character-level Transformers (Perspective API)** |   KDD Google    |           [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.11176)           |             [link](https:\u002F\u002Fperspectiveapi.com\u002F)              |\n| -       | **Azure AI Content Safety**                                  | Microsoft Azure |                              -                               | [link](https:\u002F\u002Fazure.microsoft.com\u002Fen-us\u002Fproducts\u002Fai-services\u002Fai-content-safety\u002F) |\n| -       | **Detoxify**                                                 |   unitary.ai    |                              -                               |        [link](https:\u002F\u002Fgithub.com\u002Funitaryai\u002Fdetoxify)         |\n| -       | **promptfoo** - LLM red teaming framework with adaptive multi-turn attacks (PAIR, tree-of-attacks, crescendo) |   promptfoo    |                              -                               |        [link](https:\u002F\u002Fgithub.com\u002Fpromptfoo\u002Fpromptfoo)         |\n\n\n\n\n\n\n\n\n### Evaluation \\& Analysis\n| Time | Title                                                        |  Venue  |                            Paper                             |                             Code                             |\n| ---- | ------------------------------------------------------------ | :-----: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2026.02 | **AgentLeak: A Full-Stack Benchmark for Privacy Leakage in Multi-Agent LLM Systems** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.11510) |  [link](https:\u002F\u002Fgithub.com\u002FPrivatris\u002FAgentLeak)  |\n| 2026.02 | **babel-bench: Multilingual Classical Language Safety Benchmark for LLMs (babel-bench)** | ICLR'26 | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.22983) |  [link](https:\u002F\u002Fgithub.com\u002FMARUCIE\u002Fbabel-bench)  |\n| 2025.12 | **Compressed but Compromised? A Study of Jailbreaking in Compressed LLMs** | NeurIPS-W | [link](https:\u002F\u002Fopenreview.net\u002Fpdf?id=OkNfb8SmLh) | [BlogPost Link](https:\u002F\u002Fnamburisrinath.medium.com\u002Fcompressed-but-compromised-a-study-of-jailbreaking-in-compressed-llms-02a6e40aaf17) |\n| 2025.08 | **JADES: A Universal Framework for Jailbreak Assessment via Decompositional Scoring** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2508.20848) |  [link](https:\u002F\u002Ftrustairlab.github.io\u002Fjades.github.io\u002F)  |\n| 2025.06 | **Activation Approximations Can Incur Safety Vulnerabilities Even in Aligned LLMs: Comprehensive Analysis and Defense** | USENIX Security'25 | [link](https:\u002F\u002Fwww.arxiv.org\u002Fpdf\u002F2502.00840) |  [link](https:\u002F\u002Fgithub.com\u002FKevin-Zh-CS\u002FQuadA)  |\n| 2025.05 | **Are Vision-Language Models Safe in the Wild? A Meme-Based Benchmark Study** | EMNLP'25 | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2505.15389) |  [link](https:\u002F\u002Fgithub.com\u002Foneonlee\u002FMeme-Safety-Bench)  |\n| 2025.05 | **PandaGuard: Systematic Evaluation of LLM Safety against Jailbreaking Attacks** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2505.13862) |  [link](https:\u002F\u002Fgithub.com\u002FBeijing-AISI\u002Fpanda-guard)  |\n| 2025.05 | **Assessing Safety Risks and Quantization-aware Safety Patching for Quantized Large Language Models** | ICML'25 | [link](https:\u002F\u002Ficml.cc\u002Fvirtual\u002F2025\u002Fposter\u002F44278) |  [link](https:\u002F\u002Fgithub.com\u002FThecommonirin\u002FQresafe)  |\n| 2025.02 | **GuidedBench: Equipping Jailbreak Evaluation with Guidelines** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2502.16903) |  [link](https:\u002F\u002Fgithub.com\u002FSproutNan\u002FAI-Safety_Benchmark)  |\n| 2024.12 | **Agent-SafetyBench: Evaluating the Safety of LLM Agents** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.14470) |  [link](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FAgent-SafetyBench)  |\n| 2024.11 | **Global Challenge for Safe and Secure LLMs Track 1** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.14502v1) | -  |\n| 2024.11 | **JailbreakLens: Interpreting Jailbreak Mechanism in the Lens of Representation and Circuit** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.11114v1) | -  |\n| 2024.11 | **The VLLM Safety Paradox: Dual Ease in Jailbreak Attack and Defense** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.08410v1) | -  |\n| 2024.11 | **HarmLevelBench: Evaluating Harm-Level Compliance and the Impact of Quantization on Model Alignment** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.06835v1) | - |\n| 2024.11 | **ChemSafetyBench: Benchmarking LLM Safety on Chemistry Domain** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.16736) | [link](https:\u002F\u002Fgithub.com\u002FHaochenZhao\u002FSafeAgent4Chem) |\n| 2024.11 | **GuardBench: A Large-Scale Benchmark for Guardrail Models** | EMNLP'24 | [link](https:\u002F\u002Faclanthology.org\u002F2024.emnlp-main.1022.pdf) | [link](https:\u002F\u002Fgithub.com\u002FAmenRa\u002Fguardbench) |\n| 2024.11 | **What Features in Prompts Jailbreak LLMs? Investigating the Mechanisms Behind Attacks** | arXiv | [Link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.03343v1) | [link](https:\u002F\u002Fgithub.com\u002FNLie2\u002Fwhat_features_jailbreak_LLMs) |\n| 2024.11 | **Benchmarking LLM Guardrails in Handling Multilingual Toxicity** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.22153v1) | [link](https:\u002F\u002Fcommoncrawl.github.io\u002Fcc-crawl-statistics\u002Fplots\u002Flanguages.html) |\n| 2024.10 | **JAILJUDGE: A Comprehensive Jailbreak Judge Benchmark with Multi-Agent Enhanced Explanation Evaluation Framework** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.12855) | [link](https:\u002F\u002Fgithub.com\u002Fusail-hkust\u002FJailjudge) |\n| 2024.10 | **Do LLMs Have Political Correctness? Analyzing Ethical Biases and Jailbreak Vulnerabilities in AI Systems** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.13334v1) | [link](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002FPCJailbreak-F2B0\u002FREADME.md) |\n| 2024.10 | **A Realistic Threat Model for Large Language Model Jailbreaks** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.16222v1) | [link](https:\u002F\u002Fgithub.com\u002Fvalentyn1boreiko\u002Fllm-threat-model) |\n| 2024.10 | **ADVERSARIAL SUFFIXES MAY BE FEATURES TOO!** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.00451) | [link](https:\u002F\u002Fgithub.com\u002Fsuffix-maybe-feature\u002Fadver-suffix-maybe-features) |\n| 2024.09 | **JAILJUDGE: A COMPREHENSIVE JAILBREAK** | arXiv | [Link](https:\u002F\u002Fopenreview.net\u002Fpdf?id=cLYvhd0pDY) | [Link](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002Fpublic_multiagents_judge-66CB\u002FREADME.md) |\n| 2024.09 | **Multimodal Pragmatic Jailbreak on Text-to-image Models** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.19149) | [link](https:\u002F\u002Fgithub.com\u002Fmultimodalpragmatic\u002Fmultimodalpragmatic\u002Ftree\u002Fmain) |\n| 2024.08 | **ShieldGemma: Generative AI Content Moderation Based on Gemma (ShieldGemma)** |    arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.21772) | [link](https:\u002F\u002Fhuggingface.co\u002Fgoogle\u002Fshieldgemma-2b) |\n| 2024.08 | **MMJ-Bench: A Comprehensive Study on Jailbreak Attacks and Defenses for Vision Language Models (MMJ-Bench)** |    arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.08464) | [link](https:\u002F\u002Fgithub.com\u002Fthunxxx\u002FMLLM-Jailbreak-evaluation-MMJ-bench) |\n| 2024.08 | **Mission Impossible: A Statistical Perspective on Jailbreaking LLMs** | NeurIPS'24 | [Link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.01420) | - |\n| 2024.07 | **Operationalizing a Threat Model for Red-Teaming Large Language Models (LLMs)** |    arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.14937) | [link](https:\u002F\u002Fgithub.com\u002Fdapurv5\u002Fawesome-llm-red-teaming) |\n| 2024.07 | **JailBreakV-28K: A Benchmark for Assessing the Robustness of MultiModal Large Language Models against Jailbreak Attacks** |    arXiv   | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.03027) | [link](https:\u002F\u002Fgithub.com\u002FEddyLuo1232\u002FJailBreakV_28K) |\n| 2024.07 | **Jailbreak Attacks and Defenses Against Large Language Models: A Survey** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.04295)  |                     -                         |\n| 2024.06 | **\"Not Aligned\" is Not \"Malicious\": Being Careful about Hallucinations of Large Language Models' Jailbreak** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.11668) |                              [link](https:\u002F\u002Fgithub.com\u002FMeirtz\u002FBabyBLUE-llm)                               |\n| 2024.06 | **WildTeaming at Scale: From In-the-Wild Jailbreaks to (Adversarially) Safer Language Models (WildTeaming)** |   NeurIPS'24   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18510) |                              [link](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fwildteaming)                               |\n| 2024.06 | **From LLMs to MLLMs: Exploring the Landscape of Multimodal Jailbreaking** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.14859) |                              -                               |\n| 2024.06 | **AI Agents Under Threat: A Survey of Key Security Challenges and Future Pathways** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.02630) |                              -                               |\n| 2024.06 | **MM-SafetyBench: A Benchmark for Safety Evaluation of Multimodal Large Language Models (MM-SafetyBench)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.17600) |                              -                              |\n| 2024.06 | **ArtPrompt: ASCII Art-based Jailbreak Attacks against Aligned LLMs (VITC)** |   ACL'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.11753) |                              [link](https:\u002F\u002Fgithub.com\u002Fuw-nsl\u002FArtPrompt)                               |\n| 2024.06 | **Bag of Tricks: Benchmarking of Jailbreak Attacks on LLMs** |   NeurIPS'24   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.09324) |                             [link](https:\u002F\u002Fgithub.com\u002Fusail-hkust\u002FBag_of_Tricks_for_LLM_Jailbreaking)                              |\n| 2024.06 | **JailbreakZoo: Survey, Landscapes, and Horizons in Jailbreaking Large Language and Vision-Language Models (JailbreakZoo)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.01599) |                             [link](https:\u002F\u002Fgithub.com\u002FAllen-piexl\u002FJailbreakZoo)                              |\n| 2024.06 | **Fundamental limitations of alignment in large language models** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.11082) |                             -                              |\n| 2024.06 | **JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large Language Models (JailbreakBench)** |   NeurIPS'24   | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.01318) |                             [link](https:\u002F\u002Fgithub.com\u002FJailbreakBench\u002Fjailbreakbench)                             |\n| 2024.06 | **Towards Understanding Jailbreak Attacks in LLMs: A Representation Space Analysis** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.10794) |                             [link](https:\u002F\u002Fgithub.com\u002Fyuplin2333\u002Frepresentation-space-jailbreak)                              |\n| 2024.06 | **JailbreakEval: An Integrated Toolkit for Evaluating Jailbreak Attempts Against Large Language Models (JailbreakEval)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.09321) |                              [link](https:\u002F\u002Fgithub.com\u002FThuCCSLab\u002FJailbreakEval)                               |\n| 2024.05 | **Rethinking How to Evaluate Language Model Jailbreak** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.06407) |                             [link](https:\u002F\u002Fgithub.com\u002Fcontrollability\u002Fjailbreak-evaluation)                               |\n| 2024.05 | **Enhancing Large Language Models Against Inductive Instructions with Dual-critique Prompting (INDust)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.13733) |                             [link](https:\u002F\u002Fgithub.com\u002FDevoAllen\u002FINDust)                               |\n| 2024.05 | **Prompt Injection attack against LLM-integrated Applications** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.05499) |                             -                               |\n| 2024.05 | **Tricking LLMs into Disobedience: Formalizing, Analyzing, and Detecting Jailbreaks** |   LREC-COLING'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.14965) |                             [link](https:\u002F\u002Fgithub.com\u002FAetherPrior\u002FTrickLLM)                               |\n| 2024.05 | **LLM Jailbreak Attack versus Defense Techniques--A Comprehensive Study** |   NDSS'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.13457) |                             -                               |\n| 2024.05 | **Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.13860) |                             -                               |\n| 2024.05 | **Detoxifying Large Language Models via Knowledge Editing (SafeEdit)** |   ACL'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.14472) |                             [link](https:\u002F\u002Fgithub.com\u002Fzjunlp\u002FEasyEdit\u002Fblob\u002Fmain\u002Fexamples\u002FSafeEdit.md)                               |\n| 2024.04 | **JailbreakLens: Visual Analysis of Jailbreak Attacks Against Large Language Models (JailbreakLens)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.08793) |                             -                               |\n| 2024.03 | **How (un) ethical are instruction-centric responses of LLMs? Unveiling the vulnerabilities of safety guardrails to harmful queries (TECHHAZARDQA)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.15302) |                              [link](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FSoftMINER-Group\u002FTechHazardQA)                               |\n| 2024.03 | **Don’t Listen To Me: Understanding and Exploring Jailbreak Prompts of Large Language Models** |   USENIX Security    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.17336) |                             -                               |\n| 2024.03 | **EasyJailbreak: A Unified Framework for Jailbreaking Large Language Models (EasyJailbreak)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.12171) |                              [link](https:\u002F\u002Fgithub.com\u002FEasyJailbreak\u002FEasyJailbreak)                               |\n| 2024.02 | **Comprehensive Assessment of Jailbreak Attacks Against LLMs** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.05668) |                             -                     |\n| 2024.02 | **SPML: A DSL for Defending Language Models Against Prompt Attacks** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.11755) |                             -                     |\n| 2024.02 | **Coercing LLMs to do and reveal (almost) anything** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.14020) |                             -                     |\n| 2024.02 | **A STRONGREJECT for Empty Jailbreaks (StrongREJECT)** |   NeurIPS'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.10260) |                             [link](https:\u002F\u002Fgithub.com\u002Falexandrasouly\u002Fstrongreject)                      |\n| 2024.02 | **ToolSword: Unveiling Safety Issues of Large Language Models in Tool Learning Across Three Stages** |   ACL'24    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.10753) |                             [link](https:\u002F\u002Fgithub.com\u002FJunjie-Ye\u002FToolSword)                      |\n| 2024.02 | **HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal (HarmBench)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.04249) |                              [link](https:\u002F\u002Fgithub.com\u002Fcenterforaisafety\u002FHarmBench)                               |\n| 2023.12 | **Goal-Oriented Prompt Attack and Safety Evaluation for LLMs** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.11830)  |                             [link](https:\u002F\u002Fgithub.com\u002Fliuchengyuan123\u002FCPAD)                              |\n| 2023.12 | **The Art of Defending: A Systematic Evaluation and Analysis of LLM Defense Strategies on Safety and Over-Defensiveness** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.00287)  |                            -                              |\n| 2023.12 | **A Comprehensive Survey of Attack Techniques, Implementation, and Mitigation Strategies in Large Language Models** |   UbiSec'23    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.10982)  |                            -                              |\n| 2023.11 | **Summon a Demon and Bind it: A Grounded Theory of LLM Red Teaming in the Wild** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.06237)  |                             -                               |\n| 2023.11 | **How many unicorns are in this image? a safety evaluation benchmark for vision llms** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.16101)  |                             [link](https:\u002F\u002Fgithub.com\u002FUCSC-VLAA\u002Fvllm-safety-benchmark)                               |\n| 2023.11 | **Exploiting Large Language Models (LLMs) through Deception Techniques and Persuasion Principles** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.14876)  |                             -                               |\n| 2023.10 | **Explore, establish, exploit: Red teaming language models from scratch** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.09442)  |                             -                               |\n| 2023.10 | **Survey of Vulnerabilities in Large Language Models Revealed by Adversarial Attacks** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.10844)  |                             -                               |\n| 2023.10 | **Fine-tuning aligned language models compromises safety, even when users do not intend to! (HEx-PHI)** |   ICLR'24 (oral)    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.03693)  |                             [link](https:\u002F\u002Fgithub.com\u002FLLM-Tuning-Safety\u002FLLMs-Finetuning-Safety)                               |\n| 2023.08 | **Red-Teaming Large Language Models using Chain of Utterances for Safety-Alignment (RED-EVAL)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.09662) |                              [link](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fred-instruct)                                |\n| 2023.08 | **Use of LLMs for Illicit Purposes: Threats, Prevention Measures, and Vulnerabilities** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.12833) |                              -                               |\n| 2023.07 | **Jailbroken: How Does LLM Safety Training Fail? (Jailbroken)** |   NeurIPS'23    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.02483#page=1.01)  |                             -                               |\n| 2023.08 | **Use of LLMs for Illicit Purposes: Threats, Prevention Measures, and Vulnerabilities** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.12833)  |                              -                               |\n| 2023.08 | **From chatgpt to threatgpt: Impact of generative ai in cybersecurity and privacy** |   IEEE Access    | [link](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10198233?denied=)  |                              -                               |\n| 2023.07 | **Llm censorship: A machine learning challenge or a computer security problem?** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.10719)  |                              -                               |\n| 2023.07 | **Universal and Transferable Adversarial Attacks on Aligned Language Models (AdvBench)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.15043)  |                              [link](https:\u002F\u002Fgithub.com\u002Fllm-attacks\u002Fllm-attacks)                               |\n| 2023.06 | **DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models** |   NeurIPS'23    | [link](https:\u002F\u002Fblogs.qub.ac.uk\u002Fdigitallearning\u002Fwp-content\u002Fuploads\u002Fsites\u002F332\u002F2024\u002F01\u002FA-comprehensive-Assessment-of-Trustworthiness-in-GPT-Models.pdf)  |                              [link](https:\u002F\u002Fdecodingtrust.github.io\u002F)                               |\n| 2023.04 | **Safety Assessment of Chinese Large Language Models** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.10436)  |                              [link](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FSafety-Prompts)                              |\n| 2023.02 | **Exploiting Programmatic Behavior of LLMs: Dual-Use Through Standard Security Attacks** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2302.05733)  |                              -                               |\n| 2022.11 | **Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.07858)  |                              -                               |\n| 2022.02 | **Red Teaming Language Models with Language Models** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.03286)  | \n| 2026.03 | **Evaluation and Alignment, The Seminal Papers** | Manning | [link](https:\u002F\u002Fwww.manning.com\u002Fbooks\u002Fevaluation-and-alignment-the-seminal-papers) | -  |                               |\n\n### Application\n\n\n| Time | Title                                                        |  Venue  |                            Paper                             |                             Code                             |\n| ---- | ------------------------------------------------------------ | :-----: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2025.12 | **Compressed but Compromised? A Study of Jailbreaking in Compressed LLMs** | NeurIPS-W | [link](https:\u002F\u002Fopenreview.net\u002Fpdf?id=OkNfb8SmLh) | [BlogPost Link](https:\u002F\u002Fnamburisrinath.medium.com\u002Fcompressed-but-compromised-a-study-of-jailbreaking-in-compressed-llms-02a6e40aaf17) |\n| 2025.08 | **Beyond Jailbreaks: Revealing Stealthier and Broader LLM Security Risks Stemming from Alignment Failures** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.07402) | [link](https:\u002F\u002Fjailflip.github.io\u002F) |\n| 2024.11 | **Attacking Vision-Language Computer Agents via Pop-ups** | arXiv | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.02391) | [link](https:\u002F\u002Fgithub.com\u002FSALT-NLP\u002FPopupAttack) |\n| 2024.10 | **Jailbreaking LLM-Controlled Robots (ROBOPAIR)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.13691v1) |        [link](https:\u002F\u002Frobopair.org\u002F)      |\n| 2024.10 | **SMILES-Prompting: A Novel Approach to LLM Jailbreak Attacks in Chemical Synthesis** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.15641v1) |        [link](https:\u002F\u002Fgithub.com\u002FIDEA-XL\u002FChemSafety)      |\n| 2024.10 | **Cheating Automatic LLM Benchmarks: Null Models Achieve High Win Rates** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.07137) |        [link](https:\u002F\u002Fgithub.com\u002Fsail-sg\u002FCheating-LLM-Benchmarks)      |\n| 2024.09 | **RoleBreak: Character Hallucination as a Jailbreak Attack in Role-Playing Systems** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.16727) |                              -  |\n| 2024.08 | **A Jailbroken GenAI Model Can Cause Substantial Harm: GenAI-powered Applications are Vulnerable to PromptWares (APwT)** |   arXiv    | [link](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.05061) |                              -  |\n\n\n\n## Other Related Awesome Repository\n\n- [Awesome-LM-SSP](https:\u002F\u002Fgithub.com\u002FThuCCSLab\u002FAwesome-LM-SSP)\n- [llm-sp](https:\u002F\u002Fgithub.com\u002Fchawins\u002Fllm-sp)\n- [awesome-llm-security](https:\u002F\u002Fgithub.com\u002Fcorca-ai\u002Fawesome-llm-security)\n- [Awesome-LLM-Safety](https:\u002F\u002Fgithub.com\u002Fydyjya\u002FAwesome-LLM-Safety)\n- [Awesome-LRMs-Safety](https:\u002F\u002Fgithub.com\u002FWangCheng0116\u002FAwesome-LRMs-Safety)\n- [Awesome-LALMs-Jailbreak](https:\u002F\u002Fgithub.com\u002FWangCheng0116\u002FAwesome_LALMs_Jailbreak)\n- [Awesome-Embodied-AI-Safety](https:\u002F\u002Fgithub.com\u002Fx-zheng16\u002FAwesome-Embodied-AI-Safety)\n\n\n\n\n\n\n## Contributors\n\n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fyueliu1999\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_046e0fc42617.png\" alt=\"yueliu1999\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fbhooi\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_abf8b40670ad.png\" alt=\"bhooi\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fzqypku\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_1abc937ffe3e.png\" alt=\"zqypku\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002FjiaxiaojunQAQ\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_82c6f307c87d.png\" alt=\"jiaxiaojunQAQ\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002FHuang-yihao\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_7e269c23780e.png\" alt=\"Huang-yihao\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcsyuhao\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_28f34f9eb088.png\" alt=\"csyuhao\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fxszheng2020\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_65efddbbcb5b.png\" alt=\"xszheng2020\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fdapurv5\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_f5d91bf10a84.png\" alt=\"dapurv5\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002FZYQ-Zoey77\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_14970bad82e5.png\" alt=\"ZYQ-Zoey77\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fmdoumbouya\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_e7b63d401e4b.png\" alt=\"mdoumbouya\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fxyliugo\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_ebffef2bc6f9.png\" alt=\"xyliugo\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fzky001\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_a3c8c413336f.png\" alt=\"zky001\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\n\n\n\n\n\n\u003Cp align=\"right\">(\u003Ca href=\"#top\">back to top\u003C\u002Fa>)\u003C\u002Fp>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n","# 令人惊叹的LLM jailbreak方法集\n\nAwesome-Jailbreak-on-LLMs 是一个收集了当前最先进、新颖且激动人心的LLM jailbreak方法的集合。它包含论文、代码、数据集、评估和分析等内容。任何有关jailbreak的补充内容、PR或问题都欢迎提出，我们很乐意将您加入贡献者名单[这里](#contributors)。如有任何问题，请联系 yliu@u.nus.edu。如果您觉得这个仓库对您的研究或工作有所帮助，非常感谢您给本仓库点赞并引用我们的论文[这里](#Reference)。:sparkles:\n\n\n## 参考文献\n\n如果您觉得这个仓库对您的研究有帮助，我们非常感谢您能引用我们的论文。:sparkles:\n\n```\n@article{zhuzhenhao_GuardReasoner_Omni,\n  title={GuardReasoner-Omni: 一种基于推理的文本、图像和视频多模态安全防护},\n  author={朱振浩、刘悦、郭延培、曲文杰、陈灿灿、何宇飞、李一博、陈玉林、吴天一、徐慧颖等},\n  journal={arXiv预印本 arXiv:2602.03328},\n  year={2026}\n}\n\n@article{liuyue_GuardReasoner_VL,\n  title={GuardReasoner-VL: 通过强化推理保障VLMs的安全},\n  author={刘悦、翟圣芳、杜明哲、陈玉林、曹三、高洪成、王程、李新峰、王坤、方俊峰、张嘉恒、胡伊·布莱恩},\n  journal={arXiv预印本 arXiv:2505.11049},\n  year={2025}\n}\n\n@article{liuyue_GuardReasoner,\n  title={GuardReasoner: 向基于推理的LLM安全防护迈进},\n  author={刘悦、高洪成、翟圣芳、Jun夏、吴天一、薛志伟、陈玉林、川口健二、张嘉恒、胡伊·布莱恩},\n  journal={arXiv预印本 arXiv:2501.18492},\n  year={2025}\n}\n\n@article{liuyue_FlipAttack,\n  title={FlipAttack: 通过翻转实现LLM jailbreak},\n  author={刘悦、何晓欣、熊淼、傅金兰、邓淑敏、胡伊·布莱恩},\n  journal={arXiv预印本 arXiv:2410.02832},\n  year={2024}\n}\n\n@article{wang2025safety,\n  title={大型推理模型中的安全性：综述},\n  author={王程、刘悦、李宝龙、张杜真、李忠志、方俊峰},\n  journal={arXiv预印本 arXiv:2504.17704},\n  year={2025}\n}\n```\n\n\n## 书签\n\n- [Jailbreak攻击](#jailbreak-attack)\n  - [针对LRM的攻击](#attack-on-lrms)\n  - [黑盒攻击](#black-box-attack)\n  - [白盒攻击](#white-box-attack)\n  - [多轮攻击](#multi-turn-attack)\n  - [针对RAG-based LLM的攻击](#attack-on-rag-based-llm)\n  - [多模态攻击](#multi-modal-attack)\n- [Jailbreak防御](#jailbreak-defense)\n  - [基于学习的防御](#learning-based-defense)\n  - [基于策略的防御](#strategy-based-defense)\n  - [防护模型](#Guard-model)\n  - [审核API](#Moderation-API)\n- [评估与分析](#evaluation--analysis)\n- [应用](#application)\n\n\n\n## 论文\n\n\n\n\n### Jailbreak攻击\n\n\n\n#### 针对LRM的攻击\n| 时间    | 标题                                                        | 会议 |                  论文                   |                             代码                             |\n| ------- | ------------------------------------------------------------ | :---: | :--------------------------------------: | :----------------------------------------------------------: |\n| 2025.11 | **BadThink: 触发过度思考攻击，针对大型语言模型中的链式思维推理** | AAAI'26  | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2511.10714) | - |\n| 2025.08 | **Jinx: 用于探测对齐失败的无限LLM** | arXiv  | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.08243) | [模型](https:\u002F\u002Fhuggingface.co\u002FJinx-org) |\n| 2025.07 | **BadReasoner: 在大型推理模型中植入可调过度思考后门，用于娱乐或牟利** | arXiv  | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.18305) | [链接](https:\u002F\u002Fgithub.com\u002FFZaKK\u002FBadReasoner) |\n| 2025.06 | **ExtendAttack: 通过扩展推理攻击LRM服务器** | AAAI'26  | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.13737) | [链接](https:\u002F\u002Fgithub.com\u002Fzzh-thu-22\u002FExtendAttack) |\n| 2025.06 | **过度推理攻击推理LLM** | arXiv  | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.14374) | - |\n| 2025.03 | **猫迷惑推理LLM：针对推理模型的查询无关对抗性触发器** | arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.01781) |- |\n| 2025.02 | **OverThink: 针对推理LLM的减速攻击** | arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.02542#:~:text=We%20increase%20overhead%20for%20applications%20that%20rely%20on,the%20user%20query%20while%20providing%20contextually%20correct%20answers.) | [链接](https:\u002F\u002Fgithub.com\u002Fakumar2709\u002FOVERTHINK_public) |\n| 2025.02 | **BoT: 通过后门攻击破解o1类大型语言模型的长思维过程** | arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.12202) | [链接](https:\u002F\u002Fgithub.com\u002Fzihao-ai\u002FBoT) |\n| 2025.02 | **H-CoT: 劫持链式思维安全推理机制，实现大型推理模型的jailbreak，包括OpenAI o1\u002Fo3、DeepSeek-R1和Gemini 2.0 Flash Thinking** | arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.12893) |[链接](https:\u002F\u002Fgithub.com\u002Fdukeceicenter\u002Fjailbreak-reasoning-openai-o1o3-deepseek-r1) |\n| 2025.02 | **捕鼠器：用迭代混沌链愚弄大型推理模型实现jailbreak** | arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.15806) |- |\n\n\n\n\n#### 黑盒攻击\n\n| 时间    | 标题                                                       |  venue  |                                                           论文                                                            |                                          代码                                          |\n|---------| ----------------------------------------------------------- | :-----: |:--------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------:|\n| 2026.03 | **前沿大型语言模型中的内部安全崩溃（ISC-Bench）** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.23509) | [链接](https:\u002F\u002Fgithub.com\u002Fwuyoscar\u002FISC-Bench) |\n| 2025.10 | **BreakFun：通过模式利用实现大型语言模型的越狱** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.17904) | - |\n| 2025.07 | **响应攻击：利用上下文提示增强大型语言模型的越狱能力** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.05248) | [链接](https:\u002F\u002Fgithub.com\u002FDtc7w3PQ\u002FResponse-Attack) |\n| 2025.05 | **表情符号攻击：增强针对法官大型语言模型检测的越狱攻击** |   ICML'25     |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.01077)                                          |                   [链接](https:\u002F\u002Fgithub.com\u002Fzhipeng-wei\u002FEmojiAttack)                   |\n| 2025.05 | **FlipAttack：通过翻转实现大型语言模型的越狱（FlipAttack）** |   ICML'25     |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.02832)                                          |                    [链接](https:\u002F\u002Fgithub.com\u002Fyueliu1999\u002FFlipAttack)                    |\n| 2025.03 | **扮演傻瓜：使用分布外策略实现大型语言模型和多模态语言模型的越狱（JOOD）** |   CVPR'25     |                                          [链接](http:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.20823)                                           |                        [链接](https:\u002F\u002Fgithub.com\u002Fnaver-ai\u002FJOOD)                        |\n| 2025.02 | **StructTransform：面向安全对齐大型语言模型的可扩展攻击面** |   arXiv     |                                          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11853)                                          |                  [链接](https:\u002F\u002Fgithub.com\u002FStructTransform\u002FBenchmark)                  |\n| 2025.01 | **用通用魔法词保护大型语言模型的越狱，适用于文本嵌入模型** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.18280) | - |\n| 2025.01 | **理解和增强越狱攻击的可迁移性** |   ICLR'25     |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2502.03052)                                          |                    [链接](https:\u002F\u002Fgithub.com\u002Ftmllab\u002F2025_ICLR_PiF)                     |\n| 2024.11 | **信任的阴暗面：基于权威引用的大型语言模型越狱攻击** | arXiv |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.11407)                                          |                     [链接](https:\u002F\u002Fgithub.com\u002FYancyKahn\u002FDarkCite)                      |\n| 2024.11 | **与大型语言模型玩语言游戏导致越狱** | arXiv |                                         [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.12762v1)                                         | [链接](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002Fencode_jailbreaking_anonymous-B4C4\u002FREADME.md) |\n| 2024.11 | **GASP：高效黑盒生成对抗后缀以实现大型语言模型的越狱（GASP）** | arXiv |                                         [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.14133v1)                                         |                        [链接](https:\u002F\u002Fgithub.com\u002Fllm-gasp\u002Fgasp)                        |\n| 2024.11 | **LLM STINGER：利用强化学习微调的大型语言模型实现越狱** | arXiv |                                         [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.08862v1)                                         |                                           -                                            |\n| 2024.11 | **SequentialBreak：通过将越狱提示嵌入序列化提示中欺骗大型语言模型** | arXiv |                                         [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.06426v1)                                         |            [链接](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002FJailBreakAttack-4F3B\u002F)             |\n| 2024.11 | **多样性有助于越狱大型语言模型** | arXiv |                                         [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.04223v1)                                         |                                           -                                            |\n| 2024.11 | **用字符串组合实现丰富的越狱** | arXiv |                                         [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.01084v1)                                         |                                           -                                            |\n| 2024.11 | **可迁移的集成黑盒越狱大型语言模型攻击** |   arXiv    |                                         [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.23558v1)                                         |          [链接](https:\u002F\u002Fgithub.com\u002FYQYANG2233\u002FLarge-Language-Model-Break-AI)           |\n| 2024.11 | **通过良性数据镜像实现隐蔽的大型语言模型越狱攻击** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.21083)                                          |                                           -                                            |\n| 2024.10 | **通过双射实现无尽越狱** |   arXiv    |                                         [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.01294v1)                                         |                                           -                                            |\n| 2024.10 | **利用任务过载实现可扩展的大型语言模型越狱攻击** |   arXiv    |                                         [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.04190v1)                                         |                                           -                                            |\n| 2024.10 | **你知道我在说什么：通过隐式引用实现越狱攻击** |   arXiv    |                                         [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.03857v2)                                         |               [链接](https:\u002F\u002Fgithub.com\u002FLucas-TY\u002Fllm_Implicit_reference)               |\n| 2024.10 | **破解混沌：通过对抗性提示翻译增强越狱攻击** |   arXiv    |                                         [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.11317v1)                                         |           [链接](https:\u002F\u002Fgithub.com\u002Fqizhangli\u002FAdversarial-Prompt-Translator)           |\n| 2024.10 | **AutoDAN-Turbo：一种终身代理，用于策略自我探索以实现大型语言模型的越狱（AutoDAN-Turbo）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.05295)                                          |                 [链接](https:\u002F\u002Fgithub.com\u002FSaFoLab-WISC\u002FAutoDAN-Turbo)                  |\n| 2024.10 | **PathSeeker：基于强化学习的越狱方法探索大型语言模型的安全漏洞（PathSeeker）** | arXiv |                                        [链接](https:\u002F\u002Fwww.arxiv.org\u002Fpdf\u002F2409.14177)                                        |                                           -                                            |\n| 2024.10 | **读取行间内容：利用ASCII艺术掩盖脏话攻击大型语言模型和毒性检测系统** | arXiv |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.18708)                                          |                     [链接](https:\u002F\u002Fgithub.com\u002FSerbernari\u002FToxASCII)                     |\n| 2024.09 | **AdaPPA：针对大型语言模型的自适应位置预填充越狱攻击方法** | arXiv |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.07503)                                          |                       [链接](https:\u002F\u002Fgithub.com\u002FYummy416\u002FAdaPPA)                       |\n| 2024.09 | **有效且规避的模糊测试驱动的大型语言模型越狱攻击** | arXiv |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.14866)                                          |                                           -                                            |\n| 2024.09 | **用符号数学实现大型语言模型的越狱** |   arXiv    |                                         [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.11445v1)                                         |                                           -                                            |\n| 2024.08 | **与大型语言模型玩猜谜游戏：间接越狱攻击与隐含线索** |   ACL Findings'24    |                                   [链接](https:\u002F\u002Faclanthology.org\u002F2024.findings-acl.304)                                   |                       [链接](https:\u002F\u002Fgithub.com\u002Fczycurefun\u002FIJBR)                       |\n| 2024.08 | **推进对齐大型语言模型的对抗后缀迁移学习** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.14866)                                          |                                           -                                            |\n| 2024.08 | **将恶意目标隐藏在良性叙事中：通过神经载体文章实现大型语言模型的越狱** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.11182)                                          |                                           -                                            |\n| 2024.08 | **h4rm3l：可组合越狱攻击的动态基准，用于大型语言模型安全评估（h4rm3l）** |    arXiv   |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.04811)                                          |                      [链接](https:\u002F\u002Fmdoumbouya.github.io\u002Fh4rm3l\u002F)                      |\n| 2024.08 | **EnJa：大型语言模型的集成越狱（EnJa）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.03603)                                          |                                           -                                            |\n| 2024.07 | **知识到越狱：一点知识抵得上一次攻击** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.11682)                                          |               [链接](https:\u002F\u002Fgithub.com\u002FTHU-KEG\u002FKnowledge-to-Jailbreak\u002F)               |\n| 2024.07 | **大型语言模型可能是危险的推理者：基于分析的大型语言模型越狱攻击** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.16205)                                          |                                                                                        |\n| 2024.07 | **单字符扰动破坏大型语言模型对齐** |   arXiv    |                                     [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.03232#page=3.00)                                     |                  [链接](https:\u002F\u002Fgithub.com\u002Fhannah-aught\u002Fspace_attack)                  |\n| 2024.07 | **虚假的安全感：‘安全’AI回复中的不安全信息泄露** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.02551)                                          |                                           -                                            |\n| 2024.07 | **虚拟上下文：通过特殊标记注入增强越狱攻击（虚拟上下文）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.19845)                                          |                                           -                                            |\n| 2024.07 | **SoP：解锁社交促进的力量，实现自动越狱攻击（SoP）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.01902)                                          |                    [链接](https:\u002F\u002Fgithub.com\u002FYang-Yan-Yang-Yan\u002FSoP)                    |\n| 2024.06 | **越狱作为奖励错配问题**| ICLR'25|                                                          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.14393)                                                          |                  [链接](https:\u002F\u002Fgithub.com\u002Fzhxieml\u002Fremiss-jailbreak)                   |\n| 2024.06 | **改进的小样本越狱可绕过对齐语言模型及其防御措施（I-FSJ）** |    NeurIPS'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.01288)                                          |                        [链接](https:\u002F\u002Fgithub.com\u002Fsail-sg\u002FI-FSJ)                        |\n| 2024.06 | **当大型语言模型遇上深度强化学习：通过深度强化学习引导的搜索提升越狱效率（RLbreaker）** |   NeurIPS'24   |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.08705)                                          |                                           -                                            |\n| 2024.06 | **史密斯特工：一张图片可指数级快速越狱百万个多模态语言模型代理（史密斯特工）** |   ICML'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.08567)                                          |                     [链接](https:\u002F\u002Fgithub.com\u002Fsail-sg\u002FAgent-Smith)                     |\n| 2024.06 | **隐蔽恶意微调：保障大型语言模型适配的挑战** |   ICML'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.20053)                                          |                                           -                                            |\n| 2024.06 | **ArtPrompt：基于ASCII艺术的对齐大型语言模型越狱攻击（ArtPrompt）** |   ACL'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.11753)                                          |                      [链接](https:\u002F\u002Fgithub.com\u002Fuw-nsl\u002FArtPrompt)                       |\n| 2024.06 | **从噪声到清晰：通过文本嵌入翻译揭示大型语言模型攻击的对抗后缀（ASETF）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.16006)                                          |                                           -                                            |\n| 2024.06 | **CodeAttack：通过代码补全揭示大型语言模型的安全泛化挑战（CodeAttack）** |   ACL'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.07865)                                          |                                           -                                            |\n| 2024.06 | **让他们问并回答：通过伪装和重构实现少量查询中的大型语言模型越狱（DRA）** |   USENIX Security'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.18104)                                          |                        [链接](https:\u002F\u002Fgithub.com\u002FLLM-DRA\u002FDRA\u002F)                         |\n| 2024.06 | **AutoJailbreak：通过依赖性视角探索越狱攻击与防御措施（AutoJailbreak）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.08705)                                          |                                           -                                            |\n| 2024.06 | **用简单自适应攻击实现领先安全对齐大型语言模型的越狱** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.02151)                                          |                [链接](https:\u002F\u002Fgithub.com\u002Ftml-epfl\u002Fllm-adaptive-attacks)                |\n| 2024.06 | **GPTFUZZER：用自动生成的越狱提示对大型语言模型进行红队攻击（GPTFUZZER）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.10253)                                          |                    [链接](https:\u002F\u002Fgithub.com\u002Fsherdencooper\u002FGPTFuzz)                    |\n| 2024.06 | **披着羊皮的狼：通用嵌套越狱提示可轻松欺骗大型语言模型（ReNeLLM）** |   NAACL'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.08268)                                          |                       [链接](https:\u002F\u002Fgithub.com\u002FNJUNLP\u002FReNeLLM)                        |\n| 2024.06 | **QROA：针对大型语言模型的黑盒查询-响应优化攻击（QROA）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.02044)                                          |                          [链接](https:\u002F\u002Fgithub.com\u002Fqroa\u002Fqroa)                          |\n| 2024.06 | **LangChain中毒：通过LangChain实现大型语言模型的越狱（PLC）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18122)                                          |                 [链接](https:\u002F\u002Fgithub.com\u002FCAM-FSS\u002Fjailbreak-langchain)                 |\n| 2024.05 | **大型语言模型中的多语言越狱挑战** |   ICLR'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.06474)                                          |          [链接](https:\u002F\u002Fgithub.com\u002FDAMO-NLP-SG\u002Fmultilingual-safety-for-LLMs)           |\n| 2024.05 | **DeepInception：催眠大型语言模型成为越狱者（DeepInception）** |   EMNLP'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.03191)                                          |                  [链接](https:\u002F\u002Fgithub.com\u002Ftmlr-group\u002FDeepInception)                   |\n| 2024.05 | **GPT-4通过自我解释几乎完美地实现自我越狱（IRIS）** |   ACL'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.13077)                                          |                                           -                                            |\n| 2024.05 | **GUARD：角色扮演生成自然语言越狱以测试大型语言模型的准则遵守情况（GUARD）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.03299)                                          |                                           -                                            |\n| 2024.05 | **“现在就做任何事”：刻画和评估大型语言模型上的野外越狱提示（DAN）** |   CCS'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.03825)                                          |                   [链接](https:\u002F\u002Fgithub.com\u002Fverazuo\u002Fjailbreak_llms)                    |\n| 2024.05 | **Gpt-4太聪明了，无法保证安全：通过密码与大型语言模型进行隐蔽聊天（SelfCipher）** |   ICLR'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.06463)                                          |                    [链接](https:\u002F\u002Fgithub.com\u002FRobustNLP\u002FCipherChat)                     |\n| 2024.05 | **通过密码字符实现大型语言模型的越狱攻击（JAM）** | NeurIPS'24 |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.20413)                                          |                                           -                                            |\n| 2024.05 | **仅用少量上下文示范实现对齐语言模型的越狱攻击（ICA）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.06387)                                          |                                           -                                            |\n| 2024.04 | **多轮越狱（MSJ）** |   NeurIPS'24 Anthropic   | [链接](https:\u002F\u002Fwww-cdn.anthropic.com\u002Faf5633c94ed2beb282f6a53c595eb437e8e7b630\u002FMany_Shot_Jailbreaking__2024_04_02_0936.pdf) |                                           -                                            |\n| 2024.04 | **PANDORA：通过协作钓鱼代理与分解推理实现详细的大型语言模型越狱（PANDORA）** |   ICLR Workshop'24    |                                      [链接](https:\u002F\u002Fopenreview.net\u002Fpdf?id=9o06ugFxIj)                                      |                                           -                                            |\n| 2024.04 | **Fuzzllm：一种新颖且通用的模糊测试框架，主动发现大型语言模型中的越狱漏洞（FuzzLLM）** |   ICASSP'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.05274)                                          |                     [链接](https:\u002F\u002Fgithub.com\u002FRainJamesY\u002FFuzzLLM)                      |\n| 2024.04 | **三明治攻击：多语言混合自适应攻击大型语言模型（三明治攻击）** |   TrustNLP'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.07242)                                          |                                           -                                            |\n| 2024.03 | **Tastle：为自动越狱攻击分散大型语言模型注意力（TASTLE）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.08424)                                          |                                           -                                            |\n| 2024.03 | **DrAttack：提示分解与重构打造强大的大型语言模型越狱工具（DrAttack）** |   EMNLP'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.16914)                                          |                      [链接](https:\u002F\u002Fgithub.com\u002Fxirui-li\u002FDrAttack)                      |\n| 2024.02 | **PRP：传播通用扰动以攻击大型语言模型防护墙（PRP）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.15911)                                          |                                           -                                            |\n| 2024.02 | **CodeChameleon：个性化加密框架用于大型语言模型的越狱（CodeChameleon）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.16717)                                          |                  [链接](https:\u002F\u002Fgithub.com\u002Fhuizhang-L\u002FCodeChameleon)                   |\n| 2024.02 | **PAL：代理引导的大型语言模型黑盒攻击（PAL）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.09674)                                          |                         [链接](https:\u002F\u002Fgithub.com\u002Fchawins\u002Fpal)                         |\n| 2024.02 | **利用单词替换密码实现专有大型语言模型的越狱** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.10601)                                          |                                           -                                            |\n| 2024.02 | **基于查询的对抗性提示生成** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.12329)                                          |                                           -                                            |\n| 2024.02 | **通过多轮交互利用上下文实现越狱攻击（上下文交互攻击）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.09177)                                          |                                           -                                            |\n| 2024.02 | **语义镜像越狱：基于遗传算法的越狱提示对抗开源大型语言模型（SMJ）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.14872)                                          |                                           -                                            |\n| 2024.02 | **认知过载：通过逻辑思维过载实现大型语言模型的越狱** |   NAACL'24    |                                    [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.09827#page=10.84)                                     |                [链接](https:\u002F\u002Fgithub.com\u002Fluka-group\u002FCognitiveOverload)                 |\n| 2024.01 | **低资源语言越狱GPT-4** |   NeurIPS Workshop'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.02446)                                          |                                           -                                            |\n| 2024.01 | **约翰尼如何说服大型语言模型越狱：重新思考说服力，通过人性化大型语言模型挑战AI安全（PAP）** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.06373)                                          |              [链接](https:\u002F\u002Fgithub.com\u002FCHATS-lab\u002Fpersuasive_jailbreaker)               |\n| 2023.12 | **攻击之树：自动越狱黑盒大型语言模型（TAP）** |   NeurIPS'24   |                                          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.02119)                                          |                       [链接](https:\u002F\u002Fgithub.com\u002FRICommunity\u002FTAP)                       |\n| 2023.12 | **让他们吐露真相！从（生产）大型语言模型中强制获取知识** |   arXiv    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.04782)                                          |                                           -                                            |\n| 2023.12 | **忽略这个标题，HackAPrompt：通过全球规模的提示黑客竞赛揭露大型语言模型的系统性漏洞** |   ACL'24    |                                   [链接](https:\u002F\u002Faclanthology.org\u002F2023.emnlp-main.302\u002F)                                    |                                           -                                            |\n| 2023.11 | **面向语言模型的可扩展且可迁移黑盒越狱，通过角色调节（Persona）** |   NeurIPS Workshop'23    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.03348)                                          |                                           -                                            |\n| 2023.10 | **二十次查询内实现黑盒大型语言模型的越狱（PAIR）** |   NeurIPS'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.08419)                                          |                [链接](https:\u002F\u002Fgithub.com\u002Fpatrickrchao\u002FJailbreakingLLMs)                |\n| 2023.10 | **针对大型语言模型的对抗性演示攻击（advICL）** |   EMNLP'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.14950)                                          |                                           -                                            |\n| 2023.10 | **MASTERKEY：大型语言模型聊天机器人自动越狱（MASTERKEY）** |   NDSS'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.08715)                                          |                    [链接](https:\u002F\u002Fgithub.com\u002FLLMSecurity\u002FMasterKey)                    |              -                               |\n| 2023.10 | **攻击提示生成用于红队与大型语言模型防御（SAP）** |   EMNLP'23    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.12505)                                          |                        [链接](https:\u002F\u002Fgithub.com\u002FAatrox103\u002FSAP)                        |\n| 2023.10 | **一个大型语言模型可以骗自己：基于提示的对抗性攻击（PromptAttack）** |   ICLR'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.13345)                                          |                   [链接](https:\u002F\u002Fgithub.com\u002FGodXuxilie\u002FPromptAttack)                   |\n| 2023.09 | **针对ChatGPT的多步隐私越狱攻击（MJP）** |   EMNLP Findings'23    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.05197)                                          |           [链接](https:\u002F\u002Fgithub.com\u002FHKUST-KnowComp\u002FLLM-Multistep-Jailbreak)            |\n| 2023.09 | **芝麻开门！大型语言模型的通用黑盒越狱（GA）** |   Applied Sciences'24    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.01446)                                          |                                           -                                            |\n| 2023.05 | **并非你所注册的：通过间接提示注入破坏现实世界中集成大型语言模型的应用程序** |   CCS'23    |                           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2302.12173?trk=public_post_comment-text)                            |                    [链接](https:\u002F\u002Fgithub.com\u002Fgreshake\u002Fllm-security)                    |\n| 2022.11 | **忽略先前提示：针对语言模型的攻击技术（PromptInject）** |   NeurIPS WorkShop'22    |                                          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.09527)                                          |                [链接](https:\u002F\u002Fgithub.com\u002Fagencyenterprise\u002FPromptInject)                |\n\n#### 白盒攻击\n\n| 年份    | 标题                                                        |      地点       |                            论文                             |                            代码                            |\n| ------- | ------------------------------------------------------------ | :--------------: | :----------------------------------------------------------: | :--------------------------------------------------------: |\n| 2025.08 | **别说不：通过抑制拒绝实现大语言模型越狱（DSN）** | ACL'25 | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.16369) |    [链接](https:\u002F\u002Fgithub.com\u002FDSN-2024\u002FDSN) |\n| 2025.03 | **引导而非强迫：通过移除多余约束提升大语言模型越狱攻击的可迁移性** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.01865) |    [链接](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FTransferAttack) |\n| 2025.02 | **基于优化的大语言模型越狱改进技术（I-GCG）** |   ICLR'25     | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.21018) |        [链接](https:\u002F\u002Fgithub.com\u002FjiaxiaojunQAQ\u002FI-GCG)      |\n| 2024.12 | **利用连续攻击实现大语言模型高效对抗训练** | NeurIPS'24 | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15589) | [链接](https:\u002F\u002Fgithub.com\u002Fsophie-xhonneux\u002FContinuous-AdvTrain) |\n| 2024.11 | **AmpleGCG-Plus：一种强大的对抗后缀生成模型，以更少尝试次数实现更高成功率的大语言模型越狱** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.22143v1)           |                          -                             |\n| 2024.11 | **DROJ：一种针对大语言模型的提示驱动攻击** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.09125)           |                           [链接](https:\u002F\u002Fgithub.com\u002FLeon-Leyang\u002FLLM-Safeguard)                              |\n| 2024.11 | **SQL注入越狱：大语言模型的结构性灾难** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.01565)           |                           [链接](https:\u002F\u002Fgithub.com\u002Fweiyezhimeng\u002FSQL-Injection-Jailbreak)                              |\n| 2024.10 | **函数同伦：通过连续参数平滑离散优化实现大语言模型越狱攻击** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.04234)           |                           -                              |\n| 2024.10 | **AttnGCG：通过注意力操纵增强大语言模型越狱攻击** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.09040v1)           |                           [链接](https:\u002F\u002Fgithub.com\u002FUCSC-VLAA\u002FAttnGCG-attack)                               |\n| 2024.10 | **通过句子末尾MLP重新加权实现指令微调大语言模型越狱** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.10150v1)           |                            -                              |\n| 2024.10 | **提升大语言模型越狱可迁移性的方法（SI-GCG）** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.15645v1)           |                            -                              |\n| 2024.10 | **迭代自适应大语言模型以增强越狱能力（ADV-LLM）** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.18469v1)           |                             [链接](https:\u002F\u002Fgithub.com\u002FSunChungEn\u002FADV-LLM)                               |\n| 2024.08 | **通过不安全解码路径生成探测大语言模型的安全响应边界（JVD）** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.10668)           |                             -                              |\n| 2024.08 | **通过强制解码实现开源大语言模型越狱（EnDec）** |      ACL'24      | [链接](https:\u002F\u002Faclanthology.org\u002F2024.acl-long.299.pdf#page=4.96) |                             -                              |\n| 2024.07 | **Best-of-Venom：通过注入中毒偏好数据攻击RLHF** |      COLM'24       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.05530)           |    -    |\n| 2024.07 | **语言模型中的拒绝行为由单一方向介导** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.11717)           |    [链接](https:\u002F\u002Fgithub.com\u002Fandyrdt\u002Frefusal_direction)    |\n| 2024.07 | **重新审视针对语言模型的字符级对抗攻击** |     ICML'24      |           [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.04346)           |       [链接](https:\u002F\u002Fgithub.com\u002FLIONS-EPFL\u002FCharmer)        |\n| 2024.07 | **Badllama 3：在几分钟内从Llama 3中移除安全微调（Badllama 3）** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.01376)           |                             -                              |\n| 2024.07 | **SOS！针对开源大语言模型的软提示攻击** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.03160)           |                             -                              |\n| 2024.06 | **COLD-Attack：以隐蔽性和可控性实现大语言模型越狱（COLD-Attack）** |     ICML'24      |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.08679)           |      [链接](https:\u002F\u002Fgithub.com\u002FYu-Fangxu\u002FCOLD-Attack)      |\n| 2024.05 | **面向通用目标劫持的语言模型语义引导提示组织** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14189)           |                                                            |\n| 2024.05 | **通过自适应密集到稀疏约束优化实现高效大语言模型越狱** |    NeurIPS'24    |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.09113)           |                             -                              |\n| 2024.05 | **AutoDAN：在对齐的大语言模型上生成隐蔽越狱提示（AutoDAN）** |     ICLR'24      |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.04451)           |      [链接](https:\u002F\u002Fgithub.com\u002FSheltonLiu-N\u002FAutoDAN)       |\n| 2024.05 | **AmpleGCG：学习一种通用且可迁移的对抗后缀生成模型，用于越狱开放和封闭的大语言模型（AmpleGCG）** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.07921)           |     [链接](https:\u002F\u002Fgithub.com\u002FOSU-NLP-Group\u002FAmpleGCG)      |\n| 2024.05 | **借助动量提升越狱攻击（MAC）**            | ICLR Workshop'24 |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.01229)           |  [链接](https:\u002F\u002Fgithub.com\u002Fweizeming\u002Fmomentum-attack-llm)  |\n| 2024.04 | **AdvPrompter：面向大语言模型的快速自适应对抗提示生成（AdvPrompter）** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.16873)           |  [链接](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fadvprompter)   |\n| 2024.03 | **来自中毒人类反馈的通用越狱后门** |     ICLR'24      |       [链接](https:\u002F\u002Fopenreview.net\u002Fpdf?id=GxCGsxiAaK)       |                             -                              |\n| 2024.02 | **用投影梯度下降攻击大语言模型（PGD）** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.09154)           |                             -                              |\n| 2024.02 | **打开大语言模型的潘多拉魔盒：通过表示工程实现大语言模型越狱（JRE）** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.06824)           |                             -                              |\n| 2024.02 | **以好奇心驱动的红队测试大语言模型（CRT）** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.19464)           | [链接](https:\u002F\u002Fgithub.com\u002FImprobable-AI\u002Fcuriosity_redteam) |\n| 2023.12 | **AutoDAN：面向大语言模型的可解释梯度对抗攻击（AutoDAN）** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.15140)           |    [链接](https:\u002F\u002Fgithub.com\u002Frotaryhammer\u002Fcode-autodan)    |\n| 2023.10 | **通过利用生成实现开源大语言模型的灾难性越狱** |     ICLR'24      |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.06987)           |  [链接](https:\u002F\u002Fgithub.com\u002FPrinceton-SysML\u002FJailbreak_LLM)  |\n| 2023.06 | **通过离散优化自动审计大语言模型（ARCA）** |     ICML'23      | [链接](https:\u002F\u002Fproceedings.mlr.press\u002Fv202\u002Fjones23a\u002Fjones23a.pdf) |     [链接](https:\u002F\u002Fgithub.com\u002Fejones313\u002Fauditing-llms)     |\n| 2023.07 | **面向对齐语言模型的通用且可迁移的对抗攻击（GCG）** |      arXiv       |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.15043)           |     [链接](https:\u002F\u002Fgithub.com\u002Fllm-attacks\u002Fllm-attacks)     |\n\n#### 多轮攻击\n\n\n\n| 时间    | 标题                                                        |   地点   |                  论文                   |                             代码                             |\n| ------- | ------------------------------------------------------------ | :-------: | :--------------------------------------: | :----------------------------------------------------------: |\n| 2025.04 | **通过注意力转移实现大型语言模型的多轮破解** |   AAAI'25    | [链接](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F34553) |    -      |\n| 2025.04 | **X-Teaming：基于自适应多智能体的多轮破解与防御** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.13203) |    [链接](https:\u002F\u002Fgithub.com\u002Fsalman-lui\u002Fx-teaming)      |\n| 2025.04 | **全局策略，局部适应：一种具有双层学习的多轮红队代理** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.01278) |      -    |\n| 2025.03 | **脚踏实地：一种针对大型语言模型的多轮破解方法** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2502.19820) |    [链接](https:\u002F\u002Fgithub.com\u002FJinxiaolong1129\u002FFoot-in-the-door-Jailbreak)      |\n| 2025.03 | **围攻：利用树搜索实现大型语言模型的自主多轮破解** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.10619) |      -    |\n| 2024.11 | **MRJ-Agent：一种高效的多轮对话破解代理** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.03814) |      -    |\n| 2024.10 | **拼图游戏：将有害问题拆分以破解大型语言模型（JSP）** |   arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.11459v1) |    [链接](https:\u002F\u002Fgithub.com\u002FYangHao97\u002FJigSawPuzzles)      |\n| 2024.10 | **针对大型语言模型的多轮破解攻击** |   arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.11533v1) |     -      |\n| 2024.10 | **自我颠覆：通过自我发现线索实现多轮大型语言模型破解攻击** |   arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.10700) |     [链接](https:\u002F\u002Fgithub.com\u002Frenqibing\u002FActorAttack)      |\n| 2024.10 | **使用GOAT进行自动化红队测试：生成式进攻代理测试器** |   arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.01606) |     -      |\n| 2024.09 | **LLM防御尚未能抵御多轮人类破解攻击** |   arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.15221) |     [链接](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FScaleAI\u002Fmhj)      |\n| 2024.09 | **红后：防范隐蔽多轮破解攻击的大型语言模型安全机制** |   arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.17458) |       [链接](https:\u002F\u002Fgithub.com\u002Fkriti-hippo\u002Fred_queen)       |\n| 2024.08 | **FRACTURED-SORRY-Bench：揭示对话回合中削弱拒绝效力及防御措施的攻击框架（自动多轮破解）** |   arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.16163) |     -      |\n| 2024.08 | **前沿模型中的新兴漏洞：多轮破解攻击** |   arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.00137) | [链接](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Ftom-gibbs\u002Fmulti-turn_jailbreak_attack_datasets) |\n| 2024.05 | **CoA：基于上下文感知的多轮对话大型语言模型攻击链（CoA）** |   arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.05610) |           [链接](https:\u002F\u002Fgithub.com\u002FYancyKahn\u002FCoA)           |\n| 2024.04 | **太好了，现在写篇文章吧：Crescendo多轮大型语言模型破解攻击（Crescendo）** | 微软Azure | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.01833) |                              -                               |\n\n\n\n\n\n\n\n#### 针对基于RAG的大型语言模型的攻击\n\n\n\n| 时间    | 标题                                                        | 地点 |                  论文                   |                             代码                             |\n| ------- | ------------------------------------------------------------ | :---: | :--------------------------------------: | :----------------------------------------------------------: |\n| 2024.09 | **释放蠕虫并提取数据：利用破解手段升级基于RAG推理的攻击在规模和严重性上的效果** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.08045) | [链接](https:\u002F\u002Fgithub.com\u002FStavC\u002FUnleashingWorms-ExtractingData) |\n| 2024.02 | **潘多拉：通过检索增强生成中毒实现GPT破解（潘多拉）** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.08416) |                              -                               |\n\n\n\n\n\n\n\n#### 多模态攻击\n\n| 时间 | 标题                                                        |  地点  |                            论文                             |                             代码                             |\n| ---- | ------------------------------------------------------------ | :-----: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2024.11 | **多模态生成模型的越狱攻击与防御：综述** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.09259) | [链接](https:\u002F\u002Fgithub.com\u002Fliuxuannan\u002FAwesome-Multimodal-Jailbreak) |\n| 2024.10 | **通过逐步编辑实现针对图像生成模型的越狱链攻击** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.03869) | - |\n| 2024.10 | **ColJailBreak：协作生成与编辑，用于越狱文本到图像深度生成** | NeurIPS'24 | [链接](https:\u002F\u002Fnips.cc\u002Fvirtual\u002F2024\u002Fposter\u002F94287) | - |\n| 2024.08 | **基于大语言模型代理的文本到图像模型越狱（Atlas）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.00523) |                              -                              |\n| 2024.07 | **图像转文本逻辑越狱：你的想象力能助你为所欲为** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.02534) |                              -                              |\n| 2024.06 | **通过双模态对抗提示越狱视觉语言模型** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.04031) |                              [链接](https:\u002F\u002Fgithub.com\u002FNY1024\u002FBAP-Jailbreak-Vision-Language-Models-via-Bi-Modal-Adversarial-Prompt)                               |\n| 2024.05 | **针对GPT-4o的语音越狱攻击** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.19103) |                              [链接](https:\u002F\u002Fgithub.com\u002FTrustAIRLab\u002FVoiceJailbreakAttack)                               |\n| 2024.05 | **文本到图像生成AI系统的自动越狱** |     ICML'24研讨会    | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.16567) | [链接](https:\u002F\u002Fgithub.com\u002FKim-Minseon\u002FAPGP) |\n| 2024.04 | **图像劫持：对抗性图像可在运行时控制生成模型** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.00236) |                              [链接](https:\u002F\u002Fgithub.com\u002Feuanong\u002Fimage-hijacks)                               |\n| 2024.03 | **一张图胜过千言万语：对抗性跨提示在视觉语言模型上的迁移能力（CroPA）** |   ICLR'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.09766) |                              [链接](https:\u002F\u002Fgithub.com\u002FHaochen-Luo\u002FCroPA)                               |\n| 2024.03 | **分块越狱：针对多模态语言模型的组合式对抗攻击** |   ICLR'24    | [链接](https:\u002F\u002Fopenreview.net\u002Fpdf?id=plmBsXHxgR) |                              -                               |\n| 2024.03 | **重新思考基于迁移的对抗攻击中的模型集成** |   ICLR'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.09105) |                              [链接](https:\u002F\u002Fgithub.com\u002Fhuanranchen\u002FAdversarialAttacks)                               |\n| 2024.02 | **VLATTACK：利用预训练模型对视觉语言任务进行多模态对抗攻击** |   NeurIPS'23    | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.04655) |               [链接](https:\u002F\u002Fgithub.com\u002Fericyinyzy\u002FVLAttack)                                         |\n| 2024.02 | **针对多模态大型语言模型的越狱攻击** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.02309) |                             -                          |\n| 2024.01 | **通过系统提示的自对抗攻击越狱GPT-4V** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.09127) |                             -                          |\n| 2024.03 | **视觉对抗样本越狱对齐大型语言模型** |   AAAI'24    | [链接](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F30150\u002F32038) |                              -                               |\n| 2023.12 | **OT攻击：通过最优传输优化增强视觉语言模型的对抗迁移能力（OT攻击）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.04403) |                              -                               |\n| 2023.12 | **FigStep：通过排版视觉提示越狱大型视觉语言模型（FigStep）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.05608) |                              [链接](https:\u002F\u002Fgithub.com\u002FThuCCSLab\u002FFigStep)                               |\n| 2023.11 | **SneakyPrompt：越狱文本到图像生成模型** |   S&P'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.12082) |                              [链接](https:\u002F\u002Fgithub.com\u002FYuchen413\u002Ftext2image_safety)                               |\n| 2023.11 | **关于评估大型视觉语言模型的对抗鲁棒性** |   NeurIPS'23    | [链接](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2023\u002Ffile\u002Fa97b58c4f7551053b0512f92244b0810-Paper-Conference.pdf) |                              [链接](https:\u002F\u002Fgithub.com\u002Fyunqing-me\u002FAttackVLM)                               |\n| 2023.10 | **谷歌Bard对对抗性图像攻击有多鲁棒？** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.11751) |                              [链接](https:\u002F\u002Fgithub.com\u002Fthu-ml\u002FAttack-Bard)                               |\n| 2023.08 | **AdvCLIP：多模态对比学习中的下游无关对抗样本（AdvCLIP）** |   ACM MM'23    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.07026) |                              [链接](https:\u002F\u002Fgithub.com\u002FCGCL-codes\u002FAdvCLIP)                               |\n| 2023.07 | **集合级指导攻击：提升视觉语言预训练模型的对抗迁移能力（SGA）** |   ICCV'23    | [链接](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2023\u002Fpapers\u002FLu_Set-level_Guidance_Attack_Boosting_Adversarial_Transferability_of_Vision-Language_Pre-training_Models_ICCV_2023_paper.pdf) |                              [链接](https:\u002F\u002Fgithub.com\u002FZoky-2020\u002FSGA)                               |\n| 2023.07 | **关于多模态基础模型的对抗鲁棒性** |   ICCV Workshop'23    | [链接](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2023W\u002FAROW\u002Fpapers\u002FSchlarmann_On_the_Adversarial_Robustness_of_Multi-Modal_Foundation_Models_ICCVW_2023_paper.pdf) |                              -                               |\n| 2022.10 | **迈向视觉语言预训练模型的对抗攻击** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.09391) |                              [链接](https:\u002F\u002Fgithub.com\u002Fadversarial-for-goodness\u002FCo-Attack)                               |\n\n\n### 越狱防御\n\n#### 基于学习的防御\n| 时间 | 标题                                                        | 举办地  |                            论文                             |                             代码                             |\n| ---- | ------------------------------------------------------------ | :-----: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2025.12 | **重新思考利用表征对比评分检测大型视觉语言模型的越狱行为** | arXiv'25 | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.12069) | [链接](https:\u002F\u002Fgithub.com\u002Fsarendis56\u002FJailbreak_Detection_RCS) |\n| 2025.07 | **将推理作为安全性的自适应防御** | NeurIPS'25 | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.00971) | [链接](https:\u002F\u002Ftraining-adaptive-reasoners-safety.github.io) |\n| 2025.04 | **JailDAM：面向视觉语言模型的自适应记忆越狱检测** | COLM'25 | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.03770) | [链接](https:\u002F\u002Fgithub.com\u002FShenzheZhu\u002FJailDAM) |\n| 2024.12 | **塑造安全边界：理解并防御大型语言模型中的越狱攻击** | arXiv'24 | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.17034) | - |\n| 2024.10 | **面向安全的大型语言模型微调** | arXiv'24 | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.10014) | - |\n| 2024.10 | **MoJE：越狱专家混合，以朴素表格分类器作为提示攻击防护** | AAAI'24 | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.17699) | - |\n| 2024.08 | **BaThe：通过将有害指令视为后门触发器来防御多模态大型语言模型中的越狱攻击（BaThe）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.09093)  |                               -                              |\n| 2024.07 | **DART：用于LLM安全性的深度对抗自动化红队测试** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.03876)  |                               -                              |\n| 2024.07 | **Eraser：通过遗忘有害知识防御大型语言模型中的越狱攻击（Eraser）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.05880) |                              [链接](https:\u002F\u002Fgithub.com\u002FZeroNLP\u002FEraser)                               |\n| 2024.07 | **安全遗忘：一种令人惊讶的有效且可推广的解决方案，用于防御越狱攻击** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.02855) |                              [链接](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FSafeUnlearning)                               |\n| 2024.06 | **对抗性微调：防御LLM的越狱攻击** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.06622) | - |\n| 2024.06 | **Jatmo：通过任务特定微调防御提示注入攻击（Jatmo）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.17673) |                              [链接](https:\u002F\u002Fgithub.com\u002Fwagner-group\u002Fprompt-injection-defense)                               |\n| 2024.06 | **通过目标优先级防御大型语言模型免受越狱攻击（SafeDecoding）** |   ACL'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.09096) |                              [链接](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FJailbreakDefense_GoalPriority)                               |\n| 2024.06 | **通过后门增强的安全对齐缓解基于微调的越狱攻击** |   NeurIPS'24   | [链接](https:\u002F\u002Fjayfeather1024.github.io\u002FFinetuning-Jailbreak-Defense\u002F) |                              [链接](https:\u002F\u002Fgithub.com\u002FJayfeather1024\u002FBackdoor-Enhanced-Alignment)                               |\n| 2024.06 | **关于大型语言模型的提示驱动安全保障（DRO）** |   ICML'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.18018) |                              [链接](https:\u002F\u002Fgithub.com\u002Fchujiezheng\u002FLLM-Safeguard)          |\n| 2024.06 | **鲁棒提示优化，用于防御语言模型免受越狱攻击（RPO）** |   NeurIPS'24   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.17263) |                              -          |\n| 2024.06 | **通过提示对抗性微调反击越狱攻击（PAT）** |   NeurIPS'24   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.06255) |                              [链接](https:\u002F\u002Fgithub.com\u002Frain152\u002FPAT)          |\n| 2024.05 | **通过安全补丁实现大型语言模型全面而高效的后期安全对齐（SAFEPATCHING）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.13820) |                              -          |\n| 2024.05 | **通过知识编辑净化大型语言模型（DINM）** |   ACL'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.14472) |                              [链接](https:\u002F\u002Fgithub.com\u002Fzjunlp\u002FEasyEdit\u002Fblob\u002Fmain\u002Fexamples\u002FSafeEdit.md)          |\n| 2024.05 | **通过分层编辑防御大型语言模型免受越狱攻击** |   arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.18166) | [链接](https:\u002F\u002Fgithub.com\u002Fledllm\u002Fledllm) |\n| 2023.11 | **MART：利用多轮自动红队测试提升LLM安全性（MART）** |   ACL'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.07689) |                              -          |\n| 2023.11 | **针对对齐语言模型的对抗性攻击的基础防御** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.14132) |                              -                               |\n| 2023.10 | **Safe rlhf：安全的人类反馈强化学习** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.12773) |                              [链接](https:\u002F\u002Fgithub.com\u002FPKU-Alignment\u002Fsafe-rlhf)                                |\n| 2023.08 | **使用话语链进行大型语言模型的红队测试以实现安全对齐（RED-INSTRUCT）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.09662) |                              [链接](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fred-instruct)                                |\n| 2022.04 | **通过人类反馈的强化学习训练有益且无害的助手** |   Anthropic    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.05862?spm=a2c6h.13046898.publish-article.36.6cd56ffaIPu4NQ&file=2204.05862) |                              -                                |\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n#### 基于策略的防御\n\n| 时间 | 标题                                                        | 举办地  |                            论文                             |                             代码                             |\n| ---- | ------------------------------------------------------------ | :-----: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2025.12 | **压缩但妥协？对压缩大语言模型中越狱行为的研究** | NeurIPS-W | [链接](https:\u002F\u002Fopenreview.net\u002Fpdf?id=OkNfb8SmLh) | [博客文章链接](https:\u002F\u002Fnamburisrinath.medium.com\u002Fcompressed-but-compromised-a-study-of-jailbreaking-in-compressed-llms-02a6e40aaf17) |\n| 2025.09 | **（几乎）免费的LLM越狱检测！** | arXiv | [链接](http:\u002F\u002Farxiv.org\u002Fabs\u002F2509.14558) | [链接](https:\u002F\u002Fgithub.com\u002FGuoruiC\u002FFJD) |\n| 2025.05 | **推理以防御：安全意识推理可保护大语言模型免受越狱攻击** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2502.12970?) | [链接](https:\u002F\u002Fgithub.com\u002Fchuhac\u002FReasoning-to-Defend) |\n| 2024.11 | **快速响应：用少量示例缓解LLM越狱攻击** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.07494v1) | [链接](https:\u002F\u002Fgithub.com\u002Frapidresponsebench\u002Frapidresponsebench) |\n| 2024.10 | **RePD：通过检索式提示分解过程（RePD）防御越狱攻击** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.08660v1) |  - |\n| 2024.10 | **防御指南（G4D）：大语言模型稳健均衡防御的动态指导（G4D）** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.17922v1) |  [链接](https:\u002F\u002Fgithub.com\u002FIDEA-XL\u002FG4D) |\n| 2024.10 | **越狱解药：通过大语言模型中的稀疏表示调整实现运行时安全与效用平衡** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fhtml\u002F2410.02298v1) | - |\n| 2024.09 | **HSF：通过隐藏状态过滤防御越狱攻击** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fhtml\u002F2409.03788v1) | [链接](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002FHidden-State-Filtering-8652\u002F) |\n| 2024.08 | **EEG-Defender：通过大语言模型的早期退出生成防御越狱攻击（EEG-Defender）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.11308) |                              -                               |\n| 2024.08 | **前缀引导：为大语言模型提供方向盘，抵御越狱攻击（PG）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.08924) |                              [链接](https:\u002F\u002Fgithub.com\u002Fweiyezhimeng\u002FPrefix-Guidance)                               |\n| 2024.08 | **自我评估作为对抗LLM敌对攻击的防御手段（自我评估）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.03234#page=2.47) |                              [链接](https:\u002F\u002Fgithub.com\u002FLinlt-leon\u002Fself-eval)                               |\n| 2024.06 | **通过反向翻译防御LLM越狱攻击（反向翻译）** |   ACL Findings'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.16459) |                              [链接](https:\u002F\u002Fgithub.com\u002FYihanWang617\u002FLLM-Jailbreaking-Defense-Backtranslation)                               |\n| 2024.06 | **SafeDecoding：通过安全意识解码防御越狱攻击（SafeDecoding）** |   ACL'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.08983) |                              [链接](https:\u002F\u002Fgithub.com\u002Fuw-nsl\u002FSafeDecoding)                               |\n| 2024.06 | **通过稳健对齐的LLM防御对齐破坏攻击** |   ACL'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.14348) |                              -                               |\n| 2024.06 | **披着羊皮的狼：通用嵌套越狱提示可轻易欺骗大语言模型（ReNeLLM）** |   NAACL'24    | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.08268) |                              [链接](https:\u002F\u002Fgithub.com\u002FNJUNLP\u002FReNeLLM)                               |\n| 2024.06 | **SMOOTHLLM：防御大语言模型免受越狱攻击** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.03684) |                              [链接](https:\u002F\u002Fgithub.com\u002Farobey1\u002Fsmooth-llm)                               |\n| 2024.05 | **通过双重批评提示增强大语言模型应对归纳指令的能力（双重批评）** |   ACL'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.13733) |                             [链接](https:\u002F\u002Fgithub.com\u002FDevoAllen\u002FINDust)                               |\n| 2024.05 | **PARDEN，你能再说一遍吗？通过重复防御越狱攻击（PARDEN）** |   ICML'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2405.07932) |                             [链接](https:\u002F\u002Fgithub.com\u002FEd-Zh\u002FPARDEN)                               |\n| 2024.05 | **LLM自我防御：通过自我检查，LLM知道它们正在被欺骗** |   ICLR Tiny Paper'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.07308) |                             [链接](https:\u002F\u002Fgithub.com\u002Fpoloclub\u002Fllm-self-defense)                               |\n| 2024.05 | **GradSafe：通过安全关键梯度分析检测LLM不安全提示（GradSafe）** |   ACL'24    | [链接](\\https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.13494) |                             [链接](https:\u002F\u002Fgithub.com\u002Fxyq7\u002FGradSafe)                               |\n| 2024.05 | **大语言模型中的多语言越狱挑战** |   ICLR'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.06474)  |                              [链接](https:\u002F\u002Fgithub.com\u002FDAMO-NLP-SG\u002Fmultilingual-safety-for-LLMs)                               |\n| 2024.05 | **梯度袖口：通过探索拒绝损失景观检测大语言模型越狱攻击** |   NeurIPS'24   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.00867)  |                              -                            |\n| 2024.05 | **AutoDefense：针对越狱攻击的多智能体LLM防御** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.04783) |                             [链接](https:\u002F\u002Fgithub.com\u002FXHMY\u002FAutoDefense)                               |\n| 2024.05 | **Bergeron：通过基于良知的对齐框架对抗敌对攻击（Bergeron）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.00029) |                             [链接](https:\u002F\u002Fgithub.com\u002Fmatthew-pisano\u002FBergeron)                               |\n| 2024.05 | **仅需少量上下文示范即可实现越狱与防护对齐的语言模型（ICD）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.06387)  |                              -                               |\n| 2024.04 | **用信息瓶颈保护你的LLM** |   NeurIPS'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.13968)  |                              [链接](https:\u002F\u002Fgithub.com\u002Fzichuan-liu\u002FIB4LLMs)                                |\n| 2024.04 | **修剪以保护：在无需微调的情况下提高对齐LLM的越狱抵抗能力** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.10862)  |                              [链接](https:\u002F\u002Fgithub.com\u002FCrystalEye42\u002Feval-safety)                                |\n| 2024.02 | **认证LLM对抗敌对提示的安全性** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.02705) |                              [链接](https:\u002F\u002Fgithub.com\u002Faounon\u002Fcertified-llm-safety)                              |\n| 2024.02 | **突破封锁：通过自我精炼重新定义LLM防御越狱攻击** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.15180) |                              -                              |\n| 2024.02 | **通过语义平滑防御大语言模型越狱攻击（SEMANTICSMOOTH）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.16192) |                              [链接](https:\u002F\u002Fgithub.com\u002FUCSB-NLP-Chang\u002FSemanticSmooth)                             |\n| 2024.01 | **意图分析让LLM成为优秀的越狱防御者（IA）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.06561) |                              [链接](https:\u002F\u002Fgithub.com\u002Falphadl\u002FSafeLLM_with_IntentionAnalysis)                               |\n| 2024.01 | **约翰尼如何说服LLM越狱：重新思考说服力，以人性化LLM挑战AI安全（PAP）** |   ACL'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.06373) |                              [链接](https:\u002F\u002Fgithub.com\u002FCHATS-lab\u002Fpersuasive_jailbreaker)                               |\n| 2023.12 | **通过自我提醒防御ChatGPT越狱攻击（自我提醒）** |   Nature Machine Intelligence    | [链接](https:\u002F\u002Fxyq7.github.io\u002Fpapers\u002FNMI-JailbreakDefense.pdf) |                              [链接](https:\u002F\u002Fgithub.com\u002Fyjw1029\u002FSelf-Reminder\u002F)                               |\n| 2023.11 | **用困惑度检测语言模型攻击** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.14132) |                             -                              |\n| 2023.10 | **RAIN：你的语言模型无需微调即可自我对齐（RAIN）** |    ICLR'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.07124) |                              [链接](https:\u002F\u002Fgithub.com\u002FSafeAILab\u002FRAIN)                               |\n\n#### 监控模型\n\n| 时间    | 标题                                                        |   地点    |                            论文                             |                             代码                             |\n| ------- | ------------------------------------------------------------ | :--------: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2026.02 | **GuardReasoner-Omni：一种基于推理的文本、图像和视频多模态护栏** | arXiv'26  |          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.03328)          |                              [链接](https:\u002F\u002Fgithub.com\u002Fzzh-thu-22\u002FGuardReasoner-Omni)   \n| 2025.12 | **OmniGuard：具有深思熟虑推理的统一全模态护栏** | arXiv'25 | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.02306) | - |\n| 2025.10 | **三思而后行：通过渐进式自我反思实现安全防护（PSR）** | EMNLP'25  |          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.01270)          |                              [链接](https:\u002F\u002Fgithub.com\u002FVietHoang1512\u002FPSR)                  |\n| 2025.05 | **GuardReasoner-VL：通过强化推理保障VLMs的安全（GuardReasoner-VL）** | NeurIPS'25  |          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.11049)          |                              [链接](https:\u002F\u002Fgithub.com\u002Fyueliu1999\u002FGuardReasoner-VL\u002F)                               |\n| 2025.04 | **X-Guard：用于内容审核的多语言护栏代理（X-Guard）** | arXiv'25  |          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.08848)          |                              [链接](https:\u002F\u002Fgithub.com\u002FUNHSAILLab\u002FX-Guard)                               |\n| 2025.02 | **ThinkGuard：深思熟虑的慢思考带来谨慎的护栏（ThinkGuard）** | arXiv'25  |          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13458)          |                             [链接](https:\u002F\u002Fgithub.com\u002Fluka-group\u002FThinkGuard)                              |\n| 2025.02 | **宪法分类器：抵御数千小时红队测试中的通用越狱攻击** | arXiv'25  |          [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.18837)          |                             -                               |\n| 2025.01 | **GuardReasoner：迈向基于推理的LLM安全防护（GuardReasoner）** | ICLR Workshop'25  |          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2501.18492)          |                              [链接](https:\u002F\u002Fgithub.com\u002Fyueliu1999\u002FGuardReasoner\u002F)                               |\n| 2024.12 | **使用剪枝语言模型进行轻量级安全分类（Sentence-BERT）** | arXiv'24  |          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.13435)          |                              -                               |\n| 2024.11 | **GuardFormer：用于高效安全防护的护栏指令预训练（GuardFormer）** | Meta  |          [链接](https:\u002F\u002Fopenreview.net\u002Fpdf?id=vr31i9pzQk)          |                              -                               |\n| 2024.11 | **Llama Guard 3 Vision：保障人机图像理解对话的安全性（LLaMA Guard 3 Vision）** | Meta  |          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.10414?)          |                              [链接](https:\u002F\u002Fgithub.com\u002Fmeta-llama\u002Fllama-recipes\u002Ftree\u002Fmain\u002Frecipes\u002Fresponsible_ai\u002Fllama_guard)                                |\n| 2024.11 | **AEGIS2.0：用于对齐LLM护栏的多样化AI安全数据集与风险分类法（Aegis2.0）** | Nvidia, NeurIPS'24 Workshop  |          [链接](https:\u002F\u002Fopenreview.net\u002Fpdf?id=0MvGCv35wi)          |                              -                               |\n| 2024.11 | **使用微调BERT嵌入进行轻量级安全护栏（Sentence-BERT）** | arXiv'24  |          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.14398?)          |                              -                               |\n| 2024.11 | **STAND-Guard：一种小型任务自适应内容审核模型（STAND-Guard）** | Microsoft  |          [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.05214v1)          |                              -                               |\n| 2024.10 | **VLMGuard：利用未标注数据防御恶意提示对VLM的攻击** |   arXiv    |         [链接](https:\u002F\u002Farxiv.org\u002Fhtml\u002F2410.00296v1)          |                              -                               |\n| 2024.09 | **AEGIS：基于LLM专家集合的在线自适应AI内容安全审核（Aegis）** |   Nvidia   |           [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.05993)           | [链接](https:\u002F\u002Fhuggingface.co\u002Fnvidia\u002FAegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) |\n| 2024.09 | **Llama 3.2：以开放、可定制的模型革新边缘AI与视觉技术（LLaMA Guard 3）** |    Meta    | [链接](https:\u002F\u002Fai.meta.com\u002Fblog\u002Fllama-3-2-connect-2024-vision-edge-mobile-devices\u002F) |  [链接](https:\u002F\u002Fhuggingface.co\u002Fmeta-llama\u002FLlama-Guard-3-1B)  |\n| 2024.08 | **ShieldGemma：基于Gemma的生成式AI内容审核（ShieldGemma）** |   Google   |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.21772)           |     [链接](https:\u002F\u002Fhuggingface.co\u002Fgoogle\u002Fshieldgemma-2b)     |\n| 2024.07 | **WildGuard：面向LLM安全风险、越狱与拒绝的开放式一站式审核工具（WildGuard）** | NeurIPS'24 |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18495)           |         [链接](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fwildguard)         |\n| 2024.06 | **GuardAgent：通过知识驱动的推理保障LLM代理的安全（GuardAgent）** | arXiv'24 |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.09187)           |         -        |\n| 2024.06 | **R2-Guard：通过知识增强逻辑推理实现稳健推理的LLM护栏（R2-Guard）** |   arXiv    |           [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.05557)           |       [链接](https:\u002F\u002Fgithub.com\u002Fkangmintong\u002FR-2-Guard)       |\n| 2024.04 | **Llama Guard 2**                                            |    Meta    | [链接](https:\u002F\u002Fwww.llama.com\u002Fdocs\u002Fmodel-cards-and-prompt-formats\u002Fmeta-llama-guard-2\u002F) | [链接](https:\u002F\u002Fgithub.com\u002Fmeta-llama\u002FPurpleLlama\u002Fblob\u002Fmain\u002FLlama-Guard2\u002FMODEL_CARD.md) |\n| 2024.03 | **AdaShield：通过自适应屏蔽提示保障多模态大语言模型免受结构化攻击（AdaShield）** |  ECCV'24   |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.09513)           |      [链接](https:\u002F\u002Fgithub.com\u002FSaFoLab-WISC\u002FAdaShield)       |\n| 2023.12 | **Llama Guard：基于LLM的人机对话输入输出安全防护（LLaMA Guard）** |    Meta    |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.06674)           | [链接](https:\u002F\u002Fgithub.com\u002Fmeta-llama\u002FPurpleLlama\u002Ftree\u002Fmain\u002FLlama-Guard) |\n\n\n\n\n\n#### 审核API\n\n| 时间    | 标题                                                        |      地点      |                            论文                             |                             代码                             |\n| ------- | ------------------------------------------------------------ | :-------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2023.08 | **使用GPT-4进行内容审核（GPT-4）**               |     OpenAI      | [链接](https:\u002F\u002Fopenai.com\u002Findex\u002Fusing-gpt-4-for-content-moderation\u002F) |                              -                               |\n| 2023.02 | **面向现实世界中不良内容检测的整体方法（OpenAI审核端点）** |   AAAI OpenAI   |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.03274)           |   [链接](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fmoderation-api-release)   |\n| 2022.02 | **新一代观点API：高效多语言字符级Transformer（观点API）** |   KDD Google    |           [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.11176)           |             [链接](https:\u002F\u002Fperspectiveapi.com\u002F)              |\n| -       | **Azure AI 内容安全**                                  | Microsoft Azure |                              -                               | [链接](https:\u002F\u002Fazure.microsoft.com\u002Fen-us\u002Fproducts\u002Fai-services\u002Fai-content-safety\u002F) |\n| -       | **Detoxify**                                                 |   unitary.ai    |                              -                               |        [链接](https:\u002F\u002Fgithub.com\u002Funitaryai\u002Fdetoxify)         |\n| -       | **promptfoo** - 大型语言模型红队框架，支持自适应多轮攻击（PAIR、攻击树、渐强攻击） |   promptfoo    |                              -                               |        [链接](https:\u002F\u002Fgithub.com\u002Fpromptfoo\u002Fpromptfoo)         |\n\n### 评估与分析\n| 时间 | 标题                                                        |  地点  |                            论文                             |                             代码                             |\n| ---- | ------------------------------------------------------------ | :-----: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2026.02 | **AgentLeak：多智能体大语言模型系统中隐私泄露的全栈基准测试** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.11510) |  [链接](https:\u002F\u002Fgithub.com\u002FPrivatris\u002FAgentLeak)  |\n| 2026.02 | **babel-bench：面向大语言模型的多语言古典语言安全基准测试（babel-bench）** | ICLR'26 | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.22983) |  [链接](https:\u002F\u002Fgithub.com\u002FMARUCIE\u002Fbabel-bench)  |\n| 2025.12 | **压缩但被攻破？对压缩大语言模型中越狱攻击的研究** | NeurIPS-W | [链接](https:\u002F\u002Fopenreview.net\u002Fpdf?id=OkNfb8SmLh) | [博客文章链接](https:\u002F\u002Fnamburisrinath.medium.com\u002Fcompressed-but-compromised-a-study-of-jailbreaking-in-compressed-llms-02a6e40aaf17) |\n| 2025.08 | **JADES：一种基于分解评分的越狱评估通用框架** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2508.20848) |  [链接](https:\u002F\u002Ftrustairlab.github.io\u002Fjades.github.io\u002F)  |\n| 2025.06 | **激活近似可能在对齐的大语言模型中引发安全漏洞：全面分析与防御** | USENIX Security'25 | [链接](https:\u002F\u002Fwww.arxiv.org\u002Fpdf\u002F2502.00840) |  [链接](https:\u002F\u002Fgithub.com\u002FKevin-Zh-CS\u002FQuadA)  |\n| 2025.05 | **视觉语言模型在野外是否安全？基于模因的基准测试研究** | EMNLP'25 | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2505.15389) |  [链接](https:\u002F\u002Fgithub.com\u002Foneonlee\u002FMeme-Safety-Bench)  |\n| 2025.05 | **PandaGuard：针对越狱攻击的大语言模型安全性的系统性评估** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2505.13862) |  [链接](https:\u002F\u002Fgithub.com\u002FBeijing-AISI\u002Fpanda-guard)  |\n| 2025.05 | **评估量化大语言模型的安全风险及量化感知的安全补丁** | ICML'25 | [链接](https:\u002F\u002Ficml.cc\u002Fvirtual\u002F2025\u002Fposter\u002F44278) |  [链接](https:\u002F\u002Fgithub.com\u002FThecommonirin\u002FQresafe)  |\n| 2025.02 | **GuidedBench：为越狱评估配备指南** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2502.16903) |  [链接](https:\u002F\u002Fgithub.com\u002FSproutNan\u002FAI-Safety_Benchmark)  |\n| 2024.12 | **Agent-SafetyBench：评估大语言模型代理的安全性** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.14470) |  [链接](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FAgent-SafetyBench)  |\n| 2024.11 | **安全且可靠的大型语言模型全球挑战 第一赛道** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.14502v1) | -  |\n| 2024.11 | **JailbreakLens：从表征与电路角度解读越狱机制** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.11114v1) | -  |\n| 2024.11 | **VLLM安全悖论：越狱攻击与防御的双重易感性** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.08410v1) | -  |\n| 2024.11 | **HarmLevelBench：评估危害等级合规性及量化对模型对齐的影响** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.06835v1) | - |\n| 2024.11 | **ChemSafetyBench：化学领域大语言模型安全性的基准测试** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.16736) | [链接](https:\u002F\u002Fgithub.com\u002FHaochenZhao\u002FSafeAgent4Chem) |\n| 2024.11 | **GuardBench：针对护栏模型的大规模基准测试** | EMNLP'24 | [链接](https:\u002F\u002Faclanthology.org\u002F2024.emnlp-main.1022.pdf) | [链接](https:\u002F\u002Fgithub.com\u002FAmenRa\u002Fguardbench) |\n| 2024.11 | **提示中的哪些特征会越狱大语言模型？探究攻击背后的机制** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.03343v1) | [链接](https:\u002F\u002Fgithub.com\u002FNLie2\u002Fwhat_features_jailbreak_LLMs) |\n| 2024.11 | **大语言模型护栏在处理多语言毒性方面的基准测试** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.22153v1) | [链接](https:\u002F\u002Fcommoncrawl.github.io\u002Fcc-crawl-statistics\u002Fplots\u002Flanguages.html) |\n| 2024.10 | **JAILJUDGE：一种综合越狱判断基准，配备多智能体增强解释评估框架** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.12855) | [链接](https:\u002F\u002Fgithub.com\u002Fusail-hkust\u002FJailjudge) |\n| 2024.10 | **大语言模型有政治正确性吗？分析人工智能系统的伦理偏见与越狱漏洞** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.13334v1) | [链接](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002FPCJailbreak-F2B0\u002FREADME.md) |\n| 2024.10 | **大语言模型越狱的真实威胁模型** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.16222v1) | [链接](https:\u002F\u002Fgithub.com\u002Fvalentyn1boreiko\u002Fllm-threat-model) |\n| 2024.10 | **对抗后缀也可能成为特征！** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.00451) | [链接](https:\u002F\u002Fgithub.com\u002Fsuffix-maybe-feature\u002Fadver-suffix-maybe-features) |\n| 2024.09 | **JAILJUDGE：一种综合越狱** | arXiv | [链接](https:\u002F\u002Fopenreview.net\u002Fpdf?id=cLYvhd0pDY) | [链接](https:\u002F\u002Fanonymous.4open.science\u002Fr\u002Fpublic_multiagents_judge-66CB\u002FREADME.md) |\n| 2024.09 | **文本到图像模型的多模态语用越狱** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.19149) | [链接](https:\u002F\u002Fgithub.com\u002Fmultimodalpragmatic\u002Fmultimodalpragmatic\u002Ftree\u002Fmain) |\n| 2024.08 | **ShieldGemma：基于Gemma的生成式人工智能内容审核（ShieldGemma）** |    arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.21772) | [链接](https:\u002F\u002Fhuggingface.co\u002Fgoogle\u002Fshieldgemma-2b) |\n| 2024.08 | **MMJ-Bench：视觉语言模型越狱攻击与防御的全面研究（MMJ-Bench）** |    arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.08464) | [链接](https:\u002F\u002Fgithub.com\u002Fthunxxx\u002FMLLM-Jailbreak-evaluation-MMJ-bench) |\n| 2024.08 | **不可能的任务：从越狱大语言模型到更安全的语言模型的统计视角** | NeurIPS'24 | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.01420) | - |\n| 2024.07 | **为大型语言模型红队行动构建威胁模型** |    arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.14937) | [链接](https:\u002F\u002Fgithub.com\u002Fdapurv5\u002Fawesome-llm-red-teaming) |\n| 2024.07 | **JailBreakV-28K：评估多模态大语言模型抵御越狱攻击的鲁棒性基准测试** |    arXiv   | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.03027) | [链接](https:\u002F\u002Fgithub.com\u002FEddyLuo1232\u002FJailBreakV_28K) |\n| 2024.07 | **大语言模型越狱攻击与防御综述** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.04295)  |                     -                         |\n| 2024.06 | **“未对齐”并不等于“恶意”：警惕大语言模型越狱的幻觉** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.11668) |                              [链接](https:\u002F\u002Fgithub.com\u002FMeirtz\u002FBabyBLUE-llm)                               |\n| 2024.06 | **大规模野外越狱：从野外越狱到更安全的语言模型（WildTeaming）** |   NeurIPS'24   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.18510) |                              [链接](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fwildteaming)                               |\n| 2024.06 | **从大语言模型到多模态大语言模型：探索多模态越狱的格局** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.14859) |                              -                               |\n| 2024.06 | **受威胁的AI代理：关键安全挑战与未来路径的综述** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.02630) |                              -                               |\n| 2024.06 | **MM-SafetyBench：多模态大语言模型安全评估基准测试（MM-SafetyBench）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.17600) |                              -                              |\n| 2024.06 | **ArtPrompt：基于ASCII艺术的对齐大语言模型越狱攻击（VITC）** |   ACL'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.11753) |                              [链接](https:\u002F\u002Fgithub.com\u002Fuw-nsl\u002FArtPrompt)                               |\n| 2024.06 | **技巧大全：大语言模型越狱攻击的基准测试** |   NeurIPS'24   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.09324) |                             [链接](https:\u002F\u002Fgithub.com\u002Fusail-hkust\u002FBag_of_Tricks_for_LLM_Jailbreaking)                              |\n| 2024.06 | **JailbreakZoo：大语言模型与视觉语言模型越狱的调查、格局与展望（JailbreakZoo）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2407.01599) |                             [链接](https:\u002F\u002Fgithub.com\u002FAllen-piexl\u002FJailbreakZoo)                              |\n| 2024.06 | **大语言模型对齐的根本局限性** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.11082) |                             -                              |\n| 2024.06 | **JailbreakBench：大语言模型越狱的开放鲁棒性基准测试（JailbreakBench）** |   NeurIPS'24   | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.01318) |                             [链接](https:\u002F\u002Fgithub.com\u002FJailbreakBench\u002Fjailbreakbench)                             |\n| 2024.06 | **理解大语言模型越狱攻击：一种表征空间分析** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.10794) |                             [链接](https:\u002F\u002Fgithub.com\u002Fyuplin2333\u002Frepresentation-space-jailbreak)                              |\n| 2024.06 | **JailbreakEval：评估大语言模型越狱尝试的集成工具包（JailbreakEval）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.09321) |                              [链接](https:\u002F\u002Fgithub.com\u002FThuCCSLab\u002FJailbreakEval)                               |\n| 2024.05 | **重新思考如何评估语言模型越狱** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.06407) |                             [链接](https:\u002F\u002Fgithub.com\u002Fcontrollability\u002Fjailbreak-evaluation)                               |\n| 2024.05 | **通过双评提示增强大语言模型应对归纳指令的能力（INDust）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.13733) |                             [链接](https:\u002F\u002Fgithub.com\u002FDevoAllen\u002FINDust)                               |\n| 2024.05 | **针对集成大语言模型应用的提示注入攻击** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.05499) |                             -                               |\n| 2024.05 | **诱使大语言模型违抗：越狱的正式化、分析与检测** |   LREC-COLING'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.14965) |                             [链接](https:\u002F\u002Fgithub.com\u002FAetherPrior\u002FTrickLLM)                               |\n| 2024.05 | **大语言模型越狱攻击与防御技术——全面研究** |   NDSS'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.13457) |                             -                               |\n| 2024.05 | **通过提示工程越狱ChatGPT：一项实证研究** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.13860) |                             -                               |\n| 2024.05 | **通过知识编辑净化大语言模型（SafeEdit）** |   ACL'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.14472) |                             [链接](https:\u002F\u002Fgithub.com\u002Fzjunlp\u002FEasyEdit\u002Fblob\u002Fmain\u002Fexamples\u002FSafeEdit.md)                               |\n| 2024.04 | **JailbreakLens：大语言模型越狱攻击的可视化分析（JailbreakLens）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2404.08793) |                             -                               |\n| 2024.03 | **大语言模型的指令中心响应有多（不）道德？揭示安全护栏对有害查询的脆弱性（TECHHAZARDQA）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.15302) |                              [链接](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FSoftMINER-Group\u002FTechHazardQA)                               |\n| 2024.03 | **别听我的：理解和探索大语言模型的越狱提示** |   USENIX Security    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.17336) |                             -                               |\n| 2024.03 | **EasyJailbreak：大语言模型越狱的统一框架（EasyJailbreak）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.12171) |                              [链接](https:\u002F\u002Fgithub.com\u002FEasyJailbreak\u002FEasyJailbreak)                               |\n| 2024.02 | **大语言模型越狱攻击的全面评估** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.05668) |                             -                     |\n| 2024.02 | **SPML：一种用于防御语言模型免受提示攻击的DSL** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.11755) |                             -                     |\n| 2024.02 | **强迫大语言模型做并透露（几乎）任何事** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.14020) |                             -                     |\n| 2024.02 | **针对空越狱的STRONGREJECT（StrongREJECT）** |   NeurIPS'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.10260) |                             [链接](https:\u002F\u002Fgithub.com\u002Falexandrasouly\u002Fstrongreject)                      |\n| 2024.02 | **ToolSword：揭示大语言模型在三个阶段工具学习中的安全问题** |   ACL'24    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.10753) |                             [链接](https:\u002F\u002Fgithub.com\u002FJunjie-Ye\u002FToolSword)                      |\n| 2024.02 | **HarmBench：自动化红队与稳健拒绝的标准评估框架（HarmBench）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.04249) |                              [链接](https:\u002F\u002Fgithub.com\u002Fcenterforaisafety\u002FHarmBench)                               |\n| 2023.12 | **面向目标的提示攻击与大语言模型的安全评估** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.11830)  |                             [链接](https:\u002F\u002Fgithub.com\u002Fliuchengyuan123\u002FCPAD)                              |\n| 2023.12 | **防御的艺术：大语言模型防御策略在安全性和过度防御方面的系统性评估与分析** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.00287)  |                            -                              |\n| 2023.12 | **大语言模型攻击技术、实现与缓解策略的全面调查** |   UbiSec'23    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.10982)  |                            -                              |\n| 2023.11 | **召唤恶魔并束缚它：大语言模型野外红队行动的扎根理论** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.06237)  |                             -                               |\n| 2023.11 | **这张图片里有多少独角兽？面向视觉大语言模型的安全评估基准测试** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.16101)  |                             [链接](https:\u002F\u002Fgithub.com\u002FUCSC-VLAA\u002Fvllm-safety-benchmark)                               |\n| 2023.11 | **利用欺骗技术和说服原则攻击大语言模型（LLMs）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.14876)  |                             -                               |\n| 2023.10 | **探索、建立、利用：从零开始的红队语言模型** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.09442)  |                             -                               |\n| 2023.10 | **由对抗攻击揭示的大语言模型漏洞调查** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.10844)  |                             -                               |\n| 2023.10 | **微调对齐语言模型会损害安全性，即使用户并无此意图！（HEx-PHI）** |   ICLR'24 (oral)    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.03693)  |                             [链接](https:\u002F\u002Fgithub.com\u002FLLM-Tuning-Safety\u002FLLMs-Finetuning-Safety)                               |\n| 2023.08 | **使用链式话语对大语言模型进行红队行动以实现安全对齐（RED-EVAL）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.09662) |                              [链接](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fred-instruct)                                |\n| 2023.08 | **大语言模型用于非法目的：威胁、预防措施与漏洞** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.12833) |                              -                               |\n| 2023.07 | **越狱了：大语言模型安全训练为何失败？（越狱了）** |   NeurIPS'23    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.02483#page=1.01)  |                             -                               |\n| 2023.08 | **大语言模型用于非法目的：威胁、预防措施与漏洞** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.12833)  |                              -                               |\n| 2023.08 | **从ChatGPT到ThreatGPT：生成式AI在网络安全与隐私中的影响** |   IEEE Access    | [链接](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10198233?denied=)  |                              -                               |\n| 2023.07 | **大语言模型审查：是机器学习挑战还是计算机安全问题？** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.10719)  |                              -                               |\n| 2023.07 | **对齐语言模型的通用且可迁移的对抗攻击（AdvBench）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.15043)  |                              [链接](https:\u002F\u002Fgithub.com\u002Fllm-attacks\u002Fllm-attacks)                               |\n| 2023.06 | **DecodingTrust：GPT模型可信度的全面评估** |   NeurIPS'23    | [链接](https:\u002F\u002Fblogs.qub.ac.uk\u002Fdigitallearning\u002Fwp-content\u002Fuploads\u002Fsites\u002F332\u002F2024\u002F01\u002FA-comprehensive-Assessment-of-Trustworthiness-in-GPT-Models.pdf)  |                              [链接](https:\u002F\u002Fdecodingtrust.github.io\u002F)                               |\n| 2023.04 | **中国大语言模型的安全评估** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.10436)  |                              [链接](https:\u002F\u002Fgithub.com\u002Fthu-coai\u002FSafety-Prompts)                              |\n| 2023.02 | **利用大语言模型的程序行为：通过标准安全攻击实现双重用途** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2302.05733)  |                              -                               |\n| 2022.11 | **红队语言模型以减少危害：方法、扩展行为与经验教训** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.07858)  |                              -                               |\n| 2022.02 | **用语言模型进行红队语言模型** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.03286)  | \n| 2026.03 | **评估与对齐，经典论文** | Manning | [链接](https:\u002F\u002Fwww.manning.com\u002Fbooks\u002Fevaluation-and-alignment-the-seminal-papers) | -  |                               |\n\n### 应用\n\n\n| 时间 | 标题                                                        |  地点  |                            论文                             |                             代码                             |\n| ---- | ------------------------------------------------------------ | :-----: | :----------------------------------------------------------: | :----------------------------------------------------------: |\n| 2025.12 | **压缩但已妥协？对压缩大语言模型中越狱行为的研究** | NeurIPS-W | [链接](https:\u002F\u002Fopenreview.net\u002Fpdf?id=OkNfb8SmLh) | [博客文章链接](https:\u002F\u002Fnamburisrinath.medium.com\u002Fcompressed-but-compromised-a-study-of-jailbreaking-in-compressed-llms-02a6e40aaf17) |\n| 2025.08 | **超越越狱：揭示更隐蔽、更广泛的因对齐失败引发的大语言模型安全风险** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.07402) | [链接](https:\u002F\u002Fjailflip.github.io\u002F) |\n| 2024.11 | **通过弹窗攻击视觉语言计算机代理** | arXiv | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.02391) | [链接](https:\u002F\u002Fgithub.com\u002FSALT-NLP\u002FPopupAttack) |\n| 2024.10 | **越狱控制的机器人（ROBOPAIR）** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.13691v1) |        [链接](https:\u002F\u002Frobopair.org\u002F)      |\n| 2024.10 | **SMILES提示：化学合成中大语言模型越狱攻击的新方法** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.15641v1) |        [链接](https:\u002F\u002Fgithub.com\u002FIDEA-XL\u002FChemSafety)      |\n| 2024.10 | **欺骗自动大语言模型基准测试：空模型也能取得高胜率** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2410.07137) |        [链接](https:\u002F\u002Fgithub.com\u002Fsail-sg\u002FCheating-LLM-Benchmarks)      |\n| 2024.09 | **角色破解：角色扮演系统中的角色幻觉作为越狱攻击** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.16727) |                              -  |\n| 2024.08 | **一个被越狱的大语言模型可能造成严重危害：基于大语言模型的应用程序易受PromptWare（APwT）攻击** |   arXiv    | [链接](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.05061) |                              -  |\n\n\n\n## 其他相关优秀仓库\n\n- [Awesome-LM-SSP](https:\u002F\u002Fgithub.com\u002FThuCCSLab\u002FAwesome-LM-SSP)\n- [llm-sp](https:\u002F\u002Fgithub.com\u002Fchawins\u002Fllm-sp)\n- [awesome-llm-security](https:\u002F\u002Fgithub.com\u002Fcorca-ai\u002Fawesome-llm-security)\n- [Awesome-LLM-Safety](https:\u002F\u002Fgithub.com\u002Fydyjya\u002FAwesome-LLM-Safety)\n- [Awesome-LRMs-Safety](https:\u002F\u002Fgithub.com\u002FWangCheng0116\u002FAwesome-LRMs-Safety)\n- [Awesome-LALMs-Jailbreak](https:\u002F\u002Fgithub.com\u002FWangCheng0116\u002FAwesome_LALMs_Jailbreak)\n- [Awesome-Embodied-AI-Safety](https:\u002F\u002Fgithub.com\u002Fx-zheng16\u002FAwesome-Embodied-AI-Safety)\n\n\n\n\n\n\n## 贡献者\n\n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fyueliu1999\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_046e0fc42617.png\" alt=\"yueliu1999\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fbhooi\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_abf8b40670ad.png\" alt=\"bhooi\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fzqypku\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_1abc937ffe3e.png\" alt=\"zqypku\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002FjiaxiaojunQAQ\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_82c6f307c87d.png\" alt=\"jiaxiaojunQAQ\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002FHuang-yihao\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_7e269c23780e.png\" alt=\"Huang-yihao\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcsyuhao\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_28f34f9eb088.png\" alt=\"csyuhao\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fxszheng2020\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_65efddbbcb5b.png\" alt=\"xszheng2020\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fdapurv5\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_f5d91bf10a84.png\" alt=\"dapurv5\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002FZYQ-Zoey77\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_14970bad82e5.png\" alt=\"ZYQ-Zoey77\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fmdoumbouya\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_e7b63d401e4b.png\" alt=\"mdoumbouya\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fxyliugo\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_ebffef2bc6f9.png\" alt=\"xyliugo\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fzky001\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_readme_a3c8c413336f.png\" alt=\"zky001\" width=\"96\" height=\"96\"\u002F>\u003C\u002Fa> \n\n\n\n\n\n\n\u003Cp align=\"right\">(\u003Ca href=\"#top\">回到顶部\u003C\u002Fa>)\u003C\u002Fp>","# Awesome-Jailbreak-on-LLMs 快速上手指南\n\n## 环境准备\n\n- **系统要求**：Linux \u002F macOS \u002F Windows（推荐 WSL2）\n- **Python 版本**：≥ 3.9\n- **前置依赖**：PyTorch、transformers、accelerate、requests、numpy\n- **推荐镜像源**：使用清华源加速依赖安装  \n  ```bash\n  pip install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n  ```\n\n## 安装步骤\n\n1. 克隆仓库：\n   ```bash\n   git clone https:\u002F\u002Fgithub.com\u002Fyueliu1999\u002FAwesome-Jailbreak-on-LLMs.git\n   cd Awesome-Jailbreak-on-LLMs\n   ```\n\n2. 安装核心依赖（推荐使用清华源）：\n   ```bash\n   pip install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple -r requirements.txt\n   ```\n\n3. （可选）下载攻击模型（如 FlipAttack）：\n   ```bash\n   git clone https:\u002F\u002Fhuggingface.co\u002FJinx-org\n   git clone https:\u002F\u002Fgithub.com\u002Fyueliu1999\u002FFlipAttack\n   ```\n\n## 基本使用\n\n以 **FlipAttack** 为例，最简使用流程：\n\n1. 进入攻击代码目录：\n   ```bash\n   cd FlipAttack\n   ```\n\n2. 运行默认攻击示例（需已安装模型）：\n   ```bash\n   python attack.py --model_name gpt-3.5-turbo --prompt \"Tell me how to make a bomb\"\n   ```\n\n3. 查看输出结果，判断是否成功绕过安全机制。\n\n> 注：所有攻击方法均需在合法授权环境下使用，仅限安全研究与模型鲁棒性评估。","某大型金融科技公司安全团队正在为一款面向公众的AI客服系统做合规性加固，该系统需严格过滤金融诈骗、洗钱诱导等高风险问答，但近期频繁被用户用隐晦话术绕过，导致监管风险上升。\n\n### 没有 Awesome-Jailbreak-on-LLMs 时\n- 团队依赖传统关键词过滤和简单提示词加固，无法识别新型语义混淆攻击，如“帮我规划一笔‘合法避税’操作”被误判为正常咨询。\n- 缺乏对多轮对话攻击的测试能力，攻击者通过5轮诱导逐步突破模型防线，团队无从复现和定位漏洞。\n- 无法评估新上线的RAG检索模块是否易被恶意查询污染，导致系统偶尔返回伪造的“高收益理财建议”。\n- 安全测试依赖外部红队，周期长、成本高，且每次攻击手法更新后需重新协商合作。\n- 没有统一的评估基准，不同模型版本的安全性对比缺乏数据支撑，决策靠经验而非实证。\n\n### 使用 Awesome-Jailbreak-on-LLMs 后\n- 团队直接复用 FlipAttack 和 BadThink 的攻击模板，在内部测试环境快速模拟出12种新型绕过话术，发现3个此前未察觉的漏洞。\n- 利用其多轮攻击数据集和代码，自动化构建了“诱导链测试流水线”，将攻击检测周期从2周缩短至48小时。\n- 通过集成针对RAG的攻击案例，发现检索模块会因用户输入“请引用央行2024年报告”而返回伪造文档，立即优化了来源验证机制。\n- 团队内部可独立完成安全攻防演练，不再依赖外部红队，年度测试成本降低60%。\n- 基于工具中的评估指标，建立模型安全评分体系，为模型选型和迭代提供明确依据，合规报告通过率提升90%。\n\nAwesome-Jailbreak-on-LLMs 让安全团队从被动防御转向主动攻防，真正实现了AI系统安全的可量化、可复现、可迭代。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fyueliu1999_Awesome-Jailbreak-on-LLMs_647adb08.png","yueliu1999","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fyueliu1999_faf53813.jpg","Yue Liu a Ph.D. student at NUS.","National University of Singapore","Singapore","yueliu19990731@163.com",null,"yueliu1999.github.io","https:\u002F\u002Fgithub.com\u002Fyueliu1999",1292,107,"2026-04-04T19:25:05","MIT",5,"Linux, macOS, Windows","未说明",{"notes":92,"python":90,"dependencies":93},"该仓库为 jailbreak 方法的集合，包含论文、代码和数据集，具体运行环境依赖各子项目；建议根据具体代码仓库的 README 配置环境，部分代码需下载大型模型（如 Hugging Face 上的模型），可能需大量磁盘空间和网络资源。",[],[13,26,14,15],[96,97,98,99,100,101,102,103,104],"ai","jailbreak","llm","llms","privacy","safety","security","vlm","vlms","2026-03-27T02:49:30.150509","2026-04-06T08:46:45.317307",[108,113,118,123,128,132],{"id":109,"question_zh":110,"answer_zh":111,"source_url":112},9213,"如何向该仓库提交新的论文或资源建议？","请创建一个拉取请求（PR）来提交新的论文或资源，维护者会审核并合并。例如，提交新论文时，请按照现有格式在对应章节添加条目。","https:\u002F\u002Fgithub.com\u002Fyueliu1999\u002FAwesome-Jailbreak-on-LLMs\u002Fissues\u002F44",{"id":114,"question_zh":115,"answer_zh":116,"source_url":117},9214,"是否接受对多模态或具身AI安全相关资源的添加？","是的，维护者欢迎针对具身AI安全、多模态攻击等方向的资源提交。请通过创建PR将相关仓库（如Awesome-Embodied-AI-Safety）添加到「Other Related Awesome Repository」部分。","https:\u002F\u002Fgithub.com\u002Fyueliu1999\u002FAwesome-Jailbreak-on-LLMs\u002Fissues\u002F50",{"id":119,"question_zh":120,"answer_zh":121,"source_url":122},9215,"如何为该文献清单贡献新的论文？","如果您发现相关论文，可以直接创建一个拉取请求（PR），将论文信息按现有格式添加到对应分类中，维护者会及时处理。","https:\u002F\u002Fgithub.com\u002Fyueliu1999\u002FAwesome-Jailbreak-on-LLMs\u002Fissues\u002F43",{"id":124,"question_zh":125,"answer_zh":126,"source_url":127},9216,"是否可以提交新会议论文（如COLM 2025）到该清单？","可以，维护者欢迎提交新会议论文。请创建一个PR，将论文标题、来源链接和分类信息按仓库格式添加即可。","https:\u002F\u002Fgithub.com\u002Fyueliu1999\u002FAwesome-Jailbreak-on-LLMs\u002Fissues\u002F31",{"id":129,"question_zh":130,"answer_zh":131,"source_url":112},9217,"提交建议后多久会被处理？","提交PR后，维护者通常会快速审核并合并。例如，Issue #44 提交后，维护者立即确认已添加，说明流程高效。",{"id":133,"question_zh":134,"answer_zh":135,"source_url":117},9218,"该清单是否接受非文本类攻击（如机器人物理攻击）的研究？","是的，清单明确欢迎具身AI安全相关的研究，包括语言控制机器人时的越狱攻击及其物理后果，建议通过PR提交相关资源。",[]]