[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-cedrickchee--awesome-transformer-nlp":3,"tool-cedrickchee--awesome-transformer-nlp":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":79,"owner_location":80,"owner_email":81,"owner_twitter":82,"owner_website":83,"owner_url":84,"languages":81,"stars":85,"forks":86,"last_commit_at":87,"license":88,"difficulty_score":89,"env_os":90,"env_gpu":91,"env_ram":91,"env_deps":92,"category_tags":95,"github_topics":96,"view_count":10,"oss_zip_url":81,"oss_zip_packed_at":81,"status":16,"created_at":114,"updated_at":115,"faqs":116,"releases":117},657,"cedrickchee\u002Fawesome-transformer-nlp","awesome-transformer-nlp","A curated list of NLP resources focused on Transformer networks, attention mechanism, GPT, BERT, ChatGPT, LLMs, and transfer learning.","awesome-transformer-nlp 是一个专注于自然语言处理（NLP）领域的精选资源导航，核心围绕 Transformer 网络、注意力机制、BERT、GPT 及大语言模型等技术展开。面对海量且分散的技术资料，学习者常面临筛选困难的问题，而这份列表通过汇聚高质量论文、文章、教程、视频及代码实现，有效解决了信息碎片化的挑战。\n\n项目非常适合 AI 开发者、算法研究人员以及对大模型感兴趣的学生使用。内容不仅包含 BERT 和 Transformer-XL 等经典论文解读，还详细列出了 PyTorch、TensorFlow 等主流框架的官方实现与社区版本，甚至涵盖 AI 安全及命名实体识别等具体任务指南。此外，它还特别关注迁移学习与强化学习在 Transformer 中的应用，为不同阶段的学习者提供了从理论基础到工程落地的完整路径。这种结构化的整理方式，帮助用户高效构建知识体系，快速追踪技术前沿，是探索现代 NLP 技术与深度学习应用的理想起点。","# Awesome Transformer & Transfer Learning in NLP [![Awesome](https:\u002F\u002Fawesome.re\u002Fbadge.svg)](https:\u002F\u002Fawesome.re)\n\nThis repository contains a hand-curated list of great machine (deep) learning resources for Natural Language Processing (NLP) with a focus on Generative Pre-trained Transformer (GPT), Bidirectional Encoder Representations from Transformers (BERT), attention mechanism, Transformer architectures\u002Fnetworks, ChatGPT, and transfer learning in NLP.\n\n\u003Cp align=\"center\" width=\"100%\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcedrickchee_awesome-transformer-nlp_readme_c1660d940453.png\" width=\"40%\" alt=\"Transformer (BERT encoder)\" \u002F>\n\u003C\u002Fp>\n\u003Cp align=\"center\" width=\"100%\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcedrickchee_awesome-transformer-nlp_readme_6f6f0127591f.png\" width=\"40%\" alt=\"Transformer (BERT encoder)\" \u002F>\n\u003C\u002Fp>\n\u003Cp align=\"center\" width=\"100%\">\n  \u003Csup>Transformer (\u003Ca href=\"https:\u002F\u002Fweb.archive.org\u002Fweb\u002F20201217063603\u002Fhttps:\u002F\u002Fpeltarion.com\u002Fknowledge-center\u002Fdocumentation\u002Fmodeling-view\u002Fbuild-an-ai-model\u002Fblocks\u002Fbert-encoder\">Source\u003C\u002Fa>)\u003C\u002Fsup>\n\u003C\u002Fp>\n\n# Table of Contents\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>Expand Table of Contents\u003C\u002Fb>\u003C\u002Fsummary>\n\n- [Papers](#papers)\n- [Articles](#articles)\n  - [BERT and Transformer](#bert-and-transformer)\n  - [Attention Mechanism](#attention-mechanism)\n  - [Transformer Architecture](#transformer-architecture)\n  - [Generative Pre-Training Transformer (GPT)](#generative-pre-training-transformer-gpt)\n    - [ChatGPT](#chatgpt)\n  - [Large Language Model (LLM)](#large-language-model-llm)\n  - [Transformer Reinforcement Learning](#transformer-reinforcement-learning)\n  - [Additional Reading](#additional-reading)\n- [Educational](#educational)\n  - [Tutorials](#tutorials)\n- [AI Safety](#ai-safety)\n- [Videos](#videos)\n  - [BERTology](#bertology)\n  - [Attention and Transformer Networks](#attention-and-transformer-networks)\n- [Official BERT Implementations](#official-bert-implementations)\n- [Transformer Implementations By Communities](#transformer-implementations-by-communities)\n  - [PyTorch and TensorFlow](#pytorch-and-tensorflow)\n  - [PyTorch](#pytorch)\n  - [Keras](#keras)\n  - [TensorFlow](#tensorflow)\n  - [Chainer](#chainer)\n  - [Other](#other)\n- [Transfer Learning in NLP](#transfer-learning-in-nlp)\n- [Books](#books)\n- [Other Resources](#other-resources)\n- [Tools](#tools)\n- [Tasks](#tasks)\n  - [Named-Entity Recognition (NER)](#named-entity-recognition-ner)\n  - [Classification](#classification)\n  - [Text Generation](#text-generation)\n  - [Question Answering (QA)](#question-answering-qa)\n  - [Knowledge Graph](#knowledge-graph)\n\u003C\u002Fdetails>\n\n---\n\n## Papers\n\n1. [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.\n2. [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.02860) by Zihang Dai, Zhilin Yang, Yiming Yang, William W. Cohen, Jaime Carbonell, Quoc V. Le and Ruslan Salakhutdinov.\n  - Uses smart caching to improve the learning of long-term dependency in Transformer. Key results: state-of-art on 5 language modeling benchmarks, including ppl of 21.8 on One Billion Word (LM1B) and 0.99 on enwiki8. The authors claim that the method is more flexible, faster during evaluation (1874 times speedup), generalizes well on small datasets, and is effective at modeling short and long sequences.\n2. [Conditional BERT Contextual Augmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.06705) by Xing Wu, Shangwen Lv, Liangjun Zang, Jizhong Han and Songlin Hu.\n3. [SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.03593) by Chenguang Zhu, Michael Zeng and Xuedong Huang.\n4. [Language Models are Unsupervised Multitask Learners](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf) by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.\n5. [The Evolved Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.11117) by David R. So, Chen Liang and Quoc V. Le.\n  - They used architecture search to improve Transformer architecture. Key is to use evolution and seed initial population with Transformer itself. The architecture is better and more efficient, especially for small size models.\n6. [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.08237) by Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.\n  - A new pretraining method for NLP that significantly improves upon BERT on 20 tasks (e.g., SQuAD, GLUE, RACE).\n  - \"Transformer-XL is a shifted model (each hyper-column ends with next token) while XLNet is a direct model (each hyper-column ends with contextual representation of same token).\" — [Thomas Wolf](https:\u002F\u002Ftwitter.com\u002FThom_Wolf\u002Fstatus\u002F1141803437719506944?s=20).\n  - [Comments from HN](https:\u002F\u002Fnews.ycombinator.com\u002Fitem?id=20229145):\n    \u003Cdetails>\n  \n    \u003Csummary>A clever dual masking-and-caching algorithm.\u003C\u002Fsummary>\n\n    - This is NOT \"just throwing more compute\" at the problem.\n    - The authors have devised a clever dual-masking-plus-caching mechanism to induce an attention-based model to learn to predict tokens from all possible permutations of the factorization order of all other tokens in the same input sequence.\n    - In expectation, the model learns to gather information from all positions on both sides of each token in order to predict the token.\n      - For example, if the input sequence has four tokens, [\"The\", \"cat\", \"is\", \"furry\"], in one training step the model will try to predict \"is\" after seeing \"The\", then \"cat\", then \"furry\".\n      - In another training step, the model might see \"furry\" first, then \"The\", then \"cat\".\n      - Note that the original sequence order is always retained, e.g., the model always knows that \"furry\" is the fourth token.\n    - The masking-and-caching algorithm that accomplishes this does not seem trivial to me.\n    - The improvements to SOTA performance in a range of tasks are significant -- see tables 2, 3, 4, 5, and 6 in the paper.\n    \u003C\u002Fdetails>\n7. [CTRL: Conditional Transformer Language Model for Controllable Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.05858) by Nitish Shirish Keskar, Richard Socher et al. [[Code](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002Fctrl)].\n8. [PLMpapers](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FPLMpapers) - BERT (Transformer, transfer learning) has catalyzed research in pretrained language models (PLMs) and has sparked many extensions. This repo contains a list of papers on PLMs.\n9. [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683) by Google Brain.\n- The group perform a systematic study of transfer learning for NLP using a unified Text-to-Text Transfer Transformer (T5) model and push the limits to achieve SoTA on SuperGLUE (approaching human baseline), SQuAD, and CNN\u002FDM benchmark. [[Code](https:\u002F\u002Fgit.io\u002FJe0cZ)].\n10. [Reformer: The Efficient Transformer](https:\u002F\u002Fopenreview.net\u002Fforum?id=rkgNKkHtvB) by Nikita Kitaev, Lukasz Kaiser, and Anselm Levskaya.\n- \"They present techniques to reduce the time and memory complexity of Transformer, allowing batches of very long sequences (64K) to fit on one GPU. Should pave way for Transformer to be really impactful beyond NLP domain.\" — @hardmaru\n11. [Supervised Multimodal Bitransformers for Classifying Images and Text](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02950) (MMBT) by Facebook AI.\n11. [A Primer in BERTology: What we know about how BERT works](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.12327) by Anna Rogers et al.\n- \"Have you been drowning in BERT papers?\". The group survey over 40 papers on BERT's linguistic knowledge, architecture tweaks, compression, multilinguality, and so on.\n12. [tomohideshibata\u002FBERT-related papers](https:\u002F\u002Fgithub.com\u002Ftomohideshibata\u002FBERT-related-papers)\n13. [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.03961) by Google Brain. [[Code]](https:\u002F\u002Fgithub.com\u002Ftensorflow\u002Fmesh\u002Fblob\u002Fmaster\u002Fmesh_tensorflow\u002Ftransformer\u002Fmoe.py) | [[Blog post (unofficial)]](https:\u002F\u002Fsyncedreview.com\u002F2021\u002F01\u002F14\u002Fgoogle-brains-switch-transformer-language-model-packs-1-6-trillion-parameters\u002F)\n- Key idea: the architecture use a subset of parameters on every training step and on each example. Upside: model train much faster. Downside: super large model that won't fit in a lot of environments.\n14. [An Attention Free Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.14103) by Apple.\n15. [A Survey of Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.04554) by Tianyang Lin et al.\n16. [Evaluating Large Language Models Trained on Code](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374) by OpenAI.\n- Codex, a GPT language model that powers GitHub Copilot.\n- They investigate their model limitations (and strengths).\n- They discuss the potential broader impacts of deploying powerful code generation techs, covering safety, security, and economics.\n17. [Training language models to follow instructions with human feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.02155) by OpenAI. They call the resulting models [InstructGPT](https:\u002F\u002Fopenai.com\u002Fblog\u002Finstruction-following\u002F). [ChatGPT](https:\u002F\u002Fopenai.com\u002Fblog\u002Fchatgpt\u002F) is a sibling model to InstructGPT.\n18. [LaMDA: Language Models for Dialog Applications](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.08239) by Google.\n19. [Training Compute-Optimal Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.15556) by Hoffmann et al. at DeepMind. TLDR: introduces a new 70B LM called \"Chinchilla\" that outperforms much bigger LMs (GPT-3, Gopher). DeepMind has found the secret to cheaply scale large language models — to be compute-optimal, model size and training data must be scaled equally. It shows that most LLMs are severely starved of data and under-trained. Given the [new scaling law](https:\u002F\u002Fwww.alignmentforum.org\u002Fposts\u002F6Fpvch8RR29qLEWNH\u002Fchinchilla-s-wild-implications), even if you pump a quadrillion parameters into a model (GPT-4 urban myth), the gains will not compensate for 4x more training tokens.\n20. [Improving language models by retrieving from trillions of tokens](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.04426) by Borgeaud et al. at DeepMind - The group explore an alternate path for efficient training with Internet-scale retrieval. The method is known as RETRO, for \"Retrieval Enhanced TRansfOrmers\". With RETRO **the model is not limited to the data seen during training – it has access to the entire training dataset through the retrieval mechanism. This results in significant performance gains compared to a standard Transformer with the same number of parameters**. RETRO obtains comparable performance to GPT-3 on the Pile dataset, despite using 25 times fewer parameters. They show that language modeling improves continuously as they increase the size of the retrieval database. [[blog post](https:\u002F\u002Fwww.deepmind.com\u002Fblog\u002Fimproving-language-models-by-retrieving-from-trillions-of-tokens)]\n21. [Scaling Instruction-Finetuned Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.11416) by Google - They find that instruction finetuning with the above aspects dramatically improves performance on a variety of model classes (PaLM, T5, U-PaLM), prompting setups (zero-shot, few-shot, CoT), and evaluation benchmarks. Flan-PaLM 540B achieves SoTA performance on several benchmarks. They also publicly release [Flan-T5 checkpoints](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Ft5x\u002Fblob\u002Fmain\u002Fdocs\u002Fmodels.md#flan-t5-checkpoints), which achieve strong few-shot performance even compared to much larger models, such as PaLM 62B.\n22. [Emergent Abilities of Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.07682) by Google Research, Stanford University, DeepMind, and UNC Chapel Hill.\n23. [Nonparametric Masked (NPM) Language Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.01349) by Meta AI et al. [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FNPM)] - Nonparametric models with **500x fewer parameters outperform GPT-3 on zero-shot tasks.**\n    > It, crucially, does not have a softmax over a fixed output vocabulary, but instead has a fully nonparametric distribution over phrases. This\nis in contrast to a recent (2022) body of work that incorporates nonparametric components in a parametric model.\n    >\n    > Results show that NPM is significantly more parameter-efficient, outperforming up to 500x larger parametric models and up to 37x larger retrieve-and-generate models.\n24. [Transformer models: an introduction and catalog](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.07730) by Xavier Amatriain, 2023 - The goal of this paper is to offer a somewhat comprehensive but simple catalog and classification of the most popular Transformer models. The paper also includes an introduction to the most important aspects and innovation in Transformer models.\n25. [Foundation Models for Decision Making: Problems, Methods, and Opportunities](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.04129) by Google Research et al., 2023 - A report of recent approaches (i.e., conditional generative modeling, RL, prompting) that ground pre-trained models (i.e., LMs) in practical decision making agents. Models can serve world dynamics or steer decisions.\n26. [GPT-4 Technical Report](https:\u002F\u002Fcdn.openai.com\u002Fpapers\u002Fgpt-4.pdf) by OpenAI, 2023.\n27. [The Llama 3 Herd of Models](https:\u002F\u002Fai.meta.com\u002Fresearch\u002Fpublications\u002Fthe-llama-3-herd-of-models\u002F) by Llama Team, AI @ Meta, Jul 2024 - The paper, a oft-overlooked component of the project, proved to be just as vital, if not more so, than the model itself, and its significance came as a complete surprise. A masterpiece in its own right, the paper presented a treasure trove of detailed information on the model's pre-training and post-training processes, offering insights that were both profound and practical. [[Discussion](https:\u002F\u002Fold.reddit.com\u002F\u002Fr\u002FLocalLLaMA\u002Fcomments\u002F1eabf4l)]\n\n## Articles\n\n### BERT and Transformer\n\n1. [Open Sourcing BERT: State-of-the-Art Pre-training for Natural Language Processing](https:\u002F\u002Fai.googleblog.com\u002F2018\u002F11\u002Fopen-sourcing-bert-state-of-art-pre.html) from Google AI.\n2. [The Illustrated BERT, ELMo, and co. (How NLP Cracked Transfer Learning)](https:\u002F\u002Fjalammar.github.io\u002Fillustrated-bert\u002F).\n3. [Dissecting BERT](https:\u002F\u002Fmedium.com\u002Fdissecting-bert) by Miguel Romero and Francisco Ingham - Understand BERT in depth with an intuitive, straightforward explanation of the relevant concepts.\n3. [A Light Introduction to Transformer-XL](https:\u002F\u002Fmedium.com\u002Fdair-ai\u002Fa-light-introduction-to-transformer-xl-be5737feb13).\n4. [Generalized Language Models](https:\u002F\u002Flilianweng.github.io\u002Flil-log\u002F2019\u002F01\u002F31\u002Fgeneralized-language-models.html) by Lilian Weng, Research Scientist at OpenAI.\n5. [What is XLNet and why it outperforms BERT](https:\u002F\u002Ftowardsdatascience.com\u002Fwhat-is-xlnet-and-why-it-outperforms-bert-8d8fce710335)\n  - Permutation Language Modeling objective is the core of XLNet.\n6. [DistilBERT](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Fpytorch-transformers\u002Ftree\u002Fmaster\u002Fexamples\u002Fdistillation) (from HuggingFace), released together with the blog post [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https:\u002F\u002Fmedium.com\u002Fhuggingface\u002Fdistilbert-8cf3380435b5).\n7. [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11942v3) from Google Research and Toyota Technological Institute. — Improvements for more efficient parameter usage: factorized embedding parameterization, cross-layer parameter sharing, and Sentence Order Prediction (SOP) loss to model inter-sentence coherence. [[Blog post](https:\u002F\u002Fai.googleblog.com\u002F2019\u002F12\u002Falbert-lite-bert-for-self-supervised.html) | [Code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002FALBERT)]\n8. [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https:\u002F\u002Fopenreview.net\u002Fforum?id=r1xMH1BtvB) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, and Christopher D. Manning - A BERT variant like ALBERT and cost less to train. They trained a model that outperforms GPT by using only one GPU; match the performance of RoBERTa by using 1\u002F4 computation. It uses a new pre-training approach, called replaced token detection (RTD), that trains a bidirectional model while learning from all input positions. [[Blog post](https:\u002F\u002Fai.googleblog.com\u002F2020\u002F03\u002Fmore-efficient-nlp-model-pre-training.html) | [Code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Felectra)]\n9. [Visual Paper Summary: ALBERT (A Lite BERT)](https:\u002F\u002Famitness.com\u002F2020\u002F02\u002Falbert-visual-summary\u002F)\n10. [Cramming: Training a Language Model on a Single GPU in One Day (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.14034) (2022) - While most in the community are asking how to push the limits of extreme computation, we ask the opposite question: How far can we get with a single GPU in just one day? ... Through the lens of scaling laws, we categorize a range of recent improvements to training and architecture and discuss their merit and practical applicability (or lack thereof) for the limited compute setting.\n11. [What happened to BERT & T5? On Transformer Encoders, PrefixLM and Denoising Objectives](https:\u002F\u002Fwww.yitay.net\u002Fblog\u002Fmodel-architecture-blogpost-encoders-prefixlm-denoising) by Yi Tay, Jul 2024\n    > to recap, we don't see any scaled up xBERTs running around: BERT models got deprecated in favor of more flexible forms of denoising (autoregressive) T5 models. This is largely due to paradigm unification where people would like to perform any task with a general purpose model (as opposed to task specific model). Meanwhile, autoregressive denoising gets sometimes folded as side objectives to casual language models.\n\n### Attention Mechanism\n\n[![Visualizing Attention, a Transformer's Heart](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcedrickchee_awesome-transformer-nlp_readme_ed8c97aef1c1.jpg)](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=eMlx5fFNoYc)\n\n\u003Csup>Visualizing Attention, a Transformer's Heart\u003C\u002Fsup>\n\n1. [Neural Machine Translation by Jointly Learning to Align and Translate](https:\u002F\u002Farxiv.org\u002Fabs\u002F1409.0473v1) by Dzmitry Bahdanau, KyungHyun Cho, and Yoshua Bengio, 2014 - [Bahdanau invented the content-based neural attention that is now a core tool in deep-learning-based NLP (language models)](https:\u002F\u002Farchive.is\u002FJxMmF#selection-99.0-103.76). A disadvantage of fixed-length context vector design is incapability of remembering long sentences. The attention mechanism was born to resolve this problem. It was born to help memorize long input sentences in language translation. [[Bahdanau deserve the praise](https:\u002F\u002Farchive.is\u002F3DwY5)]\n2. [The Annotated Transformer by Harvard NLP Group](http:\u002F\u002Fnlp.seas.harvard.edu\u002F2018\u002F04\u002F03\u002Fattention.html) - Further reading to understand the \"Attention is all you need\" paper.\n3. [Attention? Attention!](https:\u002F\u002Flilianweng.github.io\u002Flil-log\u002F2018\u002F06\u002F24\u002Fattention-attention.html) - Attention guide by Lilian Weng from OpenAI.\n4. [Visualizing A Neural Machine Translation Model (Mechanics of Seq2seq Models With Attention)](https:\u002F\u002Fjalammar.github.io\u002Fvisualizing-neural-machine-translation-mechanics-of-seq2seq-models-with-attention\u002F) by Jay Alammar, an Instructor from Udacity ML Engineer Nanodegree.\n5. [Making Transformer networks simpler and more efficient](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Fmaking-transformer-networks-simpler-and-more-efficient\u002F) - FAIR released an all-attention layer to simplify the Transformer model and an adaptive attention span method to make it more efficient (reduce computation time and memory footprint).\n6. [What Does BERT Look At? An Analysis of BERT’s Attention paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04341) by Stanford NLP Group.\n7. [Fast Transformer Decoding: One Write-Head is All You Need (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.02150) by Noam Shazeer, Google (2019) - They proposed a variant of attention type called **multi-query attention** (MQA). The plain multi-head attention mechanism has one query, key, and value per head; multi-query instead **shares one key and value across all of the different attention \"heads\"**. In practice, training time remains the same, but **much faster to decode in inference**. MQA significantly improves language models performance and efficiency. Users can get ~10x better throughput and ~30% lower latency on inference. However, MQA can lead to quality degradation, and moreover it may not be desirable to train a separate model just for faster inference. In 2022, PaLM, a decoder-style model and their use of MQA is an interesting architecture improvements over GPT. Recent models that use MQA include [TII's Falcon](https:\u002F\u002Ffalconllm.tii.ae\u002F) (2023).\n8. [GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13245) by Google Research, 2023 - They (1) propose a **technique for uptraining existing multi-head attention (MHA) models into models with multi-query attention (MQA)** using 5% of original pre-training compute, and (2) introduce **grouped-query attention (GQA)**, a generalization of MQA which uses an intermediate (more than one, less than number of query heads) number of key-value heads. GQA achieves **benefits close to MHA** with **comparable inference speed to MQA** through reduced number of key-value heads. Models that use MQA include Meta's Llama 2 (2023). [[Some Tweets](https:\u002F\u002Ftwitter.com\u002F_philschmid\u002Fstatus\u002F1673335690912825347?s=20)]\n9. [Ring Attention with Blockwise Transformers for Near-Infinite Context](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01889) by UC Berkeley, 2023 - Ring Attention is a system-level optimization technique by leveraging specific hardware architecture to make the exact attention computation more efficient.\n10. [Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.07143) by Google, 2024 - Infini-attention has an additional compressive memory with linear attention for processing infinitely long contexts. They trained a 1B parameter Transformer model that was fine-tuned on up to 5K sequence length passkey instances solves the 1M tokens input length problem. The Infini-attention mechanism presents an efficient and powerful approach for Transformer language models to process very long contexts without prohibitive increases in memory or computation.\n11. [Retrieval Head Mechanistically Explains Long-Context Factuality](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.15574) by Wenhao Wu, Yao Fu et al., 2024 - The paper explains how LLMs actually deal with context windows. The findings: they discover LLMs have unexpectedly developed retrieval heads, they were not explicitly coded for by creators. [Code: [An algorithm that statistically calculate the retrieval score of attention heads in a transformer model](https:\u002F\u002Fgithub.com\u002Fnightdessert\u002FRetrieval_Head)]\n\n### Transformer Architecture\n\n1. [The Transformer blog post](https:\u002F\u002Fai.googleblog.com\u002F2017\u002F08\u002Ftransformer-novel-neural-network.html).\n2. [The Illustrated Transformer](https:\u002F\u002Fjalammar.github.io\u002Fillustrated-transformer\u002F) by Jay Alammar, an Instructor from Udacity ML Engineer Nanodegree.\n3. Watch [Łukasz Kaiser’s talk](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=rBCqOTEfxvg) walking through the model and its details.\n4. [Transformer-XL: Unleashing the Potential of Attention Models](https:\u002F\u002Fai.googleblog.com\u002F2019\u002F01\u002Ftransformer-xl-unleashing-potential-of.html) by Google Brain.\n5. [Generative Modeling with Sparse Transformers](https:\u002F\u002Fopenai.com\u002Fblog\u002Fsparse-transformer\u002F) by OpenAI - an algorithmic improvement of the attention mechanism to extract patterns from sequences 30x longer than possible previously.\n6. [Stabilizing Transformers for Reinforcement Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06764) paper by DeepMind and CMU - they propose architectural modifications to the original Transformer and XL variant by moving layer-norm and adding gating creates Gated Transformer-XL (GTrXL). It substantially improve the stability and learning speed (integrating experience through time) in RL.\n7. [The Transformer Family](https:\u002F\u002Flilianweng.github.io\u002Flil-log\u002F2020\u002F04\u002F07\u002Fthe-transformer-family.html) by Lilian Weng - since the paper \"Attention Is All You Need\", many new things have happened to improve the Transformer model. This post is about that.\n8. [DETR (**DE**tection **TR**ansformer): End-to-End Object Detection with Transformers](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Fend-to-end-object-detection-with-transformers\u002F) by FAIR - :fire: Computer vision has not yet been swept up by the Transformer revolution. DETR completely changes the architecture compared with previous object detection systems. ([PyTorch Code and pretrained models](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fdetr)). \"A solid swing at (non-autoregressive) end-to-end detection. Anchor boxes + Non-Max Suppression (NMS) is a mess. I was hoping detection would go end-to-end back in ~2013)\" — Andrej Karpathy\n9. [Transformers for software engineers](https:\u002F\u002Fblog.nelhage.com\u002Fpost\u002Ftransformers-for-software-engineers\u002F) - This post will be helpful to software engineers who are interested in learning ML models, especially anyone interested in Transformer interpretability. The post walk through a (mostly) complete implementation of a GPT-style Transformer, but the goal will not be running code; instead, they use the language of software engineering and programming to explain how these models work and articulate some of the perspectives they bring to them when doing interpretability work.\n10. [Pathways Language Model (PaLM): Scaling to 540 Billion Parameters for Breakthrough Performance](https:\u002F\u002Fai.googleblog.com\u002F2022\u002F04\u002Fpathways-language-model-palm-scaling-to.html) - PaLM is a dense decoder-only Transformer model trained with the Pathways system, which enabled Google to efficiently train a single model across multiple TPU v4 Pods. The example explaining a joke is remarkable. This shows that it can generate explicit explanations for scenarios that require a complex combination of multi-step logical inference, world knowledge, and deep language understanding.\n11. [Efficient Long Sequence Modeling via State Space Augmented Transformer (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08136) by Georgia Institute of Technology and Microsoft - The quadratic computational cost of the attention mechanism limits its practicality for long sequences. There are existing attention variants that improve the computational efficiency, but they have limited ability to effectively compute global information. In parallel to Transformer models, state space models (SSMs) are tailored for long sequences, but they are not flexible enough to capture complicated local information. They propose SPADE, short for State sPace AugmenteD TransformEr, which performs various baselines, including Mega, on the Long Range Arena benchmark and various LM tasks. This is an interesting direction. SSMs and Transformers were combined a while back.\n12. [DeepNet: Scaling Transformers to 1,000 Layers (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.00555) by Microsoft Research (2022) - The group introduced a **new normalization function (DEEPNORM)** to modify the residual connection in Transformer and showed that model updates can be bounded in a **stable way**. This improve the training stability of deep Transformers and scale the model depth by orders of magnitude (10x) compared to Gpipe (pipeline parallelism) by Google Brain (2019). (who remembers what ResNet (2015) did to ConvNet?)\n13. [A Length-Extrapolatable Transformer (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10554) by Microsoft (2022) [[TorchScale code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Ftorchscale)] - This improves **modeling capability** of scaling Transformers.\n14. [Hungry Hungry Hippos (H3): Towards Language Modeling with State Space Models (SSMs) (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.14052) by Stanford AI Lab (2022) - A new language modeling architecture. It **scales nearly linearly with context size instead of quadratically**. No more fixed context windows, long context for everyone. Despite that, SSMs are still slower than Transformers due to poor hardware utilization. So, a Transformer successor? [[Tweet](https:\u002F\u002Ftwitter.com\u002FrealDanFu\u002Fstatus\u002F1617605971395891201)]\n15. [Accelerating Large Language Model Decoding with Speculative Sampling (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.01318) by DeepMind (2023) - Speculative sampling algorithm enable the generation of multiple tokens from each transformer call. Achieves a 2–2.5x decoding speedup with Chinchilla in a distributed setup, without compromising the sample quality or making modifications to the model itself.\n16. [A Survey on Efficient Training of Transformers (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.01107) by Monash University et al., 2023 - The first systematic overview, covering 1) computation efficiency; optimization (i.e., sparse training) and data selection (i.e., token masking), 2) memory efficiency (i.e, data\u002Fmodel parallelism, offloading\u002Fuse external mem) and 3) hardware\u002Falgorithm co-design (i.e, efficient attention, hardware-aware low-precisio).\n17. [Deep Transformers without Shortcuts: Modifying Self-attention for Faithful Signal Propagation (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.10322) by DeepMind et al., 2023\n18. [Hyena Hierarchy: Towards Larger Convolutional Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.10866) by Stanford U et al., 2023 - Attention is great. Hyena is an alternative to attention that can learn on sequences **10x longer**, up to **100x faster** than optimized attention, by using implicit long convolutions and gating. [[Tweet](https:\u002F\u002Ftwitter.com\u002FMichaelPoli6\u002Fstatus\u002F1633167040130453505)]\n19. [FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.14135) by Stanford University et al., 2022 - Transformers have grown deeper and wider, but training them on long sequences remains difficult. The attention layer at their heart is the compute and memory bottleneck: doubling the sequence length would quadruple the runtime and memory requirements. FlashAttention is a new algorithm to speed up attention and reduce its memory footprint—without any approximation. It enables training LLMs with longer context. [[code](https:\u002F\u002Fgithub.com\u002FHazyResearch\u002Fflash-attention)]\n20. [Jump to Conclusions: Short-Cutting Transformers With Linear Transformations (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.09435v1) by Google Research et al., 2023. [[Tweet](https:\u002F\u002Ftwitter.com\u002FLChoshen\u002Fstatus\u002F1637799047430905856)]\n21. [CoLT5: Faster Long-Range Transformers with Conditional Computation (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.09752) by Google Research, 2023 - 64K context size for language models! This approach enables faster training and inference while maintaining or improving performance compared to LONGT5. The main components of COLT5 include routing modules, conditional feedforward layers, and conditional attention layers. Routing modules select important tokens for each input and component, while the light branches process all tokens with lower capacity operations, and heavy branches apply higher capacity operations only on selected important tokens. Additionally, COLT5 incorporates multi-query cross-attention for faster inference speed as well as UL2 pre-training objective for improved in-context learning capabilities over long inputs. [[Tweet](https:\u002F\u002Ftwitter.com\u002Fmiolini\u002Fstatus\u002F1637677536921657344)]\n22. [google-research\u002Fmeliad](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fmeliad) - The Meliad library is collection of models which are being developed as part of ongoing Google research into various architectural improvements in deep learning. The library currently consists of several transformer variations, which explore ways in which the popular transformer architecture can be extended to better support language modeling over long sequences. The variations are Memorizing Transformer, Transformer with sliding window, Block-Recurrent Transformer, and more.\n23. [LongNet: Scaling Transformers to 1,000,000,000 Tokens (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02486) by Microsoft Research, 2023.\n24. [vLLM: Easy, Fast, and Cheap LLM Serving with PagedAttention](https:\u002F\u002Fvllm.ai\u002F) by UC Berkeley et al., 2023 - The improved throughput comes from VRAM savings on an otherwise close to fullly utilized GPU.\n25. [The Secret Sauce behind 100K context window in LLMs: all tricks in one place](https:\u002F\u002Fblog.gopenai.com\u002Fhow-to-speed-up-llms-and-use-100k-context-window-all-tricks-in-one-place-ffd40577b4c)\n26. [Unlimiformer: Long-Range Transformers with Unlimited Length Input (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.01625) by CMU, 2023.\n27. [PaLM 2 Technical Report (PDF)](https:\u002F\u002Fai.google\u002Fstatic\u002Fdocuments\u002Fpalm2techreport.pdf) by Google, 2023.\n28. [Mixture-of-Depths (MoD): Dynamically allocating compute in transformer-based language models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.02258) by Google DeepMind et al., 2024 - MoD method scale in depth dimension while keeping the FLOPs constant (similarly how Mixture of Experts (MoE) does it in width). MoD model can learn to route more complex tokens through more layers (similarly how experts in MoE can specialize to certain domains). The group explores how to optimize compute budget and improve efficiency without sacrificing performance. Results: MoD matches baseline performance with 66% faster training. Now, the question is, can it scale above 1B tokens. They tested on 500M tokens. [ELI5 version: [Mixture of Depths Meets Mixture of Experts](https:\u002F\u002Flifeinthesingularity.com\u002Fp\u002Fgoogles-breakthroughs-in-ai-design)]\n29. [Extending Context Window of Large Language Models via Positional Interpolation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.15595) by Meta Platforms, 2023 - Position Interpolation (PI) is an effective and efficient way to stably extend the context window of RoPE-based pretrained large language models such as LLaMA to much longer lengths (up to 32768) with minimal fine-tuning (within thousandth steps) while maintaining performance.\n30. [PoSE: Efficient Context Window Extension of LLMs via Positional Skip-wise Training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.10400) by Dawei Zhu et al., ICLR 2024 - PoSE simulates longer input sequences during training by manipulating the position indices within a fixed context window, rather than training on the full target length. This allows decoupling of the training length from the target context length, greatly reducing memory and computational requirements compared to full-length fine-tuning. PoSE successfully extended LLaMA-1 to support context lengths up to 128k tokens using only a 2k training window, with minimal performance degradation. [This model](https:\u002F\u002Fhuggingface.co\u002Fwinglian\u002FLlama-3-8b-64k-PoSE) uses PoSE to extend Llama-3 8B context length from 8k to 64k. PoSE has potential to scale context lengths even further, limited only by inference memory, as efficient inference techniques continue improving. [Code: [PoSE](https:\u002F\u002Fgithub.com\u002Fdwzhu-pku\u002FPoSE)]\n31. [Better & Faster Large Language Models via Multi-token Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.19737) by FAIR at Meta, Apr 2024 - What happens if we make language models predict several tokens ahead instead of only the next one? They show that replacing next token prediction tasks with multiple token prediction can result in substantially better code generation performance **with the exact same training budget and data — while also increasing inference performance by 3x**. While similar approaches have previously been used in fine-tuning to improve inference speed, **this research expands to pre-training for large models, showing notable behaviors and results at these scales**.\n32. [nGPT: Normalized Transformer with Representation Learning on the Hypersphere](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.01131) by NVIDIA, Oct 2024 - A novel Transformer architecture where all vectors (embeddings, MLP, attention matrices, hidden states) are normalized to unit norm and operate on a hypersphere. Achieves 4-20x faster convergence during training compared to standard Transformers. Eliminates the need for weight decay by enforcing normalization. Normalization approach: matrix-vector multiplications become dot products bounded in [-1,1]. Architecture changes: 1) Attention mechanism - normalizes QKV projection matrices, introduces trainable scaling factors for Q-K dot products, 2) Layer structure: introduces learnable \"eigen learning rates\" (α) for attention and MLP blocks. Theoretical: can be interpreted in the context of Riemannian optimization. Advantages: more stable training, improved performance on downstream tasks, simplified architecture.\n33. [Differential Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.05258) by Microsoft Research et al., Oct 2024 - They presents **significant improvements over standard Transformers** in multiple dimensions, with particular emphasis on attention efficiency and practical applications in LM tasks. A new architecture that improves attention mechanisms by reducing attention to irrelevant context. Achieves better performance while requiring fewer parameters and training tokens compared to standard Transformers. Solution: introduces \"differential attention\" mechanism that calculates attention scores as the difference between two separate softmax attention. This subtraction cancel out noise. It can be implemented efficiently using existing FlashAttention. Scaling efficiency: **needs only ~65% of parameters or training tokens to match standard Transformer performance**. Improvements: 1) Better performance on long sequences up to 64K tokens. 2) Better at finding key information embedded in documents. 3) ICL: more robust to prompt order permutations. 4) Reduces attention misallocation, a primary cause of hallucinations. Technical details: includes headwise normalization to handle sparse attention patterns, etc. Future: development of efficient low-bit attention kernels, potential for compressing KV caches due to sparser attention patterns, etc. [Listen to [NotebookLM podcast](https:\u002F\u002Fnotebooklm.google.com\u002Fnotebook\u002F8e4c0907-8b12-4bc9-9d29-26c03daab71d\u002Faudio)]\n\n### Generative Pre-Training Transformer (GPT)\n\n[![LLM visualization](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcedrickchee_awesome-transformer-nlp_readme_0b68f8c13e24.png)](https:\u002F\u002Fbbycroft.net\u002Fllm)\n\n\u003Csup>GPT visualization\u003C\u002Fsup>\n\n1. [Better Language Models and Their Implications](https:\u002F\u002Fopenai.com\u002Fblog\u002Fbetter-language-models\u002F).\n2. [Improving Language Understanding with Unsupervised Learning](https:\u002F\u002Fblog.openai.com\u002Flanguage-unsupervised\u002F) - this is an overview of the original OpenAI GPT model.\n3. [🦄 How to build a State-of-the-Art Conversational AI with Transfer Learning](https:\u002F\u002Fconvai.huggingface.co\u002F) by Hugging Face.\n4. [The Illustrated GPT-2 (Visualizing Transformer Language Models)](https:\u002F\u002Fjalammar.github.io\u002Fillustrated-gpt2\u002F) by Jay Alammar.\n5. [MegatronLM: Training Billion+ Parameter Language Models Using GPU Model Parallelism](https:\u002F\u002Fnv-adlr.github.io\u002FMegatronLM) by NVIDIA ADLR.\n6. [OpenGPT-2: We Replicated GPT-2 Because You Can Too](https:\u002F\u002Fmedium.com\u002F@vanya_cohen\u002Fopengpt-2-we-replicated-gpt-2-because-you-can-too-45e34e6d36dc) - the authors trained a 1.5 billion parameter GPT-2 model on a similar sized text dataset and they reported results that can be compared with the original model.\n7. [MSBuild demo of an OpenAI generative text model generating Python code](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=fZSFNUT6iY8) [video] - The model that was trained on GitHub OSS repos. The model uses English-language code comments or simply function signatures to generate entire Python functions. Cool!\n8. [GPT-3: Language Models are Few-Shot Learners (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14165) by Tom B. Brown (OpenAI) et al. - \"We train GPT-3, an autoregressive language model with 175 billion parameters :scream:, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting.\"\n9. [elyase\u002Fawesome-gpt3](https:\u002F\u002Fgithub.com\u002Felyase\u002Fawesome-gpt3) - A collection of demos and articles about the OpenAI GPT-3 API.\n10. [How GPT3 Works - Visualizations and Animations](https:\u002F\u002Fjalammar.github.io\u002Fhow-gpt3-works-visualizations-animations\u002F) by Jay Alammar.\n11. [GPT-Neo](https:\u002F\u002Fwww.eleuther.ai\u002Fprojects\u002Fgpt-neo\u002F) - Replicate a GPT-3 sized model and open source it for free. GPT-Neo is \"an implementation of model parallel GPT2 & GPT3-like models, with the ability to scale up to full GPT3 sizes (and possibly more!), using the mesh-tensorflow library.\" [[Code](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neo)].\n12. [GitHub Copilot](https:\u002F\u002Fcopilot.github.com\u002F), powered by OpenAI Codex - Codex is a descendant of GPT-3. Codex translates natural language into code.\n13. [GPT-4 Rumors From Silicon Valley](https:\u002F\u002Fthealgorithmicbridge.substack.com\u002Fp\u002Fgpt-4-rumors-from-silicon-valley) - GPT-4 is almost ready. GPT-4 would be multimodal, accepting text, audio, image, and possibly video inputs. Release window: Dec - Feb. #hype\n14. [New GPT-3 model: text-Davinci-003](https:\u002F\u002Fbeta.openai.com\u002Fdocs\u002Fmodels\u002Fdavinci) - Improvements:\n  - Handle more complex intents — you can get even more creative with how you make use of its capabilities now.\n  - Higher quality writing — clearer, more engaging, and more compelling content.\n  - Better at longer form content generation.\n15. [GPT-4 research](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4) landing page.\n16. [A Comprehensive Capability Analysis of GPT-3 and GPT-3.5 Series Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10420) by Fudan University et al., 2023.\n17. [Sparks of Artificial General Intelligence: Early experiments with GPT-4](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.12712) by Microsoft Research, 2023 - There are completely mind-blowing examples in the paper.\n\n#### ChatGPT\n\n[What is ChatGPT?](https:\u002F\u002Fopenai.com\u002Fblog\u002Fchatgpt\u002F)\n\n**TL;DR:** ChatGPT is a conversational web interface, backed by OpenAI's newest language model fine-tuned from a model in the [GPT-3.5 series](https:\u002F\u002Fbeta.openai.com\u002Fdocs\u002Fmodel-index-for-researchers) (which finished training in early 2022), optimized for dialogue. It is trained using Reinforcement Learning from Human Feedback (RLHF); human AI trainers provide supervised fine-tuning by playing both sides of the conversation.\n\nIt's evidently better than GPT-3 at following user instructions and context. [People have noticed](https:\u002F\u002Farchive.ph\u002Fm6AOQ) ChatGPT's output quality seems to represent a notable improvement over previous GPT-3 models.\n\nFor more, please take a look at [ChatGPT Universe](https:\u002F\u002Fgithub.com\u002Fcedrickchee\u002Fchatgpt-universe). This is my fleeting notes on everything I understand about ChatGPT and stores a collection of interesting things about ChatGPT.\n\n### Large Language Model (LLM)\n\n![ChatGPT among the LLMs](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcedrickchee_awesome-transformer-nlp_readme_83e681c3ee5a.jpeg)\n\nChatGPT among the LLMs [^1]\n\n1. [GPT-J-6B](https:\u002F\u002Ftowardsdatascience.com\u002Fcant-access-gpt-3-here-s-gpt-j-its-open-source-cousin-8af86a638b11) - Can't access GPT-3? Here's GPT-J — its open-source cousin.\n2. [Fun and Dystopia With AI-Based Code Generation Using GPT-J-6B](https:\u002F\u002Fminimaxir.com\u002F2021\u002F06\u002Fgpt-j-6b\u002F) - Prior to GitHub Copilot tech preview launch, Max Woolf, a data scientist tested GPT-J-6B's code \"writing\" abilities.\n3. [GPT-Code-Clippy (GPT-CC)](https:\u002F\u002Fgithub.com\u002FCodedotAl\u002Fgpt-code-clippy) - An open source version of GitHub Copilot. The GPT-CC models are fine-tuned versions of GPT-2 and GPT-Neo.\n4. [GPT-NeoX-20B](https:\u002F\u002Fblog.eleuther.ai\u002Fannouncing-20b\u002F) - A 20 billion parameter model trained using EleutherAI’s [GPT-NeoX](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neox) framework. They expect it to perform well on many tasks. You can try out the model on [GooseAI](https:\u002F\u002Fgoose.ai\u002F) playground.\n5. [Metaseq](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmetaseq) - A codebase for working with [Open Pre-trained Transformers (OPT)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.01068).\n6. [YaLM 100B](https:\u002F\u002Fgithub.com\u002Fyandex\u002FYaLM-100B) by Yandex is a GPT-like pretrained language model with 100B parameters for generating and processing text. It can be used **freely** by developers and researchers from all over the world.\n7. [BigScience's BLOOM-176B](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom) from the Hugging Face repository [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.15424), [blog post](https:\u002F\u002Fbigscience.huggingface.co\u002Fblog\u002Fbloom)] - BLOOM is a 175-billion parameter model for language processing, able to generate text much like GPT-3 and OPT-175B. It was developed to be multilingual, being deliberately trained on datasets containing 46 natural languages and 13 programming languages.\n8. [bitsandbytes-Int8 inference for Hugging Face models](https:\u002F\u002Fdocs.google.com\u002Fdocument\u002Fd\u002F1JxSo4lQgMDBdnd19VBEoaG-mMfQupQ3XvOrgmRAVtpU\u002Fedit) - You can run BLOOM-176B\u002FOPT-175B easily on a single machine, without performance degradation. If true, this could be a game changer in enabling people outside of big tech companies being able to use these LLMs.\n9. [WeLM: A Well-Read Pre-trained Language Model for Chinese (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.10372) by WeChat. [[online demo](https:\u002F\u002Fwelm.weixin.qq.com\u002Fdocs\u002Fplayground\u002F)]\n10. [GLM-130B: An Open Bilingual (Chinese and English) Pre-Trained Model (code and paper)](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FGLM-130B) by Tsinghua University, China [[article](https:\u002F\u002Fkeg.cs.tsinghua.edu.cn\u002Fglm-130b\u002Fposts\u002Fglm-130b\u002F)] - One of the major contributions is making LLMs cost affordable using int4 quantization so it can run in limited compute environments.\n    > The resultant GLM-130B model offers **significant outperformance over GPT-3 175B** on a wide range of popular English benchmarks while the performance advantage is not observed in OPT-175B and BLOOM-176B. It also consistently and significantly outperforms ERNIE TITAN 3.0 260B -- the largest Chinese language model -- across related benchmarks. Finally, we leverage **a unique scaling property of GLM-130B to reach INT4 quantization, without quantization aware training and with almost no performance loss**, making it the first among 100B-scale models. **More importantly, the property allows its effective inference on 4×RTX 3090 (24G) or 8×RTX 2080 Ti (11G) GPUs, the most ever affordable GPUs required for using 100B-scale models**.\n11. [Teaching Small Language Models to Reason (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08410) - They finetune a student model on the chain of thought (CoT) outputs generated by a larger teacher model. For example, the **accuracy of T5 XXL on GSM8K improves from 8.11% to 21.99%** when finetuned on PaLM-540B generated chains of thought.\n12. [ALERT: Adapting Language Models to Reasoning Tasks (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08286) by Meta AI - They introduce ALERT, a benchmark and suite of analyses for assessing language models' reasoning ability comparing pre-trained and finetuned models on complex tasks that require reasoning skills to solve. It covers 10 different reasoning skills including logistic, causal, common-sense, abductive, spatial, analogical, argument and deductive reasoning as well as textual entailment, and mathematics.\n13. [Evaluating Human-Language Model Interaction (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09746) by Stanford University and Imperial College London - They find that non-interactive performance does not always result in better human-LM interaction and that first-person and third-party metrics can diverge, suggesting the importance of examining the nuances of human-LM interaction.\n14. [Unnatural Instructions: Tuning Language Models with (Almost) No Human Labor (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09689) by Meta AI [[data](https:\u002F\u002Fgithub.com\u002Forhonovich\u002Funnatural-instructions)] - Fine-tuning a T5 on a large dataset collected with virtually no human labor leads to a model that surpassing the performance of models such as T0++ and Tk-Instruct across various benchmarks. These results demonstrate the potential of model-generated data as a **cost-effective alternative to crowdsourcing for dataset expansion and diversification**.\n15. [OPT-IML (OPT + Instruction Meta-Learning) (paper)](https:\u002F\u002Fraw.githubusercontent.com\u002Ffacebookresearch\u002Fmetaseq\u002Fmain\u002Fprojects\u002FOPT-IML\u002Foptimal_paper_v1.pdf) by Meta AI - OPT-IML is a set of instruction-tuned versions of OPT, on a collection of ~2000 NLP tasks — for research use cases. It boosts the performance of the original OPT-175B model using instruction tuning to improve zero-shot and few-shot generalization abilities — allowing it to adapt for more diverse language applications (i.e., answering Q’s, summarizing text). This improves the model's ability to better process natural instruction style prompts. Ultimately, humans should be able to \"talk\" to models as naturally and fluidly as possible. [[code (available soon), weights released](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmetaseq\u002Ftree\u002Fmain\u002Fprojects\u002FOPT-IML)]\n16. [jeffhj\u002FLM-reasoning](https:\u002F\u002Fgithub.com\u002Fjeffhj\u002FLM-reasoning) - This repository contains a collection of papers and resources on reasoning in Large Language Models.\n17. [Rethinking with Retrieval: Faithful Large Language Model Inference (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.00303) by University of Pennsylvania et al., 2022 - They shows the potential of enhancing LLMs by retrieving relevant external knowledge based on decomposed reasoning steps obtained through chain-of-thought (CoT) prompting. I predict we're going to see many of these types of retrieval-enhanced LLMs in 2023.\n18. [REPLUG: Retrieval-Augmented Black-Box Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12652) by Meta AI et al., 2023 - TL;DR: Enhancing GPT-3 with world knowledge — a retrieval-augmented LM framework that combines a frozen LM with a frozen\u002Ftunable retriever. It improves GPT-3 in language modeling and downstream tasks by prepending retrieved documents to LM inputs. [[Tweet](https:\u002F\u002Ftwitter.com\u002FWeijiaShi2\u002Fstatus\u002F1620497381962977281)]\n19. [Progressive Prompts: Continual Learning for Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12314) by Meta AI et al., 2023 - Current LLMs have hard time with catastrophic forgetting and leveraging past experiences. The approach learns a prompt for new task and concatenates with frozen previously learned prompts. This efficiently transfers knowledge to future tasks. [[code](https:\u002F\u002Fgithub.com\u002Farazd\u002FProgressivePrompts)]\n20. [Large Language Models Can Be Easily Distracted by Irrelevant Context (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.00093) by Google Research et al., 2023 - Adding the instruction \"Feel free to ignore irrelevant information given in the questions.\" consistently improves robustness to irrelevant context.\n21. [Toolformer: Language Models Can Teach Themselves to Use Tools (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.04761) by Meta AI, 2023 - A smaller model trained to translate human intention into actions (i.e. decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction).\n22. [ERNIE 3.0 Titan: Exploring Larger-scale Knowledge Enhanced Pre-training for Language Understanding and Generation (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.12731) by Baidu et al., 2021 - ERNIE 3.0 Titan is the latest addition to Baidu's ERNIE (Enhanced Representation through kNowledge IntEgration)  family. It's inspired by the masking strategy of Google's BERT. ERNIE is also a unified framework. They also proposed a controllable learning algorithm and a credible learning algorithm. They apply online distillation technique to compress their model. To their knowledge, it is the largest (260B parameters) Chinese dense pre-trained model so far. [[article](http:\u002F\u002Fresearch.baidu.com\u002FBlog\u002Findex-view?id=165)]\n23. [Characterizing Attribution and Fluency Tradeoffs for Retrieval-Augmented Large Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.05578) by Google Research, 2023 - Despite recent progress, it has been difficult to prevent semantic hallucinations in generative LLMs. One common solution to this is augmenting LLMs with a retrieval system and making sure that the generated output is attributable to the retrieved information.\n24. [Augmented Language Models (ALMs): a Survey (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.07842) by Meta AI, 2023 - Augmenting language models with reasoning skills and the ability to use various, non-parametric external modules for context processing and outperform traditional LMs on several benchmarks. This new research direction has the potential to address interpretability, consistency and scalability issues.\n25. [A Comprehensive Survey on Pretrained Foundation Models: A History from BERT to ChatGPT (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.09419) by MSU et al., 2023 - My remarks: this paper raises a lot of questions around the term \"foundation models\", i.e., what's the model bare minimum number of parameters to qualify as foundation? It sounds to me foundation models are an \"invented\" concept that doesn't have good validity.\n26. [Multimodal Chain-of-Thought Reasoning in Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.00923) by Amazon Web Service et al., 2023 - The model outperform GPT-3.5 by 16% on the ScienceQA benchmark. This work is the first to study CoT reasoning in different modalities, language (text) and vision (images). Unfortunately, they never provide ablation study on how much of that performance gain was caused by the new modalities. [[code](https:\u002F\u002Fgithub.com\u002Famazon-science\u002Fmm-cot)]\n27. [RECITE: Recitation-Augmented Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.01296) by Google Research et al., ICLR 2023 - How can ChatGPT-like models achieve greater factual accuracy without relying on an external retrieval search engine? This paper shows that recitation can help LLMs generate accurate factual knowledge by reciting relevant passages from their own memory (by sampling) before producing final answers. The core idea is motivated by the intuition: recite-step that recollects relevant knowledge pieces helps answer-step (generation) better output. That's a recite-answer paradigm: first ask the LLM to generate the support paragraphs that contain the answer (knowledge-recitation) and then use it as additional prompt, along with the question to ask the LLM to generate the answer. They verify the effectiveness on four LLMs. They also show that recitation can be more effective than retrieval. This is important since having a retriever may lead to unpredictable behavior (i.e., Bing\u002FSydney). [[code](https:\u002F\u002Fgithub.com\u002FEdward-Sun\u002FRECITE)]\n28. [LLaMA: Open and Efficient Foundation Language Models (paper)](https:\u002F\u002Fresearch.facebook.com\u002Fpublications\u002Fllama-open-and-efficient-foundation-language-models\u002F) by Meta AI, 2023 - A collection of language models ranging from 7B to 65B parameters. LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla70B and PaLM-540B. This shows that **smaller models trained with more data can outperform larger models**. This is **not contradictory to anything in the Chinchilla paper**, because it's not compute-optimally trained. GPU hours for training 7B model=82,432, 65B model=1,022,362 :scream:. Total time spent for all models: 2048 A100-80GB GPU for a period of approximately 5 months. The 65B model cost something in the range of ~$1-4M. Access to the model will be granted on a case-by-case basis though. People interested can apply for access. (Mar 2: [they just approved access to the models](https:\u002F\u002Ftwitter.com\u002Fcedric_chee\u002Fstatus\u002F1631182890418712578), llama-7B works in Colab [cedrickchee\u002Fllama](https:\u002F\u002Fgithub.com\u002Fcedrickchee\u002Fllama\u002Fblob\u002Fmain\u002Fnotebooks\u002Fvi_LLaMA_alpha.ipynb)) [Takeaways: [Tweet](https:\u002F\u002Fthreadreaderapp.com\u002Fthread\u002F1629496763148017665.html)]\n29. [Language Is Not All You Need: Aligning Perception with Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.14045) by Microsoft, 2023 - They introduce KOSMOS-1, a Multimodal Large Language Model (MLLM) that can perceive general modalities, learn in context (i.e., few-shot), and follow instructions (i.e., zero-shot). The total number of parameters is about 1.6B.\n30. [Check Your Facts and Try Again: Improving Large Language Models with External Knowledge and Automated Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.12813) by Microsoft Research et al - LLM-Augmenter significantly reduces ChatGPT's hallucinations without sacrificing the fluency and informativeness of its responses. The architecture and data flow: 1) Retrieve evidence from external knowledge. 2) Context and reasoning chains. 3) Give to LLM (i.e., ChatGPT). 4) Verify hallucinations. 5) If hallucinate, give feedback and revise.\n31. [UL2: Unifying Language Learning Paradigms (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.05131) by Google Brain, 2022 - UL2 is a unified framework for pretraining models that are universally effective across datasets and setups. _Takeaways: Objective matters way more than architecture. Mixture-of-Denoisers (MoD) is effective if you care about doing well on more than one type of tasks\u002Fsettings._ UL2 frames different objective functions for training language models as denoising tasks, where the model has to recover missing sub-sequences of a given input. During pre-training it uses Mixture-of-Denoisers (MoD) that samples from a varied set of such objectives, each with different configurations. MoD combines diverse pre-training paradigms together. They demonstrated that models trained using the UL2 framework perform well in a variety of language domains, including prompt-based few-shot learning and models fine-tuned for down-stream tasks. They open sourced UL2 20B model and checkpoints back in 2022. In 2023, they open sourced Flan-UL2 20B and released the weights. Check out: [[blog post](https:\u002F\u002Farchive.is\u002F20230303191656\u002Fhttps:\u002F\u002Fwww.yitay.net\u002Fblog\u002Fflan-ul2-20b), [Tweet](https:\u002F\u002Ftwitter.com\u002FYiTayML\u002Fstatus\u002F1631359474421366784)]. I'm excited to see what the community does with this new model.\n32. [Larger language models do in-context learning (ICL) differently (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03846) by Google Research, 2023 - Overriding semantic priors when presented with enough flipped labels is an emergent ability of scale. LLMs learn better mappings when ICL labels are semantically unrelated to inputs (i.e., apple\u002Forange, negative\u002Fpositive). Fine-tuning to follow instruction helps both. [[Tweet](https:\u002F\u002Ftwitter.com\u002FJerryWeiAI\u002Fstatus\u002F1633548780619571200)]\n33. [The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03915) by Hugging Face et al., 2023 - Documents the data creation and curation efforts of Responsible Open-science Open-collaboration Text Source (ROOTS) corpus, a dataset used to train BLOOM. [[Tweet](https:\u002F\u002Ftwitter.com\u002Farankomatsuzaki\u002Fstatus\u002F1633282997020672000)]\n34. [PanGu-Σ: Towards Trillion Parameter Language Model with Sparse Heterogeneous Computing (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10845) by Huawei Technologies, 2023 - They develop a system that trained a trillion-parameter language model on a cluster of Ascend 910 AI processors and MindSpore framework. This resulted in a 6.3x increase in training throughput through heterogeneous computing.\n35. [Context-faithful Prompting for Large Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11315) by USC et al., 2023\n36. [Llama 2: Open Foundation and Fine-Tuned Chat Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.09288) by Meta AI, 2023 - Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1. Its fine-tuned models have been trained on over 1 million human annotations. It outperforms other open source language models on many benchmarks. License: The model and weights are available for free for research and commercial use. It is not an open source model, rather an open approach model — for commercial use, your product cannot have more than 700 million monthly active users and requires a form to get access. Llama-2-chat is the new addition and is created through using supervised fine-tuning and then iteratively refined using RLHF. [[Nathan Lambert's summary of the paper](https:\u002F\u002Fwww.interconnects.ai\u002Fp\u002Fllama-2-from-meta)]\n37. [Code Llama: Open Foundation Models for Code (paper)](https:\u002F\u002Fai.meta.com\u002Fresearch\u002Fpublications\u002Fcode-llama-open-foundation-models-for-code\u002F) by Meta AI, 2023 - Code Llama is a family of LLMs for code based on Llama 2 providing SoTA performance among open models, infilling capabilities, support for large input contexts, and zero-shot instruction following ability for programming tasks. It is capable of generating code, and natural language about code, from both code and natural language prompts. It's available in three models: foundation models (Code Llama), Python specializations (Code Llama - Python), and instruction-following models (Code Llama - Instruct) with 7B, 13B and 34B parameters each. It outperformed publicly available LLMs benchmark on code tasks. They release it under the same permissive license (community license) as Llama 2.\n38. [Introducing Llama 3: The most capable openly available LLM to date (article)](https:\u002F\u002Fai.meta.com\u002Fblog\u002Fmeta-llama-3\u002F) by Meta AI, 2024 - In the coming months, they’ll share the Llama 3 research paper. [[Code](https:\u002F\u002Fgithub.com\u002Fmeta-llama\u002Fllama3)]\n39. [Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context](https:\u002F\u002Fstorage.googleapis.com\u002Fdeepmind-media\u002Fgemini\u002Fgemini_v1_5_report.pdf) by Google, May 2024 - The Gemini 1.5 model family technical report. Highlights: Gemini 1.5 Pro is now Google's most capable model (surpassing 1.0 Ultra), Gemini 1.5 models achieve near-perfect recall on long-context retrieval tasks across modalities, improvement in next-token prediction and near-perfect retrieval (>99%) up to at least 10M tokens.\n\n### Transformer Reinforcement Learning\n\nTransformer Reinforcement Learning from Human Feedback (RLHF).\n\n- [Illustrating Reinforcement Learning from Human Feedback](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Frlhf) - Recent advances with language models (ChatGPT for example) have been powered by RLHF.\n- [Training a Helpful and Harmless Assistant with RLHF (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.05862) by Anthropic. [[code and red teaming data](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAnthropic\u002Fhh-rlhf), [tweet](https:\u002F\u002Ftwitter.com\u002Fanthropicai\u002Fstatus\u002F1514277273070825476)]\n- [The Wisdom of Hindsight Makes Language Models Better Instruction Followers (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.05206) by UC Berkeley, 2023 - The underlying RLHF algo is complex and requires an additional training pipeline for reward and value networks. They consider an alternative approach, Hindsight Instruction Relabeling (HIR): converting feedback to instruction by relabeling the original one and training the model for better alignment.\n- [From r to Q∗: Your Language Model is Secretly a Q-Function](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.12358) by Stanford University, Apr 2024 - The paper bridges the gap between two approaches to RLHF - the standard RLHF setup and Direct Preference Optimization (DPO) - by deriving DPO as a general inverse Q-learning algorithm in a token-level MDP (Markov Decision Process). The authors provide empirical insights into the benefits of DPO, including its ability to perform credit assignment, and demonstrate improvements over the base DPO policy using simple beam search, with **potential applications in multi-turn dialogue, reasoning, and agentic systems**.\n- [Iterative Reasoning Preference Optimization (IRPO)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.19733) by Jason Weston (Meta) et al., May 2024 - Llama-2-70B-Chat improves **from 55.6% to 81.6% on GSM8K** with this method. They apply iterative preference optimization to improve reasoning: generate chain-of-thought candidates with LLM, construct preference pairs based on if answer is correct or not, train with DPO + NLL, and repeat. For example, imagine a group of people trying to decide how to allocate a limited budget. Each person has their own priorities and preferences for how the money should be spent. Using the IRPO approach, the group would engage in a back-and-forth discussion, with each person adjusting their preferences based on the arguments and compromises made by the others. Over time, the group would converge on a set of preferences that everyone can accept, even if it's not exactly what any one person wanted initially.\n\n#### Tools for RLHF\n\n- [lvwerra\u002FTRL](https:\u002F\u002Fgithub.com\u002Flvwerra\u002Ftrl) - Train transformer language models with reinforcement learning.\n\nOpen source effort towards ChatGPT:\n\n- [CarperAI\u002FTRLX](https:\u002F\u002Fgithub.com\u002FCarperAI\u002Ftrlx) - Originated as a fork of TRL. It allows you to fine-tune Hugging Face language models (GPT2, GPT-NeoX based) up to 20B parameters using Reinforcement Learning. Brought to you by CarperAI (born at EleutherAI, an org part of StabilityAI family). CarperAI is developing production ready open-source RLHF tools. They have [announced plans for the first open-source \"instruction-tuned\" LM](https:\u002F\u002Fcarper.ai\u002Finstruct-gpt-announcement\u002F).\n- [allenai\u002FRL4LMs](https:\u002F\u002Fgithub.com\u002Fallenai\u002FRL4LMs) - RL for language models (RL4LMs) by Allen AI. It's a modular RL library to fine-tune language models to human preferences.\n\n### Additional Reading\n\n1. [How to Build OpenAI's GPT-2: \"The AI That's Too Dangerous to Release\"](https:\u002F\u002Fwww.reddit.com\u002Fr\u002FMachineLearning\u002Fcomments\u002Fbj0dsa\u002Fd_how_to_build_openais_gpt2_the_ai_thats_too\u002F).\n2. [OpenAI’s GPT2 - Food to Media hype or Wake Up Call?](https:\u002F\u002Fwww.skynettoday.com\u002Fbriefs\u002Fgpt2)\n3. [How the Transformers broke NLP leaderboards](https:\u002F\u002Fhackingsemantics.xyz\u002F2019\u002Fleaderboards\u002F) by Anna Rogers. :fire::fire::fire:\n- A well put summary post on problems with large models that dominate NLP these days.\n- Larger models + more data = progress in Machine Learning research :question:\n4. [Transformers From Scratch](http:\u002F\u002Fwww.peterbloem.nl\u002Fblog\u002Ftransformers) tutorial by Peter Bloem.\n5. [Real-time Natural Language Understanding with BERT using NVIDIA TensorRT](https:\u002F\u002Fdevblogs.nvidia.com\u002Fnlu-with-tensorrt-bert\u002F) on Google Cloud T4 GPUs achieves 2.2 ms latency for inference. Optimizations are open source on GitHub.\n6. [NLP's Clever Hans Moment has Arrived](https:\u002F\u002Fthegradient.pub\u002Fnlps-clever-hans-moment-has-arrived\u002F) by The Gradient.\n7. [Language, trees, and geometry in neural networks](https:\u002F\u002Fpair-code.github.io\u002Finterpretability\u002Fbert-tree\u002F) - a series of expository notes accompanying the paper, \"Visualizing and Measuring the Geometry of BERT\" by Google's People + AI Research (PAIR) team.\n8. [Benchmarking Transformers: PyTorch and TensorFlow](https:\u002F\u002Fmedium.com\u002Fhuggingface\u002Fbenchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) by Hugging Face - a comparison of inference time (on CPU and GPU) and memory usage for a wide range of transformer architectures.\n9. [Evolution of representations in the Transformer](https:\u002F\u002Flena-voita.github.io\u002Fposts\u002Femnlp19_evolution.html) - An accessible article that presents the insights of their EMNLP 2019 paper. They look at how the representations of individual tokens in Transformers trained with different objectives change.\n10. [The dark secrets of BERT](https:\u002F\u002Ftext-machine-lab.github.io\u002Fblog\u002F2020\u002Fbert-secrets\u002F) - This post probes fine-tuned BERT models for linguistic knowledge. In particular, the authors analyse how many self-attention patterns with some linguistic interpretation are actually used to solve downstream tasks. TL;DR: They are unable to find evidence that linguistically interpretable self-attention maps are crucial for downstream performance.\n11. [A Visual Guide to Using BERT for the First Time](https:\u002F\u002Fjalammar.github.io\u002Fa-visual-guide-to-using-bert-for-the-first-time\u002F) - Tutorial on using BERT in practice, such as for sentiment analysis on movie reviews by Jay Alammar.\n12. [Turing-NLG: A 17-billion-parameter language model](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fblog\u002Fturing-nlg-a-17-billion-parameter-language-model-by-microsoft\u002F) by Microsoft that outperforms the state of the art on many downstream NLP tasks. This work would not be possible without breakthroughs produced by the [DeepSpeed library](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeepSpeed) (compatible with PyTorch) and [ZeRO optimizer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.02054), which can be explored more in this accompanying [blog post](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fblog\u002Fzero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters).\n13. [MUM (Multitask Unified Model): A new AI milestone for understanding information](https:\u002F\u002Fblog.google\u002Fproducts\u002Fsearch\u002Fintroducing-mum\u002F) by Google.\n- Based on transformer architecture but more powerful.\n- Multitask means: supports text and images, knowledge transfer between 75 languages, understand context and go deeper in a topic, and generate content.\n14. [GPT-3 is No Longer the Only Game in Town](https:\u002F\u002Flastweekin.ai\u002Fp\u002Fgpt-3-is-no-longer-the-only-game) - GPT-3 was by far the largest AI model of its kind last year (2020). Now? Not so much.\n15. [OpenAI's API Now Available with No Waitlist](https:\u002F\u002Fopenai.com\u002Fblog\u002Fapi-no-waitlist\u002F) - GPT-3 access without the wait. However, apps must be approved before [going live](https:\u002F\u002Fbeta.openai.com\u002Fdocs\u002Fgoing-live). This release also allow them to review applications, monitor for misuse, and better understand the effects of this tech.\n16. [The Inherent Limitations of GPT-3](https:\u002F\u002Flastweekin.ai\u002Fp\u002Fthe-inherent-limitations-of-gpt-3) - One thing missing from the article if you've read [Gwern's GPT-3 Creative Fiction article](https:\u002F\u002Fwww.gwern.net\u002FGPT-3#repetitiondivergence-sampling) before is the mystery known as \"Repetition\u002FDivergence Sampling\":\n    > when you generate free-form completions, they have a tendency to eventually fall into repetitive loops of gibberish.\n\n    For those using Copilot, you should have experienced this wierdness where it generates the same line or block of code over and over again.\n17. [Language Modelling at Scale: Gopher, Ethical considerations, and Retrieval](https:\u002F\u002Fdeepmind.com\u002Fblog\u002Farticle\u002Flanguage-modelling-at-scale) by DeepMind - The paper present an analysis of Transformer-based language model performance across a wide range of model scales — from models with tens of millions of parameters up to a 280 billion parameter model called Gopher.\n18. [Competitive programming with AlphaCode](https:\u002F\u002Fdeepmind.com\u002Fblog\u002Farticle\u002FCompetitive-programming-with-AlphaCode) by DeepMind - AlphaCode uses transformer-based language models to generate code that can create novel solutions to programming problems which require an understanding of algorithms.\n19. [Building games and apps entirely through natural language using OpenAI's code-davinci model](https:\u002F\u002Fandrewmayneblog.wordpress.com\u002F2022\u002F03\u002F17\u002Fbuilding-games-and-apps-entirely-through-natural-language-using-openais-davinci-code-model\u002F) - The author built several small games and apps without touching a single line of code, simply by telling the model what they want.\n20. [Open AI gets GPT-3 to work by hiring an army of humans to fix GPT’s bad answers](https:\u002F\u002Fstatmodeling.stat.columbia.edu\u002F2022\u002F03\u002F28\u002Fis-open-ai-cooking-the-books-on-gpt-3\u002F)\n21. [GPT-3 can run code](https:\u002F\u002Fmayt.substack.com\u002Fp\u002Fgpt-3-can-run-code) - You provide an input text and a command and GPT-3 will transform them into an expected output. It works well for tasks like changing coding style, translating between programming languages, refactoring, and adding doc. For example, converts JSON into YAML, translates Python code to JavaScript, improve the runtime complexity of the function.\n22. [Using GPT-3 to explain how code works](https:\u002F\u002Fsimonwillison.net\u002F2022\u002FJul\u002F9\u002Fgpt-3-explain-code\u002F) by Simon Willison.\n23. [Character AI announces they're building a full stack AGI company](https:\u002F\u002Fblog.character.ai\u002Fintroducing-character\u002F) so you could create your own AI to help you with anything, using conversational AI research. The co-founders Noam Shazeer (co-invented Transformers, scaled them to supercomputers for the first time, and pioneered large-scale pretraining) and Daniel de Freitas (led the development of LaMDA), all of which are foundational to recent AI progress.\n24. [How Much Better is OpenAI’s Newest GPT-3 Model?](https:\u002F\u002Fscale.com\u002Fblog\u002Fgpt-3-davinci-003-comparison) - In addition to ChatGPT, OpenAI releases text-davinci-003, a Reinforcement Learning-tuned model that performs better long-form writing. Example, it can explain code in the style of Eminem. 😀\n25. [OpenAI rival Cohere launches language model API](https:\u002F\u002Fventurebeat.com\u002Funcategorized\u002Fopenai-rival-cohere-launches-language-model-api\u002F) - Backed by AI experts, they aims to bring Google-quality predictive language to the masses. Aidan Gomez co-wrote a seminal 2017 paper at Google Brain that invented a concept known as \"Transformers\".\n26. [Startups competing with OpenAI's GPT-3 all need to solve the same problems](https:\u002F\u002Fwww.theregister.com\u002F2022\u002F03\u002F03\u002Flanguage_model_gpt3\u002F) - Last year, two startups released their own proprietary text-generation APIs. AI21 Labs, launched its 178-billion-parameter Jurassic-1 in Aug 2021, and Cohere, released a range of models. Cohere hasn't disclosed how many parameters its models contain. ... There are other up-and-coming startups looking to solve the same issues. Anthropic, the AI safety and research company started by a group of ex-OpenAI employees. Several researchers have left Google Brain to join two new ventures started by their colleagues. One outfit is named Character.ai, and the other Persimmon Labs.\n27. [Cohere Wants to Build the Definitive NLP Platform](https:\u002F\u002Falbertoromgar.medium.com\u002Fcohere-wants-to-build-the-definitive-nlp-platform-7d090c0de9ca) - Beyond generative models like GPT-3.\n28. [Transformer Inference Arithmetic](https:\u002F\u002Fkipp.ly\u002Fblog\u002Ftransformer-inference-arithmetic\u002F) technical write-up from Carol Chen, ML Ops at Cohere. This article presents detailed few-principles reasoning about LLM inference performance, with no experiments or difficult math.\n29. [State of AI Report 2022](https:\u002F\u002Fwww.stateof.ai\u002F2022-report-launch.html) - Key takeaways:\n  - New independent research labs are rapidly open sourcing the closed source output of major labs.\n  - AI safety is attracting more talent... yet remains extremely neglected.\n  - OpenAI's Codex, which drives GitHub Copilot, has impressed the computer science community with its ability to complete code on multiple lines or directly from natural language instructions. This success spurred more research in this space.\n  - DeepMind revisited LM scaling laws and found that current LMs are significantly undertrained: they’re not trained on enough data given their large size. They train Chinchilla, a 4x smaller version of their Gopher, on 4.6x more data, and find that Chinchilla outperforms Gopher and other large models on BIG-bench.\n  - Reinforcement Learning from Human Feedback (RLHF) has emerged as a key method to finetune LLMs and align them with human values. This involves humans ranking language model outputs sampled for a given input, using these rankings to learn a reward model of human preferences, and then using this as a reward signal to finetune the language model with using RL.\n30. [The Scaling Hypothesis](https:\u002F\u002Fwww.gwern.net\u002FScaling-hypothesis) by Gwern - On GPT-3: meta-learning, scaling, implications, and deep theory.\n31. [AI And The Limits Of Language — An AI system trained on words and sentences alone will never approximate human understanding](https:\u002F\u002Fwww.noemamag.com\u002Fai-and-the-limits-of-language\u002F) by Jacob Browning and Yann LeCun - What LLMs like ChatGPT can and cannot do, and why AGI is not here yet.\n32. [Use GPT-3 foundational models incorrectly: reduce costs 40x and increase speed by 5x](https:\u002F\u002Fwww.buildt.ai\u002Fblog\u002Fincorrectusage) - When fine-tuning a model, it is important to keep a few things in mind. There's still a lot to learn about working with these models at scale. We need a better guide.\n33. [The Next Generation Of Large Language Models](https:\u002F\u002Farchive.vn\u002FWFZnG) - It highlights 3 emerging areas: 1) models that can generate their own training data to improve themselves, 2) models that can fact-check themselves, and 3) massive sparse expert models.\n34. [GPT-4 analysis and predictions](https:\u002F\u002Fwww.lesswrong.com\u002Fposts\u002FqdStMFDMrWAnTqNWL\u002Fgpt-4-predictions) - Somehow related, in [\"Bing Chat is blatantly, aggressively misaligned\"](https:\u002F\u002Fwww.lesswrong.com\u002Fposts\u002FjtoPawEhLNXNxvgTT\u002Fbing-chat-is-blatantly-aggressively-misaligned) post, Gwern think how Bing Chat\u002FSydney can be so different from ChatGPT and his hypothesis: \"Sydney is not a RLHF trained GPT-3 model but a GPT-4 model developed in a hurry\". Some have also argued that Sydney performs better on reasoning tasks than ChatGPT\u002FGPT-3.5 and it may be GPT-4.\n35. [Mosaic LLMs (Part 2): GPT-3 quality for \u003C$500k (2022)](https:\u002F\u002Farchive.is\u002Fgu2li) - They claimed their [Composer PyTorch framework](https:\u002F\u002Fgithub.com\u002Fmosaicml\u002Fcomposer) ease model training. Now with Colossal-AI framework, I wonder how good is their solution. Until their users train it, I guess everything is purely hypothetical.\n36. [I made a transformer by hand (no training!)](https:\u002F\u002Fvgel.me\u002Fposts\u002Fhandmade-transformer\u002F) (2023) - Make a transformer to predict a simple sequence manually — not by training one, or using pretrained weights, but instead by **assigning each weight, by hand**, over an evening.\n37. [Finetune Llama 3.1 on GCP for production use cases](https:\u002F\u002Fwww.zenml.io\u002Fblog\u002Fhow-to-finetune-llama-3-1-with-zenml)\n\n## Educational\n\n\u003Cdiv style=\"width: 200px;\">\n\u003Ctable>\n\u003Ctr>\n    \u003Ctd>\u003C\u002Ftd>\n    \u003Ctd colspan=\"2\">\u003Cvideo width=\"100%\" src='https:\u002F\u002Fgithub.com\u002Fpoloclub\u002Ftransformer-explainer\u002Fassets\u002F5067740\u002F5c2d6a9d-2cbf-4b01-9ce1-bdf8e190dc42'>\u003C\u002Ftd>\n    \u003Ctd>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>    \n    \u003Ctd colspan=\"2\" align=\"right\">\u003Ca href=\"http:\u002F\u002Fpoloclub.github.io\u002Ftransformer-explainer\">Live Demo\u003C\u002Fa>\u003C\u002Ftd>\n    \u003Ctd colspan=\"2\">\u003Ca href=\"https:\u002F\u002Fyoutu.be\u002FECR4oAwocjs\">Demo Video\u003C\u002Fa>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003C\u002Ftable>\n\u003C\u002Fdiv>\n\n- [minGPT](https:\u002F\u002Fgithub.com\u002Fkarpathy\u002FminGPT) by Andrej Karpathy - A PyTorch re-implementation of GPT, both training and inference. minGPT tries to be small, clean, interpretable and educational, as most of the currently available GPT model implementations can a bit sprawling. GPT is not a complicated model and this implementation is appropriately about 300 lines of code.\n  - [nanoGPT](https:\u002F\u002Fgithub.com\u002Fkarpathy\u002FnanoGPT) - It's a re-write of minGPT. Still under active development. The associated and ongoing video lecture series _[Neural Networks: Zero to Hero](https:\u002F\u002Fkarpathy.ai\u002Fzero-to-hero.html)_, build GPT, from scratch, in code and aspire to spell everything out. Note that Karpathy's bottom up approach and fast.ai teaching style work well together. (FYI, fast.ai has both top-down (\"part 1\") and bottom-up (\"part 2\") approach.)\n- [A visual intro to large language models (LLMs) by Jay Alammar\u002FCohere](https:\u002F\u002Fjalammar.github.io\u002Fapplying-large-language-models-cohere\u002F) - A high-level look at LLMs and some of their applications for language processing. It covers text generation models (like GPT) and representation models (like BERT).\n- [Interfaces for Explaining Transformer Language Models](https:\u002F\u002Fjalammar.github.io\u002Fexplaining-transformers\u002F) by Jay Alammar - A gentle visual to Transformer models by looking at input saliency and neuron activation inside neural networks. **Our understanding of why these models work so well, however, still lags behind these developments**.\n- [The GPT-3 Architecture, on a Napkin](https:\u002F\u002Fdugas.ch\u002Fartificial_curiosity\u002FGPT_architecture.html)\n- [PicoGPT: GPT in 60 Lines of NumPy](https:\u002F\u002Fjaykmody.com\u002Fblog\u002Fgpt-from-scratch\u002F)\n- [Video explainer about the core of transformer architecture](https:\u002F\u002Fyoutu.be\u002FkWLed8o5M2Y?si=YOMpWS1gfWMADzTX) (2023) - Read The Illustrated Transformer, but still didn't feel like you had an intuitive understanding of what the various pieces of attention were doing? In this video, a more constructive approach to explaining the transformer and attention can help you understand it better: starting from a simple convolutional neural network (CNN), the author will step you through all of the changes that need to be made to a CNN to become a transformer.\n- [A Hackers' Guide to Language Models (video)](https:\u002F\u002Fyoutu.be\u002FjkrNMKz9pWU?si=4qsEubcueqp45geo) (2023) by Jeremy Howard, fast.ai - A quick run through all the basic ideas of language models, how to use them (both open models and OpenAI-based models) using code as much as possible.\n\n### Tutorials\n\n1. [How to train a new language model from scratch using Transformers and Tokenizers](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fhow-to-train) tutorial by Hugging Face. :fire:\n\n## AI Safety\n\nInterpretability research and AI alignment research.\n\n- [Transformer Circuits Thread](https:\u002F\u002Ftransformer-circuits.pub\u002F) project by Anthropic - Can we reverse engineer transformer language models into human-understandable computer programs? Interpretability research benefits a lot from interactive articles. As part of their effort, they've created several other resources besides their paper like \"A Mathematical Framework for Transformer Circuits\" and [\"toy models of superposition\"](https:\u002F\u002Fthreadreaderapp.com\u002Fthread\u002F1570087876053942272.html).\n- [Discovering Language Model Behaviors with Model-Written Evaluations (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09251) by Anthropic et al. - They automatically generate evaluations with LMs. They discover new cases of inverse scaling where LMs get worse with size. They also find some of the first examples of inverse scaling in RLHF, where more RLHF makes LMs worse.\n- [Transformers learn in-context by gradient descent (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.07677) by J von Oswald et al. [[AI Alignment Forum](https:\u002F\u002Fwww.alignmentforum.org\u002Fposts\u002FfirtXAWGdvzXYAh9B\u002Fpaper-transformers-learn-in-context-by-gradient-descent)]\n- [Why Can GPT Learn In-Context? Language Models Secretly Perform Gradient Descent as Meta-Optimizers (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10559v2) by Microsoft Research.\n- [Cognitive Biases in Large Language Models](https:\u002F\u002Funiversalprior.substack.com\u002Fp\u002Fcognitive-biases-in-large-language)\n- [Tracr: Compiled Transformers as a Laboratory for Interpretability (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.05062) (2023) by DeepMind - TRACR (TRAnsformer Compiler for RASP) is a compiler for converting RASP programs (DSL for Transformers) into weights of a GPT-like model. Usually, we train Transformers to encode algorithms in their weights. With TRACR, we go in the reverse direction; compile weights **directly** from explicit code. Why do this? Accelerate interpretability research. Think of it like formal methods (from software eng.) on Transformers. It can be difficult to check if the explanation an interpretability tool provides is correct. [[Tweet](https:\u002F\u002Ftwitter.com\u002Fdavlindner\u002Fstatus\u002F1613900577804525573), [code](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Ftracr)]\n- [Yann LeCun's unwavering opinion on current (auto-regressive) LLMs (Tweet)](https:\u002F\u002Fweb.archive.org\u002Fweb\u002F20230213173604\u002Fhttps:\u002F\u002Ftwitter.com\u002Fylecun\u002Fstatus\u002F1625118108082995203)\n- [Core Views on AI Safety: When, Why, What, and How](https:\u002F\u002Fwww.anthropic.com\u002Findex\u002Fcore-views-on-ai-safety) by Anthropic, 2023.\n- [Personalisation within bounds: A risk taxonomy and policy framework for the alignment of large language models with personalised feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05453) by University of Oxford, 2023. [[Tweet](https:\u002F\u002Ftwitter.com\u002Fhannahrosekirk\u002Fstatus\u002F1634228684893700096)]\n- [GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10130) by OpenAI et al., 2023 - The paper argues GPT is a General Purpose Technology.\n\n## Videos\n\n### [BERTology](https:\u002F\u002Fhuggingface.co\u002Ftransformers\u002Fbertology.html)\n\n1. [XLNet Explained](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=naOuE9gLbZo) by NLP Breakfasts.\n  - Clear explanation. Also covers the two-stream self-attention idea.\n2. [The Future of NLP](https:\u002F\u002Fyoutu.be\u002FG5lmya6eKtc) by 🤗\n  - Dense overview of what is going on in transfer learning in NLP currently, limits, and future directions.\n3. [The Transformer neural network architecture explained](https:\u002F\u002Fyoutu.be\u002FFWFA4DGuzSc) by AI Coffee Break with Letitia Parcalabescu.\n  - High-level explanation, best suited when unfamiliar with Transformers.\n\n### Attention and Transformer Networks\n\n1. [Sequence to Sequence Learning Animated (Inside Transformer Neural Networks and Attention Mechanisms)](https:\u002F\u002Fyoutu.be\u002FGTVgJhSlHEk) by learningcurve.\n\n### General\n\n- [Trials and tribulations of OPT-175B training by Susan Zhang at Meta](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=p9IxoSkvZ-M) - In this talk, they walk through the development lifecycle of OPT-175B, covering infrastructure and training convergence challenges faced at scale, along with methods of addressing these issues going forward. Amazing that they managed to pull off such a feat. Key takeaway: data matters a lot! Super deep understanding of neural networks nuts&bolts (LR, SGD, etc.) and engineering. Even more than usual time spend staring at the loss curves. Understanding the Chinchilla's scaling law of how the new architecture\u002Falgorithms works as you scale up. [[LLM training log](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmetaseq\u002Fblob\u002Fmain\u002Fprojects\u002FOPT\u002Fchronicles\u002FOPT175B_Logbook.pdf)]\n\n## Official BERT Implementations\n\n1. [google-research\u002Fbert](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert) - TensorFlow code and pre-trained models for BERT.\n\n## Transformer Implementations By Communities\n\nGPT and\u002For BERT implementations.\n\n### PyTorch and TensorFlow\n\n1. [🤗 Hugging Face Transformers](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers) (formerly known as [pytorch-transformers](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Fpytorch-transformers) and [pytorch-pretrained-bert](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Fpytorch-pretrained-BERT)) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch. [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.03771)]\n2. [spacy-transformers](https:\u002F\u002Fgithub.com\u002Fexplosion\u002Fspacy-transformers) - a library that wrap Hugging Face's Transformers, in order to extract features to power NLP pipelines. It also calculates an alignment so the Transformer features can be related back to actual words instead of just wordpieces.\n3. [FasterTransformer](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FFasterTransformer) - Transformer related optimization, including BERT and GPT. This repo provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA.\n\n### PyTorch\n\n1. [codertimo\u002FBERT-pytorch](https:\u002F\u002Fgithub.com\u002Fcodertimo\u002FBERT-pytorch) - Google AI 2018 BERT pytorch implementation.\n2. [innodatalabs\u002Ftbert](https:\u002F\u002Fgithub.com\u002Finnodatalabs\u002Ftbert) - PyTorch port of BERT ML model.\n3. [kimiyoung\u002Ftransformer-xl](https:\u002F\u002Fgithub.com\u002Fkimiyoung\u002Ftransformer-xl) - Code repository associated with the Transformer-XL paper.\n4. [dreamgonfly\u002FBERT-pytorch](https:\u002F\u002Fgithub.com\u002Fdreamgonfly\u002FBERT-pytorch) - A PyTorch implementation of BERT in \"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding\".\n5. [dhlee347\u002Fpytorchic-bert](https:\u002F\u002Fgithub.com\u002Fdhlee347\u002Fpytorchic-bert) - A Pytorch implementation of Google BERT.\n6. [pingpong-ai\u002Fxlnet-pytorch](https:\u002F\u002Fgithub.com\u002Fpingpong-ai\u002Fxlnet-pytorch) - A Pytorch implementation of Google Brain XLNet.\n7. [facebook\u002Ffairseq](https:\u002F\u002Fgithub.com\u002Fpytorch\u002Ffairseq\u002Fblob\u002Fmaster\u002Fexamples\u002Froberta\u002FREADME.md) - RoBERTa: A Robustly Optimized BERT Pretraining Approach by Facebook AI Research. SoTA results on GLUE, SQuAD and RACE.\n8. [NVIDIA\u002FMegatron-LM](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM) - Ongoing research training transformer language models at scale, including: BERT.\n9. [deepset-ai\u002FFARM](https:\u002F\u002Fgithub.com\u002Fdeepset-ai\u002FFARM) - Simple & flexible transfer learning for the industry.\n10. [NervanaSystems\u002Fnlp-architect](https:\u002F\u002Fwww.intel.ai\u002Fnlp-transformer-models\u002F) - NLP Architect by Intel AI. Among other libraries, it provides a quantized version of Transformer models and efficient training method.\n11. [kaushaltrivedi\u002Ffast-bert](https:\u002F\u002Fgithub.com\u002Fkaushaltrivedi\u002Ffast-bert) - Super easy library for BERT based NLP models. Built based on 🤗 Transformers and is inspired by fast.ai.\n12. [NVIDIA\u002FNeMo](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FNeMo) - Neural Modules is a toolkit for conversational AI by NVIDIA. They are trying to [improve speech recognition with BERT post-processing](https:\u002F\u002Fnvidia.github.io\u002FNeMo\u002Fnlp\u002Fintro.html#improving-speech-recognition-with-bertx2-post-processing-model).\n13. [facebook\u002FMMBT](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmmbt\u002F) from Facebook AI - Multimodal transformers model that can accept a transformer model and a computer vision model for classifying image and text.\n14. [dbiir\u002FUER-py](https:\u002F\u002Fgithub.com\u002Fdbiir\u002FUER-py) from Tencent and RUC - Open Source Pre-training Model Framework in PyTorch & Pre-trained Model Zoo (with more focus on Chinese).\n15. [lucidrains\u002Fx-transformers](https:\u002F\u002Fgithub.com\u002Flucidrains\u002Fx-transformers) - A simple but complete full-attention transformer with a set of promising experimental features from various papers (good for learning purposes). There is a 2021 paper rounding up Transformer modifications, [_Do Transformer Modifications Transfer Across Implementations and Applications?_](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.11972).\n\n### Keras\n\n1. [Separius\u002FBERT-keras](https:\u002F\u002Fgithub.com\u002FSeparius\u002FBERT-keras) - Keras implementation of BERT with pre-trained weights.\n2. [CyberZHG\u002Fkeras-bert](https:\u002F\u002Fgithub.com\u002FCyberZHG\u002Fkeras-bert) - Implementation of BERT that could load official pre-trained models for feature extraction and prediction.\n3. [bojone\u002Fbert4keras](https:\u002F\u002Fgithub.com\u002Fbojone\u002Fbert4keras) - Light reimplement of BERT for Keras.\n\n### TensorFlow\n\n1. [guotong1988\u002FBERT-tensorflow](https:\u002F\u002Fgithub.com\u002Fguotong1988\u002FBERT-tensorflow) - BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.\n2. [kimiyoung\u002Ftransformer-xl](https:\u002F\u002Fgithub.com\u002Fkimiyoung\u002Ftransformer-xl) - Code repository associated with the Transformer-XL paper.\n3. [zihangdai\u002Fxlnet](https:\u002F\u002Fgithub.com\u002Fzihangdai\u002Fxlnet) - Code repository associated with the XLNet paper.\n\n### Chainer\n\n1. [soskek\u002Fbert-chainer](https:\u002F\u002Fgithub.com\u002Fsoskek\u002Fbert-chainer) - Chainer implementation of \"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding\".\n\n### Other\n\n- [llama.cpp](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp) - Port of Facebook's LLaMA model in C\u002FC++.\n- [Cformers](https:\u002F\u002Fgithub.com\u002FNolanoOrg\u002Fcformers) - SoTA Transformers with C-backend for fast inference on your CPU.\n- [Transformers.js](https:\u002F\u002Fgithub.com\u002Fxenova\u002Ftransformers.js) - Run 🤗 Transformers in your browser.\n- [Alpaca.cpp](https:\u002F\u002Fgithub.com\u002Fantimatter15\u002Falpaca.cpp) - Run a fast ChatGPT-like model locally on your device.\n- [LLaMA compatible port](https:\u002F\u002Fgithub.com\u002Fcedrickchee\u002Fllama#llama-compatible-port)\n- [Apple Neural Engine (ANE) Transformers](https:\u002F\u002Fgithub.com\u002Fapple\u002Fml-ane-transformers) - Transformer architecture optimized for Apple Silicon.\n\n## Transfer Learning in NLP\n\n\u003Cdetails>\n\n\u003Csummary>NLP finally had a way to do transfer learning probably as well as Computer Vision could.\u003C\u002Fsummary>\nAs Jay Alammar put it:\n\n> The year 2018 has been an inflection point for machine learning models handling text (or more accurately, Natural Language Processing or NLP for short). Our conceptual understanding of how best to represent words and sentences in a way that best captures underlying meanings and relationships is rapidly evolving. Moreover, the NLP community has been putting forward incredibly powerful components that you can freely download and use in your own models and pipelines (It's been referred to as [NLP's ImageNet moment](http:\u002F\u002Fruder.io\u002Fnlp-imagenet\u002F), referencing how years ago similar developments accelerated the development of machine learning in Computer Vision tasks).\n>\n> One of the latest milestones in this development is the [release](https:\u002F\u002Fai.googleblog.com\u002F2018\u002F11\u002Fopen-sourcing-bert-state-of-art-pre.html) of [BERT](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert), an event [described](https:\u002F\u002Ftwitter.com\u002Flmthang\u002Fstatus\u002F1050543868041555969) as marking the beginning of a new era in NLP. BERT is a model that broke several records for how well models can handle language-based tasks. Soon after the release of the paper describing the model, the team also open-sourced the code of the model, and made available for download versions of the model that were already pre-trained on massive datasets. This is a momentous development since it enables anyone building a machine learning model involving language processing to use this powerhouse as a readily-available component – saving the time, energy, knowledge, and resources that would have gone to training a language-processing model from scratch.\n>\n> BERT builds on top of a number of clever ideas that have been bubbling up in the NLP community recently – including but not limited to [Semi-supervised Sequence Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1511.01432) (by [Andrew Dai](https:\u002F\u002Ftwitter.com\u002Fiamandrewdai) and [Quoc Le](https:\u002F\u002Ftwitter.com\u002Fquocleix)), [ELMo](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.05365) (by Matthew Peters and researchers from [AI2](https:\u002F\u002Fallenai.org\u002F) and [UW CSE](https:\u002F\u002Fwww.engr.washington.edu\u002Fabout\u002Fbldgs\u002Fcse)), [ULMFiT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1801.06146) (by [fast.ai](https:\u002F\u002Ffast.ai) founder [Jeremy Howard](https:\u002F\u002Ftwitter.com\u002Fjeremyphoward) and [Sebastian Ruder](https:\u002F\u002Ftwitter.com\u002Fseb_ruder)), the [OpenAI transformer](https:\u002F\u002Fs3-us-west-2.amazonaws.com\u002Fopenai-assets\u002Fresearch-covers\u002Flanguage-unsupervised\u002Flanguage_understanding_paper.pdf) (by OpenAI researchers [Radford](https:\u002F\u002Ftwitter.com\u002Falecrad), [Narasimhan](https:\u002F\u002Ftwitter.com\u002Fkarthik_r_n), [Salimans](https:\u002F\u002Ftwitter.com\u002Ftimsalimans), and [Sutskever](https:\u002F\u002Ftwitter.com\u002Filyasut)), and the Transformer ([Vaswani et al](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.03762)).\n>\n> **ULMFiT: Nailing down Transfer Learning in NLP**\n>\n> [ULMFiT introduced methods to effectively utilize a lot of what the model learns during pre-training](http:\u002F\u002Fnlp.fast.ai\u002Fclassification\u002F2018\u002F05\u002F15\u002Fintroducting-ulmfit.html) – more than just embeddings, and more than contextualized embeddings. ULMFiT introduced a language model and a process to effectively fine-tune that language model for various tasks.\n>\n> NLP finally had a way to do transfer learning probably as well as Computer Vision could.\n\n\u003C\u002Fdetails>\n\n[MultiFiT: Efficient Multi-lingual Language Model Fine-tuning](http:\u002F\u002Fnlp.fast.ai\u002Fclassification\u002F2019\u002F09\u002F10\u002Fmultifit.html) by Sebastian Ruder et al. MultiFiT extends ULMFiT to make it more efficient and more suitable for language modelling beyond English. ([EMNLP 2019 paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.04761))\n\n## Books\n\n1. [Transfer Learning for Natural Language Processing](https:\u002F\u002Fwww.manning.com\u002Fbooks\u002Ftransfer-learning-for-natural-language-processing) - A book that is a practical primer to transfer learning techniques capable of delivering huge improvements to your NLP models.\n2. [Natural Language Processing with Transformers](https:\u002F\u002Ftransformersbook.com\u002F) by Lewis Tunstall, Leandro von Werra, and Thomas Wolf - This practical book shows you how to train and scale these large models using Hugging Face Transformers. The authors use a hands-on approach to teach you how transformers work and how to integrate them in your applications.\n\n## Other Resources\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>Expand Other Resources\u003C\u002Fb>\u003C\u002Fsummary>\n\n1. [hanxiao\u002Fbert-as-service](https:\u002F\u002Fgithub.com\u002Fhanxiao\u002Fbert-as-service) - Mapping a variable-length sentence to a fixed-length vector using pretrained BERT model.\n2. [brightmart\u002Fbert_language_understanding](https:\u002F\u002Fgithub.com\u002Fbrightmart\u002Fbert_language_understanding) - Pre-training of Deep Bidirectional Transformers for Language Understanding: pre-train TextCNN.\n3. [algteam\u002Fbert-examples](https:\u002F\u002Fgithub.com\u002Falgteam\u002Fbert-examples) - BERT examples.\n4. [JayYip\u002Fbert-multiple-gpu](https:\u002F\u002Fgithub.com\u002FJayYip\u002Fbert-multiple-gpu) - A multiple GPU support version of BERT.\n5. [HighCWu\u002Fkeras-bert-tpu](https:\u002F\u002Fgithub.com\u002FHighCWu\u002Fkeras-bert-tpu) - Implementation of BERT that could load official pre-trained models for feature extraction and prediction on TPU.\n6. [whqwill\u002Fseq2seq-keyphrase-bert](https:\u002F\u002Fgithub.com\u002Fwhqwill\u002Fseq2seq-keyphrase-bert) - Add BERT to encoder part for https:\u002F\u002Fgithub.com\u002Fmemray\u002Fseq2seq-keyphrase-pytorch\n7. [xu-song\u002Fbert_as_language_model](https:\u002F\u002Fgithub.com\u002Fxu-song\u002Fbert_as_language_model) - BERT as language model, a fork from Google official BERT implementation.\n8. [Y1ran\u002FNLP-BERT--Chinese version](https:\u002F\u002Fgithub.com\u002FY1ran\u002FNLP-BERT--ChineseVersion)\n9. [yuanxiaosc\u002FDeep_dynamic_word_representation](https:\u002F\u002Fgithub.com\u002Fyuanxiaosc\u002FDeep_dynamic_word_representation) - TensorFlow code and pre-trained models for deep dynamic word representation (DDWR). It combines the BERT model and ELMo's deep context word representation.\n10. [yangbisheng2009\u002Fcn-bert](https:\u002F\u002Fgithub.com\u002Fyangbisheng2009\u002Fcn-bert)\n11. [Willyoung2017\u002FBert_Attempt](https:\u002F\u002Fgithub.com\u002FWillyoung2017\u002FBert_Attempt)\n12. [Pydataman\u002Fbert_examples](https:\u002F\u002Fgithub.com\u002FPydataman\u002Fbert_examples) - Some examples of BERT. `run_classifier.py` based on Google BERT for Kaggle Quora Insincere Questions Classification challenge. `run_ner.py` is based on the first season of the Ruijin Hospital AI contest and a NER written by BERT.\n13. [guotong1988\u002FBERT-chinese](https:\u002F\u002Fgithub.com\u002Fguotong1988\u002FBERT-chinese) - Pre-training of deep bidirectional transformers for Chinese language understanding.\n14. [zhongyunuestc\u002Fbert_multitask](https:\u002F\u002Fgithub.com\u002Fzhongyunuestc\u002Fbert_multitask) - Multi-task.\n15. [Microsoft\u002FAzureML-BERT](https:\u002F\u002Fgithub.com\u002FMicrosoft\u002FAzureML-BERT) - End-to-end walk through for fine-tuning BERT using Azure Machine Learning.\n16. [bigboNed3\u002Fbert_serving](https:\u002F\u002Fgithub.com\u002FbigboNed3\u002Fbert_serving) - Export BERT model for serving.\n17. [yoheikikuta\u002Fbert-japanese](https:\u002F\u002Fgithub.com\u002Fyoheikikuta\u002Fbert-japanese) - BERT with SentencePiece for Japanese text.\n18. [nickwalton\u002FAIDungeon](https:\u002F\u002Fgithub.com\u002Fnickwalton\u002FAIDungeon) - AI Dungeon 2 is a completely AI generated text adventure built with OpenAI's largest 1.5B param GPT-2 model. It's a first of it's kind game that allows you to enter and will react to any action you can imagine.\n19. [turtlesoupy\u002Fthis-word-does-not-exist](https:\u002F\u002Fgithub.com\u002Fturtlesoupy\u002Fthis-word-does-not-exist) - \"This Word Does Not Exist\" is a project that allows people to train a variant of GPT-2 that makes up words, definitions and examples from scratch. We've never seen fake text so real.\n\u003C\u002Fdetails>\n\n## Tools \n\n1. [jessevig\u002Fbertviz](https:\u002F\u002Fgithub.com\u002Fjessevig\u002Fbertviz) - Tool for visualizing attention in the Transformer model.\n2. [FastBert](https:\u002F\u002Fgithub.com\u002Fkaushaltrivedi\u002Ffast-bert) - A simple deep learning library that allows developers and data scientists to train and deploy BERT based models for NLP tasks beginning with text classification. The work on FastBert is inspired by fast.ai.\n3. [gpt2tc](https:\u002F\u002Fbellard.org\u002Flibnc\u002Fgpt2tc.html) - A small program using the GPT-2 LM to complete and compress texts. It has no external dependency, requires no GPU and is quite fast. The smallest model (117M parameters) is provided. Larger models can be downloaded as well. (no waitlist, no sign up required).\n\n## Tasks\n\n### Named-Entity Recognition (NER)\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>Expand NER\u003C\u002Fb>\u003C\u002Fsummary>\n\n1. [kyzhouhzau\u002FBERT-NER](https:\u002F\u002Fgithub.com\u002Fkyzhouhzau\u002FBERT-NER) - Use google BERT to do CoNLL-2003 NER.\n2. [zhpmatrix\u002Fbert-sequence-tagging](https:\u002F\u002Fgithub.com\u002Fzhpmatrix\u002Fbert-sequence-tagging) - Chinese sequence labeling.\n3. [JamesGu14\u002FBERT-NER-CLI](https:\u002F\u002Fgithub.com\u002FJamesGu14\u002FBERT-NER-CLI) - Bert NER command line tester with step by step setup guide.\n4. [sberbank-ai\u002Fner-bert](https:\u002F\u002Fgithub.com\u002Fsberbank-ai\u002Fner-bert)\n5. [mhcao916\u002FNER_Based_on_BERT](https:\u002F\u002Fgithub.com\u002Fmhcao916\u002FNER_Based_on_BERT) - This project is based on Google BERT model, which is a Chinese NER.\n6. [macanv\u002FBERT-BiLSMT-CRF-NER](https:\u002F\u002Fgithub.com\u002Fmacanv\u002FBERT-BiLSMT-CRF-NER) - TensorFlow solution of NER task using Bi-LSTM-CRF model with Google BERT fine-tuning.\n7. [ProHiryu\u002Fbert-chinese-ner](https:\u002F\u002Fgithub.com\u002FProHiryu\u002Fbert-chinese-ner) - Use the pre-trained language model BERT to do Chinese NER.\n8. [FuYanzhe2\u002FName-Entity-Recognition](https:\u002F\u002Fgithub.com\u002FFuYanzhe2\u002FName-Entity-Recognition) - Lstm-CRF, Lattice-CRF, recent NER related papers.\n9. [king-menin\u002Fner-bert](https:\u002F\u002Fgithub.com\u002Fking-menin\u002Fner-bert) - NER task solution (BERT-Bi-LSTM-CRF) with Google BERT https:\u002F\u002Fgithub.com\u002Fgoogle-research.\n\u003C\u002Fdetails>\n\n### Classification\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>Expand Classification\u003C\u002Fb>\u003C\u002Fsummary>\n\n1. [brightmart\u002Fsentiment_analysis_fine_grain](https:\u002F\u002Fgithub.com\u002Fbrightmart\u002Fsentiment_analysis_fine_grain) - Multi-label classification with BERT; Fine Grained Sentiment Analysis from AI challenger.\n2. [zhpmatrix\u002FKaggle-Quora-Insincere-Questions-Classification](https:\u002F\u002Fgithub.com\u002Fzhpmatrix\u002FKaggle-Quora-Insincere-Questions-Classification) - Kaggle baseline—fine-tuning BERT and tensor2tensor based Transformer encoder solution.\n3. [maksna\u002Fbert-fine-tuning-for-chinese-multiclass-classification](https:\u002F\u002Fgithub.com\u002Fmaksna\u002Fbert-fine-tuning-for-chinese-multiclass-classification) - Use Google pre-training model BERT to fine-tune for the Chinese multiclass classification.\n4. [NLPScott\u002Fbert-Chinese-classification-task](https:\u002F\u002Fgithub.com\u002FNLPScott\u002Fbert-Chinese-classification-task) - BERT Chinese classification practice.\n5. [fooSynaptic\u002FBERT_classifer_trial](https:\u002F\u002Fgithub.com\u002FfooSynaptic\u002FBERT_classifer_trial) - BERT trial for Chinese corpus classfication.\n6. [xiaopingzhong\u002Fbert-finetune-for-classfier](https:\u002F\u002Fgithub.com\u002Fxiaopingzhong\u002Fbert-finetune-for-classfier) - Fine-tuning the BERT model while building your own dataset for classification.\n7. [Socialbird-AILab\u002FBERT-Classification-Tutorial](https:\u002F\u002Fgithub.com\u002FSocialbird-AILab\u002FBERT-Classification-Tutorial) - Tutorial.\n8. [malteos\u002Fpytorch-bert-document-classification](https:\u002F\u002Fgithub.com\u002Fmalteos\u002Fpytorch-bert-document-classification\u002F) - Enriching BERT with Knowledge Graph Embedding for Document Classification (PyTorch)\n\u003C\u002Fdetails>\n\n### Text Generation\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>Expand Text Generation\u003C\u002Fb>\u003C\u002Fsummary>\n\n1. [asyml\u002Ftexar](https:\u002F\u002Fgithub.com\u002Fasyml\u002Ftexar) - Toolkit for Text Generation and Beyond. [Texar](https:\u002F\u002Ftexar.io) is a general-purpose text generation toolkit, has also implemented BERT here for classification, and text generation applications by combining with Texar's other modules.\n2. [Plug and Play Language Models: a Simple Approach to Controlled Text Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.02164) (PPLM) paper by Uber AI.\n\u003C\u002Fdetails>\n\n### Question Answering (QA)\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>Expand QA\u003C\u002Fb>\u003C\u002Fsummary>\n\n1. [matthew-z\u002FR-net](https:\u002F\u002Fgithub.com\u002Fmatthew-z\u002FR-net) - R-net in PyTorch, with BERT and ELMo.\n2. [vliu15\u002FBERT](https:\u002F\u002Fgithub.com\u002Fvliu15\u002FBERT) - TensorFlow implementation of BERT for QA.\n3. [benywon\u002FChineseBert](https:\u002F\u002Fgithub.com\u002Fbenywon\u002FChineseBert) - This is a Chinese BERT model specific for question answering.\n4. [xzp27\u002FBERT-for-Chinese-Question-Answering](https:\u002F\u002Fgithub.com\u002Fxzp27\u002FBERT-for-Chinese-Question-Answering)\n5. [facebookresearch\u002FSpanBERT](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FSpanBERT) - Question Answering on SQuAD; improving pre-training by representing and predicting spans.\n\u003C\u002Fdetails>\n\n### Knowledge Graph\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>Expand Knowledge Graph\u003C\u002Fb>\u003C\u002Fsummary>\n\n1. [sakuranew\u002FBERT-AttributeExtraction](https:\u002F\u002Fgithub.com\u002Fsakuranew\u002FBERT-AttributeExtraction) - Using BERT for attribute extraction in knowledge graph. Fine-tuning and feature extraction. The BERT-based fine-tuning and feature extraction methods are used to extract knowledge attributes of Baidu Encyclopedia characters.\n2. [lvjianxin\u002FKnowledge-extraction](https:\u002F\u002Fgithub.com\u002Flvjianxin\u002FKnowledge-extraction) - Chinese knowledge-based extraction. Baseline: bi-LSTM+CRF upgrade: BERT pre-training.\n\u003C\u002Fdetails>\n\n## License\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>Expand License\u003C\u002Fb>\u003C\u002Fsummary>\n\nThis repository contains a variety of content; some developed by Cedric Chee, and some from third-parties. The third-party content is distributed under the license provided by those parties.\n\n*I am providing code and resources in this repository to you under an open source license.  Because this is my personal repository, the license you receive to my code and resources is from me and not my employer.*\n\nThe content developed by Cedric Chee is distributed under the following license:\n\n### Code\n\nThe code in this repository, including all code samples in the notebooks listed above, is released under the [MIT license](LICENSE). Read more at the [Open Source Initiative](https:\u002F\u002Fopensource.org\u002Flicenses\u002FMIT).\n\n### Text\n\nThe text content is released under the CC-BY-SA 4.0 license. Read more at [Creative Commons](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-sa\u002F4.0\u002F).\n\u003C\u002Fdetails>\n\n[^1]: Infographic by AIM.\n","# 自然语言处理中精选的 Transformer 与迁移学习 [![Awesome](https:\u002F\u002Fawesome.re\u002Fbadge.svg)](https:\u002F\u002Fawesome.re)\n\n本仓库收录了一份精心挑选的自然语言处理（NLP）机器学习（深度学习）资源列表，重点涵盖生成式预训练 Transformer（GPT）、基于 Transformer 的双向编码器表示（BERT）、注意力机制、Transformer 架构\u002F网络、ChatGPT 以及 NLP 中的迁移学习。\n\n\u003Cp align=\"center\" width=\"100%\">\n  \u003Cimg src=\"https:\u002F\u002Fuser.githubusercontent.com\u002F145605\u002F206787465-bdfae6e0-c850-46fc-808d-a51c97644a9e.png#gh-dark-mode-only\" width=\"40%\" alt=\"Transformer (BERT 编码器)\" \u002F>\n\u003C\u002Fp>\n\u003Cp align=\"center\" width=\"100%\">\n  \u003Cimg src=\"https:\u002F\u002Fuser.githubusercontent.com\u002F145605\u002F79639176-9ca33d80-81bc-11ea-8cde-f7ff68ee2042.png#gh-light-mode-only\" width=\"40%\" alt=\"Transformer (BERT 编码器)\" \u002F>\n\u003C\u002Fp>\n\u003Cp align=\"center\" width=\"100%\">\n  \u003Csup>Transformer (\u003Ca href=\"https:\u002F\u002Fweb.archive.org\u002Fweb\u002F20201217063603\u002Fhttps:\u002F\u002Fpeltarion.com\u002Fknowledge-center\u002Fdocumentation\u002Fmodeling-view\u002Fbuild-an-ai-model\u002Fblocks\u002Fbert-encoder\">来源\u003C\u002Fa>)\u003C\u002Fsup>\n\u003C\u002Fp>\n\n# 目录\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>展开目录\u003C\u002Fb>\u003C\u002Fsummary>\n\n- [论文](#papers)\n- [文章](#articles)\n  - [BERT 和 Transformer](#bert-and-transformer)\n  - [注意力机制](#attention-mechanism)\n  - [Transformer 架构](#transformer-architecture)\n  - [生成式预训练 Transformer (GPT)](#generative-pre-training-transformer-gpt)\n    - [ChatGPT](#chatgpt)\n  - [大语言模型 (LLM)](#large-language-model-llm)\n  - [Transformer 强化学习](#transformer-reinforcement-learning)\n  - [补充阅读](#additional-reading)\n- [教育资料](#educational)\n  - [教程](#tutorials)\n- [AI 安全](#ai-safety)\n- [视频](#videos)\n  - [BERTology](#bertology)\n  - [注意力和 Transformer 网络](#attention-and-transformer-networks)\n- [官方 BERT 实现](#official-bert-implementations)\n- [社区实现的 Transformer](#transformer-implementations-by-communities)\n  - [PyTorch 和 TensorFlow](#pytorch-and-tensorflow)\n  - [PyTorch](#pytorch)\n  - [Keras](#keras)\n  - [TensorFlow](#tensorflow)\n  - [Chainer](#chainer)\n  - [其他](#other)\n- [NLP 中的迁移学习](#transfer-learning-in-nlp)\n- [书籍](#books)\n- [其他资源](#other-resources)\n- [工具](#tools)\n- [任务](#tasks)\n  - [命名实体识别 (NER)](#named-entity-recognition-ner)\n  - [分类](#classification)\n  - [文本生成](#text-generation)\n  - [问答 (QA)](#question-answering-qa)\n  - [知识图谱](#knowledge-graph)\n\u003C\u002Fdetails>\n\n---\n\n## 论文\n\n1. [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.\n2. [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.02860) by Zihang Dai, Zhilin Yang, Yiming Yang, William W. Cohen, Jaime Carbonell, Quoc V. Le and Ruslan Salakhutdinov.\n  - 使用智能缓存来改进 Transformer 中长期依赖的学习。关键结果：在 5 个语言建模基准测试上达到最先进水平，包括 One Billion Word (LM1B) 上的 ppl 为 21.8 和 enwiki8 上的 0.99。作者声称该方法更灵活，评估期间更快（加速 1874 倍），在小数据集上泛化良好，且对短序列和长序列建模有效。\n2. [Conditional BERT Contextual Augmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.06705) by Xing Wu, Shangwen Lv, Liangjun Zang, Jizhong Han and Songlin Hu.\n3. [SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.03593) by Chenguang Zhu, Michael Zeng and Xuedong Huang.\n4. [Language Models are Unsupervised Multitask Learners](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf) by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.\n5. [The Evolved Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.11117) by David R. So, Chen Liang and Quoc V. Le.\n  - 他们使用架构搜索来改进 Transformer 架构。关键在于使用进化算法并以 Transformer 本身作为初始种群的种子。该架构更好且更高效，特别是对于小尺寸模型。\n6. [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.08237) by Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.\n  - 一种新的 NLP 预训练方法，在 20 个任务（如 SQuAD, GLUE, RACE）上显著优于 BERT。\n  - “Transformer-XL 是一个移位模型（每个超列以下一个 token 结尾），而 XLNet 是一个直接模型（每个超列以相同 token 的上下文表示结尾）。” — [Thomas Wolf](https:\u002F\u002Ftwitter.com\u002FThom_Wolf\u002Fstatus\u002F1141803437719506944?s=20)。\n  - [HN 评论](https:\u002F\u002Fnews.ycombinator.com\u002Fitem?id=20229145):\n    \u003Cdetails>\n  \n    \u003Csummary>一种巧妙的双重掩码和缓存算法。\u003C\u002Fsummary>\n\n- 这并非仅仅是向问题“投入更多算力 (compute)\"。\n    - 作者们设计了一种巧妙的双重掩码加缓存机制 (dual-masking-plus-caching mechanism)，以促使基于注意力的模型 (attention-based model) 学习从同一输入序列中所有其他词元 (tokens) 的因子分解顺序的所有可能排列中预测词元。\n    - 在期望上，模型学习收集每个词元两侧所有位置的信息来预测该词元。\n      - 例如，如果输入序列有四个词元 [\"The\", \"cat\", \"is\", \"furry\"]，在一个训练步骤中，模型将在看到 \"The\" 后尝试预测 \"is\"，然后是 \"cat\"，然后是 \"furry\"。\n      - 在另一个训练步骤中，模型可能会先看到 \"furry\"，然后是 \"The\"，然后是 \"cat\"。\n      - 注意，原始序列顺序始终保留，例如，模型始终知道 \"furry\" 是第四个词元。\n    - 实现这一目标的掩码和缓存算法在我看来并非易事。\n    - 在各种任务中对 SOTA（State-of-the-Art，最先进）性能的改进是显著的 -- 参见论文中的表 2、3、4、5 和 6。\n    \u003C\u002Fdetails>\n7. [CTRL: Conditional Transformer Language Model for Controllable Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.05858) by Nitish Shirish Keskar, Richard Socher et al. [[Code](https:\u002F\u002Fgithub.com\u002Fsalesforce\u002Fctrl)].\n8. [PLMpapers](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FPLMpapers) - BERT (Transformer, transfer learning) 已经催化了预训练语言模型 (PLMs) 的研究并引发了许多扩展。此仓库包含一份关于 PLMs 的论文列表。\n9. [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683) by Google Brain.\n- 该团队使用统一的文本到文本转移 Transformer (T5) 模型对 NLP（自然语言处理）的迁移学习进行了系统研究，并推动其达到 SuperGLUE（接近人类基线）、SQuAD 和 CNN\u002FDM 基准的 SOTA（最先进水平）。[[Code](https:\u002F\u002Fgit.io\u002FJe0cZ)].\n10. [Reformer: The Efficient Transformer](https:\u002F\u002Fopenreview.net\u002Fforum?id=rkgNKkHtvB) by Nikita Kitaev, Lukasz Kaiser, and Anselm Levskaya.\n- “他们提出了降低 Transformer 时间和内存复杂度的技术，允许非常长的序列（64K）批次适配到一个 GPU 上。应为 Transformer 在 NLP 领域之外产生真正的影响力铺平道路。” — @hardmaru\n11. [Supervised Multimodal Bitransformers for Classifying Images and Text](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02950) (MMBT) by Facebook AI.\n11. [A Primer in BERTology: What we know about how BERT works](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.12327) by Anna Rogers et al.\n- “你是否正淹没在 BERT 论文中？”。该团队调查了超过 40 篇关于 BERT 的语言学知识、架构调整、压缩、多语言能力等的论文。\n12. [tomohideshibata\u002FBERT-related papers](https:\u002F\u002Fgithub.com\u002Ftomohideshibata\u002FBERT-related-papers)\n13. [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.03961) by Google Brain. [[Code]](https:\u002F\u002Fgithub.com\u002Ftensorflow\u002Fmesh\u002Fblob\u002Fmaster\u002Fmesh_tensorflow\u002Ftransformer\u002Fmoe.py) | [[Blog post (unofficial)]](https:\u002F\u002Fsyncedreview.com\u002F2021\u002F01\u002F14\u002Fgoogle-brains-switch-transformer-language-model-packs-1-6-trillion-parameters\u002F)\n- 核心思想：架构在每个训练步骤和每个样本上使用一部分参数。优点：模型训练快得多。缺点：超大规模模型无法适应很多环境。\n14. [An Attention Free Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.14103) by Apple.\n15. [A Survey of Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.04554) by Tianyang Lin et al.\n16. [Evaluating Large Language Models Trained on Code](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374) by OpenAI.\n- Codex，一个驱动 GitHub Copilot 的 GPT 语言模型。\n- 他们调查了模型的局限性（及优势）。\n- 他们讨论了部署强大代码生成技术的潜在更广泛影响，涵盖安全、安全和经济方面。\n17. [Training language models to follow instructions with human feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.02155) by OpenAI. 他们将生成的模型称为 [InstructGPT](https:\u002F\u002Fopenai.com\u002Fblog\u002Finstruction-following\u002F)。[ChatGPT](https:\u002F\u002Fopenai.com\u002Fblog\u002Fchatgpt\u002F) 是 InstructGPT 的兄弟模型。\n18. [LaMDA: Language Models for Dialog Applications](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.08239) by Google.\n19. [Training Compute-Optimal Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.15556) by Hoffmann et al. at DeepMind. TLDR（简而言之）：介绍了一种新的 70B LM（语言模型），名为 \"Chinchilla\"，其表现优于更大的 LLMs（GPT-3, Gopher）。DeepMind 发现了廉价扩展大型语言模型的秘诀——为了达到计算最优，模型大小和训练数据必须同等扩展。这表明大多数 LLMs 严重缺乏数据且训练不足。鉴于 [新的缩放定律](https:\u002F\u002Fwww.alignmentforum.org\u002Fposts\u002F6Fpvch8RR29qLEWNH\u002Fchinchilla-s-wild-implications)，即使你将千万亿参数注入模型（GPT-4 都市传说），收益也无法弥补 4 倍更多的训练 token。\n20. [Improving language models by retrieving from trillions of tokens](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.04426) by Borgeaud et al. at DeepMind - 该团队探索了一条通过互联网规模检索进行高效训练的替代路径。该方法被称为 RETRO，即“检索增强 Transformer (Retrieval Enhanced TRansfOrmers)\"。使用 RETRO **模型不受训练期间所见数据的限制——它可以通过检索机制访问整个训练数据集。与具有相同参数数量的标准 Transformer 相比，这带来了显著的性能提升**。尽管使用了 25 倍更少的参数，RETRO 在 Pile 数据集上获得了与 GPT-3 相当的性能。他们表明，随着检索数据库规模的增加，语言建模性能持续提升。[[blog post](https:\u002F\u002Fwww.deepmind.com\u002Fblog\u002Fimproving-language-models-by-retrieving-from-trillions-of-tokens)]\n21. [Scaling Instruction-Finetuned Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.11416) by Google - 他们发现，上述方面的指令微调极大地提高了各种模型类别（PaLM, T5, U-PaLM）、提示设置（零样本、少样本、思维链 (CoT)）和评估基准上的性能。Flan-PaLM 540B 在多个基准上实现了 SOTA 性能。他们还公开发布了 [Flan-T5 检查点](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Ft5x\u002Fblob\u002Fmain\u002Fdocs\u002Fmodels.md#flan-t5-checkpoints)，即使在比 PaLM 62B 大得多的模型面前，也能实现强大的少样本性能。\n22. [Emergent Abilities of Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.07682) by Google Research, Stanford University, DeepMind, and UNC Chapel Hill.\n23. [Nonparametric Masked (NPM) Language Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.01349) by Meta AI et al. [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FNPM)] - 非参数模型拥有 **500 倍更少的参数却在零样本任务上超越了 GPT-3。**\n    > 至关重要的是，它没有针对固定输出词汇表的 softmax，而是拥有短语上的完全非参数分布。这与最近（2022 年）将非参数组件纳入参数化模型的一系列工作形成对比。\n    >\n    > 结果显示，NPM 在参数效率方面显著更高，性能优于高达 500 倍的更大参数化模型以及高达 37 倍的更大检索生成模型。\n24. [Transformer models: an introduction and catalog](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.07730) by Xavier Amatriain, 2023 - 本文的目标是提供一份相当全面但简单的最流行 Transformer 模型的目录和分类。论文还介绍了 Transformer 模型最重要的方面和创新。\n25. [Foundation Models for Decision Making: Problems, Methods, and Opportunities](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.04129) by Google Research et al., 2023 - 一份关于近期方法（即条件生成建模、RL（强化学习）、提示）的报告，这些方法将预训练模型（即 LMs）应用于实际决策智能体。模型可以服务于世界动态或引导决策。\n26. [GPT-4 Technical Report](https:\u002F\u002Fcdn.openai.com\u002Fpapers\u002Fgpt-4.pdf) by OpenAI, 2023.\n27. [The Llama 3 Herd of Models](https:\u002F\u002Fai.meta.com\u002Fresearch\u002Fpublications\u002Fthe-llama-3-herd-of-models\u002F) by Llama Team, AI @ Meta, Jul 2024 - 这篇论文作为项目的常被忽视的组成部分，被证明同样重要，甚至更为关键，其重要性来得完全出乎意料。这篇论文本身也是一部杰作，提供了关于模型预训练和后训练流程的详细信息的宝库，提供了既深刻又实用的见解。[[Discussion](https:\u002F\u002Fold.reddit.com\u002F\u002Fr\u002FLocalLLaMA\u002Fcomments\u002F1eabf4l)]\n\n## 文章\n\n### BERT 与 Transformer\n\n1. [开源 BERT：自然语言处理 (NLP) 的先进预训练技术](https:\u002F\u002Fai.googleblog.com\u002F2018\u002F11\u002Fopen-sourcing-bert-state-of-art-pre.html) 来自 Google AI。\n2. [图解 BERT、ELMo 及其同类（NLP 如何攻克迁移学习）](https:\u002F\u002Fjalammar.github.io\u002Fillustrated-bert\u002F)。\n3. [拆解 BERT](https:\u002F\u002Fmedium.com\u002Fdissecting-bert) 由 Miguel Romero 和 Francisco Ingham 撰写 —— 通过直观、简洁的相关概念解释，深入理解 BERT。\n4. [Transformer-XL 轻量入门](https:\u002F\u002Fmedium.com\u002Fdair-ai\u002Fa-light-introduction-to-transformer-xl-be5737feb13)。\n5. [通用语言模型](https:\u002F\u002Flilianweng.github.io\u002Flil-log\u002F2019\u002F01\u002F31\u002Fgeneralized-language-models.html) 由 OpenAI 研究科学家 Lilian Weng 撰写。\n6. [XLNet 是什么以及为何它优于 BERT](https:\u002F\u002Ftowardsdatascience.com\u002Fwhat-is-xlnet-and-why-it-outperforms-bert-8d8fce710335)\n  - 排列语言建模 (Permutation Language Modeling) 目标是 XLNet 的核心。\n7. [DistilBERT](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Fpytorch-transformers\u002Ftree\u002Fmaster\u002Fexamples\u002Fdistillation)（来自 HuggingFace），随博客文章 [更小、更快、更便宜、更轻：推出 DistilBERT，一种 BERT 的蒸馏版本](https:\u002F\u002Fmedium.com\u002Fhuggingface\u002Fdistilbert-8cf3380435b5) 一同发布。\n8. [ALBERT：用于语言表示自监督学习的轻量级 BERT 论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11942v3) 来自 Google Research 和丰田技术研究所。—— 提高参数使用效率的改进：因子化嵌入参数化、跨层参数共享以及用于建模句间一致性的句子顺序预测 (SOP) 损失。[[博客文章](https:\u002F\u002Fai.googleblog.com\u002F2019\u002F12\u002Falbert-lite-bert-for-self-supervised.html) | [代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002FALBERT)]\n9. [ELECTRA：将文本编码器作为判别器而非生成器进行预训练](https:\u002F\u002Fopenreview.net\u002Fforum?id=r1xMH1BtvB) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, 和 Christopher D. Manning 撰写 —— 一种类似 ALBERT 的 BERT 变体，且训练成本更低。他们仅使用一个 GPU 就训练出了超越 GPT 的模型；使用 1\u002F4 的计算量即可达到 RoBERTa 的性能。它使用了一种名为替换令牌检测 (Replaced Token Detection, RTD) 的新预训练方法，该方法在从所有输入位置学习的同时训练双向模型。[[博客文章](https:\u002F\u002Fai.googleblog.com\u002F2020\u002F03\u002Fmore-efficient-nlp-model-pre-training.html) | [代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Felectra)]\n10. [ALBERT (轻量级 BERT) 可视化论文总结](https:\u002F\u002Famitness.com\u002F2020\u002F02\u002Falbert-visual-summary\u002F)\n11. [Cramming：一天内在单个 GPU 上训练语言模型（论文）](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.14034) (2022) —— 当社区大多数人都在询问如何将极端计算的极限推向何处时，我们提出了相反的问题：在仅仅一天的时间内，单个 GPU 能走多远？……通过缩放定律 (Scaling laws) 的视角，我们对近期的一系列训练和架构改进进行了分类，并讨论了它们在有限计算设置下的优点和实际适用性（或缺乏适用性）。\n12. [BERT 和 T5 发生了什么？关于 Transformer 编码器、PrefixLM 和去噪目标](https:\u002F\u002Fwww.yitay.net\u002Fblog\u002Fmodel-architecture-blogpost-encoders-prefixlm-denoising) 作者 Yi Tay，2024 年 7 月\n    > 简而言之，我们看不到任何扩展版的 xBERT 在运行：BERT 模型已被弃用，转而采用更灵活的去噪（自回归）T5 模型形式。这主要是由于范式统一，人们希望用一个通用模型执行任何任务（而不是特定任务模型）。同时，自回归去噪有时被折叠为因果语言模型 (Causal Language Models) 的辅助目标。\n\n### 注意力机制\n\n[![Visualizing Attention, a Transformer's Heart](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcedrickchee_awesome-transformer-nlp_readme_ed8c97aef1c1.jpg)](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=eMlx5fFNoYc)\n\n\u003Csup>可视化注意力：Transformer 的心脏\u003C\u002Fsup>\n\n1. [神经机器翻译：联合学习对齐与翻译](https:\u002F\u002Farxiv.org\u002Fabs\u002F1409.0473v1) by Dzmitry Bahdanau, KyungHyun Cho, and Yoshua Bengio, 2014 - [Bahdanau 发明了基于内容的神经网络注意力，这现在是基于深度学习的自然语言处理 (NLP)（语言模型）中的核心工具](https:\u002F\u002Farchive.is\u002FJxMmF#selection-99.0-103.76)。固定长度上下文向量设计的一个缺点是无法记住长句子。注意力机制 (Attention Mechanism) 应运而生以解决此问题。它旨在帮助在语言翻译中记忆长输入句子。[[Bahdanau 值得称赞](https:\u002F\u002Farchive.is\u002F3DwY5)]\n2. [Harvard NLP 小组注解版 Transformer](http:\u002F\u002Fnlp.seas.harvard.edu\u002F2018\u002F04\u002F03\u002Fattention.html) - 进一步阅读以理解“Attention is all you need\"论文。\n3. [Attention？Attention!](https:\u002F\u002Flilianweng.github.io\u002Flil-log\u002F2018\u002F06\u002F24\u002Fattention-attention.html) - OpenAI 的 Lilian Weng 撰写的注意力指南。\n4. [可视化神经机器翻译模型（带注意力的 Seq2seq 模型原理）](https:\u002F\u002Fjalammar.github.io\u002Fvisualizing-neural-machine-translation-mechanics-of-seq2seq-models-with-attention\u002F) by Jay Alammar, Udacity ML Engineer Nanodegree 讲师。\n5. [使 Transformer 网络更简单、更高效](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Fmaking-transformer-networks-simpler-and-more-efficient\u002F) - FAIR 发布了一个全注意力层以简化 Transformer 模型，并采用自适应注意力跨度方法以提高效率（减少计算时间和内存占用）。\n6. [BERT 关注什么？对 BERT 注意力的分析论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04341) by Stanford NLP 小组。\n7. [快速 Transformer 解码：只需一个写入头 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.02150) by Noam Shazeer, Google (2019) - 他们提出了一种称为 **多查询注意力 (MQA)** 的注意力类型变体。普通多头注意力机制每个头有一个查询、键和值；而多查询注意力则**在所有不同的注意力“头”之间共享一个键和值**。实际上，训练时间保持不变，但**推理时的解码速度更快**。MQA 显著提高了语言模型的性能和效率。用户可以获得约 10 倍的更好吞吐量，以及推理时约 30% 的更低延迟。然而，MQA 可能导致质量下降，而且仅为了更快的推理而训练单独的模型可能并不理想。2022 年，PaLM 是一种解码器风格的模型，其使用 MQA 是对 GPT 的一种有趣的架构改进。最近使用 MQA 的模型包括 [TII 的 Falcon](https:\u002F\u002Ffalconllm.tii.ae\u002F) (2023)。\n8. [GQA：从多头检查点训练通用多查询 Transformer 模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13245) by Google Research, 2023 - 他们 (1) 提出了一种**利用原始预训练算力的 5%，将现有的多头注意力 (MHA) 模型升级训练为具有多查询注意力 (MQA) 的模型的技术**，和 (2) 引入**分组查询注意力 (GQA)**，这是 MQA 的一种泛化形式，使用中间数量（多于一个，少于查询头数量）的键值头。GQA 通过减少键值头的数量，实现了**接近 MHA 的收益**，同时拥有**与 MQA 相当的推理速度**。使用 MQA 的模型包括 Meta 的 Llama 2 (2023)。 [[一些推文](https:\u002F\u002Ftwitter.com\u002F_philschmid\u002Fstatus\u002F1673335690912825347?s=20)]\n9. [用于近无限上下文的块状 Transformer 环注意力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01889) by UC Berkeley, 2023 - Ring Attention 是一种系统级优化技术，通过利用特定的硬件架构使精确的注意力计算更高效。\n10. [不留任何上下文：使用 Infini-attention 的高效无限上下文 Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.07143) by Google, 2024 - Infini-attention 具有额外的压缩记忆和线性注意力，用于处理无限长的上下文。他们训练了一个 1B 参数的 Transformer 模型，该模型在高达 5K 序列长度的密码实例上进行了微调，从而解决了 1M Token 输入长度的问题。Infini-attention 机制为 Transformer 语言模型提供了一种高效且强大的方法，用于处理非常长的上下文，而无需大幅增加内存或计算量。\n11. [检索头从机制上解释长上下文事实性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.15574) by Wenhao Wu, Yao Fu 等，2024 - 该论文解释了 LLM（大语言模型）实际上如何处理上下文窗口。研究发现：他们发现 LLM 意外地开发了检索头，这并非创作者明确编码的。[代码：[一种统计计算 Transformer 模型中注意力头检索分数的算法](https:\u002F\u002Fgithub.com\u002Fnightdessert\u002FRetrieval_Head)]\n\n### Transformer 架构\n\n1. [Transformer（转换器）博客文章](https:\u002F\u002Fai.googleblog.com\u002F2017\u002F08\u002Ftransformer-novel-neural-network.html)。\n2. [图解 Transformer](https:\u002F\u002Fjalammar.github.io\u002Fillustrated-transformer\u002F) by Jay Alammar，Udacity 机器学习工程师纳米学位讲师。\n3. 观看 [Łukasz Kaiser 的演讲](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=rBCqOTEfxvg)，了解该模型及其细节。\n4. [Transformer-XL：释放注意力模型的潜力](https:\u002F\u002Fai.googleblog.com\u002F2019\u002F01\u002Ftransformer-xl-unleashing-potential-of.html) by Google Brain。\n5. [使用稀疏 Transformer 进行生成建模](https:\u002F\u002Fopenai.com\u002Fblog\u002Fsparse-transformer\u002F) by OpenAI - 一种对注意力机制 (attention mechanism) 的算法改进，可从比之前可能长 30 倍的序列中提取模式。\n6. [为强化学习稳定 Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06764) 论文 by DeepMind 和 CMU - 他们提出了对原始 Transformer 和 XL 变体的架构修改，通过移动层归一化 (layer-norm) 并添加门控 (gating) 创建了门控 Transformer-XL (Gated Transformer-XL, GTrXL)。这大大提高了强化学习 (RL) 中的稳定性和学习速度（通过时间整合经验）。\n7. [Transformer 家族](https:\u002F\u002Flilianweng.github.io\u002Flil-log\u002F2020\u002F04\u002F07\u002Fthe-transformer-family.html) by Lilian Weng - 自从《Attention Is All You Need》论文发表以来，为了改进 Transformer 模型发生了许多新变化。本文就是关于这方面的内容。\n8. [DETR (**DE**tection **TR**ansformer)：基于 Transformer 的端到端目标检测](https:\u002F\u002Fai.facebook.com\u002Fblog\u002Fend-to-end-object detection-with-transformers\u002F) by FAIR - :fire: 计算机视觉尚未被 Transformer 革命席卷。与之前的目标检测系统相比，DETR 彻底改变了架构。（[PyTorch 代码和预训练模型](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fdetr)）。“对（非自回归）端到端检测的一次扎实尝试。锚框 + 非极大值抑制 (NMS) 是一团糟。我希望早在 ~2013 年检测就能实现端到端。” —— Andrej Karpathy\n9. [面向软件工程师的 Transformer](https:\u002F\u002Fblog.nelhage.com\u002Fpost\u002Ftransformers-for-software-engineers\u002F) - 这篇文章将对有兴趣学习机器学习模型 (ML models) 的软件工程师有帮助，特别是任何对 Transformer 可解释性 (interpretability) 感兴趣的人。这篇文章将逐步介绍一个（大部分）完整的 GPT 风格 Transformer 实现，但目标不是运行代码；相反，他们使用软件工程编程语言的语言来解释这些模型的工作原理，并阐述他们在进行可解释性工作时的某些观点。\n10. [Pathways 语言模型 (PaLM)：扩展至 5400 亿参数以实现突破性性能](https:\u002F\u002Fai.googleblog.com\u002F2022\u002F04\u002Fpathways-language-model-palm-scaling-to.html) - PaLM 是一个使用 Pathways 系统训练的密集仅解码器 (dense decoder-only) Transformer 模型，该系统使 Google 能够高效地在多个 TPU v4 Pods 上训练单个模型。解释笑话的例子令人瞩目。这表明它可以为需要复杂的多步逻辑推理、世界知识和深层语言理解组合的场景生成明确的解释。\n11. [通过状态空间增强 Transformer 实现高效的长序列建模 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08136) by 佐治亚理工学院和微软 - 注意力机制的二次计算成本限制了其在长序列中的实用性。现有的注意力变体提高了计算效率，但它们有效计算全局信息的能力有限。与 Transformer 模型并行，状态空间模型 (State Space Models, SSMs) 专为长序列量身定制，但不够灵活以捕捉复杂的局部信息。他们提出了 SPADE（状态空间增强 Transformer 的缩写），在长程领域 (Long Range Arena) 基准和各种语言模型 (LM) 任务上执行了各种基线，包括 Mega。这是一个有趣的方向。SSMs 和 Transformer 此前已被结合。\n12. [DeepNet：将 Transformer 扩展至 1000 层 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.00555) by 微软研究院 (2022) - 该团队引入了一种**新的归一化函数 (DEEPNORM)** 来修改 Transformer 中的残差连接 (residual connection)，并表明模型更新可以以**稳定的方式**进行约束。这提高了深层 Transformer 的训练稳定性，并将模型深度与 Google Brain (2019) 的 Gpipe（管道并行，pipeline parallelism）相比扩大了数量级（10 倍）。（谁还记得 ResNet (2015) 对卷积网络 (ConvNet) 做了什么？）\n13. [一种长度外推的 Transformer (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10554) by 微软 (2022) [[TorchScale 代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Ftorchscale)] - 这提高了扩展 Transformer 的**建模能力**。\n14. [饥饿的河马 (H3)：迈向使用状态空间模型 (SSMs) 的语言建模 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.14052) by 斯坦福大学 AI 实验室 (2022) - 一种新的语言建模架构。它**随上下文大小 (context size) 近乎线性扩展而非二次方**。不再有固定的上下文窗口，人人皆可拥有长上下文。尽管如此，由于硬件利用率低，SSMs 仍然比 Transformer 慢。那么，它是 Transformer 的继任者吗？[[推文](https:\u002F\u002Ftwitter.com\u002FrealDanFu\u002Fstatus\u002F1617605971395891201)]\n15. [使用投机采样加速大语言模型解码 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.01318) by DeepMind (2023) - 投机采样 (Speculative sampling) 算法使得每次 Transformer 调用都能生成多个 token。在分布式设置下，使用 Chinchilla 实现了 2–2.5 倍的解码加速，同时不损害样本质量或对模型本身进行修改。\n16. [Transformer 高效训练综述 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.01107) by 莫纳什大学等，2023 - 第一个系统概述，涵盖 1) 计算效率；优化（即稀疏训练）和数据选择（即 Token 掩码），2) 内存效率（即数据\u002F模型并行，卸载\u002F使用外部内存）和 3) 硬件\u002F算法协同设计（即高效注意力，硬件感知的低精度）。\n17. [无需捷径的深度 Transformer：修改自注意力以实现忠实信号传播 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.10322) by DeepMind 等，2023\n18. [Hyena 层次结构：迈向更大的卷积语言模型 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.10866) by 斯坦福大学等，2023 - 注意力机制很棒。Hyena 是注意力的替代方案，通过使用隐式长卷积和门控，可以在**10 倍更长**的序列上学习，速度比优化的注意力快**100 倍**。[[推文](https:\u002F\u002Ftwitter.com\u002FMichaelPoli6\u002Fstatus\u002F1633167040130453505)]\n19. [FlashAttention：具有 IO 感知能力的快速且内存高效的精确注意力 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.14135) by 斯坦福大学等，2022 - Transformer 变得更深更宽，但在长序列上训练它们仍然很困难。其核心的注意力层是计算和内存瓶颈：序列长度翻倍会导致运行时间和内存需求翻四倍。FlashAttention 是一种加速注意力并减少其内存占用的新算法——没有任何近似。它使得训练具有更长上下文的 LLMs 成为可能。[[代码](https:\u002F\u002Fgithub.com\u002FHazyResearch\u002Fflash-attention)]\n20. [得出结论：用线性变换捷径 Transformer (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.09435v1) by Google Research 等，2023。[[推文](https:\u002F\u002Ftwitter.com\u002FLChoshen\u002Fstatus\u002F1637799047430905856)]\n21. [CoLT5：使用条件计算更快的长程 Transformer (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.09752) by Google Research，2023 - 语言模型的 64K 上下文大小！这种方法在保持或提高与 LONGT5 相比的性能的同时，实现了更快的训练和推理。COLT5 的主要组件包括路由模块、条件前馈层和条件注意力层。路由模块为每个输入和组件选择重要的 token，而轻量分支以较低容量的操作处理所有 token，重分支仅对选定的重要 token 应用较高容量的操作。此外，COLT5 结合了多查询交叉注意力以加快推理速度，以及 UL2 预训练目标以改善长输入的上下文内学习能力。[[推文](https:\u002F\u002Ftwitter.com\u002Fmiolini\u002Fstatus\u002F1637677536921657344)]\n22. [google-research\u002Fmeliad](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fmeliad) - Meliad 库是正在开发的模型集合，作为 Google 持续进行的深度学习架构改进研究的一部分。该库目前包含几种 Transformer 变体，探索如何扩展流行的 Transformer 架构以更好地支持长序列上的语言建模。变体包括记忆 Transformer、带滑动窗口的 Transformer、块循环 Transformer 等。\n23. [LongNet：将 Transformer 扩展至 10 亿 tokens (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02486) by 微软研究院，2023。\n24. [vLLM：使用分页注意力轻松、快速且廉价地服务 LLM](https:\u002F\u002Fvllm.ai\u002F) by UC Berkeley 等，2023 - 吞吐量的提高来自于在几乎完全利用的 GPU 上节省显存 (VRAM)。\n25. [LLM 中 10 万上下文窗口背后的秘密酱料：所有技巧汇集一处](https:\u002F\u002Fblog.gopenai.com\u002Fhow-to-speed-up-llms-and-use-100k-context-window-all-tricks-in-one-place-ffd40577b4c)\n26. [Unlimiformer：具有无限长度输入的长程 Transformer (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.01625) by CMU，2023。\n27. [PaLM 2 技术报告 (PDF)](https:\u002F\u002Fai.google\u002Fstatic\u002Fdocuments\u002Fpalm2techreport.pdf) by Google，2023。\n28. [深度混合 (MoD)：动态分配基于 Transformer 的语言模型的计算](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.02258) by Google DeepMind 等，2024 - MoD 方法在深度维度上扩展，同时保持浮点运算次数 (FLOPs) 不变（类似于专家混合 (Mixture of Experts, MoE) 在宽度维度上所做的方式）。MoD 模型可以学习将更复杂的 token 路由到更多层（类似于 MoE 中的专家可以专门针对某些领域）。该团队探讨了如何在不牺牲性能的情况下优化计算预算并提高效率。结果：MoD 以 66% 更快的训练速度匹配基线性能。现在的问题是，它能否扩展到超过 1B tokens。他们在 500M tokens 上进行了测试。[ELI5 版本：[深度混合遇上专家混合](https:\u002F\u002Flifeinthesingularity.com\u002Fp\u002Fgoogles-breakthroughs-in-ai-design)]\n29. [通过位置插值扩展大语言模型的上下文窗口](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.15595) by Meta Platforms，2023 - 位置插值 (Positional Interpolation, PI) 是一种有效且高效的方法，可以稳定地将基于 RoPE 的预训练大语言模型（如 LLaMA）的上下文窗口扩展到更长的长度（最高达 32768），同时保持最小微调（在千步以内）并保持性能。\n30. [PoSE：通过位置跳过训练高效扩展 LLM 的上下文窗口](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.10400) by Dawei Zhu 等，ICLR 2024 - PoSE 通过在固定上下文窗口内操纵位置索引来模拟训练期间的更长输入序列，而不是在完整的目标长度上进行训练。这使得训练长度与目标上下文长度解耦，与全长微调相比大大减少了内存和计算需求。PoSE 成功扩展了 LLaMA-1 以支持长达 128k tokens 的上下文长度，仅使用 2k 训练窗口，性能下降极小。[此模型](https:\u002F\u002Fhuggingface.co\u002Fwinglian\u002FLlama-3-8b-64k-PoSE) 使用 PoSE 将 Llama-3 8B 的上下文长度从 8k 扩展到 64k。PoSE 有潜力进一步扩展上下文长度，仅受限于推理内存，随着高效推理技术的不断改进。[代码：[PoSE](https:\u002F\u002Fgithub.com\u002Fdwzhu-pku\u002FPoSE)]\n31. [通过多 token 预测实现更好更快的大语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.19737) by Meta 的 FAIR，2024 年 4 月 - 如果我们让语言模型预测几个 token 而不是仅仅下一个 token 会发生什么？他们表明，用多 token 预测任务替换下一个 token 预测任务可以导致**代码生成性能显著提高，使用完全相同的训练预算和数据——同时将推理性能提高 3 倍**。虽然类似的方法此前已用于微调以提高推理速度，但**这项研究扩展到大型模型的预训练，展示了这些规模下的显著行为和结果**。\n32. [nGPT：在超球面上进行表示学习的归一化 Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.01131) by NVIDIA，2024 年 10 月 - 一种新颖的 Transformer 架构，其中所有向量（嵌入、MLP、注意力矩阵、隐藏状态）都归一化为单位范数并在超球面上运行。与标准 Transformer 相比，训练期间的收敛速度快 4-20 倍。通过强制归一化消除了对权重衰减的需求。归一化方法：矩阵向量乘法变为限制在 [-1,1] 的点积。架构更改：1) 注意力机制 - 归一化 QKV 投影矩阵，为 Q-K 点积引入可训练缩放因子，2) 层结构：为注意力块和 MLP 块引入可学习的“特征学习率”(α)。理论：可以在黎曼优化的背景下解释。优势：训练更稳定，下游任务性能提升，架构简化。\n33. [差分 Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.05258) by 微软研究院等，2024 年 10 月 - 他们在多个维度上呈现了**相对于标准 Transformer 的显著改进**，特别强调注意力效率和 LM 任务的实际应用。一种新的架构，通过减少对无关上下文的注意力来改进注意力机制。与标准 Transformer 相比，在需要更少参数和训练 token 的情况下实现了更好的性能。解决方案：引入了“差分注意力”机制，将注意力分数计算为两个独立 softmax 注意力的差异。这种减法抵消了噪声。可以使用现有的 FlashAttention 高效实现。扩展效率：**仅需约 65% 的参数或训练 token 即可达到标准 Transformer 的性能**。改进：1) 在长达 64K tokens 的长序列上表现更好。2) 更擅长发现文档中嵌入的关键信息。3) ICL：对提示词顺序排列更具鲁棒性。4) 减少注意力误分配，这是幻觉的主要原因。技术细节：包括逐头归一化以处理稀疏注意力模式等。未来：开发高效的低比特注意力内核，由于更稀疏的注意力模式，压缩 KV 缓存的潜力等。[收听 [NotebookLM 播客](https:\u002F\u002Fnotebooklm.google.com\u002Fnotebook\u002F8e4c0907-8b12-4bc9-9d29-26c03daab71d\u002Faudio)]\n\n### 生成式预训练 Transformer (GPT)\n\n[![LLM 可视化](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcedrickchee_awesome-transformer-nlp_readme_0b68f8c13e24.png)](https:\u002F\u002Fbbycroft.net\u002Fllm)\n\n\u003Csup>GPT 可视化\u003C\u002Fsup>\n\n1. [更好的语言模型及其影响](https:\u002F\u002Fopenai.com\u002Fblog\u002Fbetter-language-models\u002F)。\n2. [利用无监督学习改进语言理解](https:\u002F\u002Fblog.openai.com\u002Flanguage-unsupervised\u002F) - 这是对原始 OpenAI GPT 模型的概述。\n3. [🦄 如何使用迁移学习构建最先进的对话 AI](https:\u002F\u002Fconvai.huggingface.co\u002F) by Hugging Face。\n4. [图解 GPT-2（可视化 Transformer 语言模型）](https:\u002F\u002Fjalammar.github.io\u002Fillustrated-gpt2\u002F) by Jay Alammar。\n5. [MegatronLM：使用 GPU 模型并行训练数十亿参数语言模型](https:\u002F\u002Fnv-adlr.github.io\u002FMegatronLM) by NVIDIA ADLR。\n6. [OpenGPT-2：我们复现了 GPT-2，因为你也可以](https:\u002F\u002Fmedium.com\u002F@vanya_cohen\u002Fopengpt-2-we-replicated-gpt-2-because-you-can-too-45e34e6d36dc) - 作者在类似规模的文本数据集上训练了一个 15 亿参数的 GPT-2 模型，并报告了可与原始模型进行比较的结果。\n7. [MSBuild 演示 OpenAI 生成式文本模型生成 Python 代码](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=fZSFNUT6iY8) [视频] - 该模型是在 GitHub 开源软件 (OSS) 仓库上训练的。该模型使用英文代码注释或仅函数签名来生成整个 Python 函数。很酷！\n8. [GPT-3：语言模型是少样本学习者（论文）](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14165) by Tom B. Brown (OpenAI) 等 - “我们训练了 GPT-3，这是一个拥有 1750 亿参数的自回归语言模型 :scream:，比任何之前的非稀疏语言模型多 10 倍，并在少样本设置下测试了其性能。”\n9. [elyase\u002Fawesome-gpt3](https:\u002F\u002Fgithub.com\u002Felyase\u002Fawesome-gpt3) - 关于 OpenAI GPT-3 API 的一系列演示和文章集合。\n10. [GPT3 工作原理 - 可视化与动画](https:\u002F\u002Fjalammar.github.io\u002Fhow-gpt3-works-visualizations-animations\u002F) by Jay Alammar。\n11. [GPT-Neo](https:\u002F\u002Fwww.eleuther.ai\u002Fprojects\u002Fgpt-neo\u002F) - 复制一个 GPT-3 规模的模型并免费开源。GPT-Neo 是“一种模型并行 GPT2 及类 GPT3 模型的实现，能够使用 mesh-tensorflow 库扩展到完整的 GPT3 规模（甚至更大！）”。[[代码](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neo)]。\n12. [GitHub Copilot](https:\u002F\u002Fcopilot.github.com\u002F)，由 OpenAI Codex 提供支持 - Codex 是 GPT-3 的衍生模型。Codex 将自然语言转换为代码。\n13. [GPT-4 来自硅谷的传闻](https:\u002F\u002Fthealgorithmicbridge.substack.com\u002Fp\u002Fgpt-4-rumors-from-silicon-valley) - GPT-4 几乎准备好了。GPT-4 将是多模态的，接受文本、音频、图像以及可能的视频输入。发布时间窗口：12 月 - 2 月。#热议\n14. [新 GPT-3 模型：text-Davinci-003](https:\u002F\u002Fbeta.openai.com\u002Fdocs\u002Fmodels\u002Fdavinci) - 改进点：\n  - 处理更复杂的意图 — 现在您可以更有创意地利用其功能。\n  - 更高质量的写作 — 更清晰、更具吸引力且更引人入胜的内容。\n  - 更长篇幅内容生成的能力更强。\n15. [GPT-4 研究](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4) 落地页。\n16. [GPT-3 和 GPT-3.5 系列模型的综合能力分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10420) by Fudan University 等，2023。\n17. [通用人工智能的火花：GPT-4 的早期实验](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.12712) by Microsoft Research，2023 - 论文中有完全令人惊叹的例子。\n\n#### ChatGPT\n\n[什么是 ChatGPT？](https:\u002F\u002Fopenai.com\u002Fblog\u002Fchatgpt\u002F)\n\n**简而言之：** ChatGPT 是一个对话式 Web 界面，由 OpenAI 最新的语言模型支持，该模型基于 [GPT-3.5 系列](https:\u002F\u002Fbeta.openai.com\u002Fdocs\u002Fmodel-index-for-researchers)（于 2022 年初完成训练）进行微调，针对对话进行了优化。它使用人类反馈强化学习 (RLHF) 进行训练；人类 AI 训练师通过扮演对话双方提供监督微调。\n\n显然，它在遵循用户指令和上下文方面优于 GPT-3。[人们已经注意到](https:\u002F\u002Farchive.ph\u002Fm6AOQ) ChatGPT 的输出质量似乎代表了相对于之前 GPT-3 模型的显著改进。\n\n更多信息，请查看 [ChatGPT 宇宙](https:\u002F\u002Fgithub.com\u002Fcedrickchee\u002Fchatgpt-universe)。这是我关于 ChatGPT 的所有理解的零散笔记，并存储了有关 ChatGPT 的一些有趣事物。\n\n### 大型语言模型 (LLM)\n\n![ChatGPT 与各类大语言模型](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcedrickchee_awesome-transformer-nlp_readme_83e681c3ee5a.jpeg)\n\nChatGPT 与各类大语言模型 [^1]\n\n1. [GPT-J-6B](https:\u002F\u002Ftowardsdatascience.com\u002Fcant-access-gpt-3-here-s-gpt-j-its-open-source-cousin-8af86a638b11) - 无法访问 GPT-3？这是 GPT-J —— 它的开源表亲。\n2. [Fun and Dystopia With AI-Based Code Generation Using GPT-J-6B](https:\u002F\u002Fminimaxir.com\u002F2021\u002F06\u002Fgpt-j-6b\u002F) - 使用基于 AI 的代码生成进行娱乐与反乌托邦：在 GitHub Copilot 技术预览发布之前，数据科学家 Max Woolf 测试了 GPT-J-6B 的“编写”代码能力。\n3. [GPT-Code-Clippy (GPT-CC)](https:\u002F\u002Fgithub.com\u002FCodedotAl\u002Fgpt-code-clippy) - GitHub Copilot 的开源版本。GPT-CC 模型是 GPT-2 和 GPT-Neo 的微调版本。\n4. [GPT-NeoX-20B](https:\u002F\u002Fblog.eleuther.ai\u002Fannouncing-20b\u002F) - 一个拥有 200 亿参数的模型，使用 EleutherAI 的 [GPT-NeoX](https:\u002F\u002Fgithub.com\u002FEleutherAI\u002Fgpt-neox) 框架训练。他们期望它在许多任务上表现良好。你可以在 [GooseAI](https:\u002F\u002Fgoose.ai\u002F) 沙盒上试用该模型。\n5. [Metaseq](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmetaseq) - 用于处理 [Open Pre-trained Transformers (OPT)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.01068)（开放预训练 Transformer）的代码库。\n6. [YaLM 100B](https:\u002F\u002Fgithub.com\u002Fyandex\u002FYaLM-100B) by Yandex is a GPT-like pretrained language model with 100B parameters for generating and processing text. It can be used **freely** by developers and researchers from all over the world. - 由 Yandex 开发的 YaLM 100B 是一个类似 GPT 的预训练语言模型，拥有 1000 亿参数，用于生成和处理文本。它可被来自世界各地的开发者和研究人员**免费**使用。\n7. [BigScience's BLOOM-176B](https:\u002F\u002Fhuggingface.co\u002Fbigscience\u002Fbloom) from the Hugging Face repository [[paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.15424), [blog post](https:\u002F\u002Fbigscience.huggingface.co\u002Fblog\u002Fbloom)] - BLOOM 是一个拥有 1750 亿参数的语言处理模型，能够像 GPT-3 和 OPT-175B 一样生成文本。它被开发为多语言模型，故意在包含 46 种自然语言和 13 种编程语言的语料集上进行训练。\n8. [bitsandbytes-Int8 inference for Hugging Face models](https:\u002F\u002Fdocs.google.com\u002Fdocument\u002Fd\u002F1JxSo4lQgMDBdnd19VBEoaG-mMfQupQ3XvOrgmRAVtpU\u002Fedit) - 你可以轻松地在单台机器上运行 BLOOM-176B\u002FOPT-175B，而不会降低性能。如果属实，这可能是一个游戏规则改变者，使大型科技公司以外的人也能使用这些大语言模型（LLM）。\n9. [WeLM: A Well-Read Pre-trained Language Model for Chinese (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.10372) by WeChat. [[online demo](https:\u002F\u002Fwelm.weixin.qq.com\u002Fdocs\u002Fplayground\u002F)] - 由微信开发的 WeLM：一个阅读广泛的中文预训练语言模型（论文）[[在线演示](https:\u002F\u002Fwelm.weixin.qq.com\u002Fdocs\u002Fplayground\u002F)]\n10. [GLM-130B: An Open Bilingual (Chinese and English) Pre-Trained Model (code and paper)](https:\u002F\u002Fgithub.com\u002FTHUDM\u002FGLM-130B) by Tsinghua University, China [[article](https:\u002F\u002Fkeg.cs.tsinghua.edu.cn\u002Fglm-130b\u002Fposts\u002Fglm-130b\u002F)] - One of the major contributions is making LLMs cost affordable using int4 quantization so it can run in limited compute environments.\n    > The resultant GLM-130B model offers **significant outperformance over GPT-3 175B** on a wide range of popular English benchmarks while the performance advantage is not observed in OPT-175B and BLOOM-176B. It also consistently and significantly outperforms ERNIE TITAN 3.0 260B -- the largest Chinese language model -- across related benchmarks. Finally, we leverage **a unique scaling property of GLM-130B to reach INT4 quantization, without quantization aware training and with almost no performance loss**, making it the first among 100B-scale models. **More importantly, the property allows its effective inference on 4×RTX 3090 (24G) or 8×RTX 2080 Ti (11G) GPUs, the most ever affordable GPUs required for using 100B-scale models**.\n    - 由中国清华大学开发的 GLM-130B：一个开放的 bilingual（中英双语）预训练模型（代码和论文）[[文章](https:\u002F\u002Fkeg.cs.tsinghua.edu.cn\u002Fglm-130b\u002Fposts\u002Fglm-130b\u002F)] - 主要贡献之一是使用 int4 量化使大语言模型（LLM）成本可控，以便在有限的计算环境中运行。\n    > 生成的 GLM-130B 模型在广泛流行的英文基准测试中表现出**显著优于 GPT-3 175B 的性能**，而在 OPT-175B 和 BLOOM-176B 中未观察到这种性能优势。它还一致且显著地优于 ERNIE TITAN 3.0 260B——最大的中文语言模型——在相关基准测试中。最后，我们利用**GLM-130B 的独特缩放特性实现了 INT4 量化，无需量化感知训练且几乎没有性能损失**，使其成为 1000 亿规模模型中的第一个。**更重要的是，该特性允许其在 4×RTX 3090 (24G) 或 8×RTX 2080 Ti (11G) GPU 上进行有效推理，这是使用 1000 亿规模模型所需的最经济的 GPU**。\n11. [Teaching Small Language Models to Reason (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08410) - They finetune a student model on the chain of thought (CoT) outputs generated by a larger teacher model. For example, the **accuracy of T5 XXL on GSM8K improves from 8.11% to 21.99%** when finetuned on PaLM-540B generated chains of thought.\n    - 教导小语言模型进行推理（论文） - 他们在一个更大的教师模型生成的思维链（CoT）输出上微调学生模型。例如，当在 PaLM-540B 生成的思维链上微调时，**T5 XXL 在 GSM8K 上的准确率从 8.11% 提高到 21.99%**。\n12. [ALERT: Adapting Language Models to Reasoning Tasks (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08286) by Meta AI - They introduce ALERT, a benchmark and suite of analyses for assessing language models' reasoning ability comparing pre-trained and finetuned models on complex tasks that require reasoning skills to solve. It covers 10 different reasoning skills including logistic, causal, common-sense, abductive, spatial, analogical, argument and deductive reasoning as well as textual entailment, and mathematics.\n    - Meta AI 发布的 ALERT：将语言模型适应于推理任务（论文） - 他们引入了 ALERT，这是一个基准和分析套件，用于评估语言模型的推理能力，比较预训练和微调模型在需要推理技能解决的复杂任务上的表现。它涵盖了 10 种不同的推理技能，包括逻辑、因果、常识、溯因、空间、类比、论证和演绎推理，以及文本蕴含和数学。\n13. [Evaluating Human-Language Model Interaction (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09746) by Stanford University and Imperial College London - They find that non-interactive performance does not always result in better human-LM interaction and that first-person and third-party metrics can diverge, suggesting the importance of examining the nuances of human-LM interaction.\n    - 评估人类与语言模型交互（论文） - 斯坦福大学和伦敦帝国理工学院的研究人员发现，非交互式性能并不总是导致更好的人类-LM 交互，并且第一人称和第三方指标可能会产生分歧，这表明检查人类-LM 交互细微差别的重要性。\n14. [Unnatural Instructions: Tuning Language Models with (Almost) No Human Labor (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09689) by Meta AI [[data](https:\u002F\u002Fgithub.com\u002Forhonovich\u002Funnatural-instructions)] - Fine-tuning a T5 on a large dataset collected with virtually no human labor leads to a model that surpassing the performance of models such as T0++ and Tk-Instruct across various benchmarks. These results demonstrate the potential of model-generated data as a **cost-effective alternative to crowdsourcing for dataset expansion and diversification**.\n    - Meta AI 发布的 Unnatural Instructions：用（几乎）零人力微调语言模型（论文）[[数据](https:\u002F\u002Fgithub.com\u002Forhonovich\u002Funnatural-instructions)] - 在通过几乎零人力收集的大型数据集上微调 T5，会导致一个在各种基准测试中超越 T0++ 和 Tk-Instruct 等模型性能的模型。这些结果证明了模型生成数据作为**数据集扩展和多样化众包（crowdsourcing）的经济替代方案**的潜力。\n15. [OPT-IML (OPT + Instruction Meta-Learning) (paper)](https:\u002F\u002Fraw.githubusercontent.com\u002Ffacebookresearch\u002Fmetaseq\u002Fmain\u002Fprojects\u002FOPT-IML\u002Foptimal_paper_v1.pdf) by Meta AI - OPT-IML is a set of instruction-tuned versions of OPT, on a collection of ~2000 NLP tasks — for research use cases. It boosts the performance of the original OPT-175B model using instruction tuning to improve zero-shot and few-shot generalization abilities — allowing it to adapt for more diverse language applications (i.e., answering Q’s, summarizing text). This improves the model's ability to better process natural instruction style prompts. Ultimately, humans should be able to \"talk\" to models as naturally and fluidly as possible. [[code (available soon), weights released](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmetaseq\u002Ftree\u002Fmain\u002Fprojects\u002FOPT-IML)]\n    - Meta AI 发布的 OPT-IML (OPT + Instruction Meta-Learning) (论文) - OPT-IML 是一组针对约 2000 个自然语言处理（NLP）任务的指令微调版本的 OPT，用于研究用例。它使用指令微调来提升原始 OPT-175B 模型的性能，以改进 zero-shot 和 few-shot（少样本\u002F零样本）泛化能力——使其能够适应更多样化的语言应用（即，回答问题、总结文本）。这提高了模型更好地处理自然指令风格提示的能力。最终，人类应该能够尽可能自然流畅地与模型“交谈”。[[代码（即将发布），权重已发布](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmetaseq\u002Ftree\u002Fmain\u002Fprojects\u002FOPT-IML)]\n16. [jeffhj\u002FLM-reasoning](https:\u002F\u002Fgithub.com\u002Fjeffhj\u002FLM-reasoning) - This repository contains a collection of papers and resources on reasoning in Large Language Models.\n    - 此存储库包含关于大语言模型（LLM）中推理的一系列论文和资源。\n17. [Rethinking with Retrieval: Faithful Large Language Model Inference (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.00303) by University of Pennsylvania et al., 2022 - They shows the potential of enhancing LLMs by retrieving relevant external knowledge based on decomposed reasoning steps obtained through chain-of-thought (CoT) prompting. I predict we're going to see many of these types of retrieval-enhanced LLMs in 2023.\n    - 宾夕法尼亚大学等人在 2022 年发表的 Rethinking with Retrieval: Faithful Large Language Model Inference（论文） - 他们展示了通过基于思维链（CoT）提示获得的分解推理步骤检索相关外部知识来增强大语言模型（LLM）的潜力。我预测我们在 2023 年将看到许多这类检索增强的 LLM。\n18. [REPLUG: Retrieval-Augmented Black-Box Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12652) by Meta AI et al., 2023 - TL;DR: Enhancing GPT-3 with world knowledge — a retrieval-augmented LM framework that combines a frozen LM with a frozen\u002Ftunable retriever. It improves GPT-3 in language modeling and downstream tasks by prepending retrieved documents to LM inputs. [[Tweet](https:\u002F\u002Ftwitter.com\u002FWeijiaShi2\u002Fstatus\u002F1620497381962977281)]\n    - Meta AI 等人于 2023 年发表的 REPLUG: Retrieval-Augmented Black-Box Language Models（论文） - TL;DR: 用世界知识增强 GPT-3 —— 一种检索增强 LM 框架，结合冻结的 LM 与冻结\u002F可调的检索器。它通过将检索到的文档前置到 LM 输入中来改进 GPT-3 的语言建模和下游任务。[[推文](https:\u002F\u002Ftwitter.com\u002FWeijiaShi2\u002Fstatus\u002F1620497381962977281)]\n19. [Progressive Prompts: Continual Learning for Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12314) by Meta AI et al., 2023 - Current LLMs have hard time with catastrophic forgetting and leveraging past experiences. The approach learns a prompt for new task and concatenates with frozen previously learned prompts. This efficiently transfers knowledge to future tasks. [[code](https:\u002F\u002Fgithub.com\u002Farazd\u002FProgressivePrompts)]\n    - Meta AI 等人于 2023 年发表的 Progressive Prompts: Continual Learning for Language Models（论文） - 当前的 LLM 难以应对灾难性遗忘和利用过去的经验。该方法学习新任务的提示，并将其与之前学习的冻结提示连接起来。这有效地将知识转移到未来任务。[[代码](https:\u002F\u002Fgithub.com\u002Farazd\u002FProgressivePrompts)]\n20. [Large Language Models Can Be Easily Distracted by Irrelevant Context (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.00093) by Google Research et al., 2023 - Adding the instruction \"Feel free to ignore irrelevant information given in the questions.\" consistently improves robustness to irrelevant context.\n    - Google Research 等人于 2023 年发表的大语言模型容易被无关上下文分心（论文） - 添加指令“请随意忽略问题中给出的无关信息。”持续提高对无关上下文的鲁棒性。\n21. [Toolformer: Language Models Can Teach Themselves to Use Tools (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.04761) by Meta AI, 2023 - A smaller model trained to translate human intention into actions (i.e. decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction).\n    - Meta AI 于 2023 年发表的 Toolformer: Language Models Can Teach Themselves to Use Tools（论文） - 一个较小的模型，经过训练将人类意图转化为行动（即决定调用哪些 API、何时调用、传递什么参数，以及如何最好地将结果纳入未来的 token 预测中）。\n22. [ERNIE 3.0 Titan: Exploring Larger-scale Knowledge Enhanced Pre-training for Language Understanding and Generation (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.12731) by Baidu et al., 2021 - ERNIE 3.0 Titan is the latest addition to Baidu's ERNIE (Enhanced Representation through kNowledge IntEgration) family. It's inspired by the masking strategy of Google's BERT. ERNIE is also a unified framework. They also proposed a controllable learning algorithm and a credible learning algorithm. They apply online distillation technique to compress their model. To their knowledge, it is the largest (260B parameters) Chinese dense pre-trained model so far. [[article](http:\u002F\u002Fresearch.baidu.com\u002FBlog\u002Findex-view?id=165)]\n    - 百度等人于 2021 年发表的 ERNIE 3.0 Titan：探索更大规模的增强知识预训练以进行语言理解和生成（论文） - ERNIE 3.0 Titan 是百度 ERNIE（Enhanced Representation through kNowledge IntEgration）系列的最新成员。它受到 Google BERT 的掩码策略启发。ERNIE 也是一个统一框架。他们还提出了一种可控学习算法和一种可信学习算法。他们应用在线蒸馏技术来压缩他们的模型。据他们所知，这是迄今为止最大的（2600 亿参数）中文稠密预训练模型。[[文章](http:\u002F\u002Fresearch.baidu.com\u002FBlog\u002Findex-view?id=165)]\n23. [Characterizing Attribution and Fluency Tradeoffs for Retrieval-Augmented Large Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.05578) by Google Research, 2023 - Despite recent progress, it has been difficult to prevent semantic hallucinations in generative LLMs. One common solution to this is augmenting LLMs with a retrieval system and making sure that the generated output is attributable to the retrieved information.\n    - Google Research 于 2023 年发表的表征检索增强大语言模型的归因与流畅度权衡（论文） - 尽管最近取得了进展，但很难防止生成式 LLM 中的语义幻觉。解决这个问题的一个常见方法是用检索系统增强 LLM，并确保生成的输出可归因于检索到的信息。\n24. [Augmented Language Models (ALMs): a Survey (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.07842) by Meta AI, 2023 - Augmenting language models with reasoning skills and the ability to use various, non-parametric external modules for context processing and outperform traditional LMs on several benchmarks. This new research direction has the potential to address interpretability, consistency and scalability issues.\n    - Meta AI 于 2023 年发表的增强语言模型（ALMs）：综述（论文） - 用推理技能和利用各种非参数外部模块进行上下文处理的能力增强语言模型，并在多个基准测试中超越传统 LM。这一新的研究方向有望解决可解释性、一致性和可扩展性问题。\n25. [A Comprehensive Survey on Pretrained Foundation Models: A History from BERT to ChatGPT (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.09419) by MSU et al., 2023 - My remarks: this paper raises a lot of questions around the term \"foundation models\", i.e., what's the model bare minimum number of parameters to qualify as foundation? It sounds to me foundation models are an \"invented\" concept that doesn't have good validity.\n    - MSU 等人于 2023 年发表的预训练基础模型全面调查：从 BERT 到 ChatGPT 的历史（论文） - 我的评论：这篇论文引发了很多关于“基础模型”一词的问题，即，作为基础资格的最小参数数量是多少？在我看来，基础模型是一个“发明”的概念，没有很好的有效性。\n26. [Multimodal Chain-of-Thought Reasoning in Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.00923) by Amazon Web Service et al., 2023 - The model outperform GPT-3.5 by 16% on the ScienceQA benchmark. This work is the first to study CoT reasoning in different modalities, language (text) and vision (images). Unfortunately, they never provide ablation study on how much of that performance gain was caused by the new modalities. [[code](https:\u002F\u002Fgithub.com\u002Famazon-science\u002Fmm-cot)]\n    - Amazon Web Service 等人于 2023 年发表的多模态思维链推理（论文） - 该模型在 ScienceQA 基准测试中比 GPT-3.5 高出 16%。这项工作首次研究了不同模态（语言\u002F文本和视觉\u002F图像）中的 CoT 推理。遗憾的是，他们从未提供消融实验来说明多少性能提升是由新模态引起的。[[代码](https:\u002F\u002Fgithub.com\u002Famazon-science\u002Fmm-cot)]\n27. [RECITE: Recitation-Augmented Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.01296) by Google Research et al., ICLR 2023 - How can ChatGPT-like models achieve greater factual accuracy without relying on an external retrieval search engine? This paper shows that recitation can help LLMs generate accurate factual knowledge by reciting relevant passages from their own memory (by sampling) before producing final answers. The core idea is motivated by the intuition: recite-step that recollects relevant knowledge pieces helps answer-step (generation) better output. That's a recite-answer paradigm: first ask the LLM to generate the support paragraphs that contain the answer (knowledge-recitation) and then use it as additional prompt, along with the question to ask the LLM to generate the answer. They verify the effectiveness on four LLMs. They also show that recitation can be more effective than retrieval. This is important since having a retriever may lead to unpredictable behavior (i.e., Bing\u002FSydney). [[code](https:\u002F\u002Fgithub.com\u002FEdward-Sun\u002FRECITE)]\n    - Google Research 等人于 ICLR 2023 年发表的 RECITE：背诵增强语言模型（论文） - 像 ChatGPT 这样的模型如何在不依赖外部检索搜索引擎的情况下实现更高的事实准确性？这篇论文表明，背诵可以帮助 LLM 通过从自己的记忆（通过采样）中背诵相关段落来生成准确的事实知识，然后再产生最终答案。核心思想源于直觉：回顾步骤（recollect relevant knowledge pieces）有助于回答步骤（generation）更好地输出。这是一种背诵 - 回答范式：首先要求 LLM 生成包含答案的支持段落（知识背诵），然后将其作为额外提示，连同问题一起询问 LLM 生成答案。他们在四个 LLM 上验证了其有效性。他们还表明，背诵可能比检索更有效。这很重要，因为拥有检索器可能会导致不可预测的行为（即 Bing\u002FSydney）。[[代码](https:\u002F\u002Fgithub.com\u002FEdward-Sun\u002FRECITE)]\n28. [LLaMA: Open and Efficient Foundation Language Models (paper)](https:\u002F\u002Fresearch.facebook.com\u002Fpublications\u002Fllama-open-and-efficient-foundation-language-models\u002F) by Meta AI, 2023 - A collection of language models ranging from 7B to 65B parameters. LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla70B and PaLM-540B. This shows that **smaller models trained with more data can outperform larger models**. This is **not contradictory to anything in the Chinchilla paper**, because it's not compute-optimally trained. GPU hours for training 7B model=82,432, 65B model=1,022,362 :scream:. Total time spent for all models: 2048 A100-80GB GPU for a period of approximately 5 months. The 65B model cost something in the range of ~$1-4M. Access to the model will be granted on a case-by-case basis though. People interested can apply for access. (Mar 2: [they just approved access to the models](https:\u002F\u002Ftwitter.com\u002Fcedric_chee\u002Fstatus\u002F1631182890418712578), llama-7B works in Colab [cedrickchee\u002Fllama](https:\u002F\u002Fgithub.com\u002Fcedrickchee\u002Fllama\u002Fblob\u002Fmain\u002Fnotebooks\u002Fvi_LLaMA_alpha.ipynb)) [Takeaways: [Tweet](https:\u002F\u002Fthreadreaderapp.com\u002Fthread\u002F1629496763148017665.html)]\n    - Meta AI 于 2023 年发表的 LLaMA：开放和高效的基础语言模型（论文） - 一系列从 70 亿到 650 亿参数的语言模型。LLaMA-13B 在大多数基准测试中优于 GPT-3 (175B)，LLaMA-65B 与最佳模型 Chinchilla70B 和 PaLM-540B 具有竞争力。这表明**使用更多数据训练的较小模型可以胜过较大模型**。这与 Chinchilla 论文中的任何内容**都不矛盾**，因为它不是按计算最优训练的。训练 7B 模型的 GPU 小时数=82,432，65B 模型=1,022,362 :scream:。所有模型花费的总时间：2048 A100-80GB GPU 大约 5 个月。65B 模型的成本大约在~$1-4M 范围内。不过，模型访问将根据具体情况授予。感兴趣的人可以申请访问。（3 月 2 日：[他们刚刚批准了模型访问权限](https:\u002F\u002Ftwitter.com\u002Fcedric_chee\u002Fstatus\u002F1631182890418712578)，llama-7B 可在 Colab 上使用 [cedrickchee\u002Fllama](https:\u002F\u002Fgithub.com\u002Fcedrickchee\u002Fllama\u002Fblob\u002Fmain\u002Fnotebooks\u002Fvi_LLaMA_alpha.ipynb)）[要点：[推文](https:\u002F\u002Fthreadreaderapp.com\u002Fthread\u002F1629496763148017665.html)]\n29. [Language Is Not All You Need: Aligning Perception with Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.14045) by Microsoft, 2023 - They introduce KOSMOS-1, a Multimodal Large Language Model (MLLM) that can perceive general modalities, learn in context (i.e., few-shot), and follow instructions (i.e., zero-shot). The total number of parameters is about 1.6B.\n    - Microsoft 于 2023 年发表的语言并非你所需的一切：将感知与语言模型对齐（论文） - 他们介绍了 KOSMOS-1，一种多模态大语言模型（MLLM），可以感知通用模态，在上下文中学习（即 few-shot），并遵循指令（即 zero-shot）。参数总数约为 16 亿。\n30. [Check Your Facts and Try Again: Improving Large Language Models with External Knowledge and Automated Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.12813) by Microsoft Research et al - LLM-Augmenter significantly reduces ChatGPT's hallucinations without sacrificing the fluency and informativeness of its responses. The architecture and data flow: 1) Retrieve evidence from external knowledge. 2) Context and reasoning chains. 3) Give to LLM (i.e., ChatGPT). 4) Verify hallucinations. 5) If hallucinate, give feedback and revise.\n    - Microsoft Research 等人发表的核实事实并重试：利用外部知识和自动反馈改进大语言模型 - LLM-Augmenter 显著减少了 ChatGPT 的幻觉，同时不牺牲其响应的流畅度和信息量。架构和数据流：1) 从外部知识检索证据。2) 上下文和推理链。3) 交给 LLM（即 ChatGPT）。4) 验证幻觉。5) 如果幻觉，给予反馈并修订。\n31. [UL2: Unifying Language Learning Paradigms (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.05131) by Google Brain, 2022 - UL2 is a unified framework for pretraining models that are universally effective across datasets and setups. _Takeaways: Objective matters way more than architecture. Mixture-of-Denoisers (MoD) is effective if you care about doing well on more than one type of tasks\u002Fsettings._ UL2 frames different objective functions for training language models as denoising tasks, where the model has to recover missing sub-sequences of a given input. During pre-training it uses Mixture-of-Denoisers (MoD) that samples from a varied set of such objectives, each with different configurations. MoD combines diverse pre-training paradigms together. They demonstrated that models trained using the UL2 framework perform well in a variety of language domains, including prompt-based few-shot learning and models fine-tuned for down-stream tasks. They open sourced UL2 20B model and checkpoints back in 2022. In 2023, they open sourced Flan-UL2 20B and released the weights. Check out: [[blog post](https:\u002F\u002Farchive.is\u002F20230303191656\u002Fhttps:\u002F\u002Fwww.yitay.net\u002Fblog\u002Fflan-ul2-20b), [Tweet](https:\u002F\u002Ftwitter.com\u002FYiTayML\u002Fstatus\u002F1631359474421366784)]. I'm excited to see what the community does with this new model.\n    - Google Brain 于 2022 年发表的 UL2：统一语言学习范式（论文） - UL2 是一个统一的框架，用于预训练跨数据集和设置普遍有效的模型。_要点：目标比架构重要得多。如果你关心在多种类型的任务\u002F设置中表现出色，Denoisers 混合（MoD）是有效的。_ UL2 将训练语言模型的不同目标函数框架化为去噪任务，其中模型必须恢复给定输入的缺失子序列。在预训练期间，它使用 Mixture-of-Denoisers (MoD)，从这样的一组多样化的目标中进行采样，每个都有不同的配置。MoD 结合了不同的预训练范式。他们证明，使用 UL2 框架训练的模型在各种语言领域表现良好，包括基于提示的 few-shot 学习和为下游任务微调的模型。他们在 2022 年开源了 UL2 20B 模型和检查点。2023 年，他们开源了 Flan-UL2 20B 并发布了权重。查看：[[博客文章](https:\u002F\u002Farchive.is\u002F20230303191656\u002Fhttps:\u002F\u002Fwww.yitay.net\u002Fblog\u002Fflan-ul2-20b), [推文](https:\u002F\u002Ftwitter.com\u002FYiTayML\u002Fstatus\u002F1631359474421366784)]。我很期待社区如何使用这个新模型。\n32. [Larger language models do in-context learning (ICL) differently (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03846) by Google Research, 2023 - Overriding semantic priors when presented with enough flipped labels is an emergent ability of scale. LLMs learn better mappings when ICL labels are semantically unrelated to inputs (i.e., apple\u002Forange, negative\u002Fpositive). Fine-tuning to follow instruction helps both. [[Tweet](https:\u002F\u002Ftwitter.com\u002FJerryWeiAI\u002Fstatus\u002F1633548780619571200)]\n    - Google Research 于 2023 年发表的大语言模型以不同方式执行上下文学习（ICL）（论文） - 当呈现足够的翻转标签时，覆盖语义先验是规模的一种涌现能力。当 ICL 标签与输入在语义上不相关时（即，苹果\u002F橙子，负面\u002F正面），LLM 学习更好的映射。微调以遵循指令有助于两者。[[推文](https:\u002F\u002Ftwitter.com\u002FJerryWeiAI\u002Fstatus\u002F1633548780619571200)]\n33. [The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03915) by Hugging Face et al., 2023 - Documents the data creation and curation efforts of Responsible Open-science Open-collaboration Text Source (ROOTS) corpus, a dataset used to train BLOOM. [[Tweet](https:\u002F\u002Ftwitter.com\u002Farankomatsuzaki\u002Fstatus\u002F1633282997020672000)]\n    - Hugging Face 等人于 2023 年发表的 BigScience ROOTS 语料库：一个 1.6TB 复合多语言数据集（论文） - 记录了负责任开源科学开放协作文本源（ROOTS）语料库的数据创建和策展工作，该语料库用于训练 BLOOM。[[推文](https:\u002F\u002Ftwitter.com\u002Farankomatsuzaki\u002Fstatus\u002F1633282997020672000)]\n34. [PanGu-Σ: Towards Trillion Parameter Language Model with Sparse Heterogeneous Computing (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10845) by Huawei Technologies, 2023 - They develop a system that trained a trillion-parameter language model on a cluster of Ascend 910 AI processors and MindSpore framework. This resulted in a 6.3x increase in training throughput through heterogeneous computing.\n    - 华为技术有限公司于 2023 年发表的 PanGu-Σ：迈向具有稀疏异构计算的万亿参数语言模型（论文） - 他们开发了一个系统，在 Ascend 910 AI 处理器集群和 MindSpore 框架上训练了万亿参数语言模型。这通过异构计算使训练吞吐量增加了 6.3 倍。\n35. [Context-faithful Prompting for Large Language Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11315) by USC et al., 2023\n    - USC 等人于 2023 年发表的上下文忠实提示为大语言模型（论文）\n36. [Llama 2: Open Foundation and Fine-Tuned Chat Models (paper)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.09288) by Meta AI, 2023 - Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1. Its fine-tuned models have been trained on over 1 million human annotations. It outperforms other open source language models on many benchmarks. License: The model and weights are available for free for research and commercial use. It is not an open source model, rather an open approach model — for commercial use, your product cannot have more than 700 million monthly active users and requires a form to get access. Llama-2-chat is the new addition and is created through using supervised fine-tuning and then iteratively refined using RLHF. [[Nathan Lambert's summary of the paper](https:\u002F\u002Fwww.interconnects.ai\u002Fp\u002Fllama-2-from-meta)]\n    - Meta AI 于 2023 年发表的 Llama 2：开放基础和微调聊天模型（论文） - Llama 2 预训练模型在 2 万亿 token 上训练，上下文长度是 Llama 1 的两倍。其微调模型已在超过 100 万个人类注释上训练。它在许多基准测试中优于其他开源语言模型。许可证：模型和权重可免费用于研究和商业用途。它不是一个开源模型，而是一个开放方法模型——对于商业用途，您的产品每月活跃用户不能超过 7 亿，并且需要填写表格才能获得访问权限。Llama-2-chat 是新加入的，是通过监督微调创建的，然后使用 RLHF（人类反馈强化学习）迭代优化。[[Nathan Lambert 的论文摘要](https:\u002F\u002Fwww.interconnects.ai\u002Fp\u002Fllama-2-from-meta)]\n37. [Code Llama: Open Foundation Models for Code (paper)](https:\u002F\u002Fai.meta.com\u002Fresearch\u002Fpublications\u002Fcode-llama-open-foundation-models-for-code\u002F) by Meta AI, 2023 - Code Llama is a family of LLMs for code based on Llama 2 providing SoTA performance among open models, infilling capabilities, support for large input contexts\n\n### Transformer 强化学习\n\n基于人类反馈的强化学习（RLHF）。\n\n- [图解基于人类反馈的强化学习](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Frlhf) - 语言模型的最新进展（例如 ChatGPT）均得益于 RLHF 技术。\n- [使用 RLHF 训练乐于助人且无害的助手（论文）](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.05862) 来自 Anthropic。[[代码和红队测试数据](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FAnthropic\u002Fhh-rlhf), [推文](https:\u002F\u002Ftwitter.com\u002Fanthropicai\u002Fstatus\u002F1514277273070825476)]\n- [后见之明使语言模型成为更好的指令遵循者（论文）](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.05206) 来自加州大学伯克利分校，2023 年 - 底层 RLHF 算法较为复杂，需要额外的训练流程来训练奖励和价值网络。他们考虑了一种替代方法：后见之明指令重标记（HIR）：通过对原始反馈进行重新标记将其转换为指令，并训练模型以实现更好的对齐。\n- [从 r 到 Q*：你的语言模型秘密是一个 Q 函数](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.12358) 来自斯坦福大学，2024 年 4 月 - 该论文弥合了两种 RLHF 方法之间的差距——标准 RLHF 设置和直接偏好优化（DPO）——通过将 DPO 推导为词元级 MDP（马尔可夫决策过程）中的一种通用逆 Q 学习算法。作者提供了关于 DPO 益处的实证见解，包括其执行信用分配的能力，并使用简单的束搜索展示了相对于基础 DPO 策略的改进，具有**在多轮对话、推理和智能体系统中的潜在应用**。\n- [迭代推理偏好优化（IRPO）](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.19733) 来自 Jason Weston (Meta) 等人，2024 年 5 月 - Llama-2-70B-Chat 使用此方法在 **GSM8K 上从 55.6% 提升至 81.6%**。他们应用迭代偏好优化来改进推理：利用 LLM 生成思维链候选项，根据答案是否正确构建偏好对，使用 DPO + NLL（负对数似然）进行训练，然后重复。例如，想象一群人试图决定如何分配有限的预算。每个人对于资金如何使用都有自己的优先事项和偏好。使用 IRPO 方法，群体会进行往复讨论，每个人根据他人的论点和妥协调整自己的偏好。随着时间的推移，群体会收敛于一组大家都能接受的偏好，即使它最初并不是任何一个人想要的。\n\n#### RLHF 工具\n\n- [lvwerra\u002FTRL](https:\u002F\u002Fgithub.com\u002Flvwerra\u002Ftrl) - 使用强化学习训练 Transformer 语言模型。\n\n面向 ChatGPT 的开源努力：\n\n- [CarperAI\u002FTRLX](https:\u002F\u002Fgithub.com\u002FCarperAI\u002Ftrlx) - 起源于 TRL 的一个分支。它允许你使用强化学习微调 Hugging Face 语言模型（基于 GPT2, GPT-NeoX），参数量高达 20B。由 CarperAI 带来（诞生于 EleutherAI，是 StabilityAI 家族的一部分组织）。CarperAI 正在开发生产就绪的开源 RLHF 工具。他们已经 [宣布了首个开源“指令微调”语言模型的计划](https:\u002F\u002Fcarper.ai\u002Finstruct-gpt-announcement\u002F)。\n- [allenai\u002FRL4LMs](https:\u002F\u002Fgithub.com\u002Fallenai\u002FRL4LMs) - Allen AI 的语言模型强化学习（RL4LMs）。它是一个模块化 RL 库，用于将语言模型微调至人类偏好。\n\n### 延伸阅读\n\n1. [如何构建 OpenAI 的 GPT-2：“过于危险而无法发布的 AI\"](https:\u002F\u002Fwww.reddit.com\u002Fr\u002FMachineLearning\u002Fcomments\u002Fbj0dsa\u002Fd_how_to_build_openais_gpt2_the_ai_thats_too\u002F).\n2. [OpenAI 的 GPT2——是媒体的炒作还是警钟？](https:\u002F\u002Fwww.skynettoday.com\u002Fbriefs\u002Fgpt2)\n3. [Transformer（转换器）如何打破 NLP（自然语言处理）排行榜](https:\u002F\u002Fhackingsemantics.xyz\u002F2019\u002Fleaderboards\u002F) 作者 Anna Rogers. :fire::fire::fire:\n- 一篇关于当前主导 NLP 的大模型问题的精辟总结文章。\n- 更大的模型 + 更多数据 = 机器学习研究中的进步 :question:\n4. [从零开始构建 Transformer](http:\u002F\u002Fwww.peterbloem.nl\u002Fblog\u002Ftransformers) 教程 作者 Peter Bloem.\n5. [使用 NVIDIA TensorRT 和 BERT（双向编码器表示）进行实时自然语言理解](https:\u002F\u002Fdevblogs.nvidia.com\u002Fnlu-with-tensorrt-bert\u002F) 在 Google Cloud T4 GPU（图形处理器）上实现了 2.2 毫秒的推理延迟。优化方案已在 GitHub 上开源。\n6. [NLP 的“聪明汉斯”时刻已到来](https:\u002F\u002Fthegradient.pub\u002Fnlps-clever-hans-moment-has-arrived\u002F) 作者 The Gradient.\n7. [神经网络中的语言、树与几何](https:\u002F\u002Fpair-code.github.io\u002Finterpretability\u002Fbert-tree\u002F) —— 伴随论文《可视化与测量 BERT 的几何结构》的一系列解释性笔记，由 Google 的 People + AI Research (PAIR) 团队撰写。\n8. [Transformer 基准测试：PyTorch 与 TensorFlow](https:\u002F\u002Fmedium.com\u002Fhuggingface\u002Fbenchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) 作者 Hugging Face —— 针对广泛 Transformer 架构的推理时间（CPU 和 GPU 上）及内存使用情况的对比。\n9. [Transformer 中表示的演变](https:\u002F\u002Flena-voita.github.io\u002Fposts\u002Femnlp19_evolution.html) —— 一篇通俗易懂的文章，展示了他们 EMNLP 2019 论文的见解。他们研究了在不同目标下训练的 Transformer 中，各个词元（Token）的表示是如何变化的。\n10. [BERT 的黑暗秘密](https:\u002F\u002Ftext-machine-lab.github.io\u002Fblog\u002F2020\u002Fbert-secrets\u002F) —— 这篇文章探究了微调（Fine-tuned）后的 BERT 模型中的语言学知识。特别是，作者分析了具有某种语言学解释的自注意力（Self-attention）模式实际上有多少被用于解决下游任务。TL;DR（简而言之）：他们未能找到证据表明具有语言学可解释性的自注意力图对下游性能至关重要。\n11. [首次使用 BERT 的视觉指南](https:\u002F\u002Fjalammar.github.io\u002Fa-visual-guide-to-using-bert-for-the-first-time\u002F) —— Jay Alammar 撰写的关于在实践中使用 BERT 的教程，例如用于电影评论的情感分析。\n12. [Turing-NLG：一个拥有 170 亿参数的语言模型](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fblog\u002Fturing-nlg-a-17-billion-parameter-language-model-by-microsoft\u002F) 由 Microsoft 开发，在许多下游 NLP 任务上超越了最先进水平。如果没有 [DeepSpeed 库](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeepSpeed)（兼容 PyTorch）和 [ZeRO 优化器](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.02054) 取得的突破，这项工作将无法实现，更多关于此的内容可参考随附的 [博客文章](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fblog\u002Fzero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters)。\n13. [MUM（多任务统一模型）：理解信息的新 AI 里程碑](https:\u002F\u002Fblog.google\u002Fproducts\u002Fsearch\u002Fintroducing-mum\u002F) 由 Google 发布。\n- 基于 Transformer 架构但更强大。\n- 多任务意味着：支持文本和图像，75 种语言间的知识迁移，理解上下文并深入探讨主题，以及生成内容。\n14. [GPT-3 不再是唯一的热门选择](https:\u002F\u002Flastweekin.ai\u002Fp\u002Fgpt-3-is-no-longer-the-only-game) —— GPT-3 去年（2020 年）迄今为止是该类最大的 AI 模型。现在？情况并非如此。\n15. [OpenAI 的 API（应用程序编程接口）现已可用，无需排队](https:\u002F\u002Fopenai.com\u002Fblog\u002Fapi-no-waitlist\u002F) —— 无需等待即可访问 GPT-3。然而，应用必须在 [上线](https:\u002F\u002Fbeta.openai.com\u002Fdocs\u002Fgoing-live) 之前获得批准。此次发布还允许其审查申请、监控滥用情况，并更好地理解该技术的影响。\n16. [GPT-3 的固有局限性](https:\u002F\u002Flastweekin.ai\u002Fp\u002Fthe-inherent-limitations-of-gpt-3) —— 如果你之前读过 [Gwern 的 GPT-3 创意小说文章](https:\u002F\u002Fwww.gwern.net\u002FGPT-3#repetitiondivergence-sampling)，那么文章中缺少的一件事就是被称为“重复\u002F发散采样”的谜团：\n    > 当你生成分散式补全时，它们最终往往会陷入无意义内容的重复循环中。\n\n对于使用 Copilot（GitHub Copilot）的用户来说，你们应该都经历过这种奇怪的现象：它一遍又一遍地生成相同的代码行或代码块。\n17. [大规模语言建模：Gopher、伦理考量与检索](https:\u002F\u002Fdeepmind.com\u002Fblog\u002Farticle\u002Flanguage-modelling-at-scale) by DeepMind - 该论文分析了基于 Transformer（一种深度学习架构）的语言模型在不同规模下的性能表现——从拥有数千万参数的模型到名为 Gopher 的拥有 2800 亿参数的模型。\n18. [使用 AlphaCode 进行编程竞赛](https:\u002F\u002Fdeepmind.com\u002Fblog\u002Farticle\u002FCompetitive-programming-with-AlphaCode) by DeepMind - AlphaCode 使用基于 Transformer 的语言模型来生成代码，这些代码能够针对需要理解算法的编程问题创造出新颖的解决方案。\n19. [完全通过自然语言构建游戏和应用，使用 OpenAI 的 code-davinci 模型](https:\u002F\u002Fandrewmayneblog.wordpress.com\u002F2022\u002F03\u002F17\u002Fbuilding-games-and-apps-entirely-through-natural-language-using-openais-davinci-code-model\u002F) - 作者仅通过告诉模型他们想要什么，就构建了几个小游戏和应用，完全没有触碰一行代码。\n20. [OpenAI 雇佣大量人类来修复 GPT 的错误答案以使 GPT-3 正常工作](https:\u002F\u002Fstatmodeling.stat.columbia.edu\u002F2022\u002F03\u002F28\u002Fis-open-ai-cooking-the-books-on-gpt-3\u002F)\n21. [GPT-3 可以运行代码](https:\u002F\u002Fmayt.substack.com\u002Fp\u002Fgpt-3-can-run-code) - 你提供输入文本和指令，GPT-3 会将它们转换为预期的输出。它在更改编码风格、在编程语言之间翻译、重构和添加文档等任务上表现良好。例如，将 JSON 转换为 YAML，将 Python 代码翻译成 JavaScript，改进函数的运行时复杂度。\n22. [使用 GPT-3 解释代码工作原理](https:\u002F\u002Fsimonwillison.net\u002F2022\u002FJul\u002F9\u002Fgpt-3-explain-code\u002F) by Simon Willison.\n23. [Character AI 宣布正在建立全栈 AGI（通用人工智能）公司](https:\u002F\u002Fblog.character.ai\u002Fintroducing-character\u002F) 这样你就可以利用对话式 AI 研究创建自己的 AI 来帮助你做任何事情。联合创始人 Noam Shazeer（共同发明了 Transformer，首次将其扩展到超级计算机，并开创了大规模预训练）和 Daniel de Freitas（领导了 LaMDA 的开发），所有这些都对最近的 AI 进步至关重要。\n24. [OpenAI 最新的 GPT-3 模型有多好？](https:\u002F\u002Fscale.com\u002Fblog\u002Fgpt-3-davinci-003-comparison) - 除了 ChatGPT，OpenAI 还发布了 text-davinci-003，这是一个经过强化学习微调的模型，在长文写作方面表现更好。例如，它可以以 Eminem 的风格解释代码。😀\n25. [OpenAI 竞争对手 Cohere 推出语言模型 API](https:\u002F\u002Fventurebeat.com\u002Funcategorized\u002Fopenai-rival-cohere-launches-language-model-api\u002F) - 得到 AI 专家支持，他们的目标是将谷歌质量的预测语言带给大众。Aidan Gomez 在 Google Brain 联合撰写了一篇具有开创性的 2017 年论文，发明了一种被称为“Transformer\"的概念。\n26. [与 OpenAI 的 GPT-3 竞争的所有初创公司都需要解决相同的问题](https:\u002F\u002Fwww.theregister.com\u002F2022\u002F03\u002F03\u002Flanguage_model_gpt3\u002F) - 去年，两家初创公司发布了各自专有的文本生成 API。AI21 Labs 于 2021 年 8 月推出了其拥有 1780 亿参数的 Jurassic-1，Cohere 则发布了一系列模型。Cohere 尚未披露其模型包含多少参数。……还有其他新兴初创公司正致力于解决同样的问题。Anthropic 是一家由前 OpenAI 员工组成的团队创立的 AI 安全和研究公司。几位研究人员离开 Google Brain 加入了由同事创办的两家新企业。一家名为 Character.ai，另一家名为 Persimmon Labs。\n27. [Cohere 希望构建终极 NLP（自然语言处理）平台](https:\u002F\u002Falbertoromgar.medium.com\u002Fcohere-wants-to-build-the-definitive-nlp-platform-7d090c0de9ca) - 超越像 GPT-3 这样的生成式模型。\n28. [Transformer 推理算术](https:\u002F\u002Fkipp.ly\u002Fblog\u002Ftransformer-inference-arithmetic\u002F) Cohere 的 ML Ops（机器学习运维）Carol Chen 的技术文章。本文详细阐述了关于 LLM（大语言模型）推理性能的少数原则推理，没有实验或复杂的数学。\n29. [2022 年 AI 状态报告](https:\u002F\u002Fwww.stateof.ai\u002F2022-report-launch.html) - 关键要点：\n  - 新的独立研究机构正在迅速开源主要机构闭源的成果。\n  - AI 安全正吸引更多人才……但仍极度被忽视。\n  - 驱动 GitHub Copilot 的 OpenAI 的 Codex，凭借其在多行代码或直接通过自然语言指令完成代码的能力，给计算机科学界留下了深刻印象。这一成功激发了该领域更多的研究。\n  - DeepMind 重新审视了 LM（语言模型）缩放定律，发现当前的 LMs（语言模型）显著训练不足：鉴于其庞大的规模，它们使用的训练数据不够。他们使用 4.6 倍的数据训练了 Chinchilla（Gopher 的 4 倍小版本），发现 Chinchilla 在 BIG-bench 上优于 Gopher 和其他大型模型。\n  - 来自人类反馈的强化学习（RLHF）已成为微调 LLM（大语言模型）并将其与人类价值观对齐的关键方法。这涉及人类对给定输入采样的语言模型输出进行排名，利用这些排名学习人类偏好的奖励模型，然后使用此作为奖励信号，利用 RL（强化学习）微调语言模型。\n30. [缩放假设](https:\u002F\u002Fwww.gwern.net\u002FScaling-hypothesis) by Gwern - 关于 GPT-3：元学习 (meta-learning)、缩放、影响和深层理论。\n31. [AI 与语言的局限——仅用单词和句子训练的 AI 系统永远无法近似人类的理解](https:\u002F\u002Fwww.noemamag.com\u002Fai-and-the-limits-of-language\u002F) by Jacob Browning and Yann LeCun - 像 ChatGPT 这样的 LLM（大语言模型）能做什么和不能做什么，以及为什么 AGI（通用人工智能）尚未到来。\n32. [错误地使用 GPT-3 基础模型（Foundational Models）：降低成本 40 倍，提高速度 5 倍](https:\u002F\u002Fwww.buildt.ai\u002Fblog\u002Fincorrectusage) - 当微调模型时，需要注意几点。关于如何大规模使用这些模型，我们仍有很多要学。我们需要更好的指南。\n33. [下一代大型语言模型](https:\u002F\u002Farchive.vn\u002FWFZnG) - 它突出了 3 个新兴领域：1) 能够生成自己的训练数据以改进自身的模型，2) 能够自我事实核查的模型，以及 3) 大规模稀疏专家模型 (massive sparse expert models)。\n34. [GPT-4 分析与预测](https:\u002F\u002Fwww.lesswrong.com\u002Fposts\u002FqdStMFDMrWAnTqNWL\u002Fgpt-4-predictions) - 有些关联，在 [\"Bing Chat 明显且激进地未对齐\"](https:\u002F\u002Fwww.lesswrong.com\u002Fposts\u002FjtoPawEhLNXNxvgTT\u002Fbing-chat-is-blatantly-aggressively-misaligned) 帖子中，Gwern 思考 Bing Chat\u002FSydney 为何与 ChatGPT 如此不同，他的假设是：\"Sydney 不是经过 RLHF（来自人类反馈的强化学习）训练的 GPT-3 模型，而是匆忙开发的 GPT-4 模型”。也有人认为 Sydney 在推理任务上的表现优于 ChatGPT\u002FGPT-3.5，它可能是 GPT-4。\n35. [Mosaic LLMs（第 2 部分）：低于 50 万美元获得 GPT-3 质量（2022）](https:\u002F\u002Farchive.is\u002Fgu2li) - 他们声称他们的 [Composer PyTorch 框架](https:\u002F\u002Fgithub.com\u002Fmosaicml\u002Fcomposer) 简化了模型训练。现在有了 Colossal-AI 框架，我想知道他们的解决方案有多好。直到他们的用户实际训练它，我想一切都是纯假设。\n36. [我手工制作了一个 Transformer（无需训练！）](https:\u002F\u002Fvgel.me\u002Fposts\u002Fhandmade-transformer\u002F) (2023) - 手动制作一个 Transformer 来预测简单序列——不是通过训练一个，或使用预训练权重，而是在一个晚上内**手工分配每个权重**。\n37. [在 GCP（Google Cloud Platform）上为生产用例微调 Llama 3.1](https:\u002F\u002Fwww.zenml.io\u002Fblog\u002Fhow-to-finetune-llama-3-1-with-zenml)\n\n## 教育类\n\n\u003Cdiv style=\"width: 200px;\">\n\u003Ctable>\n\u003Ctr>\n    \u003Ctd>\u003C\u002Ftd>\n    \u003Ctd colspan=\"2\">\u003Cvideo width=\"100%\" src='https:\u002F\u002Fgithub.com\u002Fpoloclub\u002Ftransformer-explainer\u002Fassets\u002F5067740\u002F5c2d6a9d-2cbf-4b01-9ce1-bdf8e190dc42'>\u003C\u002Ftd>\n    \u003Ctd>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003Ctr>    \n    \u003Ctd colspan=\"2\" align=\"right\">\u003Ca href=\"http:\u002F\u002Fpoloclub.github.io\u002Ftransformer-explainer\">在线演示\u003C\u002Fa>\u003C\u002Ftd>\n    \u003Ctd colspan=\"2\">\u003Ca href=\"https:\u002F\u002Fyoutu.be\u002FECR4oAwocjs\">演示视频\u003C\u002Fa>\u003C\u002Ftd>\n\u003C\u002Ftr>\n\u003C\u002Ftable>\n\u003C\u002Fdiv>\n\n- [minGPT](https:\u002F\u002Fgithub.com\u002Fkarpathy\u002FminGPT) by Andrej Karpathy - 一个基于 PyTorch 的 GPT 重新实现，涵盖训练和推理。minGPT 力求小巧、简洁、可解释且具教育意义，因为目前可用的大多数 GPT 模型实现都略显繁杂。GPT 并非复杂的模型，该实现代码量约为 300 行。\n  - [nanoGPT](https:\u002F\u002Fgithub.com\u002Fkarpathy\u002FnanoGPT) - 这是对 minGPT 的重写版本。仍在积极开发中。相关的持续视频讲座系列 _[Neural Networks: Zero to Hero](https:\u002F\u002Fkarpathy.ai\u002Fzero-to-hero.html)_，从头开始用代码构建 GPT，并致力于详尽说明一切。注意 Karpathy 的自下而上方法与 fast.ai 的教学风格相得益彰。（供参考，fast.ai 既有自上而下（“第一部分”）也有自下而上（“第二部分”）的方法。）\n- [大型语言模型 (LLMs) 的视觉介绍 by Jay Alammar\u002FCohere](https:\u002F\u002Fjalammar.github.io\u002Fapplying-large-language-models-cohere\u002F) - 对 LLMs 及其在语言处理中的一些应用的高层概览。涵盖了文本生成模型（如 GPT）和表示模型（如 BERT）。\n- [解释 Transformer 语言模型的界面](https:\u002F\u002Fjalammar.github.io\u002Fexplaining-transformers\u002F) by Jay Alammar - 通过查看神经网络内部的输入显著性和神经元激活来温和地介绍 Transformer 模型 (Transformer)。**然而，我们对这些模型为何如此有效的工作原理的理解仍然落后于这些发展**。\n- [餐巾纸上的 GPT-3 架构](https:\u002F\u002Fdugas.ch\u002Fartificial_curiosity\u002FGPT_architecture.html)\n- [PicoGPT: 用 60 行 NumPy 代码实现的 GPT](https:\u002F\u002Fjaykmody.com\u002Fblog\u002Fgpt-from-scratch\u002F)\n- [关于 Transformer 架构核心的视频解释](https:\u002F\u002Fyoutu.be\u002FkWLed8o5M2Y?si=YOMpWS1gfWMADzTX) (2023) - 阅读了《图解 Transformer》，但仍觉得没有直观理解各种注意力组件的作用吗？在这个视频中，一种更具建设性的解释 Transformer 和注意力的方法可以帮助你更好地理解它：从简单的卷积神经网络 (CNN) 开始，作者将逐步带你了解将 CNN 转变为 Transformer 所需进行的所有更改。\n- [黑客指南：语言模型 (视频)](https:\u002F\u002Fyoutu.be\u002FjkrNMKz9pWU?si=4qsEubcueqp45geo) (2023) by Jeremy Howard, fast.ai - 快速浏览语言模型的所有基本思想，尽可能多地使用代码演示如何使用它们（包括开源模型和基于 OpenAI 的模型）。\n\n### 教程\n\n1. [如何使用 Transformers 和 Tokenizers 从头开始训练新的语言模型](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fhow-to-train) 教程 by Hugging Face。 :fire:\n\n## AI 安全\n\n可解释性 (Interpretability) 研究和 AI 对齐 (AI Alignment) 研究。\n\n- [Transformer Circuits Thread](https:\u002F\u002Ftransformer-circuits.pub\u002F) project by Anthropic - 我们能否将 Transformer 语言模型逆向工程为人类可理解的计算机程序？可解释性研究非常受益于交互式文章。作为其努力的一部分，除了论文外，他们还创建了其他几个资源，例如\"Transformer 电路的数学框架”和 [\"叠加态的玩具模型\"](https:\u002F\u002Fthreadreaderapp.com\u002Fthread\u002F1570087876053942272.html)。\n- [利用模型编写的评估发现语言模型行为 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09251) by Anthropic et al. - 他们使用 LLMs 自动生成评估。他们发现了 LLMs 随规模增大而变差的反向缩放新案例。他们还找到了人类反馈强化学习 (RLHF) 中反向缩放的最早示例之一，其中更多的 RLHF 会使 LLMs 变差。\n- [Transformer 通过梯度下降 (Gradient Descent) 学习上下文内 (In-context) 知识 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.07677) by J von Oswald et al. [[AI Alignment Forum](https:\u002F\u002Fwww.alignmentforum.org\u002Fposts\u002FfirtXAWGdvzXYAh9B\u002Fpaper-transformers-learn-in-context-by-gradient-descent)]\n- [为什么 GPT 能进行上下文内学习？语言模型秘密执行梯度下降作为元优化器 (Meta-Optimizers) (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10559v2) by Microsoft Research.\n- [大型语言模型中的认知偏差](https:\u002F\u002Funiversalprior.substack.com\u002Fp\u002Fcognitive-biases-in-large-language)\n- [Tracr: 编译后的 Transformer 作为可解释性的实验室 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.05062) (2023) by DeepMind - TRACR (TRAnsformer Compiler for RASP) 是一个编译器，用于将 RASP 程序（Transformer 的领域特定语言 (DSL)）转换为类似 GPT 模型的权重。通常，我们训练 Transformer 将其算法编码到权重中。有了 TRACR，我们走相反的方向；直接从显式代码编译权重。**直接**这样做是为了什么？加速可解释性研究。可以把它看作 Transformer 上的形式化方法（来自软件工程）。检查可解释性工具提供的解释是否正确可能很困难。[[Tweet](https:\u002F\u002Ftwitter.com\u002Fdavlindner\u002Fstatus\u002F1613900577804525573), [code](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Ftracr)]\n- [Yann LeCun 对当前（自回归）LLMs 的坚定看法 (推文)](https:\u002F\u002Fweb.archive.org\u002Fweb\u002F20230213173604\u002Fhttps:\u002F\u002Ftwitter.com\u002Fylecun\u002Fstatus\u002F1625118108082995203)\n- [AI 安全的核心观点：何时、为何、何事及如何](https:\u002F\u002Fwww.anthropic.com\u002Findex\u002Fcore-views-on-ai-safety) by Anthropic, 2023.\n- [界限内的个性化：大型语言模型与个性化反馈对齐的风险分类和政策框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05453) by University of Oxford, 2023. [[Tweet](https:\u002F\u002Ftwitter.com\u002Fhannahrosekirk\u002Fstatus\u002F1634228684893700096)]\n- [GPTs 就是 GPTs：早期审视大型语言模型对劳动力市场的潜在影响 (论文)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10130) by OpenAI et al., 2023 - 该论文认为 GPT 是一种通用技术。\n\n## 视频\n\n### [BERTology](https:\u002F\u002Fhuggingface.co\u002Ftransformers\u002Fbertology.html)\n\n1. [XLNet 详解](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=naOuE9gLbZo) by NLP Breakfasts.\n  - 清晰的解释。还涵盖了双流自注意力概念。\n2. [NLP 的未来](https:\u002F\u002Fyoutu.be\u002FG5lmya6eKtc) by 🤗\n  - 关于当前自然语言处理 (NLP) 迁移学习中正在发生的事情、限制和未来方向的密集概览。\n3. [Transformer 神经网络架构详解](https:\u002F\u002Fyoutu.be\u002FFWFA4DGuzSc) by AI Coffee Break with Letitia Parcalabescu.\n  - 高层解释，最适合不熟悉 Transformer 的人。\n\n### 注意力 (Attention) 和 Transformer 网络\n\n1. [Sequence to Sequence Learning Animated (Inside Transformer Neural Networks and Attention Mechanisms)](https:\u002F\u002Fyoutu.be\u002FGTVgJhSlHEk) by learningcurve.\n\n### 通用\n\n- [Trials and tribulations of OPT-175B training by Susan Zhang at Meta](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=p9IxoSkvZ-M) - 在这次演讲中，他们回顾了 OPT-175B 的开发生命周期，涵盖了大规模部署时面临的**基础设施**和**训练收敛**挑战，以及未来解决这些问题的方法。令人惊叹的是他们竟然完成了如此壮举。关键要点：**数据**至关重要！对神经网络的**基本原理**（LR (学习率), SGD (随机梯度下降) 等）和**工程实践**有着超深的理解。甚至花费了比平时更多的时间盯着**损失曲线**。理解 Chinchilla 的**缩放定律**，即随着规模扩大，新架构\u002F算法是如何工作的。 [[LLM (大型语言模型) training log](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmetaseq\u002Fblob\u002Fmain\u002Fprojects\u002FOPT\u002Fchronicles\u002FOPT175B_Logbook.pdf)]\n\n## 官方 BERT 实现\n\n1. [google-research\u002Fbert](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert) - BERT 的 TensorFlow 代码和预训练模型。\n\n## 社区提供的 Transformer 实现\n\nGPT 和\u002F或 BERT 的实现。\n\n### PyTorch 和 TensorFlow\n\n1. [🤗 Hugging Face Transformers](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers) (formerly known as [pytorch-transformers](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Fpytorch-transformers) and [pytorch-pretrained-bert](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Fpytorch-pretrained-BERT)) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, CTRL...) for Natural Language Understanding (NLU) (自然语言理解) and Natural Language Generation (NLG) (自然语言生成) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch. [[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.03771)]\n2. [spacy-transformers](https:\u002F\u002Fgithub.com\u002Fexplosion\u002Fspacy-transformers) - 一个封装 Hugging Face 的 Transformers 的库，用于提取特征以驱动 NLP 管道。它还计算一个**对齐**，以便 Transformer 特征可以关联回实际单词，而不仅仅是 wordpieces (词元)。\n3. [FasterTransformer](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FFasterTransformer) - Transformer 相关优化，包括 BERT 和 GPT。此仓库提供了一个脚本和配方来运行高度优化的基于 Transformer 的 encoder (编码器) 和 decoder (解码器) 组件，并由 NVIDIA 进行测试和维护。\n\n### PyTorch\n\n1. [codertimo\u002FBERT-pytorch](https:\u002F\u002Fgithub.com\u002Fcodertimo\u002FBERT-pytorch) - Google AI 2018 BERT PyTorch 实现。\n2. [innodatalabs\u002Ftbert](https:\u002F\u002Fgithub.com\u002Finnodatalabs\u002Ftbert) - BERT 机器学习模型的 PyTorch 移植版。\n3. [kimiyoung\u002Ftransformer-xl](https:\u002F\u002Fgithub.com\u002Fkimiyoung\u002Ftransformer-xl) - Transformer-XL 论文关联的代码仓库。\n4. [dreamgonfly\u002FBERT-pytorch](https:\u002F\u002Fgithub.com\u002Fdreamgonfly\u002FBERT-pytorch) - “BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding”中的 BERT PyTorch 实现。\n5. [dhlee347\u002Fpytorchic-bert](https:\u002F\u002Fgithub.com\u002Fdhlee347\u002Fpytorchic-bert) - Google BERT 的 PyTorch 实现。\n6. [pingpong-ai\u002Fxlnet-pytorch](https:\u002F\u002Fgithub.com\u002Fpingpong-ai\u002Fxlnet-pytorch) - Google Brain XLNet 的 PyTorch 实现。\n7. [facebook\u002Ffairseq](https:\u002F\u002Fgithub.com\u002Fpytorch\u002Ffairseq\u002Fblob\u002Fmaster\u002Fexamples\u002Froberta\u002FREADME.md) - RoBERTa：Facebook AI Research 提出的鲁棒优化的 BERT 预训练方法。在 GLUE, SQuAD 和 RACE 上取得 SoTA (最先进) 结果。\n8. [NVIDIA\u002FMegatron-LM](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FMegatron-LM) - 持续研究大规模训练 Transformer 语言模型，包括：BERT。\n9. [deepset-ai\u002FFARM](https:\u002F\u002Fgithub.com\u002Fdeepset-ai\u002FFARM) - 面向行业的简单且灵活的 transfer learning (迁移学习)。\n10. [NervanaSystems\u002Fnlp-architect](https:\u002F\u002Fwww.intel.ai\u002Fnlp-transformer-models\u002F) - Intel AI 的 NLP Architect。除其他库外，它提供了 Transformer 模型的 quantized (量化) 版本和高效的训练方法。\n11. [kaushaltrivedi\u002Ffast-bert](https:\u002F\u002Fgithub.com\u002Fkaushaltrivedi\u002Ffast-bert) - 基于 BERT 的 NLP 模型的超级简易库。基于 🤗 Transformers 构建并受 fast.ai 启发。\n12. [NVIDIA\u002FNeMo](https:\u002F\u002Fgithub.com\u002FNVIDIA\u002FNeMo) - Neural Modules 是 NVIDIA 用于对话式 AI 的工具包。他们正尝试 [improve speech recognition with BERT post-processing (使用 BERT post-processing (后处理) 改进语音识别)](https:\u002F\u002Fnvidia.github.io\u002FNeMo\u002Fnlp\u002Fintro.html#improving-speech-recognition-with-bertx2-post-processing-model)。\n13. [facebook\u002FMMBT](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmmbt\u002F) from Facebook AI - Multimodal (多模态) transformers 模型，可以接受一个 Transformer 模型和一个 computer vision (计算机视觉) 模型用于分类图像和文本。\n14. [dbiir\u002FUER-py](https:\u002F\u002Fgithub.com\u002Fdbiir\u002FUER-py) from Tencent and RUC - PyTorch 开源预训练模型框架及预训练模型库（更侧重于中文）。\n15. [lucidrains\u002Fx-transformers](https:\u002F\u002Fgithub.com\u002Flucidrains\u002Fx-transformers) - 一个简单但完整的 full-attention (全注意力) Transformer，包含来自各种论文的有前景的实验特性（适合学习目的）。有一篇 2021 年的论文总结了 Transformer 修改，[_Do Transformer Modifications Transfer Across Implementations and Applications?_](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.11972)。\n\n### Keras\n\n1. [Separius\u002FBERT-keras](https:\u002F\u002Fgithub.com\u002FSeparius\u002FBERT-keras) - 带有预训练权重的 BERT Keras 实现。\n2. [CyberZHG\u002Fkeras-bert](https:\u002F\u002Fgithub.com\u002FCyberZHG\u002Fkeras-bert) - 可加载官方预训练模型以进行特征提取和预测的 BERT 实现。\n3. [bojone\u002Fbert4keras](https:\u002F\u002Fgithub.com\u002Fbojone\u002Fbert4keras) - 为 Keras 设计的轻量级 BERT 重新实现。\n\n### TensorFlow\n\n1. [guotong1988\u002FBERT-tensorflow](https:\u002F\u002Fgithub.com\u002Fguotong1988\u002FBERT-tensorflow) - BERT：用于语言理解的深度双向 Transformer 预训练。\n2. [kimiyoung\u002Ftransformer-xl](https:\u002F\u002Fgithub.com\u002Fkimiyoung\u002Ftransformer-xl) - Transformer-XL 论文关联的代码仓库。\n3. [zihangdai\u002Fxlnet](https:\u002F\u002Fgithub.com\u002Fzihangdai\u002Fxlnet) - XLNet 论文关联的代码仓库。\n\n### Chainer\n\n1. [soskek\u002Fbert-chainer](https:\u002F\u002Fgithub.com\u002Fsoskek\u002Fbert-chainer) - “BERT：用于语言理解的深度双向 Transformer 预训练”的 Chainer 实现。\n\n### 其他\n\n- [llama.cpp](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp) - Facebook 的 LLaMA 模型的 C\u002FC++ 移植版。\n- [Cformers](https:\u002F\u002Fgithub.com\u002FNolanoOrg\u002Fcformers) - 具有 C 后端的高速 CPU inference (推理) SoTA Transformer。\n- [Transformers.js](https:\u002F\u002Fgithub.com\u002Fxenova\u002Ftransformers.js) - 在浏览器中运行 🤗 Transformers。\n- [Alpaca.cpp](https:\u002F\u002Fgithub.com\u002Fantimatter15\u002Falpaca.cpp) - 在本地设备上运行快速的类 ChatGPT 模型。\n- [LLaMA compatible port](https:\u002F\u002Fgithub.com\u002Fcedrickchee\u002Fllama#llama-compatible-port)\n- [Apple Neural Engine (ANE) Transformers](https:\u002F\u002Fgithub.com\u002Fapple\u002Fml-ane-transformers) - 针对 Apple Silicon (苹果硅芯片) 优化的 Transformer 架构。\n\n## NLP（自然语言处理）中的迁移学习（Transfer Learning）\n\n\u003Cdetails>\n\n\u003Csummary>NLP 终于有了进行迁移学习的方法，其效果可能堪比计算机视觉（Computer Vision）。\u003C\u002Fsummary>\n正如 Jay Alammar 所说：\n\n> 2018 年是处理文本的 Machine Learning（机器学习）模型的一个转折点（更准确地说，是 Natural Language Processing（自然语言处理）或简称 NLP）。我们对于如何最好地表示单词和句子以捕捉潜在含义和关系的概念理解正在迅速演变。此外，NLP 社区一直在提出极其强大的组件，你可以免费下载并在自己的模型和流程（pipelines）中使用它们（这被称为 [NLP 的 ImageNet 时刻](http:\u002F\u002Fruder.io\u002Fnlp-imagenet\u002F)，引用了多年前类似的发展如何加速了计算机视觉任务中机器学习的发展）。\n>\n> 这一发展的最新里程碑之一是 [BERT](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert) 的 [发布](https:\u002F\u002Fai.googleblog.com\u002F2018\u002F11\u002Fopen-sourcing-bert-state-of-art-pre.html)，该事件被 [描述](https:\u002F\u002Ftwitter.com\u002Flmthang\u002Fstatus\u002F1050543868041555969) 为标志着 NLP 新时代的开始。BERT 是一个打破了多项记录的语言处理能力模型。在描述该模型的论文发布后不久，团队也开源了该模型的代码，并提供了已在大规模数据集上 Pre-trained（预训练）好的模型版本可供下载。这是一个重大的发展，因为它使得任何构建涉及语言处理的机器学习模型的人都可以使用这个强大的组件作为现成的部分——节省了从头开始训练语言处理模型所需的时间、精力、知识和资源。\n>\n> BERT 建立在 NLP 社区最近涌现的许多巧妙想法之上——包括但不限于 [Semi-supervised Sequence Learning（半监督序列学习）](https:\u002F\u002Farxiv.org\u002Fabs\u002F1511.01432)（由 [Andrew Dai](https:\u002F\u002Ftwitter.com\u002Fiamandrewdai) 和 [Quoc Le](https:\u002F\u002Ftwitter.com\u002Fquocleix) 提出），[ELMo](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.05365)（由 Matthew Peters 以及来自 [AI2](https:\u002F\u002Fallenai.org\u002F) 和 [UW CSE](https:\u002F\u002Fwww.engr.washington.edu\u002Fabout\u002Fbldgs\u002Fcse) 的研究人员提出），[ULMFiT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1801.06146)（由 [fast.ai](https:\u002F\u002Ffast.ai) 创始人 [Jeremy Howard](https:\u002F\u002Ftwitter.com\u002Fjeremyphoward) 和 [Sebastian Ruder](https:\u002F\u002Ftwitter.com\u002Fseb_ruder) 提出），[OpenAI transformer](https:\u002F\u002Fs3-us-west-2.amazonaws.com\u002Fopenai-assets\u002Fresearch-covers\u002Flanguage-unsupervised\u002Flanguage_understanding_paper.pdf)（由 OpenAI 研究人员 [Radford](https:\u002F\u002Ftwitter.com\u002Falecrad), [Narasimhan](https:\u002F\u002Ftwitter.com\u002Fkarthik_r_n), [Salimans](https:\u002F\u002Ftwitter.com\u002Ftimsalimans), 和 [Sutskever](https:\u002F\u002Ftwitter.com\u002Filyasut) 提出），以及 Transformer（[Vaswani et al](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.03762)）。\n>\n> **ULMFiT：确立 NLP 中的迁移学习**\n>\n> [ULMFiT 引入了方法，有效地利用模型在预训练期间学到的大量内容] —— 不仅仅是 Embeddings（嵌入），也不仅仅是 Contextualized Embeddings（上下文嵌入）。ULMFiT 引入了一种语言模型和一个过程，可以有效地针对各种任务 Fine-tune（微调）该语言模型。\n>\n> NLP 终于有了进行迁移学习的方法，其效果可能堪比计算机视觉。\n\n\u003C\u002Fdetails>\n\n[MultiFiT：高效的多语言语言模型微调](http:\u002F\u002Fnlp.fast.ai\u002Fclassification\u002F2019\u002F09\u002F10\u002Fmultifit.html) 由 Sebastian Ruder 等人撰写。MultiFiT 扩展了 ULMFiT，使其更高效且更适合英语之外的语言建模。（[EMNLP 2019 论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.04761)）\n\n## 书籍\n\n1. [Transfer Learning for Natural Language Processing](https:\u002F\u002Fwww.manning.com\u002Fbooks\u002Ftransfer-learning-for-natural-language-processing) - 一本关于迁移学习技术的实用入门书，能够为您带来巨大的 NLP 模型改进。\n2. [Natural Language Processing with Transformers](https:\u002F\u002Ftransformersbook.com\u002F) 作者：Lewis Tunstall, Leandro von Werra, 和 Thomas Wolf - 这本实用书籍展示了如何使用 Hugging Face Transformers 来训练和扩展这些大型模型。作者采用动手实践的方法，教你了解 Transformers 的工作原理以及如何将它们集成到你的应用中。\n\n## 其他资源\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>展开其他资源\u003C\u002Fb>\u003C\u002Fsummary>\n\n1. [hanxiao\u002Fbert-as-service](https:\u002F\u002Fgithub.com\u002Fhanxiao\u002Fbert-as-service) - 使用预训练的 BERT（双向编码器表示）模型将变长句子映射为固定长度向量。\n2. [brightmart\u002Fbert_language_understanding](https:\u002F\u002Fgithub.com\u002Fbrightmart\u002Fbert_language_understanding) - 用于语言理解的深度双向 Transformer 预训练：预训练 TextCNN。\n3. [algteam\u002Fbert-examples](https:\u002F\u002Fgithub.com\u002Falgteam\u002Fbert-examples) - BERT 示例。\n4. [JayYip\u002Fbert-multiple-gpu](https:\u002F\u002Fgithub.com\u002FJayYip\u002Fbert-multiple-gpu) - 支持多 GPU（图形处理器）的 BERT 版本。\n5. [HighCWu\u002Fkeras-bert-tpu](https:\u002F\u002Fgithub.com\u002FHighCWu\u002Fkeras-bert-tpu) - BERT 实现，可在 TPU（张量处理单元）上加载官方预训练模型进行特征提取和预测。\n6. [whqwill\u002Fseq2seq-keyphrase-bert](https:\u002F\u002Fgithub.com\u002Fwhqwill\u002Fseq2seq-keyphrase-bert) - 为 https:\u002F\u002Fgithub.com\u002Fmemray\u002Fseq2seq-keyphrase-pytorch 的编码器部分添加 BERT。\n7. [xu-song\u002Fbert_as_language_model](https:\u002F\u002Fgithub.com\u002Fxu-song\u002Fbert_as_language_model) - 作为语言模型的 BERT，源自 Google 官方 BERT 实现的分支。\n8. [Y1ran\u002FNLP-BERT--Chinese version](https:\u002F\u002Fgithub.com\u002FY1ran\u002FNLP-BERT--ChineseVersion)\n9. [yuanxiaosc\u002FDeep_dynamic_word_representation](https:\u002F\u002Fgithub.com\u002Fyuanxiaosc\u002FDeep_dynamic_word_representation) - 用于深度动态词表示 (DDWR) 的 TensorFlow 代码和预训练模型。它结合了 BERT 模型和 ELMo 的深度上下文词表示。\n10. [yangbisheng2009\u002Fcn-bert](https:\u002F\u002Fgithub.com\u002Fyangbisheng2009\u002Fcn-bert)\n11. [Willyoung2017\u002FBert_Attempt](https:\u002F\u002Fgithub.com\u002FWillyoung2017\u002FBert_Attempt)\n12. [Pydataman\u002Fbert_examples](https:\u002F\u002Fgithub.com\u002FPydataman\u002Fbert_examples) - 一些 BERT 示例。`run_classifier.py` 基于 Google BERT 用于 Kaggle Quora 不真诚问题分类挑战。`run_ner.py` 基于瑞金医院 AI 竞赛第一季以及由 BERT 编写的 NER。\n13. [guotong1988\u002FBERT-chinese](https:\u002F\u002Fgithub.com\u002Fguotong1988\u002FBERT-chinese) - 用于中文语言理解的深度双向 Transformer 预训练。\n14. [zhongyunuestc\u002Fbert_multitask](https:\u002F\u002Fgithub.com\u002Fzhongyunuestc\u002Fbert_multitask) - 多任务。\n15. [Microsoft\u002FAzureML-BERT](https:\u002F\u002Fgithub.com\u002FMicrosoft\u002FAzureML-BERT) - 使用 Azure Machine Learning 对 BERT 进行 fine-tuning（微调）的端到端指南。\n16. [bigboNed3\u002Fbert_serving](https:\u002F\u002Fgithub.com\u002FbigboNed3\u002Fbert_serving) - 导出 BERT 模型用于服务部署。\n17. [yoheikikuta\u002Fbert-japanese](https:\u002F\u002Fgithub.com\u002Fyoheikikuta\u002Fbert-japanese) - 用于日语文本的带有 SentencePiece 的 BERT。\n18. [nickwalton\u002FAIDungeon](https:\u002F\u002Fgithub.com\u002Fnickwalton\u002FAIDungeon) - AI Dungeon 2 是一个完全由 AI 生成的文字冒险游戏，使用 OpenAI 最大的 15 亿参数 GPT-2 模型构建。这是一款前所未有的游戏，允许你输入并对你能想象到的任何行动做出反应。\n19. [turtlesoupy\u002Fthis-word-does-not-exist](https:\u002F\u002Fgithub.com\u002Fturtlesoupy\u002Fthis-word-does-not-exist) - “这个词不存在”是一个项目，允许人们从头开始训练一种 GPT-2 变体来编造单词、定义和示例。我们从未见过如此真实的伪造文本。\n\u003C\u002Fdetails>\n\n## 工具 \n\n1. [jessevig\u002Fbertviz](https:\u002F\u002Fgithub.com\u002Fjessevig\u002Fbertviz) - 用于可视化 Transformer（变换器）模型中注意力机制的工具。\n2. [FastBert](https:\u002F\u002Fgithub.com\u002Fkaushaltrivedi\u002Ffast-bert) - 一个简单的深度学习库，允许开发者和数据科学家训练和部署基于 BERT 的 NLP（自然语言处理）任务模型，从文本分类开始。FastBert 的工作灵感来自 fast.ai。\n3. [gpt2tc](https:\u002F\u002Fbellard.org\u002Flibnc\u002Fgpt2tc.html) - 一个使用 GPT-2 语言模型完成和压缩文本的小程序。它没有外部依赖，不需要 GPU 且速度很快。提供了最小的模型（1.17 亿参数）。也可以下载更大的模型。（无需等待名单，无需注册）。\n\n## 任务\n\n### 命名实体识别 (NER)\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>展开 NER\u003C\u002Fb>\u003C\u002Fsummary>\n\n1. [kyzhouhzau\u002FBERT-NER](https:\u002F\u002Fgithub.com\u002Fkyzhouhzau\u002FBERT-NER) - 使用 Google BERT 进行 CoNLL-2003 NER。\n2. [zhpmatrix\u002Fbert-sequence-tagging](https:\u002F\u002Fgithub.com\u002Fzhpmatrix\u002Fbert-sequence-tagging) - 中文序列标注。\n3. [JamesGu14\u002FBERT-NER-CLI](https:\u002F\u002Fgithub.com\u002FJamesGu14\u002FBERT-NER-CLI) - 带逐步设置指南的 BERT NER 命令行测试器。\n4. [sberbank-ai\u002Fner-bert](https:\u002F\u002Fgithub.com\u002Fsberbank-ai\u002Fner-bert)\n5. [mhcao916\u002FNER_Based_on_BERT](https:\u002F\u002Fgithub.com\u002Fmhcao916\u002FNER_Based_on_BERT) - 该项目基于 Google BERT 模型，这是一个中文 NER。\n6. [macanv\u002FBERT-BiLSMT-CRF-NER](https:\u002F\u002Fgithub.com\u002Fmacanv\u002FBERT-BiLSMT-CRF-NER) - 使用 Bi-LSTM-CRF 模型配合 Google BERT 微调的 NER 任务 TensorFlow 解决方案。\n7. [ProHiryu\u002Fbert-chinese-ner](https:\u002F\u002Fgithub.com\u002FProHiryu\u002Fbert-chinese-ner) - 使用预训练语言模型 BERT 进行中文 NER。\n8. [FuYanzhe2\u002FName-Entity-Recognition](https:\u002F\u002Fgithub.com\u002FFuYanzhe2\u002FName-Entity-Recognition) - Lstm-CRF, 网格-CRF, 近期 NER 相关论文。\n9. [king-menin\u002Fner-bert](https:\u002F\u002Fgithub.com\u002Fking-menin\u002Fner-bert) - NER 任务解决方案 (BERT-Bi-LSTM-CRF)，基于 Google BERT https:\u002F\u002Fgithub.com\u002Fgoogle-research。\n\u003C\u002Fdetails>\n\n### 分类\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>展开分类\u003C\u002Fb>\u003C\u002Fsummary>\n\n1. [brightmart\u002Fsentiment_analysis_fine_grain](https:\u002F\u002Fgithub.com\u002Fbrightmart\u002Fsentiment_analysis_fine_grain) - 使用 BERT 进行多标签分类；来自 AI Challenger 的细粒度情感分析。\n2. [zhpmatrix\u002FKaggle-Quora-Insincere-Questions-Classification](https:\u002F\u002Fgithub.com\u002Fzhpmatrix\u002FKaggle-Quora-Insincere-Questions-Classification) - Kaggle 基线——微调 BERT 和基于 tensor2tensor 的 Transformer 编码器解决方案。\n3. [maksna\u002Fbert-fine-tuning-for-chinese-multiclass-classification](https:\u002F\u002Fgithub.com\u002Fmaksna\u002Fbert-fine-tuning-for-chinese-multiclass-classification) - 使用 Google 预训练模型 BERT 微调以进行中文多类分类。\n4. [NLPScott\u002Fbert-Chinese-classification-task](https:\u002F\u002Fgithub.com\u002FNLPScott\u002Fbert-Chinese-classification-task) - BERT 中文分类实践。\n5. [fooSynaptic\u002FBERT_classifer_trial](https:\u002F\u002Fgithub.com\u002FfooSynaptic\u002FBERT_classifer_trial) - 中文语料库分类的 BERT 试验。\n6. [xiaopingzhong\u002Fbert-finetune-for-classfier](https:\u002F\u002Fgithub.com\u002Fxiaopingzhong\u002Fbert-finetune-for-classfier) - 在构建自己的分类数据集的同时微调 BERT 模型。\n7. [Socialbird-AILab\u002FBERT-Classification-Tutorial](https:\u002F\u002Fgithub.com\u002FSocialbird-AILab\u002FBERT-Classification-Tutorial) - 教程。\n8. [malteos\u002Fpytorch-bert-document-classification](https:\u002F\u002Fgithub.com\u002Fmalteos\u002Fpytorch-bert-document-classification\u002F) - 使用知识图谱嵌入丰富 BERT 以进行文档分类 (PyTorch)。\n\u003C\u002Fdetails>\n\n### 文本生成\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>展开文本生成\u003C\u002Fb>\u003C\u002Fsummary>\n\n1. [asyml\u002Ftexar](https:\u002F\u002Fgithub.com\u002Fasyml\u002Ftexar) - 文本生成及更多领域的工具包。[Texar](https:\u002F\u002Ftexar.io) 是一个通用文本生成工具包，在此也实现了 BERT（双向编码器表示）用于分类，并通过结合 Texar 的其他模块实现文本生成应用。\n2. [即插即用语言模型：一种受控文本生成的简单方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.02164) (PPLM) 论文，由 Uber AI 发布。\n\u003C\u002Fdetails>\n\n### 问答 (QA)\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>展开问答\u003C\u002Fb>\u003C\u002Fsummary>\n\n1. [matthew-z\u002FR-net](https:\u002F\u002Fgithub.com\u002Fmatthew-z\u002FR-net) - 基于 PyTorch（深度学习框架）的 R-net，包含 BERT 和 ELMo（上下文嵌入）。\n2. [vliu15\u002FBERT](https:\u002F\u002Fgithub.com\u002Fvliu15\u002FBERT) - 用于问答的 BERT 的 TensorFlow（机器学习库）实现。\n3. [benywon\u002FChineseBert](https:\u002F\u002Fgithub.com\u002Fbenywon\u002FChineseBert) - 这是一个专门用于问答的中文 BERT 模型。\n4. [xzp27\u002FBERT-for-Chinese-Question-Answering](https:\u002F\u002Fgithub.com\u002Fxzp27\u002FBERT-for-Chinese-Question-Answering)\n5. [facebookresearch\u002FSpanBERT](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FSpanBERT) - 在 SQuAD（斯坦福问答数据集）上进行问答；通过表示和预测跨度来改进预训练。\n\u003C\u002Fdetails>\n\n### 知识图谱\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>展开知识图谱\u003C\u002Fb>\u003C\u002Fsummary>\n\n1. [sakuranew\u002FBERT-AttributeExtraction](https:\u002F\u002Fgithub.com\u002Fsakuranew\u002FBERT-AttributeExtraction) - 使用 BERT 进行知识图谱中的属性提取。微调和特征提取。基于 BERT 的微调和特征提取方法用于提取百度百科条目的知识属性。\n2. [lvjianxin\u002FKnowledge-extraction](https:\u002F\u002Fgithub.com\u002Flvjianxin\u002FKnowledge-extraction) - 基于中文知识的抽取。基线：bi-LSTM（双向长短期记忆网络）+CRF（条件随机场）；升级：BERT 预训练。\n\u003C\u002Fdetails>\n\n## 许可证\n\n\u003Cdetails>\n\n\u003Csummary>\u003Cb>展开许可证\u003C\u002Fb>\u003C\u002Fsummary>\n\n此仓库包含多种内容；部分由 Cedric Chee 开发，部分来自第三方。第三方内容按照这些方提供的许可证分发。\n\n*I am providing code and resources in this repository to you under an open source license. Because this is my personal repository, the license you receive to my code and resources is from me and not my employer.*\n\nThe content developed by Cedric Chee is distributed under the following license:\n\n### 代码\n\n此仓库中的代码，包括上述笔记本中的所有代码示例，均在 [MIT 许可证](LICENSE) 下发布。更多信息请访问 [开放源代码促进会](https:\u002F\u002Fopensource.org\u002Flicenses\u002FMIT)。\n\n### 文本\n\n文本内容根据 CC-BY-SA 4.0 许可证发布。更多信息请访问 [知识共享](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby-sa\u002F4.0\u002F)。\n\u003C\u002Fdetails>\n\n[^1]: AIM 制作的信息图。","# Awesome Transformer & Transfer Learning in NLP 快速上手指南\n\n## 简介\n本仓库是一个精心策划的机器学习（深度学习）资源列表，专注于自然语言处理（NLP）。内容涵盖生成式预训练 Transformer (GPT)、BERT、注意力机制、Transformer 架构、ChatGPT 及 NLP 中的迁移学习等主题。它不是可安装的软件包，而是通往相关论文、教程、代码实现和工具的导航站。\n\n## 环境准备\n- **操作系统**: Linux \u002F macOS \u002F Windows\n- **必备工具**: Git (用于克隆仓库)\n- **开发环境**: Python (推荐 3.7+)，用于运行部分列出的代码示例或依赖库\n- **网络要求**: 需能访问 GitHub 及相关学术资源网站 (如 arXiv)。国内用户建议使用网络加速工具或镜像源。\n\n## 安装步骤\n由于本项目为资源索引库，无需通过 pip 安装，直接克隆至本地即可浏览。\n\n1. 打开终端，执行以下命令克隆仓库：\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fharvardnlp\u002Fawesome-transformer-nlp.git\n```\n\n2. 进入项目目录：\n```bash\ncd awesome-transformer-nlp\n```\n\n3. (可选) 查看最新文档：\n```bash\ncat README.md\n```\n\n## 基本使用\n本项目主要通过阅读 `README.md` 文件来利用其资源。\n\n1. **浏览分类目录**\n   在 `README.md` 中，资源被分为多个板块，包括：\n   - [Papers](#papers): 核心论文列表\n   - [Articles](#articles): 技术文章与教程\n   - [Educational](#educational): 教育视频与课程\n   - [Implementations](#implementations): 官方及社区实现的代码库 (PyTorch, TensorFlow 等)\n   - [Tools](#tools) & [Tasks](#tasks): 特定任务 (NER, QA 等) 的工具\n\n2. **获取具体资源**\n   点击目录中的超链接即可跳转至原始资源页面。例如：\n   - 查阅经典论文：点击 Papers 部分的 arXiv 链接。\n   - 下载代码实现：前往 Implementations 部分找到对应框架（如 Hugging Face Transformers）的仓库链接。\n\n3. **结合实践**\n   根据列出的资源，选择适合您任务的模型（如 BERT, GPT-3, T5），并参考对应的 Implementation 链接进行本地部署或微调。","某电商公司的 NLP 工程师小张需要在一个月内为商品评论构建情感分析模型，急需基于 Transformer 架构的高效解决方案。\n\n### 没有 awesome-transformer-nlp 时\n- 在搜索引擎中分散查找 BERT 原始论文和最新变体，耗时且极易遗漏关键文献\n- 面对 PyTorch 和 TensorFlow 多种实现方案，难以快速筛选适合当前业务规模的代码库\n- 缺乏系统的迁移学习教程，导致微调过程中反复调试超参数，模型收敛效率低下\n- 对 Attention 机制等底层原理理解不深，无法针对长文本场景有效优化模型结构\n\n### 使用 awesome-transformer-nlp 后\n- 通过精选 Papers 板块直接定位到 BERT 核心论文及 Transformer-XL 等进阶研究，明确技术选型\n- 在 Official Implementations 章节一键获取官方推荐的代码仓库，显著减少环境配置与复现时间\n- 利用 Transfer Learning in NLP 专栏快速掌握领域适配技巧，大幅缩短模型从开发到落地的周期\n- 结合 Educational 中的 Tutorials 视频课程，深入理解注意力机制以优化特定任务如命名实体识别的表现\n\n核心价值：awesome-transformer-nlp 将碎片化的前沿知识整合成结构化导航，大幅降低技术调研成本并加速项目落地进程。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcedrickchee_awesome-transformer-nlp_5017ff70.png","cedrickchee","Cedric Chee","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fcedrickchee_586e70c2.png","Lead Software Engineer | LLMs | Go\u002FJS, backend | product dev @ startups | 🧑‍🎓 CompSci | alumni: fast.ai, Antler.co","InvictusByte","PID 1",null,"cedric_chee","https:\u002F\u002Fcedricchee.com","https:\u002F\u002Fgithub.com\u002Fcedrickchee",1134,135,"2026-03-18T22:33:22","MIT",1,"","未说明",{"notes":93,"python":91,"dependencies":94},"此仓库为资源聚合列表（Awesome List），非独立可运行软件项目。内容主要包含论文、教程、实现库和工具的链接整理，本身不包含可执行代码或安装脚本。具体的运行环境需求（如 GPU、Python 版本、依赖库等）取决于您实际使用的下游项目（如 BERT、GPT 等具体实现），需参考各子项目的官方文档。",[],[13,26],[97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113],"nlp","natural-language-processing","bert","transformer","language-model","transfer-learning","attention-mechanism","awesome-list","gpt-2","xlnet","neural-networks","pre-trained-language-models","gpt-3","awesome","chatgpt","gpt-4","llama","2026-03-27T02:49:30.150509","2026-04-06T07:14:02.075100",[],[]]