[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-changwookjun--nlp-paper":3,"tool-changwookjun--nlp-paper":65},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",160015,2,"2026-04-18T11:30:52",[13,14,15],"开发框架","Agent","语言模型","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,3,"2026-04-06T11:19:32",[15,26,14,13],"图像",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":10,"last_commit_at":33,"category_tags":34,"status":16},8553,"spec-kit","github\u002Fspec-kit","Spec Kit 是一款专为提升软件开发效率而设计的开源工具包，旨在帮助团队快速落地“规格驱动开发”（Spec-Driven Development）模式。传统开发中，需求文档往往与代码实现脱节，导致沟通成本高且结果不可控；而 Spec Kit 通过将规格说明书转化为可执行的指令，让 AI 直接依据明确的业务场景生成高质量代码，从而减少从零开始的随意编码，确保产出结果的可预测性。\n\n该工具特别适合希望利用 AI 辅助编程的开发者、技术负责人及初创团队。无论是启动全新项目还是在现有工程中引入规范化流程，用户只需通过简单的命令行操作，即可初始化项目并集成主流的 AI 编程助手。其核心技术亮点在于“规格即代码”的理念，支持社区扩展与预设模板，允许用户根据特定技术栈定制开发流程。此外，Spec Kit 强调官方维护的安全性，提供稳定的版本管理，帮助开发者在享受 AI 红利的同时，依然牢牢掌握架构设计的主动权，真正实现从“凭感觉写代码”到“按规格建系统”的转变。",88749,"2026-04-17T09:48:14",[15,26,14,13],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":10,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,15],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":10,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",85267,"2026-04-18T11:00:28",[26,51,52,53,14,54,15,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":62,"last_commit_at":63,"category_tags":64,"status":16},5784,"funNLP","fighting41love\u002FfunNLP","funNLP 是一个专为中文自然语言处理（NLP）打造的超级资源库，被誉为\"NLP 民工的乐园”。它并非单一的软件工具，而是一个汇集了海量开源项目、数据集、预训练模型和实用代码的综合性平台。\n\n面对中文 NLP 领域资源分散、入门门槛高以及特定场景数据匮乏的痛点，funNLP 提供了“一站式”解决方案。这里不仅涵盖了分词、命名实体识别、情感分析、文本摘要等基础任务的标准工具，还独特地收录了丰富的垂直领域资源，如法律、医疗、金融行业的专用词库与数据集，甚至包含古诗词生成、歌词创作等趣味应用。其核心亮点在于极高的全面性与实用性，从基础的字典词典到前沿的 BERT、GPT-2 模型代码，再到高质量的标注数据和竞赛方案，应有尽有。\n\n无论是刚刚踏入 NLP 领域的学生、需要快速验证想法的算法工程师，还是从事人工智能研究的学者，都能在这里找到急需的“武器弹药”。对于开发者而言，它能大幅减少寻找数据和复现模型的时间；对于研究者，它提供了丰富的基准测试资源和前沿技术参考。funNLP 以开放共享的精神，极大地降低了中文自然语言处理的开发与研究成本，是中文 AI 社区不可或缺的宝藏仓库。",79857,1,"2026-04-08T20:11:31",[15,51,54],{"id":66,"github_repo":67,"name":68,"description_en":69,"description_zh":70,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":79,"owner_company":80,"owner_location":81,"owner_email":82,"owner_twitter":83,"owner_website":84,"owner_url":85,"languages":83,"stars":86,"forks":87,"last_commit_at":88,"license":83,"difficulty_score":62,"env_os":89,"env_gpu":90,"env_ram":90,"env_deps":91,"category_tags":94,"github_topics":95,"view_count":10,"oss_zip_url":83,"oss_zip_packed_at":83,"status":16,"created_at":104,"updated_at":105,"faqs":106,"releases":107},9332,"changwookjun\u002Fnlp-paper","nlp-paper","NLP Paper","nlp-paper 是一个专注于自然语言处理（NLP）领域的开源论文清单项目，旨在为研究者和开发者提供一份系统化、分类清晰的学术资源导航。面对 NLP 领域论文爆发式增长、检索难度大的痛点，nlp-paper 将海量文献按主题精细划分为 BERT 系列、Transformer 架构、迁移学习、文本摘要、情感分析、机器翻译及大语言模型（LLM）等二十余个核心板块。\n\n该项目不仅收录了如 BERT、RoBERTa、ALBERT 等奠基性经典论文，还涵盖了针对注意力机制分析、模型压缩、多模态融合及特定下游任务的前沿研究成果。其独特亮点在于结构化的知识整理方式，帮助用户快速定位从基础理论到具体应用（如问答系统、命名实体识别）的关键文献，极大提升了文献调研效率。\n\nnlp-paper 特别适合高校科研人员、算法工程师以及希望深入理解 NLP 技术演进的学生使用。无论是为了追踪最新学术动态，还是为工程项目寻找理论支撑，这份清单都能成为得力的助手。它以开放共享的精神，降低了获取高质量学术信息的门槛，是进入自然语言处理世界不可或缺的参考指南。","# NLP Paper\n![Awesome](https:\u002F\u002Fcdn.rawgit.com\u002Fsindresorhus\u002Fawesome\u002Fd7305f38d29fed78fa85652e3a63e154dd8e8829\u002Fmedia\u002Fbadge.svg)\n\nnatural language processing paper list\n\n## Contents\n* [Bert Series](#Bert-Series)   \n* [Transformer Series](#Transformer-Series)  \n* [Transfer Learning](#Transfer-Learning)  \n* [Text Summarization](#Text-Summarization)  \n* [Sentiment Analysis](#Sentiment-Analysis)  \n* [Question Answering](#Question-Answering)  \n* [Machine Translation](#Machine-Translation)\n* [Surver paper](#survey-paper)  \n* [Downstream task](#downstream-task) \n   * [QA MC Dialogue](#QA-MC-Dialogue) \n   * [Slot filling](#Slot-filling)    \n   * [Analysis](#Analysis) \n   * [Word segmentation parsing NER](#Word-segmentation-parsing-NER)    \n   * [Pronoun coreference resolution](#Pronoun-coreference-resolution) \n   * [Word sense disambiguation](#Word-sense-disambiguation) \n   * [Sentiment analysis](#Sentiment-analysis) \n   * [Relation extraction](#Relation-extraction)    \n   * [Knowledge base](#Knowledge-base)     \n   * [Text classification](#Text-classification)         \n   * [WSC WNLI NLI](#WSC-WNLI-NLI) \n   * [Commonsense](#Commonsense) \n   * [Extractive summarization](#Extractive-summarization)\n   * [IR](#IR)   \n* [Generation](#generation) \n* [Quality evaluator](#quality-evaluator) \n* [Modification (multi-task, masking strategy, etc.)](#modification-multi-task-masking-strategy-etc) \n* [Probe](#probe) \n* [Multi-lingual](#multi-lingual) \n* [Other than English models](#other-than-english-models) \n* [Domain specific](#domain-specific) \n* [Multi-modal](#multi-modal) \n* [Model compression](#model-compression)\n* [LLM](#LLM) \n* [Misc](#misc) \n\n### Bert Series\n* [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding  - NAACL 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805)  \n* [ERNIE 2.0: A Continual Pre-training Framework for Language Understanding - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.12412)  \n* [StructBERT: Incorporating Language Structures into Pre-training for Deep Language Understanding - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04577)  \n* [RoBERTa: A Robustly Optimized BERT Pretraining Approach  - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11692)  \n* [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations  - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11942)  \n* [Multi-Task Deep Neural Networks for Natural Language Understanding  - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.11504)  \n* [What does BERT learn about the structure of language?](https:\u002F\u002Fhal.inria.fr\u002Fhal-02131630\u002Fdocument) (ACL2019)\n* [Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy Lifting, the Rest Can Be Pruned](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.09418) (ACL2019) [[github](https:\u002F\u002Fgithub.com\u002Flena-voita\u002Fthe-story-of-heads)]\n* [Open Sesame: Getting Inside BERT's Linguistic Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01698) (ACL2019 WS)\n* [Analyzing the Structure of Attention in a Transformer Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04284) (ACL2019 WS)\n* [What Does BERT Look At? An Analysis of BERT's Attention](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04341) (ACL2019 WS)\n* [Do Attention Heads in BERT Track Syntactic Dependencies?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.12246)\n* [Blackbox meets blackbox: Representational Similarity and Stability Analysis of Neural Language Models and Brains](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01539) (ACL2019 WS)\n* [Inducing Syntactic Trees from BERT Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.11511) (ACL2019 WS)\n* [A Multiscale Visualization of Attention in the Transformer Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.05714) (ACL2019 Demo)\n* [Visualizing and Measuring the Geometry of BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.02715)\n* [How Contextual are Contextualized Word Representations? Comparing the Geometry of BERT, ELMo, and GPT-2 Embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00512) (EMNLP2019) \n* [Are Sixteen Heads Really Better than One?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.10650) (NeurIPS2019)\n* [On the Validity of Self-Attention as Explanation in Transformer Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04211)\n* [Visualizing and Understanding the Effectiveness of BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05620) (EMNLP2019)\n* [Attention Interpretability Across NLP Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11218)\n* [Revealing the Dark Secrets of BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.08593) (EMNLP2019)\n* [Investigating BERT's Knowledge of Language: Five Analysis Methods with NPIs](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02597) (EMNLP2019)\n* [The Bottom-up Evolution of Representations in the Transformer: A Study with Machine Translation and Language Modeling Objectives](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.01380) (EMNLP2019) \n* [A Primer in BERTology: What we know about how BERT works](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.12327)\n* [Do NLP Models Know Numbers? Probing Numeracy in Embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.07940) (EMNLP2019)\n* [How Does BERT Answer Questions? A Layer-Wise Analysis of Transformer Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.04925) (CIKM2019)\n* [Whatcha lookin' at? DeepLIFTing BERT's Attention in Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06431)\n* [What does BERT Learn from Multiple-Choice Reading Comprehension Datasets?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.12391)\n* [Calibration of Pre-trained Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.07892)\n* [exBERT: A Visual Analysis Tool to Explore Learned Representations in Transformers Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05276) [[github](https:\u002F\u002Fgithub.com\u002Fbhoov\u002Fexbert)]  \n* [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.02984.pdf) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Fmobilebert)]   \n* [Measuring and Reducing Gendered Correlations in Pre-trained Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.06032.pdf)\n* [DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.03654) [[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeBERTa)] (ACL2021)   \n* [STRUCTBERT: INCORPORATING LANGUAGE STRUCTURES INTO PRE-TRAINING FOR DEEP LANGUAGE UNDERSTANDING](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1908.04577) (ACL2021)\n* [SpanBERT: Improving Pre-training by Representing and Predicting Spans](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1907.10529) [[github](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FSpanBERT)] (ACL2021)   \n\n\n### Transformer Series\n* [Attention Is All You Need - arXiv 2017)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.03762)  \n* [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.02860)  \n* [Universal Transformers - ICLR 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.03819) \n* [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683) \n* [Reformer: The Efficient Transformer - ICLR 2020)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.04451) \n* [Adaptive Attention Span in Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07799) (ACL2019)\n* [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.02860) (ACL2019) [[github](https:\u002F\u002Fgithub.com\u002Fkimiyoung\u002Ftransformer-xl)]\n* [Generating Long Sequences with Sparse Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.10509)\n* [Adaptively Sparse Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00015) (EMNLP2019)\n* [Compressive Transformers for Long-Range Sequence Modelling](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.05507)\n* [The Evolved Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.11117) (ICML2019)\n* [Reformer: The Efficient Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.04451) (ICLR2020) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Ftrax\u002Ftree\u002Fmaster\u002Ftrax\u002Fmodels\u002Freformer)]\n* [GRET: Global Representation Enhanced Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.10101) (AAAI2020)\n* [Transformer on a Diet](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.06170) [[github](https:\u002F\u002Fgithub.com\u002Fcgraywang\u002Ftransformer-on-diet)]\n* [Efficient Content-Based Sparse Attention with Routing Transformers](https:\u002F\u002Fopenreview.net\u002Fforum?id=B1gjs6EtDr)\n* [BP-Transformer: Modelling Long-Range Context via Binary Partitioning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.04070)  \n* [Recipes for building an open-domain chatbot](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.13637.pdf)   \n* [Longformer: The Long-Document Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.05150.pdf)  \n* [UnifiedQA: Crossing Format Boundaries With a Single QA System](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.00700.pdf) [[github](https:\u002F\u002Fgithub.com\u002Fallenai\u002Funifiedqa)]  \n* [Big Bird: Transformers for Longer Sequences](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2007.14062.pdf) \n* [Longformer: The Long-Document Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.05150) [[github](https:\u002F\u002Fgithub.com\u002Fallenai\u002Flongformer)] (ACL2021)\n* [REFORMER: THE EFFICIENT TRANSFORMER](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2001.04451) (ACL2021)\n* [Linformer: Self-Attention with Linear Complexity](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.04768) (ACL2021)\n* [RETHINKING ATTENTION WITH PERFORMERS](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.14794) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Fblob\u002Fmaster\u002Fperformer\u002Ffast_attention\u002Ftensorflow\u002Ffast_attention.py)] (ICLR2021)\n* [Big Bird: Transformers for Longer Sequences](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2007.14062) (ACL2021)\n\n\n\n\n### Transfer Learning\n* [Deep contextualized word representations - NAACL 2018)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.05365)  \n* [Universal Language Model Fine-tuning for Text Classification  - ACL 2018)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1801.06146)  \n* [Improving Language Understanding by Generative Pre-Training  - Alec Radford)](https:\u002F\u002Fs3-us-west-2.amazonaws.com\u002Fopenai-assets\u002Fresearch-covers\u002Flanguage-unsupervised\u002Flanguage_understanding_paper.pdf)  \n* [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding  - NAACL 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805)  \n* [Cloze-driven Pretraining of Self-attention Networks - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.07785)  \n* [Unified Language Model Pre-training for Natural Language Understanding and Generation - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.03197)  \n* [MASS: Masked Sequence to Sequence Pre-training for Language Generation - ICML 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02450)  \n* [MPNet: Masked and Permuted Pre-training for Language Understanding)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.09297.pdf)[[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FMPNet)]    \n* [UNILMv2: Pseudo-Masked Language Models for Unified Language Model Pre-Training)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2002.12804.pdf)[[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Funilm)]    \n* [AdapterFusion:Non-Destructive Task Composition for Transfer Learning)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.00247) (ACL2022)  \n* [Prefix-Tuning: Optimizing Continuous Prompts for Generation](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.00190) (ACL2022)\n* [LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.09685) (ACL2022)\n\n\n\n\n### Text Summarization\n* [Positional Encoding to Control Output Sequence Length - Sho Takase(2019)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.07418.pdf)  \n* [Fine-tune BERT for Extractive Summarization - Yang Liu(2019)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1903.10318.pdf)  \n* [Language Models are Unsupervised Multitask Learners - Alec Radford(2019)](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf)   \n* [A Unified Model for Extractive and Abstractive Summarization using Inconsistency Loss - Wan-Ting Hsu(2018)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1805.06266.pdf)   \n* [A Discourse-Aware Attention Model for Abstractive Summarization of Long Documents - Arman Cohan(2018)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1804.05685.pdf)   \n* [GENERATING WIKIPEDIA BY SUMMARIZING LONG SEQUENCES - Peter J. Liu(2018)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1801.10198.pdf)   \n* [Get To The Point: Summarization with Pointer-Generator Networks - Abigail See(2017)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1704.04368.pdf)\n* [A Neural Attention Model for Sentence Summarization - Alexander M. Rush(2015)](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD15-1044)   \n* [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1912.08777) (ACL2021)\n* [Abstractive Text Summarization Using BART](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9972639) (ACL2021)  \n* [CTRLSUM: TOWARDS GENERIC CONTROLLABLE TEXT SUMMARIZATION](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2012.04281) (ACL2021)  \n\n### Sentiment Analysis\n* [Multi-Task Deep Neural Networks for Natural Language Understanding - Xiaodong Liu(2019)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1901.11504.pdf)  \n* [Aspect-level Sentiment Analysis using AS-Capsules - Yequan Wang(2019)](http:\u002F\u002Fcoai.cs.tsinghua.edu.cn\u002Fhml\u002Fmedia\u002Ffiles\u002FWWW19WangY.pdf) \n* [On the Role of Text Preprocessing in Neural Network Architectures:\nAn Evaluation Study on Text Categorization and Sentiment Analysis - Jose Camacho-Collados(2018)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1704.01444.pdf) \n* [Learned in Translation: Contextualized Word Vectors - Bryan McCann(2018)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1708.00107.pdf) \n* [Universal Language Model Fine-tuning for Text Classification - Jeremy Howard(2018)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1801.06146.pdf) \n* [Convolutional Neural Networks with Recurrent Neural Filters - Yi Yang(2018)](https:\u002F\u002Faclweb.org\u002Fanthology\u002FD18-1109) \n* [Information Aggregation via Dynamic Routing for Sequence Encoding - Jingjing Gong(2018)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1806.01501.pdf) \n* [Learning to Generate Reviews and Discovering Sentiment - Alec Radford(2017)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1704.01444.pdf) \n* [A Structured Self-attentive Sentence Embedding - Zhouhan Lin(2017)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1703.03130.pdf) \n\n### Question Answering  \n* [Language Models are Unsupervised Multitask Learners - Alec Radford(2019)](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf)  \n* [Improving Language Understanding by Generative Pre-Training - Alec Radford(2018)](https:\u002F\u002Fs3-us-west-2.amazonaws.com\u002Fopenai-assets\u002Fresearch-covers\u002Flanguage-unsupervised\u002Flanguage_understanding_paper.pdf) \n* [Bidirectional Attention Flow for Machine Comprehension - Minjoon Seo(2018)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.01603.pdf) \n* [Reinforced Mnemonic Reader for Machine Reading Comprehension - Minghao Hu(2017)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1705.02798.pdf)  \n* [Neural Variational Inference for Text Processing - Yishu Miao(2015)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.06038.pdf)  \n* [UnifiedQA: Crossing Format Boundaries with a Single QA System](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.00700) [[github](https:\u002F\u002Fgithub.com\u002Fallenai\u002Funifiedqa)] (ACL2021)\n* [REALM: Retrieval-Augmented Language Model Pre-Training](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2002.08909) (ACL2021)\n* \n\n### Machine Translation    \n* [The Evolved Transformer - David R. So(2019)](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1901.11117.pdf)  \n\n### Surver paper    \n* [Evolution of transfer learning in natural language processing](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.07370)\n* [Pre-trained Models for Natural Language Processing: A Survey](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.08271)\n* [A Survey on Contextual Embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.07278)\n\n### Downstream task\n#### QA MC Dialogue\n* [A BERT Baseline for the Natural Questions](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.08634)\n* [MultiQA: An Empirical Investigation of Generalization and Transfer in Reading Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.13453) (ACL2019)\n* [Unsupervised Domain Adaptation on Reading Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06137)\n* [BERTQA -- Attention on Steroids](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.10435)\n* [A Multi-Type Multi-Span Network for Reading Comprehension that Requires Discrete Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05514) (EMNLP2019)\n* [SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.03593)\n* [Multi-hop Question Answering via Reasoning Chains](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.02610)\n* [Select, Answer and Explain: Interpretable Multi-hop Reading Comprehension over Multiple Documents](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.00484)\n* [Multi-step Entity-centric Information Retrieval for Multi-Hop Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.07598) (EMNLP2019 WS)\n* [End-to-End Open-Domain Question Answering with BERTserini](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.01718) (NAALC2019)\n* [Latent Retrieval for Weakly Supervised Open Domain Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.00300) (ACL2019)\n* [Multi-passage BERT: A Globally Normalized BERT Model for Open-domain Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.08167) (EMNLP2019)\n* [Learning to Retrieve Reasoning Paths over Wikipedia Graph for Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.10470) (ICLR2020)\n* [Learning to Ask Unanswerable Questions for Machine Reading Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.06045) (ACL2019)\n* [Unsupervised Question Answering by Cloze Translation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04980) (ACL2019)\n* [Reinforcement Learning Based Graph-to-Sequence Model for Natural Question Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04942)\n* [A Recurrent BERT-based Model for Question Generation](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-5821\u002F) (EMNLP2019 WS)\n* [Learning to Answer by Learning to Ask: Getting the Best of GPT-2 and BERT Worlds](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.02365)\n* [Enhancing Pre-Trained Language Representations with Rich Knowledge for Machine Reading Comprehension](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002Fpapers\u002FP\u002FP19\u002FP19-1226\u002F) (ACL2019)\n* [Incorporating Relation Knowledge into Commonsense Reading Comprehension with Multi-task Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04530) (CIKM2019)\n* [SG-Net: Syntax-Guided Machine Reading Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05147)\n* [MMM: Multi-stage Multi-task Learning for Multi-choice Reading Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.00458)\n* [Cosmos QA: Machine Reading Comprehension with Contextual Commonsense Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00277) (EMNLP2019)\n* [ReClor: A Reading Comprehension Dataset Requiring Logical Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.04326) (ICLR2020)\n* [Robust Reading Comprehension with Linguistic Constraints via Posterior Regularization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06948)\n* [BAS: An Answer Selection Method Using BERT Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.01528)\n* [Beat the AI: Investigating Adversarial Human Annotations for Reading Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.00293)\n* [A Simple but Effective Method to Incorporate Multi-turn Context with BERT for Conversational Machine Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.12848) (ACL2019 WS)\n* [FlowDelta: Modeling Flow Information Gain in Reasoning for Conversational Machine Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05117) (ACL2019 WS)\n* [BERT with History Answer Embedding for Conversational Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.05412) (SIGIR2019)\n* [GraphFlow: Exploiting Conversation Flow with Graph Neural Networks for Conversational Machine Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.00059) (ICML2019 WS)\n* [Beyond English-only Reading Comprehension: Experiments in Zero-Shot Multilingual Transfer for Bulgarian](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.01519) (RANLP2019)\n* [XQA: A Cross-lingual Open-domain Question Answering Dataset](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FP19-1227\u002F) (ACL2019)\n* [Cross-Lingual Machine Reading Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00361) (EMNLP2019)\n* [Zero-shot Reading Comprehension by Cross-lingual Transfer Learning with Multi-lingual Language Representation Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.09587)\n* [Multilingual Question Answering from Formatted Text applied to Conversational Agents](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.04659)\n* [BiPaR: A Bilingual Parallel Dataset for Multilingual and Cross-lingual Reading Comprehension on Novels](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05040) (EMNLP2019)\n* [MLQA: Evaluating Cross-lingual Extractive Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.07475)\n* [Investigating Prior Knowledge for Challenging Chinese Machine Reading Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09679) (TACL)\n* [SberQuAD - Russian Reading Comprehension Dataset: Description and Analysis](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.09723)\n* [Giving BERT a Calculator: Finding Operations and Arguments with Reading Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00109) (EMNLP2019)\n* [BERT-DST: Scalable End-to-End Dialogue State Tracking with Bidirectional Encoder Representations from Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.03040) (Interspeech2019)\n* [Dialog State Tracking: A Neural Reading Comprehension Approach](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.01946) \n* [A Simple but Effective BERT Model for Dialog State Tracking on Resource-Limited Systems](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.12995) (ICASSP2020)\n* [Fine-Tuning BERT for Schema-Guided Zero-Shot Dialogue State Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.00181)\n* [Goal-Oriented Multi-Task BERT-Based Dialogue State Tracker](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.02450)\n* [Domain Adaptive Training BERT for Response Selection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04812)\n* [BERT Goes to Law School: Quantifying the Competitive Advantage of Access to Large Legal Corpora in Contract Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.00473)   \n* [A BERT Baseline for the Natural Questions](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.00473) \n* [Wizard of Wikipedia](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1811.01241.pdf)\n* [BlenderBot 3: a deployed conversational agent that continually∗ learns to responsibly engage](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.03188) (ACL2022)\n\n#### Slot filling\n* [BERT for Joint Intent Classification and Slot Filling](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.10909)\n* [Multi-lingual Intent Detection and Slot Filling in a Joint BERT-based Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.02884)\n* [A Comparison of Deep Learning Methods for Language Understanding](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002FInterspeech_2019\u002Fabstracts\u002F1262.html) (Interspeech2019)\n\n#### Analysis\n* [Fine-grained Information Status Classification Using Discourse Context-Aware Self-Attention](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04755)\n* [Neural Aspect and Opinion Term Extraction with Mined Rules as Weak Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.03750) (ACL2019) \n* [BERT-based Lexical Substitution](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FP19-1328) (ACL2019) \n* [Assessing BERT’s Syntactic Abilities](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.05287)\n* [Does BERT agree? Evaluating knowledge of structure dependence through agreement relations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.09892)\n* [Simple BERT Models for Relation Extraction and Semantic Role Labeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.05255)\n* [LIMIT-BERT : Linguistic Informed Multi-Task BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.14296)\n* [A Simple BERT-Based Approach for Lexical Simplification](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.06226)\n* [Multi-headed Architecture Based on BERT for Grammatical Errors Correction](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002Fpapers\u002FW\u002FW19\u002FW19-4426\u002F) (ACL2019 WS) \n* [Towards Minimal Supervision BERT-based Grammar Error Correction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.03521)\n* [BERT-Based Arabic Social Media Author Profiling](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.04181)\n* [Sentence-Level BERT and Multi-Task Learning of Age and Gender in Social Media](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.00637)\n* [Evaluating the Factual Consistency of Abstractive Text Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.12840)\n* [NegBERT: A Transfer Learning Approach for Negation Detection and Scope Resolution](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.04211)\n* [xSLUE: A Benchmark and Analysis Platform for Cross-Style Language Understanding and Evaluation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03663)\n* [TabFact: A Large-scale Dataset for Table-based Fact Verification](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02164)\n* [Rapid Adaptation of BERT for Information Extraction on Domain-Specific Business Documents](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.01861)\n* [LAMBERT: Layout-Aware language Modeling using BERT for information extraction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08087)\n* [Keyphrase Extraction from Scholarly Articles as Sequence Labeling using Contextualized Embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.08840) (ECIR2020) [[github](https:\u002F\u002Fgithub.com\u002Fmidas-research\u002Fkeyphrase-extraction-as-sequence-labeling-data)]\n* [Keyphrase Extraction with Span-based Feature Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.05407)\n* [What do you mean, BERT? Assessing BERT as a Distributional Semantics Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.05758)\n\n#### Word segmentation parsing NER  \n* [BERT Meets Chinese Word Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.09292)\n* [Toward Fast and Accurate Neural Chinese Word Segmentation with Multi-Criteria Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.04190)\n* [Establishing Strong Baselines for the New Decade: Sequence Tagging, Syntactic and Semantic Parsing with BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04943)\n* [Evaluating Contextualized Embeddings on 54 Languages in POS Tagging, Lemmatization and Dependency Parsing](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.07448) \n* [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00204)\n* [Deep Contextualized Word Embeddings in Transition-Based and Graph-Based Dependency Parsing -- A Tale of Two Parsers Revisited](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.07397) (EMNLP2019)\n* [Is POS Tagging Necessary or Even Helpful for Neural Dependency Parsing?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.03204)\n* [Parsing as Pretraining](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.01685) (AAAI2020)\n* [Cross-Lingual BERT Transformation for Zero-Shot Dependency Parsing](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.06775)\n* [Recursive Non-Autoregressive Graph-to-Graph Transformer for Dependency Parsing with Iterative Refinement](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13118)\n* [Named Entity Recognition -- Is there a glass ceiling?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.02403) (CoNLL2019)\n* [A Unified MRC Framework for Named Entity Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11476)\n* [Training Compact Models for Low Resource Entity Tagging using Pre-trained Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06294)\n* [Robust Named Entity Recognition with Truecasing Pretraining](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.07095) (AAAI2020)\n* [LTP: A New Active Learning Strategy for Bert-CRF Based Named Entity Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.02524)\n* [MT-BioNER: Multi-task Learning for Biomedical Named Entity Recognition using Deep Bidirectional Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.08904)\n* [Portuguese Named Entity Recognition using BERT-CRF](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10649)\n* [Towards Lingua Franca Named Entity Recognition with BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01389)\n\n#### Pronoun coreference resolution\n* [Resolving Gendered Ambiguous Pronouns with BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01161) (ACL2019 WS)\n* [Anonymized BERT: An Augmentation Approach to the Gendered Pronoun Resolution Challenge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.01780) (ACL2019 WS)\n* [Gendered Pronoun Resolution using BERT and an extractive question answering formulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.03695) (ACL2019 WS)\n* [MSnet: A BERT-based Network for Gendered Pronoun Resolution](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.00308) (ACL2019 WS)\n* [Fill the GAP: Exploiting BERT for Pronoun Resolution](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002Fpapers\u002FW\u002FW19\u002FW19-3815\u002F) (ACL2019 WS)\n* [On GAP Coreference Resolution Shared Task: Insights from the 3rd Place Solution](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FW19-3816\u002F) (ACL2019 WS)\n* [Look Again at the Syntax: Relational Graph Convolutional Network for Gendered Ambiguous Pronoun Resolution](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.08868) (ACL2019 WS)\n* [BERT Masked Language Modeling for Co-reference Resolution](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002Fpapers\u002FW\u002FW19\u002FW19-3811\u002F) (ACL2019 WS)\n* [Coreference Resolution with Entity Equalization](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FP19-1066\u002F) (ACL2019)\n* [BERT for Coreference Resolution: Baselines and Analysis](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.09091) (EMNLP2019) [[github](https:\u002F\u002Fgithub.com\u002Fmandarjoshi90\u002Fcoref)]\n* [WikiCREM: A Large Unsupervised Corpus for Coreference Resolution](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.08025) (EMNLP2019)\n* [Ellipsis and Coreference Resolution as Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.11141)\n* [Coreference Resolution as Query-based Span Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.01746)\n* [Multi-task Learning Based Neural Bridging Reference Resolution](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.03666)\n\n\n\n\n#### Word sense disambiguation  \n* [GlossBERT: BERT for Word Sense Disambiguation with Gloss Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.07245) (EMNLP2019)\n* [Improved Word Sense Disambiguation Using Pre-Trained Contextualized Word Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.00194)  (EMNLP2019)\n* [Using BERT for Word Sense Disambiguation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.08358)\n* [Language Modelling Makes Sense: Propagating Representations through WordNet for Full-Coverage Word Sense Disambiguation](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FP19-1569.pdf) (ACL2019)\n* [Does BERT Make Any Sense? Interpretable Word Sense Disambiguation with Contextualized Embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10430) (KONVENS2019)\n\n#### Sentiment analysis  \n* [Utilizing BERT for Aspect-Based Sentiment Analysis via Constructing Auxiliary Sentence](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.09588) (NAACL2019)\n* [BERT Post-Training for Review Reading Comprehension and Aspect-based Sentiment Analysis](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.02232) (NAACL2019)\n* [Exploiting BERT for End-to-End Aspect-based Sentiment Analysis](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.00883) (EMNLP2019 WS)\n* [Adapt or Get Left Behind: Domain Adaptation through BERT Language Model Finetuning for Aspect-Target Sentiment Classification](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.11860) \n* [An Investigation of Transfer Learning-Based Sentiment Analysis in Japanese](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.09642) (ACL2019)\n* [\"Mask and Infill\" : Applying Masked Language Model to Sentiment Transfer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.08039)\n* [Adversarial Training for Aspect-Based Sentiment Analysis with BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.11316)\n* [Utilizing BERT Intermediate Layers for Aspect Based Sentiment Analysis and Natural Language Inference](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.04815)\n* [Utilizing BERT Intermediate Layers for Aspect Based Sentiment Analysis and Natural Language Inference](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.04815)\n\n#### Relation extraction\n* [Matching the Blanks: Distributional Similarity for Relation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.03158) (ACL2019)\n* [BERT-Based Multi-Head Selection for Joint Entity-Relation Extraction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05908) (NLPCC2019)\n* [Enriching Pre-trained Language Model with Entity Information for Relation Classification](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.08284)\n* [Span-based Joint Entity and Relation Extraction with Transformer Pre-training](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.07755)\n* [Fine-tune Bert for DocRED with Two-step Process](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11898)\n* [Entity, Relation, and Event Extraction with Contextualized Span Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03546) (EMNLP2019)\n\n#### Knowledge base\n* [KG-BERT: BERT for Knowledge Graph Completion](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03193)\n* [Language Models as Knowledge Bases?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.01066) (EMNLP2019) [[github](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FLAMA)]\n* [BERT is Not a Knowledge Base (Yet): Factual Knowledge vs. Name-Based Reasoning in Unsupervised QA](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03681)\n* [Inducing Relational Knowledge from BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.12753) (AAAI2020)\n* [Latent Relation Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.07690) (AAAI2020)\n* [Pretrained Encyclopedia: Weakly Supervised Knowledge-Pretrained Language Model](https:\u002F\u002Fopenreview.net\u002Fforum?id=BJlzm64tDH) (ICLR2020)\n* [Zero-shot Entity Linking with Dense Entity Retrieval](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03814)\n* [Investigating Entity Knowledge in BERT with Simple Neural End-To-End Entity Linking](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FK19-1063\u002F) (CoNLL2019)\n* [Improving Entity Linking by Modeling Latent Entity Type Information](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.01447) (AAAI2020)\n* [PEL-BERT: A Joint Model for Protocol Entity Linking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.00744)\n* [How Can We Know What Language Models Know?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.12543)\n* [REALM: Retrieval-Augmented Language Model Pre-Training](https:\u002F\u002Fkentonl.com\u002Fpub\u002Fgltpc.2020.pdf)\n\n\n#### Text classification\n* [How to Fine-Tune BERT for Text Classification?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.05583)\n* [X-BERT: eXtreme Multi-label Text Classification with BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02331)\n* [DocBERT: BERT for Document Classification](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08398)\n* [Enriching BERT with Knowledge Graph Embeddings for Document Classification](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.08402)\n* [Classification and Clustering of Arguments with Contextualized Word Embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.09821) (ACL2019)\n* [BERT for Evidence Retrieval and Claim Verification](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.02655)\n* [Stacked DeBERT: All Attention in Incomplete Data for Text Classification](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.00137)\n* [Cost-Sensitive BERT for Generalisable Sentence Classification with Imbalanced Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.11563)\n\n#### WSC WNLI NLI\n* [Exploring Unsupervised Pretraining and Sentence Structure Modelling for Winograd Schema Challenge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09705)\n* [A Surprisingly Robust Trick for the Winograd Schema Challenge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.06290)\n* [WinoGrande: An Adversarial Winograd Schema Challenge at Scale](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.10641) (AAAI2020)\n* [Improving Natural Language Inference with a Pretrained Parser](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.08217)\n* [Adversarial NLI: A New Benchmark for Natural Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.14599)\n* [Adversarial Analysis of Natural Language Inference Systems](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.03441) (ICSC2020)\n* [HypoNLI: Exploring the Artificial Patterns of Hypothesis-only Bias in Natural Language Inference](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02756) (LREC2020)\n* [Evaluating BERT for natural language inference: A case study on the CommitmentBank](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-1630\u002F) (EMNLP2019)\n\n#### Commonsense\n* [CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.00937) (NAACL2019)\n* [HellaSwag: Can a Machine Really Finish Your Sentence?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07830) (ACL2019) [[website](https:\u002F\u002Frowanzellers.com\u002Fhellaswag\u002F)]\n* [Story Ending Prediction by Transferable BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07504) (IJCAI2019)\n* [Explain Yourself! Leveraging Language Models for Commonsense Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.02361) (ACL2019)\n* [Align, Mask and Select: A Simple Method for Incorporating Commonsense Knowledge into Language Representation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.06725)\n* [Informing Unsupervised Pretraining with External Linguistic Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02339)\n* [Commonsense Knowledge + BERT for Level 2 Reading Comprehension Ability Test](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03415)\n* [BIG MOOD: Relating Transformers to Explicit Commonsense Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.07713)\n* [Commonsense Knowledge Mining from Pretrained Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00505) (EMNLP2019)\n* [KagNet: Knowledge-Aware Graph Networks for Commonsense Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02151) (EMNLP2019)\n* [Cracking the Contextual Commonsense Code: Understanding Commonsense Reasoning Aptitude of Deep Contextual Representations](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-6001\u002F) (EMNLP2019 WS)\n* [Do Massively Pretrained Language Models Make Better Storytellers?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10705) (CoNLL2019)\n* [PIQA: Reasoning about Physical Commonsense in Natural Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.11641v1) (AAAI2020)\n* [Evaluating Commonsense in Pre-trained Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.11931) (AAAI2020)\n* [Why Do Masked Neural Language Models Still Need Common Sense Knowledge?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03024)\n* [Do Neural Language Representations Learn Physical Commonsense?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.02899) (CogSci2019)\n\n#### Extractive summarization\n* [HIBERT: Document Level Pre-training of Hierarchical Bidirectional Transformers for Document Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.06566) (ACL2019)\n* [Deleter: Leveraging BERT to Perform Unsupervised Successive Text Compression](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03223)\n* [Discourse-Aware Neural Extractive Model for Text Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.14142)  \n* [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1912.08777.pdf)[[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fpegasus)]    \n* [Discourse-Aware Neural Extractive Text Summarization](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1910.14142.pdf)[[github](https:\u002F\u002Fgithub.com\u002Fjiacheng-xu\u002FDiscoBERT)]      \n\n\n#### IR\n* [Passage Re-ranking with BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.04085)\n* [Investigating the Successes and Failures of BERT for Passage Re-Ranking](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.01758)\n* [Understanding the Behaviors of BERT in Ranking](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.07531)\n* [Document Expansion by Query Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08375)\n* [CEDR: Contextualized Embeddings for Document Ranking](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.07094) (SIGIR2019)\n* [Deeper Text Understanding for IR with Contextual Neural Language Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.09217) (SIGIR2019)\n* [FAQ Retrieval using Query-Question Similarity and BERT-Based Query-Answer Relevance](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02851) (SIGIR2019)\n* [Multi-Stage Document Ranking with BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.14424)       \n* [REALM: Retrieval-Augmented Language Model Pre-Training](https:\u002F\u002Fkentonl.com\u002Fpub\u002Fgltpc.2020.pdf)       \n* [How Much Knowledge Can You Pack Into the Parameters of a Language Model?](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2002.08910.pdf) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Ft5_closed_book_qa)]         \n* [Dense Passage Retrieval for Open-Domain Question Answering](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.04906.pdf) [[github](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FDPR)]         \n\n\n### Generation    \n* [BERT has a Mouth, and It Must Speak: BERT as a Markov Random Field Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.04094) (NAACL2019 WS)\n* [Pretraining-Based Natural Language Generation for Text Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.09243)\n* [Text Summarization with Pretrained Encoders](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.08345) (EMNLP2019) [[github (original)](https:\u002F\u002Fgithub.com\u002Fnlpyang\u002FPreSumm)] [[github (huggingface)](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers\u002Ftree\u002Fmaster\u002Fexamples\u002Fsummarization)]\n* [Multi-stage Pretraining for Abstractive Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10599)\n* [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.08777)\n* [MASS: Masked Sequence to Sequence Pre-training for Language Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02450) (ICML2019) [[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FMASS)], [[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FMASS\u002Ftree\u002Fmaster\u002FMASS-fairseq)]\n* [Unified Language Model Pre-training for Natural Language Understanding and Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.03197) [[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Funilm)] (NeurIPS2019)\n* [UniLMv2: Pseudo-Masked Language Models for Unified Language Model Pre-Training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.12804) [[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Funilm)]\n* [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.04063)\n* [Towards Making the Most of BERT in Neural Machine Translation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05672)\n* [Improving Neural Machine Translation with Pre-trained Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.07688)\n* [On the use of BERT for Neural Machine Translation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.12744) (EMNLP2019 WS)\n* [Incorporating BERT into Neural Machine Translation](https:\u002F\u002Fopenreview.net\u002Fforum?id=Hyl7ygStwB) (ICLR2020)\n* [Recycling a Pre-trained BERT Encoder for Neural Machine Translation](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-5603\u002F)\n* [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.12461)\n* [Mask-Predict: Parallel Decoding of Conditional Masked Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09324) (EMNLP2019)\n* [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.13461)\n* [ERNIE-GEN: An Enhanced Multi-Flow Pre-training and Fine-tuning Framework for Natural Language Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.11314)\n* [Cross-Lingual Natural Language Generation via Pre-Training](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10481) (AAAI2020) [[github](https:\u002F\u002Fgithub.com\u002FCZWin32768\u002FXNLG)]\n* [Multilingual Denoising Pre-training for Neural Machine Translation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.08210)\n* [PLATO: Pre-trained Dialogue Generation Model with Discrete Latent Variable](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.07931)\n* [Unsupervised Pre-training for Natural Language Generation: A Literature Review](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06171)  \n* [Improving Language Understanding by Generative Pre-Training](https:\u002F\u002Fs3-us-west-2.amazonaws.com\u002Fopenai-assets\u002Fresearch-covers\u002Flanguage-unsupervised\u002Flanguage_understanding_paper.pdf)  \n* [Language Models are Unsupervised Multitask Learners](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf)  \n* [Language Models are Few-Shot Learners](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.14165.pdf)  \n\n### Quality evaluator   \n* [BERTScore: Evaluating Text Generation with BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09675) (ICLR2020)\n* [Machine Translation Evaluation with BERT Regressor](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.12679)\n* [SumQE: a BERT-based Summary Quality Estimation Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00578) (EMNLP2019)\n* [MoverScore: Text Generation Evaluating with Contextualized Embeddings and Earth Mover Distance](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02622) (EMNLP2019) [[github](https:\u002F\u002Fgithub.com\u002FAIPHES\u002Femnlp19-moverscore)]\n* [BERT as a Teacher: Contextual Embeddings for Sequence-Level Reward](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02738)\n\n### Modification (multi-task, masking strategy, etc.)    \n* [Multi-Task Deep Neural Networks for Natural Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.11504) (ACL2019)\n* [The Microsoft Toolkit of Multi-Task Deep Neural Networks for Natural Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.07972)\n* [BERT and PALs: Projected Attention Layers for Efficient Adaptation in Multi-Task Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.02671) (ICML2019)\n* [Unifying Question Answering and Text Classification via Span Extraction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09286)\n* [ERNIE: Enhanced Language Representation with Informative Entities](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07129) (ACL2019)\n* [ERNIE: Enhanced Representation through Knowledge Integration](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09223)\n* [ERNIE 2.0: A Continual Pre-training Framework for Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.12412) (AAAI2020)\n* [Pre-Training with Whole Word Masking for Chinese BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.08101)\n* [SpanBERT: Improving Pre-training by Representing and Predicting Spans](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.10529) [[github](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FSpanBERT)]\n* [Blank Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.03079)\n* [Efficient Training of BERT by Progressively Stacking](http:\u002F\u002Fproceedings.mlr.press\u002Fv97\u002Fgong19a.html) (ICML2019) [[github](https:\u002F\u002Fgithub.com\u002Fgonglinyuan\u002FStackingBERT)]\n* [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11692) [[github](https:\u002F\u002Fgithub.com\u002Fpytorch\u002Ffairseq\u002Ftree\u002Fmaster\u002Fexamples\u002Froberta)]\n* [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11942) (ICLR2020)\n* [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https:\u002F\u002Fopenreview.net\u002Fforum?id=r1xMH1BtvB) (ICLR2020) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Felectra)] [[blog](https:\u002F\u002Fai.googleblog.com\u002F2020\u002F03\u002Fmore-efficient-nlp-model-pre-training.html)]\n* [FreeLB: Enhanced Adversarial Training for Language Understanding](https:\u002F\u002Fopenreview.net\u002Fforum?id=BygzbyHFvB) (ICLR2020)\n* [KERMIT: Generative Insertion-Based Modeling for Sequences](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01604)\n* [DisSent: Sentence Representation Learning from Explicit Discourse Relations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.04334) (ACL2019)\n* [StructBERT: Incorporating Language Structures into Pre-training for Deep Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04577) (ICLR2020)\n* [Syntax-Infused Transformer and BERT models for Machine Translation and Natural Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06156)\n* [SenseBERT: Driving Some Sense into BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05646)\n* [Semantics-aware BERT for Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02209) (AAAI2020)\n* [K-BERT: Enabling Language Representation with Knowledge Graph](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.07606)\n* [Knowledge Enhanced Contextual Word Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.04164) (EMNLP2019)\n* [KEPLER: A Unified Model for Knowledge Embedding and Pre-trained Language Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06136)\n* [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.10084) (EMNLP2019)\n* [SBERT-WK: A Sentence Embedding Method By Dissecting BERT-based Word Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.06652)\n* [Universal Text Representation from BERT: An Empirical Study](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.07973)\n* [Symmetric Regularization based BERT for Pair-wise Semantic Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03405)\n* [Transfer Fine-Tuning: A BERT Case Study](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00931) (EMNLP2019)\n* [Improving Pre-Trained Multilingual Models with Vocabulary Expansion](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.12440) (CoNLL2019)\n* [SesameBERT: Attention for Anywhere](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.03176)\n* [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Ftext-to-text-transfer-transformer)]\n* [SMART: Robust and Efficient Fine-Tuning for Pre-trained Natural Language Models through Principled Regularized Optimization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03437)\n\n### Probe   \n* [A Structural Probe for Finding Syntax in Word Representations](https:\u002F\u002Faclweb.org\u002Fanthology\u002Fpapers\u002FN\u002FN19\u002FN19-1419\u002F) (NAACL2019)\n* [Linguistic Knowledge and Transferability of Contextual Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.08855) (NAACL2019) [[github](https:\u002F\u002Fgithub.com\u002Fnelson-liu\u002Fcontextual-repr-analysis)]\n* [Probing What Different NLP Tasks Teach Machines about Function Word Comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.11544) (*SEM2019)\n* [BERT Rediscovers the Classical NLP Pipeline](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.05950) (ACL2019)\n* [Probing Neural Network Comprehension of Natural Language Arguments](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.07355) (ACL2019)\n* [Cracking the Contextual Commonsense Code: Understanding Commonsense Reasoning Aptitude of Deep Contextual Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.01157) (EMNLP2019 WS)\n* [What do you mean, BERT? Assessing BERT as a Distributional Semantics Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.05758)\n* [Quantity doesn't buy quality syntax with neural language models](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00111) (EMNLP2019)\n* [Are Pre-trained Language Models Aware of Phrases? Simple but Strong Baselines for Grammar Induction](https:\u002F\u002Fopenreview.net\u002Fforum?id=H1xPR3NtPB) (ICLR2020)\n* [oLMpics -- On what Language Model Pre-training Captures](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.13283)\n* [How Much Knowledge Can You Pack Into the Parameters of a Language Model?](http:\u002F\u002Fcolinraffel.com\u002Fpublications\u002Farxiv2020how.pdf)\n* [What Does My QA Model Know? Devising Controlled Probes using Expert Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.13337)\n* [Attention is not Explanation](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1902.10186) (ACL2021)\n\n### Multi-lingual  \n* [Multilingual Constituency Parsing with Self-Attention and Pre-Training](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.11760) (ACL2019)\n* [Language Model Pretraining](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.07291) (NeurIPS2019) [[github](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FXLM)]\n* [75 Languages, 1 Model: Parsing Universal Dependencies Universally](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.02099) (EMNLP2019) [[github](https:\u002F\u002Fgithub.com\u002Fhyperparticle\u002Fudify)]\n* [Zero-shot Dependency Parsing with Pre-trained Multilingual Sentence Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05479) (EMNLP2019 WS)\n* [Beto, Bentz, Becas: The Surprising Cross-Lingual Effectiveness of BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09077) (EMNLP2019)\n* [How multilingual is Multilingual BERT?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01502) (ACL2019)\n* [How Language-Neutral is Multilingual BERT?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03310)\n* [Is Multilingual BERT Fluent in Language Generation?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.03806)\n* [Unicoder: A Universal Language Encoder by Pre-training with Multiple Cross-lingual Tasks](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-1252\u002F) (EMNLP2019)\n* [BERT is Not an Interlingua and the Bias of Tokenization](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-6106\u002F) (EMNLP2019 WS)\n* [Cross-Lingual Ability of Multilingual BERT: An Empirical Study](https:\u002F\u002Fopenreview.net\u002Fforum?id=HJeT3yrtDr) (ICLR2020)\n* [Multilingual Alignment of Contextual Word Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.03518) (ICLR2020)\n* [On the Cross-lingual Transferability of Monolingual Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11856)\n* [Unsupervised Cross-lingual Representation Learning at Scale](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.02116)\n* [Emerging Cross-lingual Structure in Pretrained Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.01464)\n* [Can Monolingual Pretrained Models Help Cross-Lingual Classification?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03913)\n* [Fully Unsupervised Crosslingual Semantic Textual Similarity Metric Based on BERT for Identifying Parallel Data](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FK19-1020\u002F) (CoNLL2019)\n* [What the \\[MASK\\]? Making Sense of Language-Specific BERT Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02912)\n* [XTREME: A Massively Multilingual Multi-task Benchmark for Evaluating Cross-lingual Generalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.11080)\n* [mT5: A Massively Multilingual Pre-trained Text-to-Text Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.11934) (ACL2021) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fmultilingual-t5)] \n\n### Other than English models    \n* [CamemBERT: a Tasty French Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03894)\n* [FlauBERT: Unsupervised Language Model Pre-training for French](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.05372)\n* [Multilingual is not enough: BERT for Finnish](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.07076)\n* [BERTje: A Dutch BERT Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.09582)\n* [RobBERT: a Dutch RoBERTa-based Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.06286)\n* [Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07213)\n* [AraBERT: Transformer-based Model for Arabic Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.00104)\n* [PhoBERT: Pre-trained language models for Vietnamese](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.00744) \n* [CLUECorpus2020: A Large-scale Chinese Corpus for Pre-training Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.01355)\n\n\n### Domain specific \n* [BioBERT: a pre-trained biomedical language representation model for biomedical text mining](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.08746)\n* [Transfer Learning in Biomedical Natural Language Processing: An Evaluation of BERT and ELMo on Ten Benchmarking Datasets](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.05474) (ACL2019 WS) \n* [BERT-based Ranking for Biomedical Entity Normalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03548)\n* [PubMedQA: A Dataset for Biomedical Research Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.06146) (EMNLP2019)\n* [Pre-trained Language Model for Biomedical Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.08229)\n* [How to Pre-Train Your Model? Comparison of Different Pre-Training Models for Biomedical Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.00712)\n* [ClinicalBERT: Modeling Clinical Notes and Predicting Hospital Readmission](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.05342)\n* [Publicly Available Clinical BERT Embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.03323) (NAACL2019 WS)\n* [Progress Notes Classification and Keyword Extraction using Attention-based Deep Learning Models with BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05786)\n* [SciBERT: Pretrained Contextualized Embeddings for Scientific Text](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.10676) [[github](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fscibert)]\n* [PatentBERT: Patent Classification with Fine-Tuning a pre-trained BERT Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.02124)\n\n\n### Multi-modal    \n* [VideoBERT: A Joint Model for Video and Language Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.01766) (ICCV2019)\n* [ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.02265) (NeurIPS2019)\n* [VisualBERT: A Simple and Performant Baseline for Vision and Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03557)\n* [Selfie: Self-supervised Pretraining for Image Embedding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.02940)\n* [ImageBERT: Cross-modal Pre-training with Large-scale Weak-supervised Image-Text Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.07966)\n* [Contrastive Bidirectional Transformer for Temporal Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.05743)\n* [M-BERT: Injecting Multimodal Information in the BERT Structure](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05787)\n* [LXMERT: Learning Cross-Modality Encoder Representations from Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.07490) (EMNLP2019)\n* [Fusion of Detected Objects in Text for Visual Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05054) (EMNLP2019)\n* [BERT representations for Video Question Answering](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_WACV_2020\u002Fhtml\u002FYang_BERT_representations_for_Video_Question_Answering_WACV_2020_paper.html) (WACV2020)\n* [Unified Vision-Language Pre-Training for Image Captioning and VQA](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11059) [[github](https:\u002F\u002Fgithub.com\u002FLuoweiZhou\u002FVLP)]\n* [Large-scale Pretraining for Visual Dialog: A Simple State-of-the-Art Baseline](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.02379)\n* [VL-BERT: Pre-training of Generic Visual-Linguistic Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.08530) (ICLR2020)\n* [Unicoder-VL: A Universal Encoder for Vision and Language by Cross-modal Pre-training](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.06066)\n* [UNITER: Learning UNiversal Image-TExt Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11740)\n* [Supervised Multimodal Bitransformers for Classifying Images and Text](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02950)\n* [Weak Supervision helps Emergence of Word-Object Alignment and improves Vision-Language Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.03063)\n* [BERT Can See Out of the Box: On the Cross-modal Transferability of Text Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.10832)\n* [BERT for Large-scale Video Segment Classification with Test-time Augmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01127) (ICCV2019WS)\n* [SpeechBERT: Cross-Modal Pre-trained Language Model for End-to-end Spoken Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11559)\n* [vq-wav2vec: Self-Supervised Learning of Discrete Speech Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05453)\n* [Effectiveness of self-supervised pre-training for speech recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03912)\n* [Understanding Semantics from Speech Through Pre-training](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10924)\n* [Towards Transfer Learning for End-to-End Speech Synthesis from Deep Pre-Trained Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.07307) \n\n\n### Model compression    \n* [Distilling Task-Specific Knowledge from BERT into Simple Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.12136)\n* [Patient Knowledge Distillation for BERT Model Compression](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.09355) (EMNLP2019)\n* [Small and Practical BERT Models for Sequence Labeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00100) (EMNLP2019)\n* [Pruning a BERT-based Question Answering Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06360)\n* [TinyBERT: Distilling BERT for Natural Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10351) [[github](https:\u002F\u002Fgithub.com\u002Fhuawei-noah\u002FPretrained-Language-Model\u002Ftree\u002Fmaster\u002FTinyBERT)]\n* [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.01108) (NeurIPS2019 WS) [[github](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers\u002Ftree\u002Fmaster\u002Fexamples\u002Fdistillation)]\n* [Knowledge Distillation from Internal Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.03723) (AAAI2020)\n* [PoWER-BERT: Accelerating BERT inference for Classification Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.08950)\n* [WaLDORf: Wasteless Language-model Distillation On Reading-comprehension](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.06638)\n* [Extreme Language Model Compression with Optimal Subwords and Shared Projections](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11687)\n* [BERT-of-Theseus: Compressing BERT by Progressive Module Replacing](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.02925)\n* [Compressing BERT: Studying the Effects of Weight Pruning on Transfer Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08307)\n* [MiniLM: Deep Self-Attention Distillation for Task-Agnostic Compression of Pre-Trained Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.10957)\n* [Compressing Large-Scale Transformer-Based Models: A Case Study on BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.11985)\n* [Train Large, Then Compress: Rethinking Model Size for Efficient Training and Inference of Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.11794)\n* [MobileBERT: Task-Agnostic Compression of BERT by Progressive Knowledge Transfer](https:\u002F\u002Fopenreview.net\u002Fforum?id=SJxjVaNKwB)\n* [Q-BERT: Hessian Based Ultra Low Precision Quantization of BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.05840)\n* [Q8BERT: Quantized 8Bit BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06188) (NeurIPS2019 WS)\n\n\n### LLM\n* [Attention Is All You Need](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1706.03762.pdf)\n* [Improving Language Understanding by Generative Pre-Training](https:\u002F\u002Fwww.cs.ubc.ca\u002F~amuham01\u002FLING530\u002Fpapers\u002Fradford2018improving.pdf)\n* [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https:\u002F\u002Faclanthology.org\u002FN19-1423.pdf)\n* [Language Models are Unsupervised Multitask Learners](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf)\n* [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1909.08053.pdf)\n* [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https:\u002F\u002Fjmlr.org\u002Fpapers\u002Fv21\u002F20-074.html)\n* [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1910.02054.pdf)\n* [Scaling Laws for Neural Language Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2001.08361.pdf)\n* [Language models are few-shot learners](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2020\u002Ffile\u002F1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf)\n* [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.03961.pdf)\n* [Evaluating Large Language Models Trained on Code](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2107.03374.pdf)\n* [Multitask Prompted Training Enables Zero-Shot Task Generalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.08207)\n* [GLaM: Efficient Scaling of Language Models with Mixture-of-Experts](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.06905.pdf)\n* [WebGPT: Browser-assisted question-answering with human feedback](https:\u002F\u002Fwww.semanticscholar.org\u002Fpaper\u002FWebGPT%3A-Browser-assisted-question-answering-with-Nakano-Hilton\u002F2f3efe44083af91cef562c1a3451eee2f8601d22)\n* [Improving language models by retrieving from trillions of tokens](https:\u002F\u002Fwww.deepmind.com\u002Fpublications\u002Fimproving-language-models-by-retrieving-from-trillions-of-tokens)\n* [Scaling Language Models: Methods, Analysis &amp; Insights from Training Gopher](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.11446.pdf)\n* [Chain-of-Thought Prompting Elicits Reasoning in Large Language Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.11903.pdf)\n* [LaMDA: Language Models for Dialog Applications](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.08239.pdf)\n* [Solving Quantitative Reasoning Problems with Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14858)\n* [Using Deep and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.11990.pdf)\n* [Training language models to follow instructions with human feedback](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.02155.pdf)\n* [PaLM: Scaling Language Modeling with Pathways](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.02311.pdf)\n* [An empirical analysis of compute-optimal large language model training](https:\u002F\u002Fwww.deepmind.com\u002Fpublications\u002Fan-empirical-analysis-of-compute-optimal-large-language-model-training)\n* [OPT: Open Pre-trained Transformer Language Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.01068.pdf)\n* [Unifying Language Learning Paradigms](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.05131v1)\n* [Emergent Abilities of Large Language Models](https:\u002F\u002Fopenreview.net\u002Fpdf?id=yzkSU5zdwD)\n* [Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models](https:\u002F\u002Fgithub.com\u002Fgoogle\u002FBIG-bench)\n* [Language Models are General-Purpose Interfaces](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.06336.pdf)\n* [Improving alignment of dialogue agents via targeted human judgements](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.14375.pdf)\n* [Scaling Instruction-Finetuned Language Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.11416.pdf)\n* [GLM-130B: An Open Bilingual Pre-trained Model](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.02414.pdf)\n* [Holistic Evaluation of Language Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.09110.pdf)\n* [BLOOM: A 176B-Parameter Open-Access Multilingual Language Model](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.05100.pdf)\n* [Galactica: A Large Language Model for Science](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.09085.pdf)\n* [OPT-IML: Scaling Language Model Instruction Meta Learning through the Lens of Generalization](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.12017)\n* [The Flan Collection: Designing Data and Methods for Effective Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2301.13688.pdf)\n* [LLaMA: Open and Efficient Foundation Language Models](https:\u002F\u002Fresearch.facebook.com\u002Fpublications\u002Fllama-open-and-efficient-foundation-language-models\u002F)\n* [Language Is Not All You Need: Aligning Perception with Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.14045)\n* [PaLM-E: An Embodied Multimodal Language Model](https:\u002F\u002Fpalm-e.github.io)\n* [GPT-4 Technical Report](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4)\n* [Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.01373)\n* [Principle-Driven Self-Alignment of Language Models from Scratch with Minimal Human Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.03047)\n* [PaLM 2 Technical Report](https:\u002F\u002Fai.google\u002Fstatic\u002Fdocuments\u002Fpalm2techreport.pdf)\n* [RWKV: Reinventing RNNs for the Transformer Era](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13048)\n* [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.18290.pdf)\n* [Llama 2: Open Foundation and Fine-Tuned Chat Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.09288.pdf)\n* [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https:\u002F\u002Farxiv.org\u002Fftp\u002Farxiv\u002Fpapers\u002F2312\u002F2312.00752.pdf)\n* [TinyLlama: An Open-Source Small Language Model](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.02385.pdf)\n* [Scaling Instruction-Finetuned Language Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.11416) [[github](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.11416)] (ACL2023)\n\n\n\n\n### Misc\n* [jiant: A Software Toolkit for Research on General-Purpose Text Understanding Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02249) [[github](https:\u002F\u002Fgithub.com\u002Fnyu-mll\u002Fjiant\u002F)]\n* [Cloze-driven Pretraining of Self-attention Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.07785)\n* [Learning and Evaluating General Linguistic Intelligence](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.11373)\n* [To Tune or Not to Tune? Adapting Pretrained Representations to Diverse Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.05987) (ACL2019 WS)\n* [Learning to Speak and Act in a Fantasy Text Adventure Game](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-1062\u002F) (EMNLP2019)\n* [Conditional BERT Contextual Augmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.06705)\n* [Data Augmentation using Pre-trained Transformer Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02245)\n* [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.00962) (ICLR2020)\n* [Mixout: Effective Regularization to Finetune Large-scale Pretrained Language Models](https:\u002F\u002Fopenreview.net\u002Fforum?id=HkgaETNtDB) (ICLR2020)\n* [A Mutual Information Maximization Perspective of Language Representation Learning](https:\u002F\u002Fopenreview.net\u002Fforum?id=Syx79eBKwr) (ICLR2020)\n* [Is BERT Really Robust? Natural Language Attack on Text Classification and Entailment](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11932) (AAAI2020)\n* [Thieves on Sesame Street! Model Extraction of BERT-based APIs](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.12366) (ICLR2020)\n* [Graph-Bert: Only Attention is Needed for Learning Graph Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.05140)\n* [CodeBERT: A Pre-Trained Model for Programming and Natural Languages](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08155)\n* [Fine-Tuning Pretrained Language Models: Weight Initializations, Data Orders, and Early Stopping](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.06305)\n* [Extending Machine Language Models toward Human-Level Language Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.05877)\n* [Glyce: Glyph-vectors for Chinese Character Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.10125)\n* [Back to the Future -- Sequential Alignment of Text Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03464)\n* [Improving Cuneiform Language Identification with BERT](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002Fpapers\u002FW\u002FW19\u002FW19-1402\u002F) (NAACL2019 WS)\n* [BERT has a Moral Compass: Improvements of ethical and moral values of machines](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.05238)\n* [SMILES-BERT: Large Scale Unsupervised Pre-Training for Molecular Property Prediction](https:\u002F\u002Fdl.acm.org\u002Fcitation.cfm?id=3342186) (ACM-BCB2019)\n* [On the comparability of Pre-trained Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.00781)\n* [Transformers: State-of-the-art Natural Language Processing](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.03771) \n* [Jukebox: A Generative Model for Music](https:\u002F\u002Fcdn.openai.com\u002Fpapers\u002Fjukebox.pdf)  \n* [WT5?! Training Text-to-Text Models to Explain their\nPredictions](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.14546.pdf)  \n* [TAPAS: Weakly Supervised Table Parsing via Pre-training](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.02349.pdf) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Ftapas)]  \n* [TABERT: Pretraining for Joint Understanding of\nTextual and Tabular Data](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.08314.pdf)    \n\n\n# Author\nChangWookJun \u002F @changwookjun (changwookjun@gmail.com)\n","# 自然语言处理论文\n![Awesome](https:\u002F\u002Fcdn.rawgit.com\u002Fsindresorhus\u002Fawesome\u002Fd7305f38d29fed78fa85652e3a63e154dd8e8829\u002Fmedia\u002Fbadge.svg)\n\n自然语言处理论文列表\n\n## 目录\n* [Bert系列](#Bert-Series)   \n* [Transformer系列](#Transformer-Series)  \n* [迁移学习](#Transfer-Learning)  \n* [文本摘要](#Text-Summarization)  \n* [情感分析](#Sentiment-Analysis)  \n* [问答](#Question-Answering)  \n* [机器翻译](#Machine-Translation)\n* [综述论文](#survey-paper)  \n* [下游任务](#downstream-task) \n   * [QA MC对话](#QA-MC-Dialogue) \n   * [槽位填充](#Slot-filling)    \n   * [分析](#Analysis) \n   * [分词、句法分析、命名实体识别](#Word-segmentation-parsing-NER)    \n   * [代词指代消解](#Pronoun-coreference-resolution) \n   * [词义消歧](#Word-sense-disambiguation) \n   * [情感分析](#Sentiment-analysis) \n   * [关系抽取](#Relation-extraction)    \n   * [知识库](#Knowledge-base)     \n   * [文本分类](#Text-classification)         \n   * [WSC WNLI NLI](#WSC-WNLI-NLI) \n   * [常识](#Commonsense) \n   * [抽取式摘要](#Extractive-summarization)\n   * [信息检索](#IR)   \n* [生成](#generation) \n* [质量评估器](#quality-evaluator) \n* [改进方法（多任务、掩码策略等）](#modification-multi-task-masking-strategy-etc) \n* [探针](#probe) \n* [多语言](#multi-lingual) \n* [非英语模型](#other-than-english-models) \n* [领域特定](#domain-specific) \n* [多模态](#multi-modal) \n* [模型压缩](#model-compression)\n* [大语言模型](#LLM) \n* [其他](#misc) \n\n### Bert系列\n* [BERT：用于语言理解的深度双向Transformer预训练 - NAACL 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805)  \n* [ERNIE 2.0：持续预训练的语言理解框架 - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.12412)  \n* [StructBERT：将语言结构融入预训练以实现深度语言理解 - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04577)  \n* [RoBERTa：一种鲁棒优化的BERT预训练方法 - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11692)  \n* [ALBERT：用于自监督语言表示学习的轻量级BERT - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11942)  \n* [用于自然语言理解的多任务深度神经网络 - arXiv 2019)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.11504)  \n* [BERT学到了关于语言结构的什么？](https:\u002F\u002Fhal.inria.fr\u002Fhal-02131630\u002Fdocument) (ACL2019)\n* [多头自注意力分析：专用头负责主要工作，其余可剪枝](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.09418) (ACL2019) [[github](https:\u002F\u002Fgithub.com\u002Flena-voita\u002Fthe-story-of-heads)]\n* [芝麻开门：深入BERT的语言知识](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01698) (ACL2019 WS)\n* [Transformer语言模型中注意力结构的分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04284) (ACL2019 WS)\n* [BERT关注的是什么？BERT注意力分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04341) (ACL2019 WS)\n* [BERT中的注意力头是否跟踪句法依赖关系？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.12246)\n* [黑盒遇见黑盒：神经语言模型与大脑的表征相似性和稳定性分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01539) (ACL2019 WS)\n* [从BERT表示中诱导句法树](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.11511) (ACL2019 WS)\n* [Transformer模型中注意力的多尺度可视化](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.05714) (ACL2019 Demo)\n* [BERT几何结构的可视化与测量](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.02715)\n* [上下文相关的词表示到底有多上下文化？比较BERT、ELMo和GPT-2嵌入的几何结构](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00512) (EMNLP2019) \n* [十六个头真的比一个好吗？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.10650) (NeurIPS2019)\n* [作为Transformer模型解释的自注意力的有效性探讨](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04211)\n* [BERT有效性的可视化与理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05620) (EMNLP2019)\n* [跨NLP任务的注意力可解释性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11218)\n* [揭示BERT的黑暗秘密](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.08593) (EMNLP2019)\n* [探究BERT对语言的认知：使用NPIs的五种分析方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02597) (EMNLP2019)\n* [Transformer中表示的自下而上的演化：一项结合机器翻译和语言建模目标的研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.01380) (EMNLP2019) \n* [BERT学入门：我们所知道的关于BERT工作原理的一切](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.12327)\n* [NLP模型知道数字吗？嵌入中的数字能力探测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.07940) (EMNLP2019)\n* [BERT如何回答问题？Transformer表示的逐层分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.04925) (CIKM2019)\n* [你在看什么呢？在问答任务中通过DeepLIFT提升BERT注意力](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06431)\n* [BERT从多项选择阅读理解数据集中学到了什么？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.12391)\n* [预训练Transformer的校准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.07892)\n* [exBERT：探索Transformer模型中学习到的表示的可视化分析工具](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05276) [[github](https:\u002F\u002Fgithub.com\u002Fbhoov\u002Fexbert)]  \n* [MobileBERT：适用于资源受限设备的紧凑型任务无关BERT](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.02984.pdf) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Fmobilebert)]   \n* [测量并减少预训练模型中的性别相关性](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.06032.pdf)\n* [DEBERTA：解耦注意力的解码增强型BERT](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.03654) [[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FDeBERTa)] (ACL2021)   \n* [STRUCTBERT：将语言结构融入预训练以实现深度语言理解](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1908.04577) (ACL2021)\n* [SpanBERT：通过表示和预测跨度来改进预训练](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1907.10529) [[github](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FSpanBERT)] (ACL2021)\n\n### Transformer系列\n* [注意力就是一切 - arXiv 2017年](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.03762)  \n* [Transformer-XL：超越固定长度上下文的注意力语言模型 - arXiv 2019年](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.02860)  \n* [通用Transformer - ICLR 2019年](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.03819) \n* [探索统一文本到文本Transformer迁移学习的极限 - arXiv 2019年](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683) \n* [Reformer：高效的Transformer - ICLR 2020年](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.04451) \n* [Transformer中的自适应注意力范围](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07799)（ACL2019）\n* [Transformer-XL：超越固定长度上下文的注意力语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.02860)（ACL2019）[[github](https:\u002F\u002Fgithub.com\u002Fkimiyoung\u002Ftransformer-xl)]\n* [使用稀疏Transformer生成长序列](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.10509)\n* [自适应稀疏Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00015)（EMNLP2019）\n* [用于长距离序列建模的压缩Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.05507)\n* [进化版Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.11117)（ICML2019）\n* [Reformer：高效的Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.04451)（ICLR2020）[[github](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Ftrax\u002Ftree\u002Fmaster\u002Ftrax\u002Fmodels\u002Freformer)]\n* [GRET：全局表示增强型Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.10101)（AAAI2020）\n* [节食版Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.06170) [[github](https:\u002F\u002Fgithub.com\u002Fcgraywang\u002Ftransformer-on-diet)]\n* [基于路由的高效内容稀疏注意力Transformer](https:\u002F\u002Fopenreview.net\u002Fforum?id=B1gjs6EtDr)\n* [BP-Transformer：通过二元划分建模长距离上下文](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.04070)  \n* [构建开放域聊天机器人的方法](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.13637.pdf)   \n* [Longformer：长文档Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.05150.pdf)  \n* [UnifiedQA：用单一问答系统跨越格式边界](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.00700.pdf) [[github](https:\u002F\u002Fgithub.com\u002Fallenai\u002Funifiedqa)]  \n* [Big Bird：适用于更长序列的Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2007.14062.pdf) \n* [Longformer：长文档Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.05150) [[github](https:\u002F\u002Fgithub.com\u002Fallenai\u002Flongformer)]（ACL2021）\n* [REFORMER：高效的Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2001.04451)（ACL2021）\n* [Linformer：线性复杂度的自注意力机制](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.04768)（ACL2021）\n* [重新思考Performers中的注意力机制](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.14794) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Fblob\u002Fmaster\u002Fperformer\u002Ffast_attention\u002Ftensorflow\u002Ffast_attention.py)]（ICLR2021）\n* [Big Bird：适用于更长序列的Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2007.14062)（ACL2021）\n\n\n\n\n### 迁移学习\n* [深度上下文化词表示 - NAACL 2018年](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.05365)  \n* [面向文本分类的通用语言模型微调 - ACL 2018年](https:\u002F\u002Farxiv.org\u002Fabs\u002F1801.06146)  \n* [通过生成式预训练提升语言理解能力 - Alec Radford](https:\u002F\u002Fs3-us-west-2.amazonaws.com\u002Fopenai-assets\u002Fresearch-covers\u002Flanguage-unsupervised\u002Flanguage_understanding_paper.pdf)  \n* [BERT：面向语言理解的深度双向Transformer预训练 - NAACL 2019年](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805)  \n* [基于完形填空的自注意力网络预训练 - arXiv 2019年](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.07785)  \n* [面向自然语言理解和生成的统一语言模型预训练 - arXiv 2019年](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.03197)  \n* [MASS：面向语言生成的掩码序列到序列预训练 - ICML 2019年](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02450)  \n* [MPNet：面向语言理解的掩码与置换预训练](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.09297.pdf)[[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FMPNet)]    \n* [UNILMv2：用于统一语言模型预训练的伪掩码语言模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2002.12804.pdf)[[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Funilm)]    \n* [AdapterFusion：用于迁移学习的非破坏性任务组合](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.00247)（ACL2022）  \n* [前缀调优：优化连续提示以进行生成](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.00190)（ACL2022）\n* [LORA：大型语言模型的低秩适配](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.09685)（ACL2022）\n\n\n\n\n### 文本摘要\n* [位置编码控制输出序列长度 - Sho Takase（2019年）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.07418.pdf)  \n* [微调BERT进行抽取式摘要 - Yang Liu（2019年）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1903.10318.pdf)  \n* [语言模型是无监督的多任务学习者 - Alec Radford（2019年）](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf)   \n* [使用不一致性损失的抽取式和抽象式摘要统一模型 - Wan-Ting Hsu（2018年）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1805.06266.pdf)   \n* [面向长文档抽象式摘要的话语感知注意力模型 - Arman Cohan（2018年）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1804.05685.pdf)   \n* [通过总结长序列生成维基百科 - Peter J. Liu（2018年）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1801.10198.pdf)   \n* [直奔主题：使用指针-生成器网络进行摘要 - Abigail See（2017年）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1704.04368.pdf)\n* [面向句子摘要的神经注意力模型 - Alexander M. Rush（2015年）](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD15-1044)   \n* [PEGASUS：通过提取间隔句进行抽象式摘要的预训练](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1912.08777)（ACL2021）\n* [使用BART进行抽象式文本摘要](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9972639)（ACL2021）  \n* [CTRLSUM：迈向通用可控文本摘要](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2012.04281)（ACL2021）\n\n### 情感分析\n* [用于自然语言理解的多任务深度神经网络 - 刘晓东（2019）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1901.11504.pdf)  \n* [基于AS胶囊网络的方面级情感分析 - 王业权（2019）](http:\u002F\u002Fcoai.cs.tsinghua.edu.cn\u002Fhml\u002Fmedia\u002Ffiles\u002FWWW19WangY.pdf) \n* [文本预处理在神经网络架构中的作用：\n文本分类与情感分析的评估研究 - 何塞·卡马乔-科利亚多斯（2018）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1704.01444.pdf) \n* [在翻译中学习：上下文化词向量 - 布莱恩·麦肯（2018）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1708.00107.pdf) \n* [面向文本分类的通用语言模型微调 - 杰里米·霍华德（2018）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1801.06146.pdf) \n* [带有循环神经网络滤波器的卷积神经网络 - 杨毅（2018）](https:\u002F\u002Faclweb.org\u002Fanthology\u002FD18-1109) \n* [通过动态路由进行信息聚合以实现序列编码 - 龚晶晶（2018）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1806.01501.pdf) \n* [学习生成评论并发现情感 - 亚历克·拉德福德（2017）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1704.01444.pdf) \n* [一种结构化的自注意力句子嵌入 - 林周涵（2017）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1703.03130.pdf) \n\n### 问答系统  \n* [语言模型是无监督的多任务学习者 - 亚历克·拉德福德（2019）](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf)  \n* [通过生成式预训练提升语言理解能力 - 亚历克·拉德福德（2018）](https:\u002F\u002Fs3-us-west-2.amazonaws.com\u002Fopenai-assets\u002Fresearch-covers\u002Flanguage-unsupervised\u002Flanguage_understanding_paper.pdf) \n* [用于机器阅读理解的双向注意力流 - 徐珉俊（2018）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.01603.pdf) \n* [用于机器阅读理解的强化记忆读者 - 胡明浩（2017）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1705.02798.pdf)  \n* [用于文本处理的神经变分推断 - 缪一书（2015）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.06038.pdf)  \n* [UnifiedQA：用单一问答系统跨越格式边界](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.00700) [[github](https:\u002F\u002Fgithub.com\u002Fallenai\u002Funifiedqa)] (ACL2021)\n* [REALM：检索增强型语言模型预训练](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2002.08909) (ACL2021)\n* \n\n### 机器翻译    \n* [进化后的Transformer - 大卫·R·索（2019）](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1901.11117.pdf)  \n\n### 综述论文    \n* [自然语言处理中迁移学习的发展](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.07370)\n* [自然语言处理中的预训练模型：综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.08271)\n* [关于上下文嵌入的综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.07278)\n\n### 下游任务\n#### 问答多选对话\n* [自然问题数据集的 BERT 基线](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.08634)\n* [MultiQA：阅读理解中泛化与迁移的实证研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.13453)（ACL2019）\n* [阅读理解中的无监督领域适应](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06137)\n* [BERTQA——强化版注意力机制](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.10435)\n* [一种用于需要离散推理的阅读理解的多类型多跨度网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05514)（EMNLP2019）\n* [SDNet：面向会话问答的基于上下文注意力的深度网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.03593)\n* [通过推理链实现多跳问答](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.02610)\n* [选择、回答并解释：多文档上的可解释多跳阅读理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.00484)\n* [面向多跳问答的多步实体中心信息检索](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.07598)（EMNLP2019 WS）\n* [使用 BERTserini 的端到端开放域问答](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.01718)（NAALC2019）\n* [弱监督开放域问答中的潜在检索](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.00300)（ACL2019）\n* [多段落 BERT：一种用于开放域问答的全局归一化 BERT 模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.08167)（EMNLP2019）\n* [学习在维基百科图上检索推理路径以进行问答](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.10470)（ICLR2020）\n* [为机器阅读理解学习提出无法回答的问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.06045)（ACL2019）\n* [通过完形填空式翻译实现无监督问答](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04980)（ACL2019）\n* [基于强化学习的图到序列模型用于自然问题生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04942)\n* [一种基于循环 BERT 的问题生成模型](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-5821\u002F)（EMNLP2019 WS）\n* [通过学会提问来学会回答：融合 GPT-2 和 BERT 的优势](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.02365)\n* [利用丰富知识增强预训练语言表示以用于机器阅读理解](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002Fpapers\u002FP\u002FP19\u002FP19-1226\u002F)（ACL2019）\n* [通过多任务学习将关系知识融入常识阅读理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04530)（CIKM2019）\n* [SG-Net：语法引导的机器阅读理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05147)\n* [MMM：用于多选阅读理解的多阶段多任务学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.00458)\n* [Cosmos QA：具有情境化常识推理的机器阅读理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00277)（EMNLP2019）\n* [ReClor：一个需要逻辑推理的阅读理解数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.04326)（ICLR2020）\n* [通过后验正则化结合语言学约束实现鲁棒阅读理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06948)\n* [BAS：一种使用 BERT 语言模型的答案选择方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.01528)\n* [战胜 AI：探究阅读理解中的对抗性人工标注](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.00293)\n* [一种简单而有效的方法，将多轮对话上下文与 BERT 结合用于会话式机器阅读理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.12848)（ACL2019 WS）\n* [FlowDelta：为会话式机器阅读理解建模推理中的信息流增益](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05117)（ACL2019 WS）\n* [带有历史答案嵌入的 BERT 用于会话问答](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.05412)（SIGIR2019）\n* [GraphFlow：利用图神经网络捕捉对话流以进行会话式机器阅读理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.00059)（ICML2019 WS）\n* [超越仅英语的阅读理解：针对保加利亚语的零样本多语言迁移实验](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.01519)（RANLP2019）\n* [XQA：跨语言开放域问答数据集](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FP19-1227\u002F)（ACL2019）\n* [跨语言机器阅读理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00361)（EMNLP2019）\n* [基于多语言语言表示模型的跨语言迁移学习实现零样本阅读理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.09587)\n* [应用于对话代理的格式化文本多语言问答](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.04659)\n* [BiPaR：用于小说的多语言和跨语言阅读理解的双语平行数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05040)（EMNLP2019）\n* [MLQA：评估跨语言抽取式问答](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.07475)\n* [探究先验知识对挑战性中文机器阅读理解的作用](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09679)（TACL）\n* [SberQuAD——俄语阅读理解数据集：描述与分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.09723)\n* [给 BERT 装上计算器：通过阅读理解寻找运算和参数](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00109)（EMNLP2019）\n* [BERT-DST：基于 Transformer 双向编码器表示的可扩展端到端对话状态跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.03040)（Interspeech2019）\n* [对话状态跟踪：一种基于神经网络的阅读理解方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.01946)\n* [一种简单而有效的 BERT 模型，适用于资源受限系统的对话状态跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.12995)（ICASSP2020）\n* [针对模式引导的零样本对话状态跟踪微调 BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.00181)\n* [面向目标的多任务 BERT 对话状态跟踪器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.02450)\n* [用于响应选择的领域自适应训练 BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04812)\n* [BERT 进入法学院：量化访问大型法律语料库在合同理解方面的竞争优势](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.00473)\n* [自然问题数据集的 BERT 基线](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.00473)\n* [维基百科巫师](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1811.01241.pdf)\n* [BlenderBot 3：一款持续∗学习并负责任地互动的已部署对话代理](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.03188)（ACL2022）\n\n#### 实体填充\n* [用于联合意图分类和实体填充的 BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.10909)\n* [多语言意图识别与实体填充的联合 BERT 模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.02884)\n* [语言理解深度学习方法的比较](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002FInterspeech_2019\u002Fabstracts\u002F1262.html)（Interspeech2019）\n\n#### 分析\n* [基于话语上下文感知自注意力的细粒度信息状态分类](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04755)\n* [以挖掘规则作为弱监督的神经方面与观点术语抽取](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.03750) (ACL2019) \n* [基于BERT的词汇替换](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FP19-1328) (ACL2019) \n* [评估BERT的句法能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.05287)\n* [BERT是否一致？通过一致性关系评估对结构依赖性的理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.09892)\n* [用于关系抽取和语义角色标注的简单BERT模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.05255)\n* [LIMIT-BERT：语言学启发的多任务BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.14296)\n* [一种简单的基于BERT的词汇简化方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.06226)\n* [基于BERT的多头架构用于语法错误修正](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002Fpapers\u002FW\u002FW19\u002FW19-4426\u002F) (ACL2019 WS) \n* [迈向低监督的基于BERT的语法错误修正](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.03521)\n* [基于BERT的阿拉伯社交媒体作者画像](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.04181)\n* [社交媒体中基于句子级别的BERT及年龄和性别多任务学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.00637)\n* [评估摘要式文本摘要的事实一致性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.12840)\n* [NegBERT：用于否定检测与作用域解析的迁移学习方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.04211)\n* [xSLUE：跨风格语言理解与评估的基准及分析平台](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03663)\n* [TabFact：用于基于表格的事实核查的大规模数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02164)\n* [快速适配BERT用于领域特定商业文档的信息抽取](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.01861)\n* [LAMBERT：利用BERT进行信息抽取的版面感知语言建模](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08087)\n* [基于上下文嵌入的序列标注方法提取学术论文中的关键短语](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.08840) (ECIR2020) [[github](https:\u002F\u002Fgithub.com\u002Fmidas-research\u002Fkeyphrase-extraction-as-sequence-labeling-data)]\n* [基于跨度特征表示的关键短语提取](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.05407)\n* [你所说的BERT是什么意思？评估BERT作为分布语义模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.05758)\n\n#### 词分割、句法分析、NER  \n* [BERT与中文分词](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.09292)\n* [基于多准则学习的快速且准确的神经网络中文分词](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.04190)\n* [为新十年建立强大基线：使用BERT进行序列标注、句法与语义分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04943)\n* [在54种语言上评估上下文嵌入在词性标注、词形还原和依存句法分析中的表现](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.07448) \n* [NEZHA：用于中文语言理解的神经上下文表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00204)\n* [过渡型与图型依存句法分析中的深度上下文词嵌入——重温两种分析器的故事](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.07397) (EMNLP2019)\n* [词性标注对于神经网络依存句法分析来说是必要的还是有帮助的？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.03204)\n* [将句法分析作为预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.01685) (AAAI2020)\n* [跨语言BERT转换用于零样本依存句法分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.06775)\n* [递归式非自回归图到图Transformer用于依存句法分析，结合迭代精炼](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13118)\n* [命名实体识别——是否存在天花板？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.02403) (CoNLL2019)\n* [用于命名实体识别的统一MRC框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11476)\n* [利用预训练语言模型训练用于低资源实体标注的紧凑模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06294)\n* [通过真大小写预训练实现鲁棒的命名实体识别](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.07095) (AAAI2020)\n* [LTP：基于Bert-CRF的命名实体识别的新主动学习策略](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.02524)\n* [MT-BioNER：使用深度双向Transformer进行生物医学命名实体识别的多任务学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.08904)\n* [使用BERT-CRF进行葡萄牙语命名实体识别](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10649)\n* [迈向通用语言的命名实体识别，借助BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01389)\n\n#### 代词共指消解\n* [利用BERT解决性别相关的歧义代词问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01161) (ACL2019 WS)\n* [匿名化BERT：应对性别相关代词消解挑战的一种增强方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.01780) (ACL2019 WS)\n* [使用BERT和抽取式问答形式解决性别相关代词消解问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.03695) (ACL2019 WS)\n* [MSnet：基于BERT的性别相关代词消解网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.00308) (ACL2019 WS)\n* [填补空白：利用BERT进行代词消解](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002Fpapers\u002FW\u002FW19\u002FW19-3815\u002F) (ACL2019 WS)\n* [关于GAP共指消解共享任务：来自第三名解决方案的见解](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FW19-3816\u002F) (ACL2019 WS)\n* [再看句法：用于性别相关歧义代词消解的关系图卷积网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.08868) (ACL2019 WS)\n* [BERT掩码语言模型用于共指消解](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002Fpapers\u002FW\u002FW19\u002FW19-3811\u002F) (ACL2019 WS)\n* [通过实体对等进行共指消解](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FP19-1066\u002F) (ACL2019)\n* [BERT用于共指消解：基线与分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.09091) (EMNLP2019) [[github](https:\u002F\u002Fgithub.com\u002Fmandarjoshi90\u002Fcoref)]\n* [WikiCREM：用于共指消解的大规模无监督语料库](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.08025) (EMNLP2019)\n* [省略与共指消解作为问答问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.11141)\n* [将共指消解视为基于查询的跨度预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.01746)\n* [基于多任务学习的神经桥接参考消解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.03666)\n\n#### 词义消歧  \n* [GlossBERT：利用释义知识进行词义消歧的BERT模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.07245)（EMNLP 2019）\n* [基于预训练上下文感知词表示的改进型词义消歧方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.00194)（EMNLP 2019）\n* [使用BERT进行词义消歧](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.08358)\n* [语言建模让一切更合理：通过WordNet传播表示以实现全覆盖的词义消歧](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FP19-1569.pdf)（ACL 2019）\n* [BERT真的有意义吗？基于上下文嵌入的可解释词义消歧](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10430)（KONVENS 2019）\n\n#### 情感分析  \n* [通过构建辅助句子利用BERT进行基于方面的情感分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.09588)（NAACL 2019）\n* [用于评论阅读理解和基于方面的情感分析的BERT后训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.02232)（NAACL 2019）\n* [利用BERT实现端到端的基于方面的情感分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.00883)（EMNLP 2019 WS）\n* [适应还是落后：通过微调BERT语言模型实现领域自适应，用于方面—目标情感分类](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.11860)\n* [日语中基于迁移学习的情感分析研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.09642)（ACL 2019）\n* [\"掩码与填充\"：将掩码语言模型应用于情感迁移](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.08039)\n* [基于BERT的对抗训练在基于方面的情感分析中的应用](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.11316)\n* [利用BERT中间层进行基于方面的情感分析和自然语言推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.04815)\n* [利用BERT中间层进行基于方面的情感分析和自然语言推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.04815)\n\n#### 关系抽取\n* [填补空白：基于分布相似性的关系学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.03158)（ACL 2019）\n* [基于BERT的多头选择用于联合实体—关系抽取](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05908)（NLPCC 2019）\n* [用实体信息丰富预训练语言模型以进行关系分类](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.08284)\n* [基于跨度的联合实体与关系抽取及Transformer预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.07755)\n* [两步流程微调BERT用于DocRED](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11898)\n* [基于上下文感知跨度表示的实体、关系和事件抽取](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03546)（EMNLP 2019）\n\n#### 知识图谱\n* [KG-BERT：用于知识图谱补全的BERT模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03193)\n* [语言模型能否作为知识库？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.01066)（EMNLP 2019）[[github](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FLAMA)]\n* [BERT还不是知识库（目前）：无监督问答中的事实性知识与基于名称的推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03681)\n* [从BERT中诱导关系知识](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.12753)（AAAI 2020）\n* [潜在关系语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.07690)（AAAI 2020）\n* [预训练百科全书：弱监督知识预训练语言模型](https:\u002F\u002Fopenreview.net\u002Fforum?id=BJlzm64tDH)（ICLR 2020）\n* [基于密集实体检索的零样本实体链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03814)\n* [通过简单的神经网络端到端实体链接研究BERT中的实体知识](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FK19-1063\u002F)（CoNLL 2019）\n* [通过建模潜在实体类型信息提升实体链接性能](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.01447)（AAAI 2020）\n* [PEL-BERT：协议实体链接的联合模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.00744)\n* [我们如何知道语言模型了解什么？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.12543)\n* [REALM：检索增强型语言模型预训练](https:\u002F\u002Fkentonl.com\u002Fpub\u002Fgltpc.2020.pdf)\n\n\n#### 文本分类\n* [如何对BERT进行文本分类的微调？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.05583)\n* [X-BERT：基于BERT的极端多标签文本分类](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02331)\n* [DocBERT：用于文档分类的BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08398)\n* [用知识图谱嵌入丰富BERT以用于文档分类](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.08402)\n* [基于上下文感知词嵌入的论点分类与聚类](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.09821)（ACL 2019）\n* [BERT用于证据检索和主张验证](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.02655)\n* [堆叠DeBERT：在不完整数据下对文本分类的全部关注](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.00137)\n* [面向不平衡数据的代价敏感BERT用于可泛化句子分类](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.11563)\n\n#### WSC WNLI NLI\n* [探索无监督预训练和句子结构建模以应对维诺格拉德模式挑战](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09705)\n* [一种令人惊讶的稳健技巧用于维诺格拉德模式挑战](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.06290)\n* [WinoGrande：大规模的对抗性维诺格拉德模式挑战](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.10641)（AAAI 2020）\n* [借助预训练句法分析器改进自然语言推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.08217)\n* [对抗性NLI：自然语言理解的新基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.14599)\n* [自然语言推理系统的对抗性分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.03441)（ICSC 2020）\n* [HypoNLI：探索自然语言推理中仅基于假设的偏见的人工模式](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02756)（LREC 2020）\n* [评估BERT在自然语言推理中的表现：以CommitmentBank为例](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-1630\u002F)（EMNLP 2019）\n\n#### 常识推理\n* [CommonsenseQA：一项针对常识知识的问答挑战](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.00937)（NAACL2019）\n* [HellaSwag：机器真的能完成你的句子吗？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07830)（ACL2019）[[官网](https:\u002F\u002Frowanzellers.com\u002Fhellaswag\u002F)]\n* [通过可迁移的BERT进行故事结局预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07504)（IJCAI2019）\n* [解释你自己！利用语言模型进行常识推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.02361)（ACL2019）\n* [对齐、掩码与选择：一种将常识知识融入语言表示模型的简单方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.06725)\n* [利用外部语言学知识指导无监督预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02339)\n* [常识知识+BERT用于二级阅读理解能力测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03415)\n* [BIG MOOD：将Transformer与显式常识知识关联起来](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.07713)\n* [从预训练模型中挖掘常识知识](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00505)（EMNLP2019）\n* [KagNet：面向常识推理的知识感知图网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02151)（EMNLP2019）\n* [破解上下文中的常识密码：理解深度上下文表示的常识推理能力](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-6001\u002F)（EMNLP2019研讨会）\n* [大规模预训练的语言模型是否能成为更好的讲故事者？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10705)（CoNLL2019）\n* [PIQA：关于自然语言中物理常识的推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.11641v1)（AAAI2020）\n* [评估预训练语言模型中的常识](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.11931)（AAAI2020）\n* [为什么掩码神经语言模型仍然需要常识知识？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03024)\n* [神经语言表示是否学习了物理常识？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.02899)（CogSci2019）\n\n#### 抽取式摘要\n* [HIBERT：用于文档摘要的层次双向Transformer的文档级预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.06566)（ACL2019）\n* [Deleter：利用BERT实现无监督的连续文本压缩](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03223)\n* [面向文本摘要的篇章意识神经抽取模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.14142)\n* [PEGASUS：基于抽取的间隔句进行抽象式摘要的预训练](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1912.08777.pdf)[[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fpegasus)]\n* [篇章意识神经抽取式文本摘要](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1910.14142.pdf)[[github](https:\u002F\u002Fgithub.com\u002Fjiacheng-xu\u002FDiscoBERT)]\n\n\n#### 信息检索\n* [使用BERT进行段落重排序](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.04085)\n* [探究BERT在段落重排序中的成功与失败](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.01758)\n* [理解BERT在排序中的行为](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.07531)\n* [通过查询预测进行文档扩展](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08375)\n* [CEDR：用于文档排名的上下文化嵌入](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.07094)（SIGIR2019）\n* [借助上下文神经语言建模实现更深层次的文本理解以用于信息检索](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.09217)（SIGIR2019）\n* [利用查询-问题相似度和基于BERT的查询-答案相关性进行FAQ检索](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02851)（SIGIR2019）\n* [多阶段文档排名与BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.14424)\n* [REALM：检索增强型语言模型预训练](https:\u002F\u002Fkentonl.com\u002Fpub\u002Fgltpc.2020.pdf)\n* [你能将多少知识塞进语言模型的参数里？](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2002.08910.pdf)[[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Ft5_closed_book_qa)]\n* [开放域问答中的密集段落检索](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.04906.pdf)[[github](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FDPR)]\n\n### 生成    \n* [BERT有嘴，它必须说话：BERT作为马尔可夫随机场语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.04094) (NAACL2019 WS)\n* [基于预训练的自然语言生成用于文本摘要](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.09243)\n* [使用预训练编码器进行文本摘要](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.08345) (EMNLP2019) [[github (原版)](https:\u002F\u002Fgithub.com\u002Fnlpyang\u002FPreSumm)] [[github (Hugging Face)](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers\u002Ftree\u002Fmaster\u002Fexamples\u002Fsummarization)]\n* [抽象摘要的多阶段预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10599)\n* [PEGASUS：基于抽取式间隔句的抽象摘要预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.08777)\n* [MASS：面向语言生成的掩码序列到序列预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02450) (ICML2019) [[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FMASS)], [[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FMASS\u002Ftree\u002Fmaster\u002FMASS-fairseq)]\n* [面向自然语言理解与生成的统一语言模型预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.03197) [[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Funilm)] (NeurIPS2019)\n* [UniLMv2：用于统一语言模型预训练的伪掩码语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.12804) [[github](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Funilm)]\n* [ProphetNet：为序列到序列预训练预测未来N-gram](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.04063)\n* [在神经机器翻译中最大化利用BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05672)\n* [利用预训练表示改进神经机器翻译](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.07688)\n* [关于在神经机器翻译中使用BERT的研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.12744) (EMNLP2019 WS)\n* [将BERT融入神经机器翻译](https:\u002F\u002Fopenreview.net\u002Fforum?id=Hyl7ygStwB) (ICLR2020)\n* [将预训练的BERT编码器复用于神经机器翻译](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-5603\u002F)\n* [利用预训练检查点处理序列生成任务](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.12461)\n* [Mask-Predict：条件掩码语言模型的并行解码](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09324) (EMNLP2019)\n* [BART：面向自然语言生成、翻译和理解的去噪序列到序列预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.13461)\n* [ERNIE-GEN：增强型多流预训练与微调框架，用于自然语言生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.11314)\n* [通过预训练实现跨语言自然语言生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10481) (AAAI2020) [[github](https:\u002F\u002Fgithub.com\u002FCZWin32768\u002FXNLG)]\n* [面向神经机器翻译的多语言去噪预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.08210)\n* [PLATO：具有离散潜在变量的预训练对话生成模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.07931)\n* [自然语言生成的无监督预训练：文献综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06171)  \n* [通过生成式预训练提升语言理解能力](https:\u002F\u002Fs3-us-west-2.amazonaws.com\u002Fopenai-assets\u002Fresearch-covers\u002Flanguage-unsupervised\u002Flanguage_understanding_paper.pdf)  \n* [语言模型是无监督的多任务学习者](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf)  \n* [语言模型是少样本学习者](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.14165.pdf)  \n\n### 质量评估器   \n* [BERTScore：用BERT评估文本生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09675) (ICLR2020)\n* [用BERT回归器评估机器翻译](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.12679)\n* [SumQE：基于BERT的摘要质量评估模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00578) (EMNLP2019)\n* [MoverScore：基于上下文嵌入和地球移动距离的文本生成评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02622) (EMNLP2019) [[github](https:\u002F\u002Fgithub.com\u002FAIPHES\u002Femnlp19-moverscore)]\n* [BERT作为教师：用于序列级奖励的上下文嵌入](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02738)\n\n### 修改（多任务、掩码策略等）    \n* [用于自然语言理解的多任务深度神经网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.11504)（ACL2019）\n* [微软用于自然语言理解的多任务深度神经网络工具包](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.07972)\n* [BERT与PALs：用于多任务学习中高效适应的投影注意力层](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.02671)（ICML2019）\n* [通过跨度抽取统一问答和文本分类](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09286)\n* [ERNIE：融入信息实体的增强语言表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07129)（ACL2019）\n* [ERNIE：通过知识融合增强表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09223)\n* [ERNIE 2.0：面向语言理解的持续预训练框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.12412)（AAAI2020）\n* [针对中文BERT的整词掩码预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.08101)\n* [SpanBERT：通过表示和预测跨度改进预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.10529) [[github](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FSpanBERT)]\n* [空白语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.03079)\n* [通过渐进式堆叠高效训练BERT](http:\u002F\u002Fproceedings.mlr.press\u002Fv97\u002Fgong19a.html)（ICML2019）[[github](https:\u002F\u002Fgithub.com\u002Fgonglinyuan\u002FStackingBERT)]\n* [RoBERTa：一种鲁棒优化的BERT预训练方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11692) [[github](https:\u002F\u002Fgithub.com\u002Fpytorch\u002Ffairseq\u002Ftree\u002Fmaster\u002Fexamples\u002Froberta)]\n* [ALBERT：用于语言表示自监督学习的轻量级BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11942)（ICLR2020）\n* [ELECTRA：将文本编码器作为判别器而非生成器进行预训练](https:\u002F\u002Fopenreview.net\u002Fforum?id=r1xMH1BtvB)（ICLR2020）[[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Felectra)] [[blog](https:\u002F\u002Fai.googleblog.com\u002F2020\u002F03\u002Fmore-efficient-nlp-model-pre-training.html)]\n* [FreeLB：用于语言理解的增强对抗训练](https:\u002F\u002Fopenreview.net\u002Fforum?id=BygzbyHFvB)（ICLR2020）\n* [KERMIT：基于生成式插入的序列建模](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01604)\n* [DisSent：从显式话语关系中学习句子表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.04334)（ACL2019）\n* [StructBERT：将语言结构融入预训练以实现深度语言理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.04577)（ICLR2020）\n* [面向机器翻译和自然语言理解的注入句法的Transformer和BERT模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06156)\n* [SenseBERT：为BERT注入语义](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05646)\n* [面向语言理解的语义感知BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02209)（AAAI2020）\n* [K-BERT：利用知识图谱赋能语言表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.07606)\n* [知识增强的上下文词表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.04164)（EMNLP2019）\n* [KEPLER：知识嵌入与预训练语言表示的统一模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06136)\n* [Sentence-BERT：使用暹罗BERT网络生成句子嵌入](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.10084)（EMNLP2019）\n* [SBERT-WK：通过解构基于BERT的词模型进行句子嵌入的方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.06652)\n* [来自BERT的通用文本表示：一项实证研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.07973)\n* [基于对称正则化的BERT用于成对语义推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03405)\n* [迁移微调：一个BERT案例研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00931)（EMNLP2019）\n* [通过词汇扩展改进多语言预训练模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.12440)（CoNLL2019）\n* [SesameBERT：无处不在的注意力机制](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.03176)\n* [探索统一文本到文本转换Transformer的迁移学习极限](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Ftext-to-text-transfer-transformer)]\n* [SMART：通过原则性正则化优化实现对预训练自然语言模型的鲁棒高效微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03437)\n\n### 探针   \n* [用于在词表示中发现句法的结构探针](https:\u002F\u002Faclweb.org\u002Fanthology\u002Fpapers\u002FN\u002FN19\u002FN19-1419\u002F)（NAACL2019）\n* [语境表示中的语言学知识与可迁移性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.08855)（NAACL2019）[[github](https:\u002F\u002Fgithub.com\u002Fnelson-liu\u002Fcontextual-repr-analysis)]\n* [探究不同NLP任务教会机器关于功能词理解的内容](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.11544)（*SEM2019）\n* [BERT重新发现经典NLP流水线](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.05950)（ACL2019）\n* [探测神经网络对自然语言论据的理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.07355)（ACL2019）\n* [破解语境常识代码：理解深度语境表示的常识推理能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.01157)（EMNLP2019 WS）\n* [你所说的BERT是什么意思？评估BERT作为分布语义模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.05758)\n* [对于神经语言模型而言，数量并不能带来高质量的句法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00111)（EMNLP2019）\n* [预训练语言模型是否意识到短语的存在？用于语法归纳的简单而强大的基线](https:\u002F\u002Fopenreview.net\u002Fforum?id=H1xPR3NtPB)（ICLR2020）\n* [oLMpics——关于语言模型预训练所捕捉的内容](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.13283)\n* [你能将多少知识塞进语言模型的参数里？](http:\u002F\u002Fcolinraffel.com\u002Fpublications\u002Farxiv2020how.pdf)\n* [我的问答模型知道些什么？利用专家知识设计可控探针](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.13337)\n* [注意力并非解释](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1902.10186)（ACL2021）\n\n### 多语言  \n* [基于自注意力机制和预训练的多语言句法分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.11760) (ACL2019)\n* [语言模型预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.07291) (NeurIPS2019) [[github](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FXLM)]\n* [75种语言，1个模型：通用依存关系的通用句法分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.02099) (EMNLP2019) [[github](https:\u002F\u002Fgithub.com\u002Fhyperparticle\u002Fudify)]\n* [利用预训练的多语言句子表示进行零样本依存句法分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05479) (EMNLP2019 WS)\n* [Beto、Bentz、Becas：BERT令人惊讶的跨语言有效性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09077) (EMNLP2019)\n* [多语言BERT到底有多“多语言”？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01502) (ACL2019)\n* [多语言BERT究竟有多“语言中立”？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03310)\n* [多语言BERT在语言生成方面是否流利？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.03806)\n* [Unicoder：通过多任务跨语言预训练构建的通用语言编码器](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-1252\u002F) (EMNLP2019)\n* [BERT并非中间语，以及分词偏见](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-6106\u002F) (EMNLP2019 WS)\n* [多语言BERT的跨语言能力：一项实证研究](https:\u002F\u002Fopenreview.net\u002Fforum?id=HJeT3yrtDr) (ICLR2020)\n* [上下文词表示的多语言对齐](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.03518) (ICLR2020)\n* [单语表示的跨语言迁移性研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11856)\n* [大规模无监督跨语言表示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.02116)\n* [预训练语言模型中涌现的跨语言结构](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.01464)\n* [单语预训练模型能否帮助跨语言分类？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03913)\n* [基于BERT的完全无监督跨语言语义文本相似度度量，用于识别平行数据](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FK19-1020\u002F) (CoNLL2019)\n* [那个“[MASK]”是什么？解读特定语言的BERT模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02912)\n* [XTREME：一个大规模多语言多任务基准，用于评估跨语言泛化能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.11080)\n* [mT5：一个大规模多语言预训练的文本到文本转换器](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.11934) (ACL2021) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fmultilingual-t5)] \n\n### 非英语语言模型    \n* [CamemBERT：一款美味的法语语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03894)\n* [FlauBERT：面向法语的无监督语言模型预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.05372)\n* [仅多语言还不够：芬兰语版BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.07076)\n* [BERTje：荷兰版BERT模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.09582)\n* [RobBERT：基于RoBERTa的荷兰语语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.06286)\n* [深度双向多语言Transformer在俄语中的适配](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07213)\n* [AraBERT：基于Transformer的阿拉伯语理解模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.00104)\n* [PhoBERT：面向越南语的预训练语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.00744) \n* [CLUECorpus2020：用于语言模型预训练的大规模中文语料库](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.01355)\n\n\n### 领域专用  \n* [BioBERT：用于生物医学文本挖掘的预训练生物医学语言表示模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.08746)\n* [生物医学自然语言处理中的迁移学习：BERT和ELMo在十个基准数据集上的评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.05474) (ACL2019 WS) \n* [基于BERT的排序方法用于生物医学实体归一化](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03548)\n* [PubMedQA：生物医学研究问答数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.06146) (EMNLP2019)\n* [用于生物医学问答的预训练语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.08229)\n* [如何预训练你的模型？不同预训练方式在生物医学问答中的比较](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.00712)\n* [ClinicalBERT：临床笔记建模与医院再入院预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.05342)\n* [公开可用的临床BERT嵌入](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.03323) (NAACL2019 WS)\n* [利用基于注意力的深度学习模型结合BERT进行病程记录分类与关键词提取](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05786)\n* [SciBERT：面向科学文本的预训练上下文嵌入](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.10676) [[github](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fscibert)]\n* [PatentBERT：通过微调预训练BERT模型进行专利分类](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.02124)\n\n### 多模态    \n* [VideoBERT：视频与语言表示学习的联合模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.01766)（ICCV2019）\n* [ViLBERT：面向视觉-语言任务的、与任务无关的视觉语言表示预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.02265)（NeurIPS2019）\n* [VisualBERT：一种简单高效的视觉与语言基线模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03557)\n* [Selfie：用于图像嵌入的自监督预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.02940)\n* [ImageBERT：基于大规模弱监督图文数据的跨模态预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.07966)\n* [用于时间表示学习的对比双向Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.05743)\n* [M-BERT：在BERT结构中注入多模态信息](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05787)\n* [LXMERT：从Transformer中学习跨模态编码器表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.07490)（EMNLP2019）\n* [文本中检测到的物体融合用于视觉问答](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.05054)（EMNLP2019）\n* [用于视频问答的BERT表示](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_WACV_2020\u002Fhtml\u002FYang_BERT_representations_for_Video_Question_Answering_WACV_2020_paper.html)（WACV2020）\n* [面向图像字幕生成和VQA的统一视觉-语言预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11059) [[github](https:\u002F\u002Fgithub.com\u002FLuoweiZhou\u002FVLP)]\n* [面向视觉对话的大规模预训练：一种简单的最先进基线](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.02379)\n* [VL-BERT：通用视觉-语言表示的预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.08530)（ICLR2020）\n* [Unicoder-VL：通过跨模态预训练构建的视觉与语言通用编码器](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.06066)\n* [UNITER：学习通用的图文表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11740)\n* [用于分类图像和文本的有监督多模态双变换器](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02950)\n* [弱监督有助于词-物体对齐的涌现，并提升视觉-语言任务性能](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.03063)\n* [BERT开箱即用即可“看见”：关于文本表示的跨模态可迁移性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.10832)\n* [结合测试时增强的BERT用于大规模视频片段分类](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01127)（ICCV2019WS）\n* [SpeechBERT：面向端到端语音问答的跨模态预训练语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11559)\n* [vq-wav2vec：离散语音表示的自监督学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05453)\n* [自监督预训练对语音识别的有效性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.03912)\n* [通过预训练理解语音语义](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10924)\n* [迈向基于深度预训练语言模型的端到端语音合成迁移学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.07307) \n\n\n### 模型压缩    \n* [将BERT中的特定任务知识蒸馏到简单神经网络中](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.12136)\n* [针对BERT模型压缩的耐心知识蒸馏](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.09355)（EMNLP2019）\n* [用于序列标注的小型实用BERT模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00100)（EMNLP2019）\n* [剪枝基于BERT的问答模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06360)\n* [TinyBERT：为自然语言理解蒸馏BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.10351) [[github](https:\u002F\u002Fgithub.com\u002Fhuawei-noah\u002FPretrained-Language-Model\u002Ftree\u002Fmaster\u002FTinyBERT)]\n* [DistilBERT，BERT的蒸馏版本：更小、更快、更便宜且更轻](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.01108)（NeurIPS2019 WS）[[github](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers\u002Ftree\u002Fmaster\u002Fexamples\u002Fdistillation)]\n* [从内部表示进行知识蒸馏](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.03723)（AAAI2020）\n* [PoWER-BERT：加速BERT在分类任务中的推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.08950)\n* [WaLDORf：无浪费的语言模型蒸馏——以阅读理解为例](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.06638)\n* [利用最优子词和共享投影实现极端语言模型压缩](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11687)\n* [忒修斯之船BERT：通过渐进式模块替换压缩BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.02925)\n* [压缩BERT：研究权重剪枝对迁移学习的影响](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08307)\n* [MiniLM：面向任务无关压缩的预训练Transformer的深度自注意力蒸馏](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.10957)\n* [大规模基于Transformer的模型压缩：以BERT为例](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.11985)\n* [先大后小：重新思考模型规模以实现Transformer的高效训练与推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.11794)\n* [MobileBERT：通过渐进式知识迁移实现BERT的任务无关压缩](https:\u002F\u002Fopenreview.net\u002Fforum?id=SJxjVaNKwB)\n* [Q-BERT：基于Hessian矩阵的超低精度量化BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.05840)\n* [Q8BERT：量化为8位的BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06188)（NeurIPS2019 WS）\n\n### 大语言模型\n* [注意力就是一切](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1706.03762.pdf)\n* [通过生成式预训练提升语言理解能力](https:\u002F\u002Fwww.cs.ubc.ca\u002F~amuham01\u002FLING530\u002Fpapers\u002Fradford2018improving.pdf)\n* [BERT：用于语言理解的深度双向Transformer预训练](https:\u002F\u002Faclanthology.org\u002FN19-1423.pdf)\n* [语言模型是无监督的多任务学习者](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf)\n* [Megatron-LM：利用模型并行训练数十亿参数的语言模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1909.08053.pdf)\n* [使用统一的文本到文本Transformer探索迁移学习的极限](https:\u002F\u002Fjmlr.org\u002Fpapers\u002Fv21\u002F20-074.html)\n* [ZeRO：面向训练万亿参数模型的内存优化技术](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1910.02054.pdf)\n* [神经语言模型的扩展定律](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2001.08361.pdf)\n* [语言模型是少样本学习者](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2020\u002Ffile\u002F1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf)\n* [Switch Transformer：通过简单高效的稀疏性扩展至万亿参数模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.03961.pdf)\n* [对基于代码训练的大规模语言模型进行评估](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2107.03374.pdf)\n* [多任务提示训练实现零样本任务泛化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.08207)\n* [GLaM：基于专家混合的高效语言模型扩展](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.06905.pdf)\n* [WebGPT：结合浏览器辅助与人类反馈的问答系统](https:\u002F\u002Fwww.semanticscholar.org\u002Fpaper\u002FWebGPT%3A-Browser-assisted-question-answering-with-Nakano-Hilton\u002F2f3efe44083af91cef562c1a3451eee2f8601d22)\n* [通过检索数万亿个词元改进语言模型](https:\u002F\u002Fwww.deepmind.com\u002Fpublications\u002Fimproving-language-models-by-retrieving-from-trillions-of-tokens)\n* [语言模型的扩展：训练Gopher的经验、方法与见解](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.11446.pdf)\n* [思维链提示在大型语言模型中激发推理能力](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.11903.pdf)\n* [LaMDA：面向对话应用的语言模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.08239.pdf)\n* [利用语言模型解决定量推理问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14858)\n* [使用Deep和Megatron训练Megatron-Turing NLG 530B，一个大规模生成式语言模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.11990.pdf)\n* [通过人类反馈训练语言模型遵循指令](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.02155.pdf)\n* [PaLM：通过Pathways扩展语言建模](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.02311.pdf)\n* [计算最优的大规模语言模型训练的实证分析](https:\u002F\u002Fwww.deepmind.com\u002Fpublications\u002Fan-empirical-analysis-of-compute-optimal-large-language-model-training)\n* [OPT：开放的预训练Transformer语言模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.01068.pdf)\n* [统一语言学习范式](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.05131v1)\n* [大型语言模型的涌现能力](https:\u002F\u002Fopenreview.net\u002Fpdf?id=yzkSU5zdwD)\n* [超越模仿游戏：量化并外推语言模型的能力](https:\u002F\u002Fgithub.com\u002Fgoogle\u002FBIG-bench)\n* [语言模型是通用接口](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.06336.pdf)\n* [通过有针对性的人类判断改进对话代理的一致性](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.14375.pdf)\n* [指令微调语言模型的扩展](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.11416.pdf)\n* [GLM-130B：一个开放的双语预训练模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.02414.pdf)\n* [语言模型的全面评估](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.09110.pdf)\n* [BLOOM：一个拥有1760亿参数的开源多语言语言模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.05100.pdf)\n* [Galactica：一个面向科学领域的大型语言模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.09085.pdf)\n* [OPT-IML：从泛化的视角扩展语言模型指令元学习](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.12017)\n* [Flan系列：为有效指令微调设计数据与方法](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2301.13688.pdf)\n* [LLaMA：开放且高效的基座语言模型](https:\u002F\u002Fresearch.facebook.com\u002Fpublications\u002Fllama-open-and-efficient-foundation-language-models\u002F)\n* [语言并非一切：将感知与语言模型对齐](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.14045)\n* [PaLM-E：具身多模态语言模型](https:\u002F\u002Fpalm-e.github.io)\n* [GPT-4技术报告](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4)\n* [Pythia：一套用于分析训练与扩展过程中大型语言模型的工具](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.01373)\n* [基于原则的自对齐语言模型：从零开始，在极少人类监督下实现](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.03047)\n* [PaLM 2技术报告](https:\u002F\u002Fai.google\u002Fstatic\u002Fdocuments\u002Fpalm2techreport.pdf)\n* [RWKV：为Transformer时代重新发明RNN](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13048)\n* [直接偏好优化：你的语言模型其实是一个奖励模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.18290.pdf)\n* [Llama 2：开放的基座模型与微调后的聊天模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.09288.pdf)\n* [Mamba：具有选择性状态空间的线性时间序列建模](https:\u002F\u002Farxiv.org\u002Fftp\u002Farxiv\u002Fpapers\u002F2312\u002F2312.00752.pdf)\n* [TinyLlama：一个开源的小型语言模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2401.02385.pdf)\n* [指令微调语言模型的扩展](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.11416) [[github](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.11416)] (ACL2023)\n\n### 杂项\n* [jiant：用于通用文本理解模型研究的软件工具包](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02249) [[github](https:\u002F\u002Fgithub.com\u002Fnyu-mll\u002Fjiant\u002F)]\n* [基于完形填空的自注意力网络预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.07785)\n* [通用语言智能的学习与评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.11373)\n* [微调还是不微调？将预训练表示适配到多样化任务](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.05987)（ACL2019 WS）\n* [在奇幻文字冒险游戏中学习说话与行动](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-1062\u002F)（EMNLP2019）\n* [条件式BERT上下文增强](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.06705)\n* [利用预训练Transformer模型进行数据增强](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02245)\n* [深度学习的大批量优化：76分钟内训练BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.00962)（ICLR2020）\n* [Mixout：有效正则化方法，用于微调大规模预训练语言模型](https:\u002F\u002Fopenreview.net\u002Fforum?id=HkgaETNtDB)（ICLR2020）\n* [从互信息最大化视角看语言表示学习](https:\u002F\u002Fopenreview.net\u002Fforum?id=Syx79eBKwr)（ICLR2020）\n* [BERT真的 robust吗？针对文本分类与蕴含任务的自然语言攻击](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11932)（AAAI2020）\n* [芝麻街上的窃贼！基于BERT的API模型提取](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.12366)（ICLR2020）\n* [Graph-Bert：学习图表示仅需注意力机制](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.05140)\n* [CodeBERT：面向编程与自然语言的预训练模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08155)\n* [预训练语言模型的微调：权重初始化、数据顺序与早停策略](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.06305)\n* [将机器语言模型扩展至人类水平的语言理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.05877)\n* [Glyce：用于汉字表示的字形向量](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.10125)\n* [回到未来——文本表示的序列对齐](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03464)\n* [利用BERT提升楔形文字的语言识别能力](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002Fpapers\u002FW\u002FW19\u002FW19-1402\u002F)（NAACL2019 WS）\n* [BERT拥有道德指南针：提升机器的伦理与道德价值](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.05238)\n* [SMILES-BERT：用于分子性质预测的大规模无监督预训练](https:\u002F\u002Fdl.acm.org\u002Fcitation.cfm?id=3342186)（ACM-BCB2019）\n* [关于预训练语言模型的可比较性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.00781)\n* [Transformer：最先进的自然语言处理技术](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.03771)\n* [Jukebox：音乐生成模型](https:\u002F\u002Fcdn.openai.com\u002Fpapers\u002Fjukebox.pdf)\n* [WT5？！训练文本到文本模型以解释其预测](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.14546.pdf)\n* [TAPAS：通过预训练实现弱监督表格解析](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.02349.pdf) [[github](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Ftapas)]\n* [TABERT：面向文本与表格数据联合理解的预训练](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.08314.pdf)\n\n\n# 作者\nChangWookJun \u002F @changwookjun（changwookjun@gmail.com）","# NLP Paper 快速上手指南\n\n`nlp-paper` 并非一个可安装的软件库或框架，而是一个**自然语言处理（NLP）领域的经典论文索引清单**。它汇集了 BERT、Transformer、迁移学习、文本摘要等方向的核心研究成果。\n\n本指南旨在帮助开发者快速利用该资源查找文献、获取代码实现并跟进前沿技术。\n\n## 环境准备\n\n由于本项目本质是文档列表，无需特定的运行时环境。但为了阅读论文和运行相关代码，建议准备以下基础环境：\n\n*   **操作系统**：Windows \u002F macOS \u002F Linux 均可\n*   **浏览器**：推荐 Chrome 或 Edge，用于访问 arXiv 论文链接\n*   **开发依赖**（若需复现论文代码）：\n    *   Python 3.8+\n    *   PyTorch 或 TensorFlow\n    *   Git（用于克隆关联的 GitHub 仓库）\n*   **网络环境**：\n    *   访问 arXiv 可能需要科学网络连接。\n    *   **国内加速方案**：推荐使用 [arXiv 国内镜像](https:\u002F\u002Farxiv.org\u002F) (如 `arxiv.cn` 或通过高校\u002F研究所镜像站) 以加快论文 PDF 下载速度。\n\n## 安装步骤\n\n本项目无需通过 `pip` 或 `conda` 安装。你可以通过以下两种方式获取内容：\n\n### 方式一：在线浏览（推荐）\n直接访问该项目的 GitHub 页面查看分类清晰的论文列表：\n1. 打开浏览器访问项目主页（通常在 GitHub 搜索 `nlp-paper`）。\n2. 利用目录（Contents）快速跳转至你感兴趣的领域（如 `Bert Series`, `Transformer Series` 等）。\n\n### 方式二：本地克隆\n如果你希望离线查看或贡献更新，可以使用 Git 克隆仓库：\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fzhongshuox\u002Fnlp-paper.git\ncd nlp-paper\n```\n*(注：请根据实际仓库地址替换上述 URL，若原仓库不可用，可在 GitHub 搜索同名高星项目)*\n\n## 基本使用\n\n### 1. 按主题查找论文\n打开 `README.md` 文件，利用目录结构定位研究方向。例如，如果你想研究 **BERT 的变体**，直接滚动到 `### Bert Series` 章节。\n\n**示例内容结构：**\n*   **论文标题**：[RoBERTa: A Robustly Optimized BERT Pretraining Approach](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.11692)\n*   **来源会议**：arXiv 2019\n*   **操作**：点击链接直接在浏览器中打开 arXiv 页面阅读摘要或下载 PDF。\n\n### 2. 获取配套代码\n许多条目在标题后附带了 `[[github](url)]` 标记，这表示该论文有开源代码实现。\n\n**操作步骤：**\n1. 在列表中找到带 GitHub 标记的论文，例如 `exBERT`：\n   > [exBERT: A Visual Analysis Tool to Explore Learned Representations in Transformers Models](...) [[github](https:\u002F\u002Fgithub.com\u002Fbhoov\u002Fexbert)]\n2. 点击 `github` 链接进入代码仓库。\n3. 按照该仓库具体的 `README` 指示进行安装和运行。\n\n**通用代码复现流程示例：**\n```bash\n# 假设找到了某个论文的 GitHub 地址\ngit clone \u003C论文对应的 GitHub 仓库地址>\ncd \u003C仓库目录>\n\n# 安装依赖 (具体文件名参考该仓库要求)\npip install -r requirements.txt\n\n# 运行示例脚本\npython run_example.py\n```\n\n### 3. 追踪特定领域进展\n利用清单的分类特性，快速构建知识体系：\n*   **入门学习**：从 `Transformer Series` 和 `Bert Series` 开始，阅读奠基性论文。\n*   **任务导向**：根据需求跳转至 `Text Summarization` (文本摘要)、`Sentiment Analysis` (情感分析) 或 `Machine Translation` (机器翻译)。\n*   **进阶研究**：查看 `Model compression` (模型压缩)、`Multi-lingual` (多语言) 或 `LLM` (大语言模型) 章节了解最新优化策略。\n\n---\n**提示**：该列表持续更新，建议定期查看最新版本以获取最新的 SOTA (State-of-the-Art) 论文信息。","某金融科技公司算法团队正着手构建新一代智能客服系统，急需调研最新的自然语言处理（NLP）论文以优化意图识别与情感分析模块。\n\n### 没有 nlp-paper 时\n- **检索效率低下**：研究人员需在 arXiv、ACL Anthology 等多个平台分散搜索，耗费数天才能拼凑出关于 BERT 变体或情感分析的文献列表。\n- **分类体系混乱**：找到的论文缺乏统一标签，难以快速区分哪些属于“迁移学习”，哪些专攻“槽位填充”或“指代消解”，导致技术选型方向模糊。\n- **遗漏关键成果**：由于缺乏系统性整理，极易错过如 RoBERTa 优化策略或特定领域（Domain specific）的最新突破性研究，影响模型基线性能。\n- **复现成本高昂**：部分论文链接失效或缺少对应的代码仓库指引，团队在验证算法可行性上走了许多弯路。\n\n### 使用 nlp-paper 后\n- **一站式获取资源**：团队直接通过 nlp-paper 的结构化目录，几分钟内即可锁定\"Sentiment Analysis\"和\"QA MC Dialogue\"等核心板块的全部高质论文。\n- **精准技术映射**：利用其细致的子分类（如 Word segmentation parsing NER、Relation extraction），迅速将业务需求与具体学术成果对应，明确了从 BERT 到 ALBERT 的演进路线。\n- **前沿动态同步**：借助涵盖 LLM、多模态及模型压缩等最新板块，及时引入了适合部署的轻量化模型方案，显著提升了系统响应速度。\n- **链路完整可溯**：每条记录均附带权威会议来源及 ArXiv 链接，部分还关联 GitHub 项目，大幅缩短了从理论调研到代码复现的周期。\n\nnlp-paper 将碎片化的学术海洋转化为结构清晰的导航图，让研发团队能从繁琐的文献挖掘中解脱，专注于核心算法的创新与落地。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fchangwookjun_nlp-paper_d4ebd727.png","changwookjun","전창욱","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fchangwookjun_b5e9aa90.jpg","Deep Learning Engineer for NLP   \r\n\r\nSoftware Engineer","LG AI Research","Korea","changwookjun@gmail.com",null,"https:\u002F\u002Fwww.facebook.com\u002Fchangwookjun0","https:\u002F\u002Fgithub.com\u002Fchangwookjun",595,122,"2026-04-02T08:38:55","","未说明",{"notes":92,"python":90,"dependencies":93},"该仓库仅为自然语言处理（NLP）领域的论文列表索引，不包含可执行的源代码、安装脚本或具体的环境配置要求。所列项目（如 BERT, Transformer 等）为学术论文链接，用户需根据具体论文的官方实现仓库自行查询运行环境需求。",[],[15],[96,97,98,99,100,101,102,103],"deeplearning","nlp","meachinelearning","paper","bert","transformer","language-model","transfer-learning","2026-03-27T02:49:30.150509","2026-04-19T06:02:08.931846",[],[]]