[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-ict-bigdatalab--awesome-pretrained-models-for-information-retrieval":3,"tool-ict-bigdatalab--awesome-pretrained-models-for-information-retrieval":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",150037,2,"2026-04-10T23:33:47",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":77,"owner_email":78,"owner_twitter":77,"owner_website":79,"owner_url":80,"languages":77,"stars":81,"forks":82,"last_commit_at":83,"license":77,"difficulty_score":84,"env_os":85,"env_gpu":86,"env_ram":86,"env_deps":87,"category_tags":90,"github_topics":92,"view_count":32,"oss_zip_url":77,"oss_zip_packed_at":77,"status":17,"created_at":101,"updated_at":102,"faqs":103,"releases":104},6584,"ict-bigdatalab\u002Fawesome-pretrained-models-for-information-retrieval","awesome-pretrained-models-for-information-retrieval","A curated list of awesome papers related to pre-trained models for information retrieval (a.k.a., pretraining for IR).","awesome-pretrained-models-for-information-retrieval 是一个精心整理的开源资源库，专注于收录与信息检索（IR）领域预训练模型相关的顶尖学术论文。在海量数据时代，如何让用户快速、精准地找到所需信息是核心挑战，而传统的检索方法往往难以应对复杂的语义理解需求。该项目通过系统梳理“预训练用于信息检索”的前沿成果，为从业者提供了一条清晰的技术演进路径。\n\n它详细涵盖了从初步检索（包括稀疏、稠密及混合检索策略）到重排序阶段的全流程技术，并深入探讨了长文档处理、效率优化、跨语言检索以及大语言模型（LLM）与检索系统的融合等热点方向。无论是利用硬负样本采样提升稠密向量质量，还是探索生成式重排序模型，这里都提供了丰富的理论依据和实践参考。\n\n这份资源特别适合人工智能研究人员、算法工程师以及对搜索技术感兴趣的开发者使用。对于希望构建高效搜索引擎、优化推荐系统或深入研究 LLM 增强检索（RAG）的团队而言，awesome-pretrained-models-for-information-retrieval 不仅是一份论文清单，更是一张指引技术选型与创新方","awesome-pretrained-models-for-information-retrieval 是一个精心整理的开源资源库，专注于收录与信息检索（IR）领域预训练模型相关的顶尖学术论文。在海量数据时代，如何让用户快速、精准地找到所需信息是核心挑战，而传统的检索方法往往难以应对复杂的语义理解需求。该项目通过系统梳理“预训练用于信息检索”的前沿成果，为从业者提供了一条清晰的技术演进路径。\n\n它详细涵盖了从初步检索（包括稀疏、稠密及混合检索策略）到重排序阶段的全流程技术，并深入探讨了长文档处理、效率优化、跨语言检索以及大语言模型（LLM）与检索系统的融合等热点方向。无论是利用硬负样本采样提升稠密向量质量，还是探索生成式重排序模型，这里都提供了丰富的理论依据和实践参考。\n\n这份资源特别适合人工智能研究人员、算法工程师以及对搜索技术感兴趣的开发者使用。对于希望构建高效搜索引擎、优化推荐系统或深入研究 LLM 增强检索（RAG）的团队而言，awesome-pretrained-models-for-information-retrieval 不仅是一份论文清单，更是一张指引技术选型与创新方向的实用地图，帮助大家站在巨人的肩膀上加速研发进程。","\u003Cp align=\"center\">\n  \u003Cbr>\n  \u003Cimg width=\"300\" src=\".\u002Fimgs\u002Flogo.svg\" alt=\"logo of awesome repository\">\n  \u003Cbr>\n  \u003Cbr>\n\u003C\u002Fp>\n\n# awesome-pretrained-models-for-information-retrieval \n\n> A curated list of awesome papers related to pre-trained models for information retrieval (a.k.a., **pre-training for IR**). If I missed any papers, feel free to open a PR to include them! And any feedback and contributions are welcome! \n\n\n\n## Pre-training for IR\n\n- [Survey Papers](#survey-papers)\n- [Phase 1: First-stage Retrieval](#first-stage-retrieval)\n  \u003Cdetails>\n  \u003Csummary>\n  \u003Ca href=\"#sparse-retrieval\">Sparse Retrieval \u003C\u002Fa>\n\n  \u003C\u002Fsummary>\n\n    - [Neural term re-weighting](#neural-term-re-weighting)\n    - [Query or document expansion](#query-or-document-expansion)\n    - [Sparse representation learning](#sparse-representation-learning)\n    \u003C!-- - [Combining neural term re-weighting and document expansion](#combining-neural-term-re-weighting-and-document-expansion) -->\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#dense-retrieval\">Dense Retrieval \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n    - [Hard negative sampling](#hard-negative-sampling)\n    - [Late interaction and multi-vector representation](#late-interaction-and-multi-vector-representation)\n    - [Knowledge distillation](#knowledge-distillation)\n    - [Pre-training tailored for dense retrieval](#pre-training-tailored-for-dense-retrieval)\n    - [Jointly learning retrieval and indexing](#jointly-learning-retrieval-and-indexing)\n    - [Domain adaptation](#domain-adaptation)\n    - [Query reformulation](#query-reformulation)\n    - [Bias](#bias)\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#hybrid-retrieval\">Hybrid Retrieval \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n  \u003C\u002Fdetails>\n\n\n- [Phase 2: Re-ranking Stage](#re-ranking-stage)\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#basic-usage\">Basic Usage \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n    - [Discriminative ranking models](#discriminative-ranking-models)\n    - [Generative ranking models](#generative-ranking-models)\n    - [Hybrid ranking models](#hybrid-ranking-models)\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#long-document-processing-techniques\">Long Document Processing Techniques \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n    - [Passage score aggregation](#passage-score-aggregation)\n    - [Passage representation aggregation](#passage-representation-aggregation)\n    - [Designing new architectures](#designing-new-architectures)\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n      \u003Ca href=\"#improving-efficiency\">Improving Efficiency \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n    - [Decoupling the interaction](#decoupling-the-interaction)\n    - [Knowledge distillation](#knowledge-distillation)\n    - [Partial Fine-tuning](#partial-fine-tuning)\n    - [Early exit](#early-exit)\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n      \u003Ca href=\"#other-topics\">Other Topics \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n  - [Query Expansion](#query-expansion)\n  - [Re-weighting Training Samples](#re-weighting-training-samples)\n  - [Pre-training Tailored for Re-ranking](#pre-training-tailored-for-re-ranking)\n  - [Adversarial Attack and Defence](#adversarial-attack-and-defence)\n  - [Cross-lingual Retrieval](#cross-lingual-retrieval)\n  \u003C\u002Fdetails>\n\n- [Jointly Learning Retrieval and Re-ranking](#jointly-learning-retrieval-and-re-ranking)\n- [Model-based IR System](#model-based-ir-system)\n\n- [LLM and IR](#llm-and-ir)\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#retrieval-augmented-llm\">Retrieval Augmented LLM \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#llm-for-ir\">LLM for IR\u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n    - [Perspectives or Surveys](#perspectives-or-surveys)\n    - [Synthetic Query Generation](#synthetic-query-generation)\n    - [Synthetic Document Generation](#synthetic-document-generation)\n    - [LLM for Relevance Scoring](#llm-for-relevance-scoring)\n    - [Text Generation based on IR](#text-generation-based-on-ir)\n    - [Others](#others)\n  \u003C\u002Fdetails>\n\n\n- [Multimodal Retrieval](#multimodal-retrieval)\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#unified-single-stream-architecture\">Unified Single-stream Architecture \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n      \u003Ca href=\"#multi-stream-architecture-applied-on-input\">Multi-stream Architecture Applied on Input \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n  \u003C\u002Fdetails>\n\n- [Other Resources](#other-resources)\n\n\n\n \n## Survey Papers\n- [Pre-training Methods in Information Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2111.13853.pdf) *Yixing Fan, Xiaohui Xie et.al.*  FnTIR 2022\n- [Dense Text Retrieval based on Pretrained Language Models: A Survey.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.14876.pdf) *Wayne Xin Zhao, Jing Liu et.al.* Arxiv 2022\n- [Pretrained Transformers for Text Ranking: BERT and Beyond.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.06467) *Jimmy Lin et.al.*  M&C 2021\n- [Semantic Models for the First-stage Retrieval: A Comprehensive Review.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2103.04831.pdf) *Jiafeng Guo et.al.* TOIS 2021\n- [A Deep Look into neural ranking models for information retrieval.](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.06902) *Jiafeng Guo et.al.* IPM 2020\n\n\n\n\n## First Stage Retrieval\n\n### Sparse Retrieval\n#### Neural term re-weighting\n- [Learning to Reweight Terms with Distributed Representations.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F2766462.2767700) *Guoqing Zheng, Jamie Callan* SIGIR 2015.(**DeepTR**)\n- [Context-Aware Term Weighting For First Stage Passage Retrieval.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3397271.3401204) *Zhuyun Dai et.al.* SIGIR 2020 short. [[code](https:\u002F\u002Fgithub.com\u002FAdeDZY\u002FDeepCT)] (**DeepCT**)\n- [Context-Aware Document Term Weighting for Ad-Hoc Search.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3366423.3380258) *Zhuyun Dai et.al.* WWW 2020. [[code](https:\u002F\u002Fgithub.com\u002FAdeDZY\u002FDeepCT\u002Ftree\u002Fmaster\u002FHDCT)] (**HDCT**)\n- [Learning Term Discrimination.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.11759.pdf) *Jibril Frej et.al.* SIGIR 2020. (**IDF-reweighting**)\n- [COIL: Revisit Exact Lexical Match in Information Retrieval with Contextualized Inverted List.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.07186.pdf) *Luyu Gao et.al.* NAACL 2020. [[code](https:\u002F\u002Fgithub.com\u002Fluyug\u002FCOIL)] (**COIL**)\n- [Learning Passage Impacts for Inverted Indexes.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.12016.pdf) *Antonio Mallia et.al.* SIGIR 2021 short. [[code](https:\u002F\u002Fgithub.com\u002FDI4IR\u002FSIGIR2021)] (**DeepImapct**)\n\n\n#### Query or document expansion\n- [Document Expansion by Query Prediction.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.08375.pdf) *Rodrigo Nogueira et.al.* [[doc2query code](https:\u002F\u002Fgithub.com\u002Fnyu-dl\u002Fdl4ir-doc2query), [docTTTTTquery code](https:\u002F\u002Fgithub.com\u002Fcastorini\u002FdocTTTTTquery)] (**doc2query, docTTTTTquery**)\n- [Generation-Augmented Retrieval for Open-Domain Question Answering.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.08553.pdf) *Yuning Mao et.al.* ACL 2021. [[code](https:\u002F\u002Fgithub.com\u002Fmorningmoni\u002FGAR)] (**query expansion with BART**)\n- [Unsupervised Document Expansion for Information Retrieval with Stochastic Text Generation.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.00666) *Jeong et.al.* arXiv 2021. [[code](https:\u002F\u002Fgithub.com\u002Fstarsuzi\u002FUDEG)] (**unsupervised document expansion**)\n\n\n\u003C!-- #### Combining neural term re-weighting and document expansion -->\n#### Sparse representation learning\n- [SparTerm: Learning Term-based Sparse Representation for Fast Text Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.00768.pdf) *Yang Bai, Xiaoguang Li et.al.* Arxiv 2020. (**SparTerm: Term importance distribution from MLM+Binary Term Gating**)\n- [Contextualized Sparse Representations for Real-Time Open-Domain Question Answering.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.02896.pdf) *Jinhyuk Lee, Minjoon Seo et.al.* ACL 2020. [[code](https:\u002F\u002Fgithub.com\u002Fjhyuklee\u002Fsparc)] (**SPARC, sparse vectors**)\n- [SPLADE: Sparse Lexical and Expansion Model for First Stage Ranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2107.05720.pdf), and [v2.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2109.10086.pdf) *Thibault Formal et.al.* SIGIR 2021. [[code](https:\u002F\u002Fgithub.com\u002Fnaver\u002Fsplade)](**SPLADE**)\n- [Ultra-High Dimensional Sparse Representations with Binarization for Efficient Text Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.07198.pdf) *Kyoung-Rok Jang et.al.* EMNLP 2021. (**UHD**)\n- [Efficient Passage Retrieval with Hashing for Open-domain Question Answering.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.00882.pdf) *Ikuya Yamada et.al.* ACL 2021. [[code](https:\u002F\u002Fgithub.com\u002Fstudio-ousia\u002Fbpr)] (**BPR, convert embedding vector to binary codes**)\n\n\n\n### Dense Retrieval\n\n\n#### Hard negative sampling\n- [Dense Passage Retrieval for Open-Domain Question Answering.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.04906.pdf) *Vladimir Karpukhin,Barlas Oguz et.al.* EMNLP 2020 [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FDPR)] (**DPR, in-batch negatives**)\n- [RepBERT: Contextualized Text Embeddings for First-Stage Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.15498.pdf) *Jingtao Zhan et.al.* Arxiv 2020. [[code](https:\u002F\u002Fgithub.com\u002Fjingtaozhan\u002FRepBERT-Index)] (**RepBERT**)\n- [Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2007.00808.pdf) *Lee Xiong, Chenyan Xiong et.al.* [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FANCE)] (**ANCE, refresh index during training**)\n- [RocketQA: An Optimized Training Approach to Dense Passage Retrieval for Open-Domain Question Answering.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.08191.pdf) *Yingqi Qu et.al.* NAACL 2021. (**RocketQA: cross-batch negatives, denoise hard negatives and data augementation**)\n- [Optimizing Dense Retrieval Model Training with Hard Negatives.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.08051.pdf) *Jingtao Zhan et.al.* SIGIR 2021.[[code](https:\u002F\u002Fgithub.com\u002Fjingtaozhan\u002FDRhard)] (**ADORE&STAR, query-side finetuning build on pretrained document encoders**)\n- [Efficiently Teaching an Effective Dense Retriever with Balanced Topic Aware Sampling.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.06967.pdf) *Sebastian Hofstätter et.al.* SIGIR 2021.[[code](https:\u002F\u002Fgithub.com\u002Fsebastian-hofstaetter\u002Ftas-balanced-dense-retrieval)] (**TAS-Balanced, sample from query cluster and distill from BERT ensemble**)\n- [PAIR: Leveraging Passage-Centric Similarity Relation for Improving Dense Passage Retrieval](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.06027.pdf) *Ruiyang Ren et.al.* EMNLP Findings 2021. [[code](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FResearch\u002Ftree\u002Fmaster\u002FNLP\u002FACL2021-PAIR)] (**PAIR**)\n\n\n\n#### Late interaction and multi-vector representation\n- [ColBERT: Efficient and Effective Passage Search via Contextualized Late Interaction over BERT.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.12832.pdf) *Omar Khattab et.al.* SIGIR 2020. [[code](https:\u002F\u002Fgithub.com\u002Fstanford-futuredata\u002FColBERT)] (**ColBERT**)\n- [Poly-encoders: Architectures and pre-training strategies for fast and accurate multi-sentence scoring.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.01969.pdf) *Samuel Humeau,Kurt Shuster et.al.* ICLR 2020. [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FParlAI\u002Ftree\u002Fmaster\u002Fprojects\u002Fpolyencoder)] (**Poly-encoders**)\n- [Sparse, Dense, and Attentional Representations for Text Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.00181.pdf) *Yi Luan, Jacob Eisenstein et.al.* TACL 2020. (**ME-BERT, multi-vectors**)\n- [Improving Document Representations by Generating Pseudo Query Embeddings for Dense Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.03599.pdf) *Hongyin Tang, Xingwu Sun et.al.* ACL 2021.\n- [Real-Time Open-Domain Question Answering with Dense-Sparse Phrase Index.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1906.05807.pdf) *Minjoon Seo,Jinhyuk Lee et.al.* ACL 2019. [[code](https:\u002F\u002Fgithub.com\u002Fuwnlp\u002Fdenspi)] (**DENSPI**)\n- [Learning Dense Representations of Phrases at Scale.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2012.12624.pdf) *Jinhyuk Lee, Danqi Chen et.al.* ACL 2021. [[code](https:\u002F\u002Fgithub.com\u002Fjhyuklee\u002FDensePhrases)] (**DensePhrases**)\n- [Multi-View Document Representation Learning for Open-Domain Dense Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.08372.pdf) *Shunyu Zhang et.al.* ACL 2022. (**MVR**)\n- [Multivariate Representation Learning for Information Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.14522.pdf) *Hamed Zamani et.al.* SIGIR 2023. (**Learn multivariate distributions**)\n\n#### Knowledge distillation\n- [Distilling Knowledge from Reader to Retriever for Question Answering.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2012.04584.pdf) *Gautier Izacard, Edouard Grave.* ICLR 2020. [[unofficial code](https:\u002F\u002Fgithub.com\u002Flucidrains\u002Fdistilled-retriever-pytorch)] (**Distill cross-attention of reader to retriever**)\n- [Distilling Knowledge for Fast Retrieval-based Chat-bots.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.11045.pdf) *Amir Vakili Tahami et.al.* SIGIR 2020. [[code](https:\u002F\u002Fgithub.com\u002FKamyarGhajar\u002FDistilledNeuralResponseRanker)] (**Distill from cross-encoders to bi-encoders**)\n- [Improving Efficient Neural Ranking Models with Cross-Architecture Knowledge Distillation.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.02666.pdf) *Sebastian Hofstätter et.al.* Arxiv 2020. [[code](https:\u002F\u002Fgithub.com\u002Fsebastian-hofstaetter\u002Fneural-ranking-kd)] (**Distill from BERT ensemble**)\n- [Distilling Dense Representations for Ranking using Tightly-Coupled Teachers.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.11386.pdf) *Sheng-Chieh Lin, Jheng-Hong Yang, Jimmy Lin.* Arxiv 2020. [[code](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Fpyserini\u002Fblob\u002Fmaster\u002Fdocs\u002Fexperiments-tct_colbert.md)] (**TCTColBERT: distill from ColBERT**)\n- [Efficiently Teaching an Effective Dense Retriever with Balanced Topic Aware Sampling.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.06967.pdf) *Sebastian Hofstätter et.al.* SIGIR 2021.[[code](https:\u002F\u002Fgithub.com\u002Fsebastian-hofstaetter\u002Ftas-balanced-dense-retrieval)] (**TAS-Balanced, sample from query cluster and distill from BERT ensemble**)\n- [RocketQAv2: A Joint Training Method for Dense Passage Retrieval and Passage Re-ranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.07367.pdf) *Ruiyang Ren, Yingqi Qu et.al.* EMNLP 2021. [[code](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FRocketQA)] (**RocketQAv2, joint learning by distillation**)\n- [Curriculum Contrastive Context Denoising for Few-shot Conversational Dense Retrieval.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3477495.3531961) *Kelong Mao et.al.* SIGIR 2022.\n\n \n#### Pre-training tailored for dense retrieval\n- [Latent Retrieval for Weakly Supervised Open Domain Question Answering.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1906.00300.pdf) *Kenton Lee et.al.* ACL 2019. [[code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Flanguage\u002Fblob\u002Fmaster\u002Flanguage\u002Forqa\u002FREADME.md)] (**ORQA, ICT**)\n- [Pre-training tasks for embedding-based large scale retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2002.03932.pdf) *Wei-Cheng Chang et.al.* ICLR 2020. (**ICT, BFS and WLP**)\n- [REALM: Retrieval-Augmented Language Model Pre-Training.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2002.08909.pdf) *Kelvin Guu, Kenton Lee et.al.* ICML 2020. [[code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Flanguage\u002Fblob\u002Fmaster\u002Flanguage\u002Frealm\u002FREADME.md)] (**REALM**)\n- [Less is More: Pre-train a Strong Text Encoder for Dense Retrieval Using a Weak Decoder.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2102.09206.pdf) *Shuqi Lu, Di He, Chenyan Xiong et.al.* EMNLP 2021. [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FSEED-Encoder\u002F)] (**Seed**)\n- [Condenser: a Pre-training Architecture for Dense Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.08253.pdf) *Luyu Gao et.al.* EMNLP 2021. [[code](https:\u002F\u002Fgithub.com\u002Fluyug\u002FCondenser)](**Condenser**)\n- [Unsupervised Context Aware Sentence Representation Pretraining for Multi-lingual Dense Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.03281.pdf) *Ning Wu et.al.* JICAI 2022. [[code](https:\u002F\u002Fgithub.com\u002Fwuning0929\u002FCCP_IJCAI22)](**CCP, cross-lingual pre-training**)\n- [Unsupervised Corpus Aware Language Model Pre-training for Dense Passage Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.05540.pdf) *Luyu Gao et.al.* ACL 2022. [[code](https:\u002F\u002Fgithub.com\u002Fluyug\u002FCondenser)](**coCondenser**)\n- [LaPraDoR: Unsupervised Pretrained Dense Retriever for Zero-Shot Text Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.06169.pdf) *Canwen Xu, Daya Guo et.al.* ACL 2022. [[code](https:\u002F\u002Fgithub.com\u002FJetRunner\u002FLaPraDoR)] (**LaPraDoR, ICT+dropout**)\n- [A Contrastive Pre-training Approach to Learn Discriminative Autoencoder for Dense Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.09846.pdf) *Xinyu Ma et.al.* CIKM 2022. (**CPADE, document term distribution-based contrastive pretraining**)\n- [Pre-train a Discriminative Text Encoder for Dense Retrieval via Contrastive Span Prediction](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.10641.pdf) *Xinyu Ma et.al.* SIGIR 2022. [[code](https:\u002F\u002Fgithub.com\u002FAlbert-Ma\u002FCOSTA)](**COSTA, group-wise contrastive learning**)\n- [H-ERNIE: A Multi-Granularity Pre-Trained Language Model for Web Search.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3477495.3531986) *Xiaokai Chu et.al.* SIGIR 2022. (**H-ERNIE**)\n- [Structure and Semantics Preserving Document Representations.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.03720.pdf) *Natraj Raman et.al.* SIGIR 2022.\n- [Contriever: Unsupervised Dense Information Retrieval with Contrastive Learning.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.09118.pdf) *Gautier Izacard et.al.* TMLR 2022. [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fcontriever)] (**Contriever**)\n- [Augmenting Document Representations for Dense Retrieval with Interpolation and Perturbation.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.07735) *Jeong et.al.* ACL 2022. [[code](https:\u002F\u002Fgithub.com\u002Fstarsuzi\u002FDAR)] (**Augmentation for Dense Retrieval**)\n\n\n#### Jointly learning retrieval and indexing\n- [Joint Learning of Deep Retrieval Model and Product Quantization based Embedding Index.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.03933.pdf) *Han Zhang et.al.* SIGIR 2021 short. [[code](https:\u002F\u002Fgithub.com\u002Fjdcomsearch\u002Fpoeem)] (**Poeem**)\n- [Jointly Optimizing Query Encoder and Product Quantization to Improve Retrieval Performance.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.00644.pdf) *Jingtao Zhan et.al.* CIKM 2021. [[code](https:\u002F\u002Fgithub.com\u002Fjingtaozhan\u002FJPQ)] (**JPQ**)\n- [Learning Discrete Representations via Constrained Clustering for Effective and Efficient Dense Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.05789.pdf)*Jingtao Zhan et.al.* WSDM 2022. [[code](https:\u002F\u002Fgithub.com\u002Fjingtaozhan\u002FRepCONC)] (**RepCONC**)\n- [Matchingoriented Embedding Quantization For Ad-hoc Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.07858.pdf) *Shitao Xiao et.al.* EMNLP 2021. [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FMoPQ)]\n- [Distill-VQ: Learning Retrieval Oriented Vector Quantization By Distilling Knowledge from Dense Embeddings.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.00185.pdf) *Shitao Xiao et.al.* SIGIR 2022. [[code](https:\u002F\u002Fgithub.com\u002Fstaoxiao\u002FLibVQ)]\n\n\n#### Multi-hop dense retrieval\n- [Answering Complex Open-Domain Questions with Multi-Hop Dense Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.12756.pdf) *Wenhan Xiong, Xiang Lorraine Li et.al.* ICLR 2021 [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmultihop_dense_retrieval)] (**Iteratively encode the question and previously retrieved documents as query vectors**)\n\n#### Domain adaptation\n- [Multi-Task Retrieval for Knowledge-Intensive Tasks.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.00117.pdf) *Jean Maillard, Vladimir Karpukhin^ et.al.*  ACL 2021. (**Multi-task learning**)\n- [Evaluating Extrapolation Performance of Dense Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.11447.pdf) *Jingtao Zhan et.al.* CIKM 2022. [[code](https:\u002F\u002Fgithub.com\u002Fjingtaozhan\u002Fextrapolate-eval)]\n\n\n#### Query reformulation\n- [PseudoRelevance Feedback for Multiple Representation Dense Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.11251.pdf) *Xiao Wang et.al.* ICTIR 2021 (**ColBERT-PRF**)\n- [Improving Query Representations for Dense Retrieval with Pseudo Relevance Feedback.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.13454.pdf) *HongChien Yu et.al.* CIKM 2021. [[code](https:\u002F\u002Fgithub.com\u002Fyuhongqian\u002FANCE-PRF)] (**ANCE-PRF**)\n- [LoL: A Comparative Regularization Loss over Query Reformulation Losses for Pseudo-Relevance Feedback.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.11545.pdf) *Yunchang Zhu et.al.* SIGIR 2022. [[code](https:\u002F\u002Fgithub.com\u002Fzycdev\u002FLoL)] (**LoL, Pseudo-relevance feedback**)\n\n\n\n#### Bias\n- [Implicit Feedback for Dense Passage Retrieval: A Counterfactual Approach.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.00718.pdf) *Shengyao Zhuang et.al.* SIGIR 2022. [[code](https:\u002F\u002Fgithub.com\u002Fielab\u002FCounterfactual-DR)] (**CoRocchio, Counterfactual Rocchio algorithm**)\n- [Hard Negatives or False Negatives: Correcting Pooling Bias in Training Neural Ranking Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.05072.pdf) *Yinqiong Cai et.al.* CIKM 2022.\n\n\n\n### Hybrid Retrieval\n- [Real-Time Open-Domain Question Answering with Dense-Sparse Phrase Index.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1906.05807.pdf) *Minjoon Seo,Jinhyuk Lee et.al.* ACL 2019. [[code](https:\u002F\u002Fgithub.com\u002Fuwnlp\u002Fdenspi)] (**DENSPI**)\n- [Complement Lexical Retrieval Model with Semantic Residual Embeddings.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.13969.pdf) *Luyu Gao et.al.* ECIR 2021.\n- [BERT-based Dense Retrievers Require Interpolation with BM25 for Effective Passage Retrieval.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3471158.3472233) *Shuai Wang et.al.* ICTIR 2021.\n- [Progressively Optimized Bi-Granular Document Representation for Scalable Embedding Based Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.05409.pdf) *Shitao Xiao et.al.* WWW 2022. [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FBiDR)]\n\n\n\n## Re-ranking Stage\n\n\n### Basic Usage\n\n#### Discriminative ranking models\n\n##### Representation-focused\n- [Understanding the Behaviors of BERT in Ranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.07531.pdf) *Yifan Qiao et.al.* Aixiv 2019. (**Representation-focused and Interanction-focused**)\n\n##### Interanction-focused\n- [Passage Re-ranking with BERT.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1901.04085.pdf) *Rodrigo Nogueira et.al.* [[code](https:\u002F\u002Fgithub.com\u002Fnyu-dl\u002Fdl4marco-bert)] (**monoBERT: Maybe the first work on applying BERT to IR**)\n- [Multi-Stage Document Ranking with BERT,](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1910.14424.pdf) [The Expando-Mono-Duo Design Pattern for Text Ranking with Pretrained Sequence-to-Sequence Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.05667.pdf) *Rodrigo Nogueira et.al.* Arxiv 2020. (**Expando-Mono-Duo: doc2query+pointwise+pairwise**)\n- [CEDR: Contextualized Embeddings for Document Ranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.07094.pdf) *Sean MacAvaney et.al.* SIGIR 2020 short. [[code](https:\u002F\u002Fgithub.com\u002FGeorgetown-IR-Lab\u002Fcedr)] (**CEDR: BERT+neuIR model**)\n\n\n#### Generative ranking models\n- [Beyond [CLS] through Ranking by Generation.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.03073.pdf) *Cicero Nogueira dos Santos et.al.* EMNLP 2020 short. (**Query generation using GPT and BART**)\n- [Document Ranking with a Pretrained Sequence-to-Sequence Model.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2003.06713.pdf) *Rodrigo Nogueira, Zhiying Jiang et.al.* EMNLP 2020. [[code](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Fpygaggle\u002F)] (**Relevance token  generation using T5**)\n- [RankT5: Fine-Tuning T5 for Text Ranking with Ranking Losses.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.10634.pdf) *Honglei Zhuang et.al.* Arxiv 2022.\n\n\n#### Hybrid ranking models\n- [Generalizing Discriminative Retrieval Models using Generative Tasks.](https:\u002F\u002Fciir-publications.cs.umass.edu\u002Fpub\u002Fweb\u002Fgetpdf.php?id=1414) *Bingsheng Liu, Hamed Zamani et.al.* WWW 2021. (**GDMTL,joint discriminative and generative model with multitask learning**)\n\n\n\n### Long Document Processing Techniques\n#### Passage score aggregation\n- [Deeper Text Understanding for IR with Contextual Neural Language Modeling.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.09217.pdf) *Zhuyun Dai et.al.* SIGIR 2020 short. [[code](https:\u002F\u002Fgithub.com\u002FAdeDZY\u002FSIGIR19-BERT-IR)] (**BERT-MaxP, BERT-firstP, BERT-sumP: Passage-level**)\n- [Simple Applications of BERT for Ad Hoc Document Retrieval,](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1903.10972.pdf) [Applying BERT to Document Retrieval with Birch,](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-3004.pdf) [Cross-Domain Modeling of Sentence-Level Evidence for Document Retrieval.](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-1352.pdf) *Wei Yang, Haotian Zhang et.al.* Arxiv 2020, *Zeynep Akkalyoncu Yilmaz et.al.* EMNLP 2019 short. [[code](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Fbirch)] (**Birch: Sentence-level**)\n- [Intra-Document Cascading: Learning to Select Passages for Neural Document Ranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.09816.pdf) *Sebastian Hofstätter et.al.* SIGIR 2021. [[code](https:\u002F\u002Fgithub.com\u002Fsebastian-hofstaetter\u002Fintra-document-cascade)] (**Distill a ranking model to conv-knrm to select top-k passages**)\n\n\n#### Passage representation aggregation\n- [PARADE: Passage Representation Aggregation for Document Reranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2008.09093.pdf) *Canjia Li et.al.* Arxiv 2020. [[code](https:\u002F\u002Fgithub.com\u002Fcanjiali\u002FPARADE\u002F)] (**An extensive comparison of various Passage Representation Aggregation methods**)\n- [Leveraging Passage-level Cumulative Gain for Document Ranking.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3366423.3380305) *Zhijing Wu et.al.* WWW 2020. (**PCGM**)\n\n\n#### Designing new architectures\n- [Local Self-Attention over Long Text for Efficient Document Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.04908.pdf) *Sebastian Hofstätter et.al.* SIGIR 2020 short. [[code](https:\u002F\u002Fgithub.com\u002Fsebastian-hofstaetter\u002Ftransformer-kernel-ranking)] (**TKL:Transformer-Kernel for long text**)\n- [Beyond 512 Tokens: Siamese Multi-depth Transformer-based Hierarchical Encoder for Long-Form Document Matching.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.12297v2.pdf) *Liu Yang et.al.* CIKM 2020. [[code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Fsmith)] (**SMITH for doc2doc matching**)\n- [Socialformer: Social Network Inspired Long Document Modeling for Document Ranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.10870.pdf) *Yujia Zhou et.al.* WWW 2022. (**Socialformer**)\n\n\n### Improving Efficiency\n\n#### Decoupling the interaction\n- [DC-BERT: Decoupling Question and Document for Efficient Contextual Encoding.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2002.12591.pdf) *Yuyu Zhang, Ping Nie et.al.* SIGIR 2020 short. (**DC-BERT**)\n- [Efficient Document Re-Ranking for Transformers by Precomputing Term Representations.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.14255.pdf) *Sean MacAvaney et.al.* SIGIR 2020. [[code](https:\u002F\u002Fgithub.com\u002FGeorgetown-IR-Lab\u002Fprettr-neural-ir)] (**PreTTR**)\n- [Modularized Transfomer-based Ranking Framework.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.13313.pdf) *Luyu Gao et.al.* EMNLP 2020. [[code](https:\u002F\u002Fgithub.com\u002Fluyug\u002FMORES)] (**MORES, similar to PreTTR**)\n- [TILDE: Term Independent Likelihood moDEl for Passage Re-ranking.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3404835.3462922) *Shengyao Zhuang, Guido Zuccon* SIGIR 2021. [[code](https:\u002F\u002Fgithub.com\u002Fielab\u002FTILDE)] (**TILDE**)\n- [Fast Forward Indexes for Efficient Document Ranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.06051.pdf) *Jurek Leonhardt et.al.* WWW 2022. (**Fast forward index**)\n\n\n#### Knowledge distillation\n- [Understanding BERT Rankers Under Distillation.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2007.11088.pdf) *Luyu Gao et.al.* ICTIR 2020. (**LM Distill + Ranker Distill**)\n- [Simplified TinyBERT: Knowledge Distillation for Document Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.07531.pdf) *Xuanang Chen et.al.* ECIR 2021. [[code](https:\u002F\u002Fgithub.com\u002Fcxa-unique\u002FSimplified-TinyBERT)] (**TinyBERT+knowledge distillation**)\n\n\n#### Partial Fine-tuning\n- [Semi-Siamese Bi-encoder Neural Ranking Model Using Lightweight Fine-Tuning.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.14943.pdf) *Euna Jung, Jaekeol Choi et.al.* WWW 2022. [[code](https:\u002F\u002Fgithub.com\u002Fxlpczv\u002FSemi_Siamese)] (**Lightweight Fine-Tuning**)\n- [Scattered or Connected? An Optimized Parameter-efficient Tuning Approach for Information Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.09847.pdf) *Xinyu Ma et.al.* CIKM 2022.(**IAA, introduce the aside module to stabilize training**)\n\n\n#### Early exit\n- [The Cascade Transformer: an Application for Efficient Answer Sentence Selection.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.02534.pdf) *Luca Soldaini et.al.* ACL 2020.[[code](https:\u002F\u002Fgithub.com\u002Falexa\u002Fwqa-cascade-transformers)] (**Cascade Transformer: prune candidates by layer**)\n- [Early Exiting BERT for Efficient Document Ranking.](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002F2020.sustainlp-1.11.pdf) *Ji Xin et.al.* EMNLP 2020 SustaiNLP Workshop. [[code](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Fearlyexiting-monobert)] (**Early exit**)\n\n\n\n### Other Topics\n\n#### Query Expansion \n- [BERT-QE: Contextualized Query Expansion for Document Re-ranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.07258.pdf) *Zhi Zheng et.al.* EMNLP 2020 Findings. [[code](https:\u002F\u002Fgithub.com\u002Fzh-zheng\u002FBERT-QE)] (**BERT-QE**)\n\n\n#### Re-weighting Training Samples\n- [Training Curricula for Open Domain Answer Re-Ranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.14269.pdf) *Sean MacAvaney et.al.* SIGIR 2020. [[code](https:\u002F\u002Fgithub.com\u002FGeorgetown-IR-Lab\u002Fcurricula-neural-ir)] (**curriculum learning based on BM25**)\n- [Not All Relevance Scores are Equal: Efficient Uncertainty and Calibration Modeling for Deep Retrieval Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.04651.pdf) *Daniel Cohen et.al.* SIGIR 2021.\n\n#### Pre-training Tailored for Re-ranking\n\n- [MarkedBERT: Integrating Traditional IR Cues in Pre-trained Language Models for Passage Retrieval.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3397271.3401194) *Lila Boualili et.al.* SIGIR 2020 short. [[code](https:\u002F\u002Fgithub.com\u002FBOUALILILila\u002Fmarkers_bert)] (**MarkedBERT**)\n- [Selective Weak Supervision for Neural Information Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2001.10382.pdf) *Kaitao Zhang et.al.* WWW 2020. [[code](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FReInfoSelect)] (**ReInfoSelect**)\n- [PROP: Pre-training with Representative Words Prediction for Ad-hoc Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.10137.pdf) *Xinyu Ma et.al.* WSDM 2021. [[code](https:\u002F\u002Fgithub.com\u002FAlbert-Ma\u002FPROP)] (**PROP**)\n- [Cross-lingual Language Model Pretraining for Retrieval.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3442381.3449830) *Puxuan Yu et.al.* WWW 2021. \n- [B-PROP: Bootstrapped Pre-training with Representative Words Prediction for Ad-hoc Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.09791.pdf) *Xinyu Ma et.al.* SIGIR 2021. [[code](https:\u002F\u002Fgithub.com\u002FAlbert-Ma\u002FPROP)] (**B-PROP**)\n- [Pre-training for Ad-hoc Retrieval: Hyperlink is Also You Need.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.09346.pdf) *Zhengyi Ma et.al.* CIKM 2021. [[code](https:\u002F\u002Fgithub.com\u002Fzhengyima\u002FAnchors)] (**HARP**)\n- [Contrastive Learning of User Behavior Sequence for Context-Aware Document Ranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.10510.pdf) *Yutao Zhu et.al.* CIKM 2021. [[code](https:\u002F\u002Fgithub.com\u002FDaoD\u002FCOCA)](**COCA**)\n- [Pre-trained Language Model based Ranking in Baidu Search.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.11108.pdf) *Lixin Zou et.al.* KDD 2021.\n- [A Unified Pretraining Framework for Passage Ranking and Expansion.](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F16584) *Ming Yan et.al.* AAAI 2021. (**UED, jointly training ranking and query generation**)\n- [Axiomatically Regularized Pre-training for Ad hoc Search.](https:\u002F\u002Fxuanyuan14.github.io\u002Ffiles\u002FSIGIR22Chen.pdf) *Jia Chen et.al.* SIGIR 2022. [[code](https:\u002F\u002Fgithub.com\u002Fxuanyuan14\u002FARES)] (**ARES**)\n- [Webformer: Pre-training with Web Pages for Information Retrieval.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3477495.3532086) *Yu Guo et.al.* SIGIR 2022. (**Webformer**)\n\n\n\n#### Adversarial Attack and Defence\n- [Competitive Search.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3477495.3532771) *Oren Kurland et.al.* SIGIR 2022.\n- [PRADA: Practical Black-Box Adversarial Attacks against Neural Ranking Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.01321) *Chen Wu et.al.* Arxiv 2022\n- [Order-Disorder: Imitation Adversarial Attacks for Black-box Neural Ranking Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.06506.pdf) *Jiawei Liu et.al.* CCS 2022\n- [Are Neural Ranking Models Robust?](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.05018.pdf) *Chen Wu et.al.* TOIS\n- [Certified Robustness to Word Substitution Ranking Attack for Neural Ranking Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.06691.pdf) *Chen Wu et.al.* CIKM 2022\n- [Topic-oriented Adversarial Attacks against Black-box Neural Ranking Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.14867.pdf) *Yu-An Liu et.al.* SIGIR 2023. \n\n\n\n#### Cross-lingual Retrieval\n- [Cross-lingual Retrieval for Iterative Self-Supervised Training.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.09526.pdf) *Chau Tran et.al.* NIPS 2020. [[code](https:\u002F\u002Fgithub.com\u002Fpytorch\u002Ffairseq\u002Ftree\u002Fmaster\u002Fexamples\u002Fcriss)] (**CRISS**)\n- [CLIRMatrix: A massively large collection of bilingual and multilingual datasets for Cross-Lingual Information Retrieval.](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002F2020.emnlp-main.340.pdf) *Shuo Sun et.al.* EMNLP 2020. [[code](https:\u002F\u002Fgithub.com\u002Fssun32\u002FCLIRMatrix)] (**Multilingual dataset-CLIRMatrix and multilingual BERT**)\n\n\n\n\n## Jointly Learning Retrieval and Re-ranking\n- [RocketQAv2: A Joint Training Method for Dense Passage Retrieval and Passage Re-ranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.07367.pdf) *Ruiyang Ren, Yingqi Qu et.al.* EMNLP 2021. [[code](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FRocketQA)] (**RocketQAv2**)\n- [Adversarial Retriever-Ranker for dense text retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.03611.pdf) *Hang Zhang et.al.* ICLR 2022. [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FAR2)] (**AR2**)\n- [RankFlow: Joint Optimization of Multi-Stage Cascade Ranking Systems as Flows.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3477495.3532050) *Jiarui Qin et.al.* SIGIR 2022. (**RankFlow**)\n\n\n\n## Model-based IR System\n- [Rethinking Search: Making Domain Experts out of Dilettantes.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.02274.pdf) *Donald Metzler et.al.* SIGIR Forum 2020. \n(**Envisioned the model-based IR system**)\n- [Transformer Memory as a Differentiable Search Index.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.06991.pdf) *Yi Tay et.al.* Arxiv 2022. \n(**DSI**)\n- [DynamicRetriever: A Pre-training Model-based IR System with Neither Sparse nor Dense Index.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.00537.pdf) *Yujia Zhou et.al.* Arxiv 2022. \n(**DynamicRetriever**)\n- [A Neural Corpus Indexer for Document Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.02743.pdf) *Yujing Wang et.al.* Arxiv 2022. (**NCI**)\n- [Autoregressive Search Engines: Generating Substrings as Document Identifiers.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.10628.pdf) *Michele Bevilacqua et.al.* Arxiv 2022. [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FSEAL)] (**SEAL**)\n- [CorpusBrain: Pre-train a Generative Retrieval Model for Knowledge-Intensive Language Tasks.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.07652.pdf) *Jiangui Chen et.al.* CIKM 2022. [[code](https:\u002F\u002Fgithub.com\u002Fict-bigdatalab\u002FCorpusBrain)] (**CorpusBrain**)\n- [A Unified Generative Retriever for Knowledge-Intensive Language Tasks via Prompt Learning.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.14856.pdf) *Jiangui Chen et.al.* SIGIR 2023. [[code](https:\u002F\u002Fgithub.com\u002Fict-bigdatalab\u002FUGR)] (**UGR**)\n- [TOME: A Two-stage Approach for Model-based Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.11161.pdf) *Ruiyang Ren et.al.* ACL 2023. (**TOME: Passage generation then URL generation**)\n- [How Does Generative Retrieval Scale to Millions of Passages?](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.11841.pdf) *Ronak Pradeep, Kai Hui et.al.* Arxiv 2023. (**Comprehensive study on proposed methods, using synthetic queries as document ids**)\n- [Semantic-Enhanced Differentiable Search Index Inspired by Learning Strategies.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.15115.pdf) *Yubao Tang et.al.* KDD 2023. (**Semantic-Enhanced DSI**)\n\n\n\n## LLM and IR\n\n### Perspectives or Surveys\n- [Information Retrieval meets Large Language Models: A strategic report from Chinese IR community.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.09751.pdf) *Qingyao AI et.al.* The CCIR community. AI Open 2023.\n- [Large Language Models for Information Retrieval: A Survey.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.07107.pdf) *Yutao Zhu et.al.*  Renmin University of China. Arxiv 2023. \n- [Navigating Complex Search Tasks with AI Copilots.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.11401.pdf) *Ryen W. White*  Microsoft Research. Arxiv 2023. \n\n\n### Retrieval Augmented LLM\n- [Retrieval-augmented generation for knowledge-intensive NLP tasks.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.11401.pdf) *Patrick Lewis, Ethan Perez et.al.* NIPS 2020. (**RAG, for 440M BART**)\n- [Improving Language Models by Retrieving from Trillions of Tokens.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.04426.pdf) *Sebastian Borgeaud, Arthur Mensch, Jordan Hoffmann et.al.*  ICML 2022. [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FFiD)](***RETRO, enc-dec 7.5B**)\n- [Atlas: Few-shot Learning with Retrieval Augmented Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.03299.pdf) *Gautier Izacard, Patrick Lewis et.al.* Arxiv 2022. [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fatlas)] (**Atlas, T5, 11B**)\n- [Internet-augmented language models through few-shot prompting for open-domain question answering.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.05115.pdf) *Angeliki Lazaridou et.al.* Arxiv 2022. (**Gopher 280B, Conditioning on Google search results**)\n- [Enhancing Retrieval-Augmented Large Language Models with Iterative Retrieval-Generation Synergy.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.15294.pdf) *Zhihong Shao et.al.* Arxiv 2023.\n- [Instruction Tuning post Retrieval-Augmented Pretraining.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.07713.pdf) *Boxin Wang et.al.* Arxiv 2023.\n- [Retrieve Anything To Augment Large Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.07554.pdf)\n\n### LLM for IR\n\n\u003C!-- #### Survey or Perspectives\n\n- []\n- [Large Search Model: Redefining Search Stack in the Era of LLMs.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.14587.pdf) *Liang Wang, Nan Yang et.al.* -->\n\n\n#### Synthetic Query Generation\n- [Improving Passage Retrieval with Zero-Shot Question Generation.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.07496.pdf) *Devendra Singh Sachan et.al.* EMNLP 2022. [[code](https:\u002F\u002Fgithub.com\u002FDevSinghSachan\u002Funsupervised-passage-reranking)](**UPR, rerank docs based on query likelihood of GPT-neo 2.7B\u002FT0 3B,11B**)\n- [Promptagator: Few-shot Dense Retrieval From 8 Examples.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.11755.pdf) *Zhuyun Dai et.al.* ICLR 2023. (**Generate pseudo queries using in-context learning, FLAN 137B**)\n- [UDAPDR: Unsupervised Domain Adaptation via LLM Prompting and Distillation of Rerankers.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.00807.pdf) *Jon Saad-Falcon, Omar Khattab et.al.* Arxiv 2023. [[code](https:\u002F\u002Fgithub.com\u002Fprimeqa\u002Fprimeqa)](**Train reranker with generated pseudo quereis with GPT3**)\n- [InPars: Data Augmentation for Information Retrieval using Large Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.05144.pdf) *Luiz Bonifacio et.al.* Arxiv 2022. [[code](https:\u002F\u002Fgithub.com\u002Fzetaalphavector\u002FInPars\u002Ftree\u002Fmaster\u002Flegacy\u002Finpars-v1)](**Use GPT-3 Curie to generate pseudo quereis with in-context learning, query generation probs to select top-k q-d pairs**)\n- [InPars-v2: Large Language Models as Efficient Dataset Generators for Information Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2301.01820.pdf) *Vitor Jeronymo et.al.* Arxiv 2023. [[code](https:\u002F\u002Fgithub.com\u002Fzetaalphavector\u002FinPars\u002Ftree\u002Fmaster\u002Flegacy\u002Finpars-v2)](**silimar to InPars, use GPT-J 6B LLM, and a finetuned reranker as selector**)\n- [InPars-Light: Cost-Effective Unsupervised Training of Efficient Rankers.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2301.02998.pdf) *Leonid Boytsov et.al.* Arxiv 2023. (**silimar to InPars, use GPT-J 6B and BLOOM 7B**)\n- [Generative Relevance Feedback with Large Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.13157.pdf) *Iain Mackie et.al.* SIGIR 2023 short. (**GRF, generate various info with GPT3 for relevance feedback**)\n- [Query Expansion by Prompting Large Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.03653.pdf) *Rolf Jagerman et.al.* Arxiv 2023.\n- [Exploring the Viability of Synthetic Query Generation for Relevance Prediction.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.11944.pdf) *Aditi Chaudhary et.al.* Arxiv 2023. (**FLAN-137B label conditioned generation**)\n- [Large Language Model based Long-tail Query Rewriting in Taobao Search.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.03758.pdf) *Wenjun Peng et.al.* Arxiv 2023.\n- [Generate, Filter, and Fuse: Query Expansion via Multi-Step Keyword Generation for Zero-Shot Neural Rankers.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.09175.pdf) *Minghan Li et.al.* Arxiv 2023. (**Use Flan-PaLM2-S for keywords generation**)\n- [Leveraging LLMs for Synthesizing Training Data Across Many Languages in Multilingual Dense Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.05800.pdf) *Nandan Thakur et.al.* Arxiv 2023.\n\n#### Synthetic Document Generation\n- [Generate rather than Retrieve: Large Language Models are Strong Context Generators.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.10063.pdf) *Wenhao Yu et.al.* ICLR 2023. [[code](https:\u002F\u002Fgithub.com\u002Fwyu97\u002FGenRead)] (**GenRead,generate pseudo doc with InstructGPT for reader**)\n- [Recitation-Augmented Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.01296.pdf) *Zhiqing Sun et.al.* ICLR 2023. [[code](https:\u002F\u002Fgithub.com\u002FEdward-Sun\u002FRECITE)] (**similar to GenRead**)\n- [Precise Zero-Shot Dense Retrieval without Relevance Labels.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.10496.pdf) *Luyu Gao, Xueguang Ma et.al.* Arxiv 2022. [[code](https:\u002F\u002Fgithub.com\u002Ftexttron\u002Fhyde)] (**HyDE,InstructGPT generate pseudo doc and Contriever retireve the real one**)\n- [Query2doc: Query Expansion with Large Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.07678.pdf) *Liang Wang et.al.* Arxiv 2023. (**Generate pseudo docs using in-context learning and then concat with queries, text-davinci-003**)\n- [Large Language Models are Strong Zero-Shot Retriever.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.14233.pdf) *Tao Shen et.al.* Arxiv 2023. (**similar to Hyde, augment the LLM with retrieved docs using BM25**)\n- [Generating Synthetic Documents for Cross-Encoder Re-Rankers: A Comparative Study of ChatGPT and Human Experts.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.02320.pdf) *Arian Askari et.al.* Arxiv 2023. [[code](https:\u002F\u002Fgithub.com\u002Farian-askari\u002FChatGPT-RetrievalQA)] (**Ranking with synthetic data generated by ChatGPT**)\n\n\n\n#### LLM for Relevance Scoring\n- [Task-aware Retrieval with Instructions.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.09260.pdf) *Akari Asai, Timo Schick et.al.* Arxiv 2022. [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Ftart)] (**TART, BERRI 40 tasks with instructions,1.5B FLAN-T5**)\n- [One Embedder, Any Task: Instruction-Finetuned Text Embeddings.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.09741.pdf) *Hongjin Su, Weijia Shi et.al.* [[code](https:\u002F\u002Fgithub.com\u002FHKUNLP\u002Finstructor-embedding)](**Intructor, 330 diverse tasks, 1.5B model**)\n- [ExaRanker: Explanation-Augmented Neural Ranker.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2301.10521.pdf) *Fernando Ferraretto et.al.* Arxiv 2023. [[code](https:\u002F\u002Fgithub.com\u002Funicamp-dl\u002FExaRanker)] (**Training monoT5 with both relevance score and explanations generated by GPT-3.5 (text-davinci-002)**)\n- [Perspectives on Large Language Models for Relevance Judgment.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.09161.pdf) *Guglielmo Faggioli et.al.* Arxiv 2023. (**Perspective Paper**)\n- [Zero-Shot Listwise Document Reranking with a Large Language Model.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.02156.pdf) *Xueguang Ma et.al.* Arxiv 2023. (**LRL, generate rank list with GPT3**)\n- [Large Language Models are Built-in Autoregressive Search Engines.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.09612.pdf) *Noah Ziems et.al.* Arxiv 2023. (**LLM-URL, use GPT-3 text-davinci-003 to generate URL, model-based IR**)\n- [Is ChatGPT Good at Search? Investigating Large Language Models as Re-Ranking Agent.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.09542.pdf) *Weiwei Sun  et.al.* EMNLP main 2023.[[code](https:\u002F\u002Fgithub.com\u002Fsunnweiwei\u002FRankGPT)](**Zero-shot Passage reranking with ChatGPT\u002FGPT4**)\n- [Large Language Models are Effective Text Rankers with Pairwise Ranking Prompting.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.17563.pdf) *Zhen Qin et.al.* Arxiv 2023.\n- [RankVicuna: Zero-Shot Listwise Document Reranking with Open-Source Large Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.15088.pdf) *Ronak Pradeep et.al.* Arxiv 2023. [[code](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Frank_llm)]\n- [Found in the Middle: Permutation Self-Consistency Improves Listwise Ranking in Large Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.07712.pdf) *Raphael Tang, Xinyu Zhang et.al.* Arxiv 2023. [[code](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Fperm-sc)]\n- [Fine-Tuning LLaMA for Multi-Stage Text Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.08319.pdf) *Xueguang Ma et.al.* Arxiv 2023.\n- [A Setwise Approach for Effective and Highly Efficient Zero-shot Ranking with Large Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.09497.pdf) *Shengyao Zhuang et.al.* Arxiv 2023.\n- [Open-source Large Language Models are Strong Zero-shot Query Likelihood Models for Document Ranking.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.13243.pdf) *Shengyao Zhuang et.al.* Arxiv 2023. [[code](https:\u002F\u002Fgithub.com\u002Fielab\u002Fllm-qlm)]\n- [PaRaDe: Passage Ranking using Demonstrations with Large Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.14408.pdf) *Andrew Drozdov et.al.* Arxiv 2023.\n- [Beyond Yes and No: Improving Zero-Shot LLM Rankers via Scoring Fine-Grained Relevance Labels.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.14122.pdf) *Honglei Zhuang et.al.* Arxiv 2023.\n- [Large Language Models can Accurately Predict Searcher Preferences.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.10621.pdf) *Paul Thomas et.al.* Arxiv 2023.\n- [RankZephyr: Effective and Robust Zero-Shot Listwise Reranking is a Breeze!](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.02724.pdf) *Ronak Pradeep et.al.*  Arxiv 2023.\n- [Rank-without-GPT: Building GPT-Independent Listwise Rerankers on Open-Source Large Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.02969.pdf) *Xinyu Zhang et.al.* Arxiv 2023.\n\n\n#### LLM for Generative Retrieval\n- [ACID: Abstractive, Content-Based IDs for Document Retrieval with Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.08593.pdf) *Haoxin Li et.al.* Arxiv 2023. (**Using GPT-3.5 generate keyphrases**)\n\n\n#### Retrieval-Augmented Text Generation\n- [WebGPT: Browser-assisted question-answering with human feedback.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.09332.pdf) *Reiichiro Nakano,Jacob Hilton,Suchir Balaji et.al.* Arxiv 2022. (**WebGPT, GPT3**)\n- [Teaching language models to support answers with verified quotes.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.11147.pdf) *DeepMind* Arxiv 2022.\n- [Evaluating Verifiability in Generative Search Engines.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.09848.pdf) *Nelson F. Liu et.al.* Arxiv 2023. [[code](https:\u002F\u002Fgithub.com\u002Fnelson-liu\u002Fevaluating-verifiability-in-generative-search-engines)]\n- [Enabling Large Language Models to Generate Text with Citations.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.14627.pdf) *Tianyu Gao et.al.* Arxiv 2023. [[code](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FALCE)] (**ALCE benchmark**)\n- [FreshLLMs: Refreshing Large Language Models with Search Engine Augmentation.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.03214.pdf) *Tu Vu et.al.* Arxiv 2023. [[code](https:\u002F\u002Fgithub.com\u002Ffreshllms\u002Ffreshqa)]\n- [Retrieve Anything To Augment Large Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.07554.pdf) *Peitian Zhang, Shitao Xiao et.al.* Arxiv 2023. [[code](https:\u002F\u002Fgithub.com\u002FFlagOpen\u002FFlagEmbedding)]\n- [Leveraging Event Schema to Ask Clarifying Questions for\nConversational Legal Case Retrieval.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3583780.3614953) *Bulou Liu et.al.* CIKM 2023.\n- [Know Where to Go: Make LLM a Relevant, Responsible, and Trustworthy Searcher.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.12443.pdf) *Xiang Shi et.al.*\n- [Evaluating Generative Ad Hoc Information Retrieval.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.04694.pdf) *Lukas Gienapp et.al.* Arxiv 2023.\n\n#### Others\n- [Demonstrate–Search–Predict: Composing retrieval and language models for knowledge-intensive NLP.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.14024.pdf) *Omar Khattab  et.al.* Arxiv 2023.[[code](https:\u002F\u002Fgithub.com\u002Fstanfordnlp\u002Fdsp)](**DSP program, GPT3.5**)\n\n\n\n\n## Multimodal Retrieval\n\n\n### Unified Single-stream Architecture\n- [Unicoder-VL: A Universal Encoder for Vision and Language by Cross-modal Pre-training.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1908.06066.pdf) *Gen Li, Nan Duan et.al.* AAAI 2020.  [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FUnicoder)] (**Unicoder-VL**)\n- [XGPT: Cross-modal Generative Pre-Training for Image Captioning.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2003.01473.pdf) *Qiaolin Xia, Haoyang Huang, Nan Duan et.al.* Arxiv 2020.  [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FUnicoder)] (**XGPT**)\n- [UNITER: UNiversal Image-TExt Representation Learning.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1909.11740.pdf) *Yen-Chun Chen, Linjie Li et.al.* ECCV 2020.  [[code](https:\u002F\u002Fgithub.com\u002FChenRocks\u002FUNITER)] (**UNITER**)\n- [Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.06165.pdf) *Xiujun Li, Xi Yin et.al.* ECCV 2020.  [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FOscar)] (**Oscar**)\n- [VinVL: Making Visual Representations Matter in Vision-Language Models.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.00529.pdf) *Pengchuan Zhang, Xiujun Li et.al.* ECCV 2020.  [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FOscar)] (**VinVL**)\n- [Dynamic Modality Interaction Modeling for Image-Text Retrieval.](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3404835.3462829) *Leigang Qu et.al.* SIGIR 2021 **Best student paper**. [[code](https:\u002F\u002Fsigir21.wixsite.com\u002Fdime)] (**DIME**)\n\n### Multi-stream Architecture Applied on Input\n- [ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1908.02265.pdf) *Jiasen Lu, Dhruv Batra et.al.* NeurIPS 2019.  [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fvilbert-multi-task)] (**VilBERT**)\n- [12-in-1: Multi-Task Vision and Language Representation Learning.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1912.02315.pdf) *Jiasen Lu, Dhruv Batra et.al.* CVPR 2020.  [[code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fvilbert-multi-task)] (**A multi-task model based on VilBERT**)\n- [Learning Transferable Visual Models From Natural Language Supervision.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2103.00020.pdf) *Alec Radford et.al.* CVPR 2020.  [[code](https:\u002F\u002Fgithub.com\u002FOpenAI\u002FCLIP)] (**CLIP, GPT team**)\n- [ERNIE-ViL: Knowledge Enhanced Vision-Language Representations Through Scene Graph.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.16934.pdf) *Fei Yu, Jiji Tang et.al.* Arxiv 2020. [[code](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FERNIE\u002Ftree\u002Frepro\u002Fernie-vil)]  (**ERNIE-ViL，1st place on the VCR leaderboard**)\n- [M6-v0: Vision-and-Language Interaction for Multi-modal Pretraining.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2003.13198.pdf) *Junyang Lin, An Yang et.al.* KDD 2020.  (**M6-v0\u002FInterBERT**)\n- [M3P: Learning Universal Representations via Multitask Multilingual Multimodal Pre-training.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.02635.pdf) *Haoyang Huang, Lin Su et.al.* CVPR 2021. [[code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FM3P)]  (**M3P, MILD dataset**)\n\n\n## Other Resources\n\n### Some Retrieval Toolkits\n- [Faiss: a library for efficient similarity search and clustering of dense vectors](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Ffaiss)\n- [Pyserini: a Python Toolkit to Support Sparse and Dense Representations](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Fpyserini\u002F)\n- [MatchZoo: a library consisting of many popular neural text matching models](https:\u002F\u002Fgithub.com\u002FNTMC-Community\u002FMatchZoo)\n\n### Other Resources About Pre-trained Models in NLP\n- [Pre-trained Models for Natural Language Processing: A Survey.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.08271) *Xipeng Qiu et.al.* \n- [BERT-related-papers](https:\u002F\u002Fgithub.com\u002Ftomohideshibata\u002FBERT-related-papers)\n- [Pre-trained Languge Model Papers from THU-NLP](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FPLMpapers)\n\n### Surveys About Efficient Transformers\n- [Efficient Transformers: A Survey.](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.06732.pdf) *Yi Tay, Mostafa Dehghani et.al.* Arxiv 2020. \n\n","\u003Cp align=\"center\">\n  \u003Cbr>\n  \u003Cimg width=\"300\" src=\".\u002Fimgs\u002Flogo.svg\" alt=\"awesome仓库的logo\">\n  \u003Cbr>\n  \u003Cbr>\n\u003C\u002Fp>\n\n# awesome-pretrained-models-for-information-retrieval \n\n> 一个精心整理的、与信息检索预训练模型相关的优秀论文列表（即“IR领域的预训练”）。如果我遗漏了某些论文，欢迎随时提交PR将其加入！同时也非常欢迎大家提出反馈和贡献！\n\n\n\n## IR领域的预训练\n\n- [综述论文](#survey-papers)\n- [第一阶段：初检](#first-stage-retrieval)\n  \u003Cdetails>\n  \u003Csummary>\n  \u003Ca href=\"#sparse-retrieval\">稀疏检索 \u003C\u002Fa>\n\n  \u003C\u002Fsummary>\n\n    - [神经网络项权重重估](#neural-term-re-weighting)\n    - [查询或文档扩展](#query-or-document-expansion)\n    - [稀疏表示学习](#sparse-representation-learning)\n    \u003C!-- - [结合神经网络项权重重估与文档扩展](#combining-neural-term-re-weighting-and-document-expansion) -->\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#dense-retrieval\">稠密检索 \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n    - [困难负样本采样](#hard-negative-sampling)\n    - [晚期交互与多向量表示](#late-interaction-and-multi-vector-representation)\n    - [知识蒸馏](#knowledge-distillation)\n    - [针对稠密检索定制的预训练](#pre-training-tailored-for-dense-retrieval)\n    - [联合学习检索与索引构建](#jointly-learning-retrieval-and-indexing)\n    - [领域适应](#domain-adaptation)\n    - [查询改写](#query-reformulation)\n    - [偏差问题](#bias)\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#hybrid-retrieval\">混合检索 \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n  \u003C\u002Fdetails)\n\n\n- [第二阶段：重排序](#re-ranking-stage)\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#basic-usage\">基本用法 \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n    - [判别式排序模型](#discriminative-ranking-models)\n    - [生成式排序模型](#generative-ranking-models)\n    - [混合型排序模型](#hybrid-ranking-models)\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#long-document-processing-techniques\">长文档处理技术 \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n    - [段落得分聚合](#passage-score-aggregation)\n    - [段落表示聚合](#passage-representation-aggregation)\n    - [设计新架构](#designing-new-architectures)\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n      \u003Ca href=\"#improving-efficiency\">提升效率 \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n    - [解耦交互过程](#decoupling-the-interaction)\n    - [知识蒸馏](#knowledge-distillation)\n    - [部分微调](#partial-fine-tuning)\n    - [提前退出](#early-exit)\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n      \u003Ca href=\"#other-topics\">其他主题 \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n  - [查询扩展](#query-expansion)\n  - [训练样本权重调整](#re-weighting-training-samples)\n  - [专为重排序定制的预训练](#pre-training-tailored-for-re-ranking)\n  - [对抗攻击与防御](#adversarial-attack-and-defence)\n  - [跨语言检索](#cross-lingual-retrieval)\n  \u003C\u002Fdetails>\n\n- [联合学习检索与重排序](#jointly-learning-retrieval-and-re-ranking)\n- [基于模型的信息检索系统](#model-based-ir-system)\n\n- [大语言模型与IR](#llm-and-ir)\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#retrieval-augmented-llm\">检索增强型大语言模型 \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#llm-for-ir\">用于IR的大语言模型\u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n    - [观点或综述](#perspectives-or-surveys)\n    - [合成查询生成](#synthetic-query-generation)\n    - [合成文档生成](#synthetic-document-generation)\n    - [用于相关性打分的大语言模型](#llm-for-relevance-scoring)\n    - [基于IR的文本生成](#text-generation-based-on-ir)\n    - [其他](#others)\n  \u003C\u002Fdetails>\n\n\n- [多模态检索](#multimodal-retrieval)\n  \u003Cdetails>\n  \u003Csummary>\n    \u003Ca href=\"#unified-single-stream-architecture\">统一单流架构 \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n  \u003C\u002Fdetails>\n\n  \u003Cdetails>\n  \u003Csummary>\n      \u003Ca href=\"#multi-stream-architecture-applied-on-input\">应用于输入的多流架构 \u003C\u002Fa>\n  \u003C\u002Fsummary>\n\n  \u003C\u002Fdetails>\n\n- [其他资源](#other-resources)\n\n\n\n \n## 综述论文\n- [信息检索中的预训练方法。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2111.13853.pdf) *Yixing Fan, Xiaohui Xie 等.* FnTIR 2022\n- [基于预训练语言模型的稠密文本检索：综述。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.14876.pdf) *Wayne Xin Zhao, Jing Liu 等.* Arxiv 2022\n- [用于文本排序的预训练Transformer：BERT及其之后。](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.06467) *Jimmy Lin 等.* M&C 2021\n- [初检阶段的语义模型：全面回顾。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2103.04831.pdf) *Jiafeng Guo 等.* TOIS 2021\n- [深入探讨信息检索中的神经网络排序模型。](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.06902) *Jiafeng Guo 等.* IPM 2020\n\n\n\n\n## 第一阶段检索\n\n### 稀疏检索\n#### 神经网络术语重加权\n- [利用分布式表示学习术语重加权。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F2766462.2767700) *Guoqing Zheng, Jamie Callan* SIGIR 2015。（**DeepTR**）\n- [面向第一阶段段落检索的上下文感知术语权重调整。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3397271.3401204) *Zhuyun Dai等* SIGIR 2020 短文。[[代码](https:\u002F\u002Fgithub.com\u002FAdeDZY\u002FDeepCT)]（**DeepCT**）\n- [面向即席检索的上下文感知文档术语权重调整。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3366423.3380258) *Zhuyun Dai等* WWW 2020。[[代码](https:\u002F\u002Fgithub.com\u002FAdeDZY\u002FDeepCT\u002Ftree\u002Fmaster\u002FHDCT)]（**HDCT**）\n- [学习术语判别能力。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.11759.pdf) *Jibril Frej等* SIGIR 2020。（**IDF-重加权**）\n- [COIL：用上下文化倒排索引重新审视信息检索中的精确词汇匹配。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.07186.pdf) *Luyu Gao等* NAACL 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fluyug\u002FCOIL)]（**COIL**）\n- [为倒排索引学习段落影响。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.12016.pdf) *Antonio Mallia等* SIGIR 2021 短文。[[代码](https:\u002F\u002Fgithub.com\u002FDI4IR\u002FSIGIR2021)]（**DeepImapct**）\n\n\n#### 查询或文档扩展\n- [基于查询预测的文档扩展。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.08375.pdf) *Rodrigo Nogueira等* [[doc2query 代码](https:\u002F\u002Fgithub.com\u002Fnyu-dl\u002Fdl4ir-doc2query), [docTTTTTquery 代码](https:\u002F\u002Fgithub.com\u002Fcastorini\u002FdocTTTTTquery)]（**doc2query, docTTTTTquery**）\n- [用于开放域问答的生成增强型检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.08553.pdf) *Yuning Mao等* ACL 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fmorningmoni\u002FGAR)]（**使用 BART 进行查询扩展**）\n- [利用随机文本生成进行信息检索的无监督文档扩展。](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.00666) *Jeong等* arXiv 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fstarsuzi\u002FUDEG)]（**无监督文档扩展**）\n\n\n\u003C!-- #### 结合神经网络术语重加权与文档扩展 -->\n#### 稀疏表示学习\n- [SparTerm：学习基于术语的稀疏表示以实现快速文本检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.00768.pdf) *Yang Bai, Xiaoguang Li等* Arxiv 2020。（**SparTerm：由 MLM+二值术语门控得到的术语重要性分布**）\n- [面向实时开放域问答的上下文化稀疏表示。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.02896.pdf) *Jinhyuk Lee, Minjoon Seo等* ACL 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fjhyuklee\u002Fsparc)]（**SPARC，稀疏向量**）\n- [SPLADE：用于第一阶段排序的稀疏词汇及扩展模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2107.05720.pdf)，以及 [v2。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2109.10086.pdf) *Thibault Formal等* SIGIR 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fnaver\u002Fsplade)]（**SPLADE**）\n- [通过二值化实现超高维稀疏表示以提高文本检索效率。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.07198.pdf) *Kyoung-Rok Jang等* EMNLP 2021。（**UHD**）\n- [用于开放域问答的哈希高效段落检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.00882.pdf) *Ikuya Yamada等* ACL 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fstudio-ousia\u002Fbpr)]（**BPR，将嵌入向量转换为二进制码**）\n\n\n\n### 密集检索\n\n\n#### 硬负采样\n- [用于开放域问答的密集段落检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.04906.pdf) *Vladimir Karpukhin, Barlas Oguz等* EMNLP 2020 [[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FDPR)]（**DPR，批次内负样本**）\n- [RepBERT：用于第一阶段检索的上下文化文本嵌入。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.15498.pdf) *Jingtao Zhan等* Arxiv 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fjingtaozhan\u002FRepBERT-Index)]（**RepBERT**）\n- [密集文本检索中的近似最近邻负对比学习。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2007.00808.pdf) *Lee Xiong, Chenyan Xiong等* [[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FANCE)]（**ANCE，训练期间刷新索引**）\n- [RocketQA：一种优化的训练方法，用于开放域问答的密集段落检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.08191.pdf) *Yingqi Qu等* NAACL 2021。（**RocketQA：跨批次负样本、去噪硬负样本和数据增强**）\n- [利用硬负样本优化密集检索模型训练。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.08051.pdf) *Jingtao Zhan等* SIGIR 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fjingtaozhan\u002FDRhard)]（**ADORE&STAR，基于预训练文档编码器的查询端微调**）\n- [通过平衡的主题感知采样高效地训练有效的密集检索器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.06967.pdf) *Sebastian Hofstätter等* SIGIR 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fsebastian-hofstaetter\u002Ftas-balanced-dense-retrieval)]（**TAS-Balanced，从查询簇中采样并从 BERT 集成模型中蒸馏**）\n- [PAIR：利用以段落为中心的相似性关系改进密集段落检索](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.06027.pdf) *Ruiyang Ren等* EMNLP Findings 2021。[[代码](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FResearch\u002Ftree\u002Fmaster\u002FNLP\u002FACL2021-PAIR)]（**PAIR**）\n\n\n\n#### 晚期交互与多向量表示\n- [ColBERT：通过在 BERT 上进行上下文化的晚期交互实现高效且有效的段落搜索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.12832.pdf) *Omar Khattab等* SIGIR 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fstanford-futuredata\u002FColBERT)]（**ColBERT**）\n- [多编码器：用于快速准确的多句评分的架构和预训练策略。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.01969.pdf) *Samuel Humeau, Kurt Shuster等* ICLR 2020。[[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FParlAI\u002Ftree\u002Fmaster\u002Fprojects\u002Fpolyencoder)]（**多编码器**）\n- [用于文本检索的稀疏、密集和注意力机制表示。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.00181.pdf) *Yi Luan, Jacob Eisenstein等* TACL 2020。（**ME-BERT，多向量**）\n- [通过生成伪查询嵌入来改进文档表示，以用于密集检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.03599.pdf) *Hongyin Tang, Xingwu Sun等* ACL 2021。\n- [使用密集-稀疏短语索引实现实时开放域问答。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1906.05807.pdf) *Minjoon Seo, Jinhyuk Lee等* ACL 2019。[[代码](https:\u002F\u002Fgithub.com\u002Fuwnlp\u002Fdenspi)]（**DENSPI**）\n- [大规模学习短语的密集表示。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2012.12624.pdf) *Jinhyuk Lee, Danqi Chen等* ACL 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fjhyuklee\u002FDensePhrases)]（**DensePhrases**）\n- [面向开放域密集检索的多视角文档表示学习。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.08372.pdf) *Shunyu Zhang等* ACL 2022。（**MVR**）\n- [用于信息检索的多变量表示学习。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.14522.pdf) *Hamed Zamani等* SIGIR 2023。（**学习多变量分布**）\n\n#### 知识蒸馏\n- [从阅读器向检索器蒸馏知识用于问答。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2012.04584.pdf) *Gautier Izacard, Edouard Grave.* ICLR 2020。[[非官方代码](https:\u002F\u002Fgithub.com\u002Flucidrains\u002Fdistilled-retriever-pytorch)] (**将阅读器的交叉注意力蒸馏到检索器**)\n- [为快速检索型聊天机器人进行知识蒸馏。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.11045.pdf) *Amir Vakili Tahami等。* SIGIR 2020。[[代码](https:\u002F\u002Fgithub.com\u002FKamyarGhajar\u002FDistilledNeuralResponseRanker)] (**从交叉编码器蒸馏到双编码器**)\n- [通过跨架构知识蒸馏改进高效的神经排序模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.02666.pdf) *Sebastian Hofstätter等。* Arxiv 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fsebastian-hofstaetter\u002Fneural-ranking-kd)] (**从BERT集成模型蒸馏**)\n- [利用紧密耦合的教师模型蒸馏密集表示以用于排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.11386.pdf) *Sheng-Chieh Lin, Jheng-Hong Yang, Jimmy Lin。* Arxiv 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Fpyserini\u002Fblob\u002Fmaster\u002Fdocs\u002Fexperiments-tct_colbert.md)] (**TCTColBERT：从ColBERT蒸馏**)\n- [通过平衡的主题感知采样高效地训练有效的密集检索器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.06967.pdf) *Sebastian Hofstätter等。* SIGIR 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fsebastian-hofstaetter\u002Ftas-balanced-dense-retrieval)] (**TAS-Balanced，从查询簇中采样并从BERT集成模型蒸馏**)\n- [RocketQAv2：一种用于密集段落检索和段落重排序的联合训练方法。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.07367.pdf) *Ruiyang Ren, Yingqi Qu等。* EMNLP 2021。[[代码](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FRocketQA)] (**RocketQAv2，通过知识蒸馏进行联合学习**)\n- [面向少样本对话式密集检索的课程式对比上下文去噪。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3477495.3531961) *Kelong Mao等。* SIGIR 2022。\n\n \n#### 针对密集检索量身定制的预训练\n- [用于弱监督开放域问答的潜在检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1906.00300.pdf) *Kenton Lee等。* ACL 2019。[[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Flanguage\u002Fblob\u002Fmaster\u002Flanguage\u002Forqa\u002FREADME.md)] (**ORQA，ICT**)\n- [基于嵌入的大规模检索的预训练任务。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2002.03932.pdf) *Wei-Cheng Chang等。* ICLR 2020。(**ICT、BFS和WLP**)\n- [REALM：检索增强的语言模型预训练。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2002.08909.pdf) *Kelvin Guu、Kenton Lee等。* ICML 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Flanguage\u002Fblob\u002Fmaster\u002Flanguage\u002Frealm\u002FREADME.md)] (**REALM**)\n- [少即是多：使用弱解码器为密集检索预训练强大的文本编码器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2102.09206.pdf) *Shuqi Lu、Di He、Chenyan Xiong等。* EMNLP 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FSEED-Encoder)] (**Seed**)\n- [Condenser：一种用于密集检索的预训练架构。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.08253.pdf) *Luyu Gao等。* EMNLP 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fluyug\u002FCondenser)] (**Condenser**)\n- [面向多语言密集检索的无监督上下文感知句子表示预训练。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.03281.pdf) *Ning Wu等。* JICAI 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fwuning0929\u002FCCP_IJCAI22)] (**CCP，跨语言预训练**)\n- [面向密集段落检索的无监督语料库感知语言模型预训练。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.05540.pdf) *Luyu Gao等。* ACL 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fluyug\u002FCondenser)] (**coCondenser**)\n- [LaPraDoR：用于零样本文本检索的无监督预训练密集检索器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.06169.pdf) *Canwen Xu、Daya Guo等。* ACL 2022。[[代码](https:\u002F\u002Fgithub.com\u002FJetRunner\u002FLaPraDoR)] (**LaPraDoR，ICT+dropout**)\n- [一种通过对比预训练学习用于密集检索的判别自编码器的方法。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.09846.pdf) *Xinyu Ma等。* CIKM 2022。(**CPADE，基于文档词频分布的对比预训练**)\n- [通过对比跨度预测为密集检索预训练判别性文本编码器](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.10641.pdf) *Xinyu Ma等。* SIGIR 2022。[[代码](https:\u002F\u002Fgithub.com\u002FAlbert-Ma\u002FCOSTA)] (**COSTA，分组对比学习**)\n- [H-ERNIE：一种用于网页搜索的多粒度预训练语言模型。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3477495.3531986) *Xiaokai Chu等。* SIGIR 2022。(**H-ERNIE**)\n- [保持结构与语义的文档表示。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.03720.pdf) *Natraj Raman等。* SIGIR 2022。\n- [Contriever：利用对比学习实现无监督的密集信息检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.09118.pdf) *Gautier Izacard等。* TMLR 2022。[[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fcontriever)] (**Contriever**)\n- [通过插值与扰动生成密集检索用的文档表示。](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.07735) *Jeong等。* ACL 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fstarsuzi\u002FDAR)] (**用于密集检索的增强**)\n\n\n#### 检索与索引的联合学习\n- [深度检索模型与基于产品量化技术的嵌入索引的联合学习。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.03933.pdf) *Han Zhang等。* SIGIR 2021短会。[[代码](https:\u002F\u002Fgithub.com\u002Fjdcomsearch\u002Fpoeem)] (**Poeem**)\n- [联合优化查询编码器和产品量化以提升检索性能。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.00644.pdf) *Jingtao Zhan等。* CIKM 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fjingtaozhan\u002FJPQ)] (**JPQ**)\n- [通过约束聚类学习离散表示以实现有效且高效的密集检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.05789.pdf) *Jingtao Zhan等。* WSDM 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fjingtaozhan\u002FRepCONC)] (**RepCONC**)\n- [面向即席检索的匹配导向嵌入量化。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.07858.pdf) *Shitao Xiao等。* EMNLP 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FMoPQ)]\n- [Distill-VQ：通过从密集嵌入中蒸馏知识来学习面向检索的向量量化。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.00185.pdf) *Shitao Xiao等。* SIGIR 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fstaoxiao\u002FLibVQ)]\n\n\n#### 多跳密集检索\n- [利用多跳密集检索回答复杂的开放域问题。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.12756.pdf) *Wenhan Xiong、Xiang Lorraine Li等。* ICLR 2021 [[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fmultihop_dense_retrieval)] (**迭代地将问题和先前检索到的文档编码为查询向量**)\n\n#### 领域适应\n- [面向知识密集型任务的多任务检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.00117.pdf) *Jean Maillard、Vladimir Karpukhin^等。* ACL 2021。(**多任务学习**)\n- [评估密集检索的外推性能。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.11447.pdf) *Jingtao Zhan等。* CIKM 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fjingtaozhan\u002Fextrapolate-eval)]\n\n#### 查询改写\n- [用于多表示密集检索的伪相关反馈。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.11251.pdf) *Xiao Wang等* ICTIR 2021 (**ColBERT-PRF**)\n- [利用伪相关反馈改进密集检索的查询表示。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.13454.pdf) *HongChien Yu等* CIKM 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fyuhongqian\u002FANCE-PRF)] (**ANCE-PRF**)\n- [LoL：一种用于伪相关反馈的查询改写损失比较正则化损失。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.11545.pdf) *Yunchang Zhu等* SIGIR 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fzycdev\u002FLoL)] (**LoL，伪相关反馈**)\n\n\n\n#### 偏差\n- [密集段落检索中的隐式反馈：一种反事实方法。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.00718.pdf) *Shengyao Zhuang等* SIGIR 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fielab\u002FCounterfactual-DR)] (**CoRocchio，反事实Rocchio算法**)\n- [硬负样本还是假负样本：纠正训练神经排序模型中的池化偏差。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.05072.pdf) *Yinqiong Cai等* CIKM 2022。\n\n\n\n\n\n### 混合检索\n- [基于稠密-稀疏短语索引的实时开放域问答。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1906.05807.pdf) *Minjoon Seo、Jinhyuk Lee等* ACL 2019。[[代码](https:\u002F\u002Fgithub.com\u002Fuwnlp\u002Fdenspi)] (**DENSPI**)\n- [用语义残差嵌入补充词汇检索模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.13969.pdf) *Luyu Gao等* ECIR 2021。\n- [基于BERT的密集检索器需要与BM25进行插值才能有效进行段落检索。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3471158.3472233) *Shuai Wang等* ICTIR 2021。\n- [渐进优化的双粒度文档表示用于可扩展的基于嵌入的检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.05409.pdf) *Shitao Xiao等* WWW 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FBiDR)]\n\n\n\n## 重排序阶段\n\n\n### 基本用法\n\n#### 判别式排序模型\n\n##### 表示聚焦型\n- [理解BERT在排序中的行为。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.07531.pdf) *Yifan Qiao等* Arxiv 2019。(**表示聚焦型和交互聚焦型**)\n\n##### 交互聚焦型\n- [使用BERT对段落进行重排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1901.04085.pdf) *Rodrigo Nogueira等* [[代码](https:\u002F\u002Fgithub.com\u002Fnyu-dl\u002Fdl4marco-bert)] (**monoBERT：可能是最早将BERT应用于信息检索的工作**)\n- [使用BERT进行多阶段文档排序，](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1910.14424.pdf) [预训练序列到序列模型用于文本排序的Expando-Mono-Duo设计模式。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.05667.pdf) *Rodrigo Nogueira等* Arxiv 2020。(**Expando-Mono-Duo：doc2query+点对点+成对**)\n- [CEDR：用于文档排序的上下文化嵌入。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.07094.pdf) *Sean MacAvaney等* SIGIR 2020简报。[[代码](https:\u002F\u002Fgithub.com\u002FGeorgetown-IR-Lab\u002Fcedr)] (**CEDR：BERT+neuIR模型**)\n\n\n#### 生成式排序模型\n- [通过生成式排序超越[CLS]。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.03073.pdf) *Cicero Nogueira dos Santos等* EMNLP 2020简报。(**使用GPT和BART生成查询**)\n- [使用预训练序列到序列模型进行文档排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2003.06713.pdf) *Rodrigo Nogueira、Zhiying Jiang等* EMNLP 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Fpygaggle\u002F)] (**使用T5生成相关性标记**)\n- [RankT5：使用排序损失对T5进行微调以用于文本排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.10634.pdf) *Honglei Zhuang等* Arxiv 2022。\n\n\n#### 混合排序模型\n- [利用生成任务泛化判别式检索模型。](https:\u002F\u002Fciir-publications.cs.umass.edu\u002Fpub\u002Fweb\u002Fgetpdf.php?id=1414) *Bingsheng Liu、Hamed Zamani等* WWW 2021。(**GDMTL，多任务学习的联合判别式和生成式模型**)\n\n\n\n### 长文档处理技术\n#### 段落得分聚合\n- [借助上下文神经语言建模实现更深入的IR文本理解。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.09217.pdf) *Zhuyun Dai等* SIGIR 2020简报。[[代码](https:\u002F\u002Fgithub.com\u002FAdeDZY\u002FSIGIR19-BERT-IR)] (**BERT-MaxP、BERT-firstP、BERT-sumP：段落级别**)\n- [BERT在即席文档检索中的简单应用，](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1903.10972.pdf) [使用Birch将BERT应用于文档检索，](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-3004.pdf) [跨领域建模句子级证据用于文档检索。](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD19-1352.pdf) *Wei Yang、Haotian Zhang等* Arxiv 2020，*Zeynep Akkalyoncu Yilmaz等* EMNLP 2019简报。[[代码](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Fbirch)] (**Birch：句子级别**)\n- [文档内级联：学习选择段落以进行神经文档排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.09816.pdf) *Sebastian Hofstätter等* SIGIR 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fsebastian-hofstaetter\u002Fintra-document-cascade)] (**将排序模型蒸馏到conv-knrm以选择前k个段落**)\n\n\n#### 段落表示聚合\n- [PARADE：用于文档重排序的段落表示聚合。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2008.09093.pdf) *Canjia Li等* Arxiv 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fcanjiali\u002FPARADE\u002F)] (**对各种段落表示聚合方法进行了广泛比较**)\n- [利用段落级累积增益进行文档排序。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3366423.3380305) *Zhijing Wu等* WWW 2020。(**PCGM**)\n\n\n#### 设计新架构\n- [针对长文本的局部自注意力机制以实现高效文档检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.04908.pdf) *Sebastian Hofstätter等* SIGIR 2020简报。[[代码](https:\u002F\u002Fgithub.com\u002Fsebastian-hofstaetter\u002Ftransformer-kernel-ranking)] (**TKL：用于长文本的Transformer-Kernel**)\n- [超越512个token：用于长文档匹配的暹罗多深度Transformer分层编码器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.12297v2.pdf) *Liu Yang等* CIKM 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Fsmith)] (**SMITH用于文档到文档的匹配**)\n- [Socialformer：受社交网络启发的长文档建模用于文档排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.10870.pdf) *Yujia Zhou等* WWW 2022。(**Socialformer**)\n\n### 提高效率\n\n#### 解耦交互\n- [DC-BERT：为高效上下文编码解耦问题与文档。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2002.12591.pdf) *张宇宇、聂平等* SIGIR 2020 短文。（**DC-BERT**）\n- [通过预计算词项表示实现 Transformer 的高效文档重排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.14255.pdf) *肖恩·麦卡维尼等* SIGIR 2020。[[代码](https:\u002F\u002Fgithub.com\u002FGeorgetown-IR-Lab\u002Fprettr-neural-ir)]（**PreTTR**）\n- [基于 Transformer 的模块化排序框架。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.13313.pdf) *高璐宇等* EMNLP 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fluyug\u002FMORES)]（**MORES，类似于 PreTTR**）\n- [TILDE：用于段落重排序的词项独立似然模型。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3404835.3462922) *庄圣尧、圭多·祖孔* SIGIR 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fielab\u002FTILDE)]（**TILDE**）\n- [用于高效文档排序的快速前向索引。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.06051.pdf) *尤雷克·莱昂哈特等* WWW 2022。（**快速前向索引**）\n\n\n#### 知识蒸馏\n- [理解蒸馏下的 BERT 排序器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2007.11088.pdf) *高璐宇等* ICTIR 2020。（**语言模型蒸馏 + 排序器蒸馏**）\n- [简化版 TinyBERT：用于文档检索的知识蒸馏。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.07531.pdf) *陈宣昂等* ECIR 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fcxa-unique\u002FSimplified-TinyBERT)]（**TinyBERT+知识蒸馏**）\n\n\n#### 部分微调\n- [采用轻量级微调的半暹罗双编码器神经排序模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.14943.pdf) *郑恩娜、崔在赫等* WWW 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fxlpczv\u002FSemi_Siamese)]（**轻量级微调**）\n- [分散还是连接？一种面向信息检索的优化参数高效微调方法。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.09847.pdf) *马鑫宇等* CIKM 2022。（**IAA，引入旁路模块以稳定训练**）\n\n\n#### 早期退出\n- [级联 Transformer：一种用于高效答案句子选择的应用。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.02534.pdf) *卢卡·索尔代尼等* ACL 2020。[[代码](https:\u002F\u002Fgithub.com\u002Falexa\u002Fwqa-cascade-transformers)]（**级联 Transformer：按层剪枝候选**）\n- [用于高效文档排序的 BERT 早期退出。](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002F2020.sustainlp-1.11.pdf) *辛吉等* EMNLP 2020 SustaiNLP 工作坊。[[代码](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Fearlyexiting-monobert)]（**早期退出**）\n\n\n\n### 其他主题\n\n#### 查询扩展\n- [BERT-QE：用于文档重排序的上下文化查询扩展。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.07258.pdf) *郑志等* EMNLP 2020 Findings。[[代码](https:\u002F\u002Fgithub.com\u002Fzh-zheng\u002FBERT-QE)]（**BERT-QE**）\n\n\n#### 重新加权训练样本\n- [开放域答案重排序的训练课程。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.14269.pdf) *肖恩·麦卡维尼等* SIGIR 2020。[[代码](https:\u002F\u002Fgithub.com\u002FGeorgetown-IR-Lab\u002Fcurricula-neural-ir)]（**基于 BM25 的课程学习**）\n- [并非所有相关性得分都相等：深度检索模型的高效不确定性与校准建模。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.04651.pdf) *丹尼尔·科恩等* SIGIR 2021。\n\n\n#### 针对重排序量身定制的预训练\n\n- [MarkedBERT：将传统 IR 特征融入预训练语言模型以进行段落检索。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3397271.3401194) *莉拉·布阿利利等* SIGIR 2020 短文。[[代码](https:\u002F\u002Fgithub.com\u002FBOUALILILila\u002Fmarkers_bert)]（**MarkedBERT**）\n- [面向神经信息检索的选择性弱监督。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2001.10382.pdf) *张凯涛等* WWW 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FReInfoSelect)]（**ReInfoSelect**）\n- [PROP：针对即席检索的代表性词预测预训练。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.10137.pdf) *马鑫宇等* WSDM 2021。[[代码](https:\u002F\u002Fgithub.com\u002FAlbert-Ma\u002FPROP)]（**PROP**）\n- [用于检索的跨语言语言模型预训练。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3442381.3449830) *于普轩等* WWW 2021。\n- [B-PROP：基于代表性词预测的即席检索自举式预训练。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.09791.pdf) *马鑫宇等* SIGIR 2021。[[代码](https:\u002F\u002Fgithub.com\u002FAlbert-Ma\u002FPROP)]（**B-PROP**）\n- [即席检索的预训练：超链接也是你需要的。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.09346.pdf) *马正毅等* CIKM 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fzhengyima\u002FAnchors)]（**HARP**）\n- [面向上下文文档排序的用户行为序列对比学习。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.10510.pdf) *朱宇涛等* CIKM 2021。[[代码](https:\u002F\u002Fgithub.com\u002FDaoD\u002FCOCA)]（**COCA**）\n- [百度搜索中基于预训练语言模型的排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.11108.pdf) *邹立新等* KDD 2021。\n- [用于段落排序与扩展的统一预训练框架。](https:\u002F\u002Fojs.aaai.org\u002Findex.php\u002FAAAI\u002Farticle\u002Fview\u002F16584) *严明等* AAAI 2021。（**UED，联合训练排序与查询生成**）\n- [面向即席搜索的公理化正则化预训练。](https:\u002F\u002Fxuanyuan14.github.io\u002Ffiles\u002FSIGIR22Chen.pdf) *陈佳等* SIGIR 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fxuanyuan14\u002FARES)]（**ARES**）\n- [Webformer：面向信息检索的网页预训练。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3477495.3532086) *郭宇等* SIGIR 2022。（**Webformer**）\n\n\n\n#### 对抗攻击与防御\n- [竞争性搜索。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3477495.3532771) *奥伦·库兰德等* SIGIR 2022。\n- [PRADA：针对神经排序模型的实用黑盒对抗攻击。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.01321.pdf) *吴晨等* Arxiv 2022。\n- [有序—无序：针对黑盒神经排序模型的模仿式对抗攻击](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.06506.pdf) *刘嘉伟等* CCS 2022。\n- [神经排序模型是否鲁棒？](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.05018.pdf) *吴晨等* TOIS。\n- [神经排序模型对词语替换排序攻击的认证鲁棒性](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.06691.pdf) *吴晨等* CIKM 2022。\n- [面向黑盒神经排序模型的主题导向对抗攻击。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.14867.pdf) *刘宇安等* SIGIR 2023。\n\n\n#### 跨语言检索\n- [用于迭代自监督训练的跨语言检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.09526.pdf) *周 Tran 等* NIPS 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fpytorch\u002Ffairseq\u002Ftree\u002Fmaster\u002Fexamples\u002Fcriss)]（**CRISS**）\n- [CLIRMatrix：一个超大规模的双语和多语数据集，用于跨语言信息检索。](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002F2020.emnlp-main.340.pdf) *孙硕等* EMNLP 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fssun32\u002FCLIRMatrix)]（**多语数据集—CLIRMatrix 和多语 BERT**）\n\n## 检索与重排序的联合学习\n- [RocketQAv2：密集段落检索与段落重排序的联合训练方法。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.07367.pdf) *任瑞阳、屈英琪等* EMNLP 2021。[[代码](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FRocketQA)] (**RocketQAv2**)\n- [用于密集文本检索的对抗性检索器-排序器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.03611.pdf) *张航等* ICLR 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FAR2)] (**AR2**)\n- [RankFlow：将多阶段级联排序系统作为流进行联合优化。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3477495.3532050) *秦嘉睿等* SIGIR 2022。(**RankFlow**)\n\n\n\n## 基于模型的IR系统\n- [重新思考搜索：让业余爱好者成为领域专家。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.02274.pdf) *唐纳德·梅茨勒等* SIGIR Forum 2020。\n(**构想了基于模型的IR系统**)\n- [Transformer内存作为可微分的搜索索引。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.06991.pdf) *泰毅等* Arxiv 2022。\n(**DSI**)\n- [DynamicRetriever：一种既无稀疏索引也无稠密索引的预训练模型IR系统。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.00537.pdf) *周宇佳等* Arxiv 2022。\n(**DynamicRetriever**)\n- [用于文档检索的神经语料库索引器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.02743.pdf) *王玉静等* Arxiv 2022。（**NCI**）\n- [自回归搜索引擎：生成子字符串作为文档标识符。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.10628.pdf) *米凯莱·贝维拉夸等* Arxiv 2022。[[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FSEAL)] (**SEAL**)\n- [CorpusBrain：为知识密集型语言任务预训练生成式检索模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.07652.pdf) *陈江贵等* CIKM 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fict-bigdatalab\u002FCorpusBrain)] (**CorpusBrain**)\n- [通过提示学习实现知识密集型语言任务的统一生成式检索器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.14856.pdf) *陈江贵等* SIGIR 2023。[[代码](https:\u002F\u002Fgithub.com\u002Fict-bigdatalab\u002FUGR)] (**UGR**)\n- [TOME：一种基于模型的两阶段检索方法。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.11161.pdf) *任瑞阳等* ACL 2023。（**TOME：先生成段落，再生成URL**）\n- [生成式检索如何扩展到数百万个段落？](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.11841.pdf) *罗纳克·普拉迪普、凯辉等* Arxiv 2023。（**对所提出方法的全面研究，使用合成查询作为文档ID**）\n- [受学习策略启发的语义增强可微分搜索索引。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.15115.pdf) *唐宇宝等* KDD 2023。（**语义增强版DSI**）\n\n\n\n## LLM与IR\n\n### 观点或综述\n- [信息检索遇上大语言模型：来自中国IR界的策略报告。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.09751.pdf) *青瑶AI等* CCIR社区。AI Open 2023。\n- [面向信息检索的大语言模型：综述。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2308.07107.pdf) *朱宇涛等* 中国人民大学。Arxiv 2023。\n- [借助AI副驾驶应对复杂的搜索任务。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.11401.pdf) *赖恩·W·怀特* 微软研究院。Arxiv 2023。\n\n\n### 检索增强LLM\n- [用于知识密集型NLP任务的检索增强生成。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.11401.pdf) *帕特里克·刘易斯、伊森·佩雷斯等* NIPS 2020。（**RAG，针对4.4亿参数的BART**）\n- [通过从数万亿个标记中检索来改进语言模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.04426.pdf) *塞巴斯蒂安·博尔戈、阿瑟·门施、乔丹·霍夫曼等* ICML 2022。[[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FFiD)]（***RETRO，编码解码架构，75亿参数**）\n- [Atlas：利用检索增强语言模型进行少样本学习。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.03299.pdf) *高蒂埃·伊扎卡尔、帕特里克·刘易斯等* Arxiv 2022。[[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fatlas)] (**Atlas，T5，110亿参数**)\n- [通过少样本提示对开放域问答进行互联网增强的语言模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.05115.pdf) *安杰莉基·拉扎里杜等* Arxiv 2022。（**Gopher 280B，以谷歌搜索结果为条件**）\n- [通过迭代的检索-生成协同作用增强检索增强大型语言模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.15294.pdf) *邵志宏等* Arxiv 2023。\n- [检索增强预训练后的指令调优。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.07713.pdf) *王博欣等* Arxiv 2023。\n- [检索任何内容以增强大型语言模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.07554.pdf)\n\n### LLM用于IR\n\n\u003C!-- #### 综述或观点\n\n- []\n- [大型搜索模型：在LLM时代重新定义搜索栈。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.14587.pdf) *王亮、杨楠等* -->\n\n#### 合成查询生成\n- [通过零样本问题生成改进段落检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.07496.pdf) *Devendra Singh Sachan 等人* EMNLP 2022。[[代码](https:\u002F\u002Fgithub.com\u002FDevSinghSachan\u002Funsupervised-passage-reranking)](**UPR，基于GPT-neo 2.7B\u002FT0 3B、11B的查询似然对文档进行重排序**)\n- [Promptagator：仅用8个示例实现少样本密集检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.11755.pdf) *Zhuyun Dai 等人* ICLR 2023。(**使用上下文学习和FLAN 137B生成伪查询**)\n- [UDAPDR：通过LLM提示和重排序器蒸馏实现无监督领域适应。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.00807.pdf) *Jon Saad-Falcon、Omar Khattab 等人* Arxiv 2023。[[代码](https:\u002F\u002Fgithub.com\u002Fprimeqa\u002Fprimeqa)](**使用GPT3生成的伪查询训练重排序器**)\n- [InPars：利用大型语言模型进行信息检索的数据增强。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2202.05144.pdf) *Luiz Bonifacio 等人* Arxiv 2022。[[代码](https:\u002F\u002Fgithub.com\u002Fzetaalphavector\u002FInPars\u002Ftree\u002Fmaster\u002Flegacy\u002Finpars-v1)](**使用GPT-3 Curie通过上下文学习生成伪查询，并根据查询生成概率选择前k个q-d对**)\n- [InPars-v2：大型语言模型作为信息检索的高效数据生成器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2301.01820.pdf) *Vitor Jeronymo 等人* Arxiv 2023。[[代码](https:\u002F\u002Fgithub.com\u002Fzetaalphavector\u002FinPars\u002Ftree\u002Fmaster\u002Flegacy\u002Finpars-v2)](**与InPars类似，使用GPT-J 6B LLM及微调后的重排序器作为选择器**)\n- [InPars-Light：经济高效的高效排序器无监督训练。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2301.02998.pdf) *Leonid Boytsov 等人* Arxiv 2023。(**与InPars相似，使用GPT-J 6B和BLOOM 7B**)\n- [基于大型语言模型的生成式相关性反馈。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.13157.pdf) *Iain Mackie 等人* SIGIR 2023简报。(**GRF，使用GPT3生成多种信息用于相关性反馈**)\n- [通过提示大型语言模型进行查询扩展。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.03653.pdf) *Rolf Jagerman 等人* Arxiv 2023。\n- [探索合成查询生成在相关性预测中的可行性。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.11944.pdf) *Aditi Chaudhary 等人* Arxiv 2023。(**使用FLAN-137B进行标签条件生成**)\n- [基于大型语言模型的淘宝搜索长尾查询改写。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.03758.pdf) *Wenjun Peng 等人* Arxiv 2023。\n- [生成、过滤与融合：面向零样本神经排序器的多步关键词生成查询扩展。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.09175.pdf) *Minghan Li 等人* Arxiv 2023。(**使用Flan-PaLM2-S生成关键词**)\n- [利用LLM在多语言密集检索中跨多种语言合成训练数据。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.05800.pdf) *Nandan Thakur 等人* Arxiv 2023。\n\n#### 合成文档生成\n- [与其检索，不如生成：大型语言模型是强大的上下文生成器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.10063.pdf) *Wenhao Yu 等人* ICLR 2023。[[代码](https:\u002F\u002Fgithub.com\u002Fwyu97\u002FGenRead)] (**GenRead，使用InstructGPT生成读者所需的伪文档**)\n- [背诵增强型语言模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.01296.pdf) *Zhiqing Sun 等人* ICLR 2023。[[代码](https:\u002F\u002Fgithub.com\u002FEdward-Sun\u002FRECITE)] (**与GenRead类似**)\n- [无需相关性标签的精准零样本密集检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.10496.pdf) *Luyu Gao、Xueguang Ma 等人* Arxiv 2022。[[代码](https:\u002F\u002Fgithub.com\u002Ftexttron\u002Fhyde)] (**HyDE，InstructGPT生成伪文档，Contriever检索真实文档**)\n- [Query2doc：利用大型语言模型进行查询扩展。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.07678.pdf) *Liang Wang 等人* Arxiv 2023。(**使用上下文学习生成伪文档，然后与查询拼接，采用text-davinci-003**)\n- [大型语言模型是强大的零样本检索器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.14233.pdf) *Tao Shen 等人* Arxiv 2023。(**与Hyde类似，使用BM25将检索到的文档补充到LLM中**)\n- [为交叉编码器重排序器生成合成文档：ChatGPT与人类专家的比较研究。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.02320.pdf) *Arian Askari 等人* Arxiv 2023。[[代码](https:\u002F\u002Fgithub.com\u002Farian-askari\u002FChatGPT-RetrievalQA)] (**使用ChatGPT生成的合成数据进行排序**)\n\n#### 用于相关性评分的大语言模型\n- [基于指令的任务感知检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.09260.pdf) *浅井明里、蒂莫·希克等* Arxiv 2022年。[[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Ftart)] (**TART，包含40个带指令的任务数据集，15亿参数的FLAN-T5**)\n- [一个嵌入器，任意任务：指令微调的文本嵌入。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.09741.pdf) *苏洪进、史伟嘉等* [[代码](https:\u002F\u002Fgithub.com\u002FHKUNLP\u002Finstructor-embedding)](**Intructor，330种多样化任务，15亿参数模型**)\n- [ExaRanker：基于解释增强的神经排序器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2301.10521.pdf) *费尔南多·费拉雷托等* Arxiv 2023年。[[代码](https:\u002F\u002Fgithub.com\u002Funicamp-dl\u002FExaRanker)] (**使用monoT5同时训练相关性分数和由GPT-3.5（text-davinci-002）生成的解释**)\n- [关于大语言模型在相关性判断中的视角。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.09161.pdf) *古列尔莫·法吉奥利等* Arxiv 2023年。(**观点论文**)\n- [利用大语言模型进行零样本列表式文档重排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.02156.pdf) *马学光等* Arxiv 2023年。(**LRL，使用GPT3生成排序列表**)\n- [大语言模型本身就是自回归搜索引擎。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.09612.pdf) *诺亚·齐姆斯等* Arxiv 2023年。(**LLM-URL，使用GPT-3 text-davinci-003生成URL，基于模型的信息检索**)\n- [ChatGPT擅长搜索吗？探究大语言模型作为重排序代理。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.09542.pdf) *孙伟伟等* EMNLP主会2023年。[[代码](https:\u002F\u002Fgithub.com\u002Fsunnweiwei\u002FRankGPT)](**使用ChatGPT\u002FGPT4进行零样本段落重排序**)\n- [通过成对排序提示，大语言模型可有效进行文本排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.17563.pdf) *秦振等* Arxiv 2023年。\n- [RankVicuna：利用开源大语言模型进行零样本列表式文档重排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.15088.pdf) *罗纳克·普拉迪普等* Arxiv 2023年。[[代码](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Frank_llm)]\n- [中间发现：排列自一致性提升大语言模型中的列表式排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.07712.pdf) *拉斐尔·唐、张欣宇等* Arxiv 2023年。[[代码](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Fperm-sc)]\n- [为多阶段文本检索微调LLaMA。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.08319.pdf) *马学光等* Arxiv 2023年。\n- [一种集合式方法，用于高效且有效的零样本排序大语言模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.09497.pdf) *庄圣尧等* Arxiv 2023年。\n- [开源大语言模型是强大的零样本查询似然模型，可用于文档排序。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.13243.pdf) *庄圣尧等* Arxiv 2023年。[[代码](https:\u002F\u002Fgithub.com\u002Fielab\u002Fllm-qlm)]\n- [PaRaDe：利用大语言模型进行演示的段落排名。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.14408.pdf) *安德鲁·德罗佐夫等* Arxiv 2023年。\n- [超越“是”与“否”：通过细粒度的相关性标签评分改进零样本大语言模型排序器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.14122.pdf) *庄宏磊等* Arxiv 2023年。\n- [大语言模型可以准确预测搜索者的偏好。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2309.10621.pdf) *保罗·托马斯等* Arxiv 2023年。\n- [RankZephyr：高效稳健的零样本列表式重排序轻而易举！](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.02724.pdf) *罗纳克·普拉迪普等* Arxiv 2023年。\n- [无需GPT的排序：基于开源大语言模型构建独立于GPT的列表式重排序器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.02969.pdf) *张欣宇等* Arxiv 2023年。\n\n\n#### 用于生成式检索的大语言模型\n- [ACID：基于内容的抽象ID，用于语言模型驱动的文档检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.08593.pdf) *李浩鑫等* Arxiv 2023年。(**使用GPT-3.5生成关键词**)\n\n\n#### 检索增强型文本生成\n- [WebGPT：浏览器辅助的人工反馈问答系统。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.09332.pdf) *中野玲一郎、雅各布·希尔顿、苏奇尔·巴拉吉等* Arxiv 2022年。(**WebGPT，GPT3**)\n- [教导语言模型用经过验证的引文支持答案。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2203.11147.pdf) *DeepMind* Arxiv 2022年。\n- [评估生成式搜索引擎中的可验证性。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.09848.pdf) *尼尔森·F·刘等* Arxiv 2023年。[[代码](https:\u002F\u002Fgithub.com\u002Fnelson-liu\u002Fevaluating-verifiability-in-generative-search-engines)]\n- [使大语言模型能够生成带有引用的文本。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.14627.pdf) *高天宇等* Arxiv 2023年。[[代码](https:\u002F\u002Fgithub.com\u002Fprinceton-nlp\u002FALCE)] (**ALCE基准测试**)\n- [FreshLLMs：通过搜索引擎增强刷新大语言模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.03214.pdf) *涂武等* Arxiv 2023年。[[代码](https:\u002F\u002Fgithub.com\u002Ffreshllms\u002Ffreshqa)]\n- [检索任何内容以增强大语言模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.07554.pdf) *张培田、肖世涛等* Arxiv 2023年。[[代码](https:\u002F\u002Fgithub.com\u002FFlagOpen\u002FFlagEmbedding)]\n- [利用事件模式提出澄清问题，用于对话式法律案件检索。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3583780.3614953) *刘步楼等* CIKM 2023年。\n- [知道该去哪里：让大语言模型成为相关、负责且值得信赖的搜索引擎。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.12443.pdf) *石翔等*。\n- [评估生成式临时信息检索。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.04694.pdf) *卢卡斯·吉纳普等* Arxiv 2023年。\n\n#### 其他\n- [演示–搜索–预测：将检索与语言模型结合用于知识密集型自然语言处理。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.14024.pdf) *奥马尔·哈塔卜等* Arxiv 2023年。[[代码](https:\u002F\u002Fgithub.com\u002Fstanfordnlp\u002Fdsp)](**DSP程序，GPT3.5**)\n\n\n\n\n\n\n## 多模态检索\n\n### 统一单流架构\n- [Unicoder-VL：通过跨模态预训练实现视觉与语言的通用编码器。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1908.06066.pdf) *李根、段楠等* AAAI 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FUnicoder)] (**Unicoder-VL**)\n- [XGPT：用于图像字幕生成的跨模态生成式预训练。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2003.01473.pdf) *夏乔林、黄浩洋、段楠等* Arxiv 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FUnicoder)] (**XGPT**)\n- [UNITER：通用图像-文本表示学习。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1909.11740.pdf) *陈彦淳、李林杰等* ECCV 2020。[[代码](https:\u002F\u002Fgithub.com\u002FChenRocks\u002FUNITER)] (**UNITER**)\n- [Oscar：面向视觉-语言任务的物体语义对齐预训练。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.06165.pdf) *李秀军、尹曦等* ECCV 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FOscar)] (**Oscar**)\n- [VinVL：在视觉-语言模型中强化视觉表示的重要性。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.00529.pdf) *张鹏川、李秀军等* ECCV 2020。[[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FOscar)] (**VinVL**)\n- [用于图文检索的动态模态交互建模。](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fpdf\u002F10.1145\u002F3404835.3462829) *Qu Leigang等* SIGIR 2021 **最佳学生论文**。[[代码](https:\u002F\u002Fsigir21.wixsite.com\u002Fdime)] (**DIME**)\n\n### 应用于输入的多流架构\n- [ViLBERT：面向视觉-语言任务的预训练无关任务视觉语言表示。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1908.02265.pdf) *陆嘉森、Dhruv Batra等* NeurIPS 2019。[[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fvilbert-multi-task)] (**VilBERT**)\n- [12-in-1：多任务视觉与语言表示学习。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1912.02315.pdf) *陆嘉森、Dhruv Batra等* CVPR 2020。[[代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fvilbert-multi-task)] (**基于VilBERT的多任务模型**)\n- [从自然语言监督中学习可迁移的视觉模型。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2103.00020.pdf) *Alec Radford等* CVPR 2020。[[代码](https:\u002F\u002Fgithub.com\u002FOpenAI\u002FCLIP)] (**CLIP，GPT团队**)\n- [ERNIE-ViL：通过场景图增强知识的视觉-语言表示。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.16934.pdf) *于飞、唐继吉等* Arxiv 2020。[[代码](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FERNIE\u002Ftree\u002Frepro\u002Fernie-vil)] (**ERNIE-ViL，VCR排行榜第一名**)\n- [M6-v0：面向多模态预训练的视觉-语言交互。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2003.13198.pdf) *林俊阳、杨安等* KDD 2020。(**M6-v0\u002FInterBERT**)\n- [M3P：通过多任务、多语言、多模态预训练学习通用表示。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.02635.pdf) *黄浩洋、苏琳等* CVPR 2021。[[代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FM3P)] (**M3P，MILD数据集**)\n\n\n## 其他资源\n\n### 一些检索工具包\n- [Faiss：用于高效相似性搜索和稠密向量聚类的库](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Ffaiss)\n- [Pyserini：支持稀疏和稠密表示的Python工具包](https:\u002F\u002Fgithub.com\u002Fcastorini\u002Fpyserini\u002F)\n- [MatchZoo：包含多种流行神经网络文本匹配模型的库](https:\u002F\u002Fgithub.com\u002FNTMC-Community\u002FMatchZoo)\n\n### 关于NLP预训练模型的其他资源\n- [自然语言处理中的预训练模型：综述。](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.08271) *邱锡鹏等*\n- [BERT相关论文](https:\u002F\u002Fgithub.com\u002Ftomohideshibata\u002FBERT-related-papers)\n- [清华大学NLP组的预训练语言模型论文](https:\u002F\u002Fgithub.com\u002Fthunlp\u002FPLMpapers)\n\n### 关于高效Transformer的综述\n- [高效Transformer：综述。](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.06732.pdf) *Yi Tay、Mostafa Dehghani等* Arxiv 2020。","# awesome-pretrained-models-for-information-retrieval 快速上手指南\n\n本项目并非一个单一的 Python 包，而是一个**精选列表（Awesome List）**，收录了信息检索（IR）领域预训练模型相关的论文、代码库和资源。本指南将指导你如何利用该列表找到合适的模型，并以列表中热门的 **DPR (Dense Passage Retrieval)** 和 **SPLADE** 为例，演示如何快速搭建环境并运行代码。\n\n## 环境准备\n\n在开始之前，请确保你的开发环境满足以下要求：\n\n*   **操作系统**: Linux (推荐 Ubuntu 20.04+), macOS, 或 Windows (WSL2 推荐)\n*   **Python**: 3.8 或更高版本\n*   **硬件**: 建议配备 NVIDIA GPU (显存 8GB+) 以进行模型微调或推理；仅使用预训练权重进行轻量级推理可在 CPU 上运行。\n*   **前置依赖**:\n    *   `git`: 用于克隆代码库\n    *   `pip` 或 `conda`: 包管理工具\n    *   `CUDA` & `cuDNN`: 如需使用 GPU 加速，请预先安装与 PyTorch 版本匹配的驱动。\n\n## 安装步骤\n\n由于本项目是资源列表，你需要根据需求选择具体的模型仓库进行安装。以下以两个代表性项目为例：\n\n### 1. 获取资源列表\n首先克隆本仓库以浏览所有可用资源：\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FFlagOpen\u002Fawesome-pretrained-models-for-information-retrieval.git\ncd awesome-pretrained-models-for-information-retrieval\n```\n*(注：如果访问 GitHub 较慢，可使用国内镜像源如 `https:\u002F\u002Fgitee.com\u002Fmirrors\u002Fawesome-pretrained-models-for-information-retrieval` 若存在，或直接访问原库)*\n\n### 2. 安装具体模型示例：DPR (Facebook Research)\nDPR 是稠密检索的基准模型。\n\n```bash\n# 克隆 DPR 官方代码库\ngit clone https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FDPR.git\ncd DPR\n\n# 创建虚拟环境 (推荐)\npython -m venv dpr_env\nsource dpr_env\u002Fbin\u002Factivate  # Windows 用户请使用: dpr_env\\Scripts\\activate\n\n# 安装依赖 (使用国内 pip 源加速)\npip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n\n# 安装 DPR 包本身\npip install -e . -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n### 3. 安装具体模型示例：SPLADE (Naver)\nSPLADE 是稀疏检索的代表模型。\n\n```bash\n# 克隆 SPLADE 代码库\ngit clone https:\u002F\u002Fgithub.com\u002Fnaver\u002Fsplade.git\ncd splade\n\n# 创建并激活环境\nconda create -n splade python=3.8 -y\nconda activate splade\n\n# 安装依赖 (使用国内源)\npip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n## 基本使用\n\n以下展示如何加载预训练模型并进行简单的检索测试。\n\n### 示例 1：使用 DPR 进行编码 (Dense Retrieval)\n\n此示例演示如何加载预训练的查询编码器（Query Encoder）并将文本转换为向量。\n\n```python\nimport torch\nfrom dpr.models import init_encoder_components\nfrom dpr.options import add_encoder_params, setup_args_gpu, print_args\nfrom dpr.utils.model_utils import setup_for_distributed_mode, get_model_obj\n\n# 1. 初始化参数 (模拟命令行参数)\n# 这里以 bi-encoder 为例，使用 facebook-dpr-question_encoder-multiset-base\nclass Args:\n    encoder_type = \"hf_bert\"\n    pretrained_file = \"facebook\u002Fdpr-question_encoder-multiset-base\"\n    sequence_length = 256\n    dropout = 0.1\n    fp16 = False\n    local_rank = -1\n    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\nargs = Args()\n\n# 2. 初始化模型组件\nencoder, tokenizer = init_encoder_components(args.encoder_type, args)\n\n# 3. 准备输入数据\nquestions = [\"What is the capital of France?\", \"How does photosynthesis work?\"]\ntokenized_data = tokenizer(questions, return_tensors=\"pt\", padding=True, truncation=True)\n\n# 4. 生成嵌入向量\nwith torch.no_grad():\n    outputs = encoder(tokenized_data[\"input_ids\"], tokenized_data[\"attention_mask\"])\n    embeddings = outputs.pooler_output  # 获取 [CLS] 向量作为句子表示\n\nprint(f\"Input questions: {questions}\")\nprint(f\"Embedding shape: {embeddings.shape}\")\n# 输出示例: Embedding shape: torch.Size([2, 768])\n```\n\n### 示例 2：使用 SPLADE 进行加权 (Sparse Retrieval)\n\nSPLADE 通过 MLM 头预测词的重要性权重。以下是基于 Hugging Face `transformers` 的简化调用逻辑（需先安装 `transformers`）。\n\n```python\nfrom transformers import AutoModelForMaskedLM, AutoTokenizer\nimport torch\n\n# 1. 加载预训练模型 (以 SPLADE v2 为例)\nmodel_name = \"naver\u002Fsplade-cocondenser-ensembledistil\"\ntokenizer = AutoTokenizer.from_pretrained(model_name)\nmodel = AutoModelForMaskedLM.from_pretrained(model_name)\n\n# 2. 准备文档或查询\ntext = \"The quick brown fox jumps over the lazy dog.\"\ninputs = tokenizer(text, return_tensors=\"pt\", truncation=True, max_length=512)\n\n# 3. 前向传播获取词权重\nwith torch.no_grad():\n    outputs = model(**inputs)\n    logits = outputs.logits\n    \n    # SPLADE 核心操作: ReLU + Log(1+x) 激活\n    weights = torch.relu(logits).log1p()\n    \n    # 获取每个 token 的最大权重 (max pooling over vocab dimension for each position)\n    # 注意：实际 SPLADE 实现通常会对同一词的不同子词进行聚合\n    aggregated_weights = torch.max(weights, dim=1).values \n\n# 4. 映射回词汇表查看高权重词\nvocab = tokenizer.get_vocab()\nids = inputs[\"input_ids\"][0]\nfor i, weight in enumerate(aggregated_weights[0]):\n    if weight > 0.5: # 过滤低权重词\n        token = tokenizer.convert_ids_to_tokens([ids[i]])[0]\n        print(f\"Token: {token}, Weight: {weight.item():.4f}\")\n```\n\n### 下一步建议\n浏览本项目根目录下的 `README.md`，根据 **\"First-stage Retrieval\"** (第一阶段检索) 或 **\"Re-ranking Stage\"** (重排序阶段) 分类，查找更多特定任务的论文链接和对应的 GitHub 代码库地址，按照各仓库的具体说明进行深度使用。","某电商公司的搜索算法团队正致力于优化内部商品检索系统，试图将传统的关键词匹配升级为基于语义的深度学习检索架构。\n\n### 没有 awesome-pretrained-models-for-information-retrieval 时\n- **文献调研效率低下**：团队成员需在 arXiv、Google Scholar 等平台手动海量筛选论文，耗时数周仍难以穷尽“稠密检索”或“长文档处理”领域的最新进展。\n- **技术选型盲目**：面对稀疏检索、混合检索及重排序等多种技术路线，缺乏系统的分类指引，导致难以判断哪种预训练模型最适合当前的业务场景。\n- **复现成本高昂**：由于找不到经过验证的权威论文列表，团队常误选未成熟或已废弃的模型，造成大量算力浪费在无效的代码复现与调试上。\n- **前沿趋势脱节**：容易忽略如\"LLM 赋能信息检索”或“对抗攻击防御”等新兴交叉方向，致使系统架构在设计之初就缺乏前瞻性。\n\n### 使用 awesome-pretrained-models-for-information-retrieval 后\n- **调研路径清晰化**：直接利用其结构化的论文清单，按“第一阶段检索”或“重排序阶段”快速定位到神经项重加权、硬负样本采样等关键技术的顶会论文。\n- **决策依据科学化**：参考列表中关于领域自适应和知识蒸馏的分类综述，迅速锁定适合电商垂直领域的预训练模型，大幅缩短技术验证周期。\n- **落地成功率提升**：依托 curated list 中收录的高质量成果，团队直接复现成熟的稠密检索方案，避免了踩坑，将模型上线时间从数月压缩至数周。\n- **架构演进前瞻化**：通过关注\"LLM for IR\"和“多模态检索”板块，及时引入生成式查询扩展等新技术，显著提升了系统对模糊查询的语义理解能力。\n\nawesome-pretrained-models-for-information-retrieval 将分散的学术成果转化为系统的工程指南，帮助开发者在信息检索的深海中精准导航，实现从理论到落地的高效跨越。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fict-bigdatalab_awesome-pretrained-models-for-information-retrieval_e5e7d563.png","ict-bigdatalab","ICT-BigData","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fict-bigdatalab_306f7501.png","CAS Key Lab of Network Data Science and Technology, Institute of Computing Technology, Chinese Academy of Sciences",null,"ict.bigdata@gmail.com","http:\u002F\u002Fwww.bigdatalab.ac.cn\u002F","https:\u002F\u002Fgithub.com\u002Fict-bigdatalab",676,49,"2026-04-07T18:26:57",1,"","未说明",{"notes":88,"python":86,"dependencies":89},"该仓库是一个信息检索（IR）预训练模型相关论文的资源列表（Awesome List），并非一个可直接运行的单一软件工具或代码库。因此，README 中未包含具体的操作系统、硬件配置、Python 版本或依赖库安装要求。所列出的每个模型（如 DPR, ColBERT, SPLADE 等）都有独立的代码仓库和特定的环境需求，用户需参考各模型对应的链接获取具体运行指南。",[],[35,91,14],"其他",[93,94,95,96,97,98,99,100],"pretraining-for-ir","dense-retrieval","bert-for-ir","information-retrieval","pretrain-for-search","reranking","web-search","pretrained-language-models","2026-03-27T02:49:30.150509","2026-04-11T17:43:02.444979",[],[105],{"id":106,"version":107,"summary_zh":77,"released_at":108},206288,"v1.0","2023-05-17T03:47:56"]