[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-EdinburghNLP--awesome-hallucination-detection":3,"tool-EdinburghNLP--awesome-hallucination-detection":65},[4,17,27,35,48,57],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",150037,2,"2026-04-10T23:33:47",[13,14,15],"开发框架","Agent","语言模型","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,3,"2026-04-06T11:19:32",[15,26,14,13],"图像",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":10,"last_commit_at":33,"category_tags":34,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":10,"last_commit_at":41,"category_tags":42,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",85092,"2026-04-10T11:13:16",[26,43,44,45,14,46,15,13,47],"数据工具","视频","插件","其他","音频",{"id":49,"name":50,"github_repo":51,"description_zh":52,"stars":53,"difficulty_score":54,"last_commit_at":55,"category_tags":56,"status":16},5784,"funNLP","fighting41love\u002FfunNLP","funNLP 是一个专为中文自然语言处理（NLP）打造的超级资源库，被誉为\"NLP 民工的乐园”。它并非单一的软件工具，而是一个汇集了海量开源项目、数据集、预训练模型和实用代码的综合性平台。\n\n面对中文 NLP 领域资源分散、入门门槛高以及特定场景数据匮乏的痛点，funNLP 提供了“一站式”解决方案。这里不仅涵盖了分词、命名实体识别、情感分析、文本摘要等基础任务的标准工具，还独特地收录了丰富的垂直领域资源，如法律、医疗、金融行业的专用词库与数据集，甚至包含古诗词生成、歌词创作等趣味应用。其核心亮点在于极高的全面性与实用性，从基础的字典词典到前沿的 BERT、GPT-2 模型代码，再到高质量的标注数据和竞赛方案，应有尽有。\n\n无论是刚刚踏入 NLP 领域的学生、需要快速验证想法的算法工程师，还是从事人工智能研究的学者，都能在这里找到急需的“武器弹药”。对于开发者而言，它能大幅减少寻找数据和复现模型的时间；对于研究者，它提供了丰富的基准测试资源和前沿技术参考。funNLP 以开放共享的精神，极大地降低了中文自然语言处理的开发与研究成本，是中文 AI 社区不可或缺的宝藏仓库。",79857,1,"2026-04-08T20:11:31",[15,43,46],{"id":58,"name":59,"github_repo":60,"description_zh":61,"stars":62,"difficulty_score":54,"last_commit_at":63,"category_tags":64,"status":16},6590,"gpt4all","nomic-ai\u002Fgpt4all","GPT4All 是一款让普通电脑也能轻松运行大型语言模型（LLM）的开源工具。它的核心目标是打破算力壁垒，让用户无需依赖昂贵的显卡（GPU）或云端 API，即可在普通的笔记本电脑和台式机上私密、离线地部署和使用大模型。\n\n对于担心数据隐私、希望完全掌控本地数据的企业用户、研究人员以及技术爱好者来说，GPT4All 提供了理想的解决方案。它解决了传统大模型必须联网调用或需要高端硬件才能运行的痛点，让日常设备也能成为强大的 AI 助手。无论是希望构建本地知识库的开发者，还是单纯想体验私有化 AI 聊天的普通用户，都能从中受益。\n\n技术上，GPT4All 基于高效的 `llama.cpp` 后端，支持多种主流模型架构（包括最新的 DeepSeek R1 蒸馏模型），并采用 GGUF 格式优化推理速度。它不仅提供界面友好的桌面客户端，支持 Windows、macOS 和 Linux 等多平台一键安装，还为开发者提供了便捷的 Python 库，可轻松集成到 LangChain 等生态中。通过简单的下载和配置，用户即可立即开始探索本地大模型的无限可能。",77307,"2026-04-11T06:52:37",[15,13],{"id":66,"github_repo":67,"name":68,"description_en":69,"description_zh":70,"ai_summary_zh":71,"readme_en":72,"readme_zh":73,"quickstart_zh":74,"use_case_zh":75,"hero_image_url":76,"owner_login":77,"owner_name":78,"owner_avatar_url":79,"owner_bio":80,"owner_company":81,"owner_location":81,"owner_email":81,"owner_twitter":81,"owner_website":82,"owner_url":83,"languages":81,"stars":84,"forks":85,"last_commit_at":86,"license":87,"difficulty_score":54,"env_os":88,"env_gpu":89,"env_ram":89,"env_deps":90,"category_tags":93,"github_topics":94,"view_count":98,"oss_zip_url":81,"oss_zip_packed_at":81,"status":16,"created_at":99,"updated_at":100,"faqs":101,"releases":126},931,"EdinburghNLP\u002Fawesome-hallucination-detection","awesome-hallucination-detection","List of papers on hallucination detection in LLMs.","awesome-hallucination-detection 是爱丁堡大学 NLP 团队维护的一份精选论文清单，专注于大语言模型（LLM）和多模态大语言模型（MLLM）的幻觉检测研究。这份资源系统整理了 139 篇前沿论文，每篇都附有关键技术要点、评测指标和数据集的精炼总结，帮助读者快速把握领域全貌。\n\n幻觉问题——即 AI 生成看似合理但实际错误的内容——是当前大模型落地的核心瓶颈之一。awesome-hallucination-detection 通过分类梳理检测方法与缓解策略，为研究者节省了大量文献调研时间。收录的技术路线涵盖训练后干预（如 VISTA 通过激活空间操控强化视觉信息）、解码时修正（如 MemVR 模拟人类\"再看一眼\"的认知机制）、以及强化学习对齐等多元方案，并涉及纯文本、视觉-语言等多种模态场景。\n\n这份资源尤其适合从事大模型安全研究、幻觉评估工具开发，或需要为特定应用场景选择检测方案的算法工程师与科研人员。项目采用 Apache 2.0 协议开源，持续接受社区投稿维护，保持了较高的时效性和学术严谨性。对于希望系统了解幻觉检测从基础理论到最新进展（如 2025 ","awesome-hallucination-detection 是爱丁堡大学 NLP 团队维护的一份精选论文清单，专注于大语言模型（LLM）和多模态大语言模型（MLLM）的幻觉检测研究。这份资源系统整理了 139 篇前沿论文，每篇都附有关键技术要点、评测指标和数据集的精炼总结，帮助读者快速把握领域全貌。\n\n幻觉问题——即 AI 生成看似合理但实际错误的内容——是当前大模型落地的核心瓶颈之一。awesome-hallucination-detection 通过分类梳理检测方法与缓解策略，为研究者节省了大量文献调研时间。收录的技术路线涵盖训练后干预（如 VISTA 通过激活空间操控强化视觉信息）、解码时修正（如 MemVR 模拟人类\"再看一眼\"的认知机制）、以及强化学习对齐等多元方案，并涉及纯文本、视觉-语言等多种模态场景。\n\n这份资源尤其适合从事大模型安全研究、幻觉评估工具开发，或需要为特定应用场景选择检测方案的算法工程师与科研人员。项目采用 Apache 2.0 协议开源，持续接受社区投稿维护，保持了较高的时效性和学术严谨性。对于希望系统了解幻觉检测从基础理论到最新进展（如 2025 年 ICML 收录的多篇工作）的技术人员，这是一份不可多得的入门与参考手册。","# awesome-hallucination-detection\n\n[![Awesome](https:\u002F\u002Fcdn.rawgit.com\u002Fsindresorhus\u002Fawesome\u002Fd7305f38d29fed78fa85652e3a63e154dd8e8829\u002Fmedia\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection) [![License: Apache 2.0](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-Apache_2.0-blue.svg)](https:\u002F\u002Fopensource.org\u002Flicenses\u002FApache-2.0) [![PRs Welcome](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPRs-welcome-brightgreen.svg)](https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection\u002Fpulls) [![Papers](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPapers-139-blue.svg)](https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection) [![Maintained](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FMaintained-yes-green.svg)](https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection)\n\n## Papers and Summaries\n\n### [The Hidden Life of Tokens: Reducing Hallucination of Large Vision-Language Models via Visual Information Steering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.03628)\n- **Metrics:** CHAIRs, CHAIRi, POPE Accuracy\u002FF1, MMHal-Bench GPT-4 score, MME score\n- **Datasets:** MSCOCO 2014 (CHAIR, 500 images), POPE (COCO subset), MMHal-Bench (96 image-question pairs), MME\n- **Comments:** Introduces **VISTA**, a training-free inference-time framework that combats hallucination in LVLMs by steering visual information in the activation space. Reveals three phenomena during LVLM generation: gradual visual information loss, early excitation of semantically meaningful tokens, and hidden genuine information in vocabulary rankings. Reinforces visual grounding at inference time by leveraging early-layer logits, reducing hallucinations by ~40% across four architectures (LLaVA-1.5, MiniGPT-4, Shikra, InstructBLIP) under three decoding strategies. (ICML 2025)\n\n### [Look Twice Before You Answer: Memory-Space Visual Retracing for Hallucination Mitigation in Multimodal Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.03577)\n- **Metrics:** CHAIRs, CHAIRi, POPE Accuracy\u002FF1, HallusionBench (fACC, qACC, aACC), MME, MMBench, MM-Vet, LLaVA-Bench, VizWiz-VQA\n- **Datasets:** POPE (MSCOCO, A-OKVQA, GQA), CHAIR, HallusionBench, MME, MMBench, MM-Vet, LLaVA-Bench, VizWiz-VQA\n- **Comments:** Proposes **MemVR**, a training-free decoding approach inspired by human cognition: when the model exhibits uncertainty during generation, visual tokens are reinjected as key-value memory through the Feed Forward Network, analogous to \"looking twice\" at an image when memory fades. Significantly reduces hallucinations while preserving general capabilities across eight benchmarks and multiple MLLM architectures (LLaVA-1.5, Qwen-VL, GLM4V). (ICML 2025)\n\n### [Robust Multimodal Large Language Models Against Modality Conflict](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.07151)\n- **Metrics:** ROUGE-L F1, Hallucination Rate, LLM-Judge Score (0–4)\n- **Datasets:** MMMC (Multimodal Modality Conflict, 20K image-question-answer triples from Visual Genome), HallusionBench, MMBench, MMStar, MMMU, MathVista, OCRBench, AI2D, MMVet, MME\n- **Comments:** Formally defines **modality conflict**—where contradictions between visual and textual inputs trap MLLMs in a dilemma—as a primary driver of hallucinations. Introduces the **MMMC** dataset with object, attribute, and relationship conflicts, and evaluates three mitigation strategies: prompt engineering, supervised fine-tuning, and reinforcement learning. Finds that RL provides the most robust defense, training the model to prioritize visual evidence over misleading textual cues. (ICML 2025)\n\n### [GLSim: Detecting Object Hallucinations in LVLMs via Global-Local Similarity](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.19972)\n- **Metrics:** AUROC, AUPR\n- **Datasets:** MSCOCO (5K validation images, 80 object classes), Objects365 (5K validation images, 365 object classes)\n- **Comments:** Proposes **GLSim**, a training-free hallucination detection framework that combines complementary global and local embedding similarity signals between image and text modalities. By extracting continuous hallucination likelihood scores from intermediate-layer embeddings, GLSim captures both contextual and fine-grained perspectives. Outperforms competitive baselines across multiple LVLMs (LLaVA-1.5, MiniGPT-4, Shikra, InstructBLIP, Qwen2.5-VL) without requiring external supervision or judge models. (NeurIPS 2025)\n\n### [Intervene-All-Paths: Unified Mitigation of LVLM Hallucinations across Alignment Formats](https:\u002F\u002Farxiv.org\u002Fabs\u002F2511.17254)\n- **Metrics:** POPE Accuracy\u002FF1, MCQ-POPE Accuracy\u002FMacro-F1, CHAIRs, CHAIRi, MME (Existence, Count, Position, Color)\n- **Datasets:** POPE (COCO, A-OKVQA, GQA), MCQ-POPE, CHAIR (COCO), MME\n- **Comments:** Demonstrates that LVLM hallucinations stem from three interacting causal pathways: image-to-input-text, image-to-output-text, and text-to-text. Proposes a training-free, head-level intervention framework that identifies critical **hallucination heads** across all pathways and applies targeted corrections customized for different question-answer formats (yes\u002Fno, multiple-choice, open-ended). Achieves consistent hallucination reduction across diverse alignment types. (NeurIPS 2025)\n\n### [One SPACE to Rule Them All: Jointly Mitigating Factuality and Faithfulness Hallucinations in LLMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.11088)\n- **Metrics:** TruthfulQA (MC1, MC2, Truthfulness, Informativeness, True*Info), PDTB DISQ Score (Targeted, Counterfactual, Consistency, Overall)\n- **Datasets:** TruthfulQA, PDTB (Penn Discourse TreeBank)\n- **Comments:** Exposes a critical zero-sum dynamic: interventions targeting factuality often degrade faithfulness, and vice versa. **SPACE** resolves this by showing that both hallucination types share overlapping subspaces within neural representations. Uses dual-task feature modeling, spectral clustering, and attention head saliency scoring to identify and edit shared activation subspaces, enabling concurrent mitigation of both hallucination types without sacrificing instruction-following capability. (NeurIPS 2025)\n\n### [Reasoning Models Hallucinate More: Factuality-Aware Reinforcement Learning for Large Reasoning Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.24630)\n- **Metrics:** Pass@1 Accuracy (reasoning), Truthfulness ratio (TruthfulQA), Accuracy (HaluEval-QA), Truthfulness ratio (HalluQA)\n- **Datasets:** TruthfulQA (817 samples), HaluEval-QA (10K samples), HalluQA (450 samples), GSM8K (1,319 samples), MATH-500, AIME 2024, AIME 2025\n- **Comments:** Reveals that reasoning models (using CoT) actually hallucinate **more** than base models on complex factual questions, as extended generation provides more surface area for factuality drift. Introduces **FSPO** (Factuality-aware Step-wise Policy Optimization), an RL fine-tuning algorithm that incorporates explicit factuality verification at each reasoning step, dynamically adjusting token-level advantage values to maintain factual correctness throughout the reasoning trace. (NeurIPS 2025)\n\n### [The Illusion of Progress: Re-evaluating Hallucination Detection in LLMs](https:\u002F\u002Faclanthology.org\u002F2025.emnlp-main.1761\u002F)\n- **Metrics:** AUROC, PR-AUC, Precision, Recall, F1, ROUGE-L F1\n- **Datasets:** NQ-Open (3,610 QA pairs), TriviaQA (3,842 examples), SQuADv2 (4,150 examples)\n- **Comments:** Demonstrates that ROUGE-based evaluation systematically overestimates hallucination detection performance in QA. Through comprehensive human studies, shows that ROUGE exhibits alarmingly low precision, and several established detection methods (Perplexity, EigenScore, eRank) suffer performance drops of up to **45.9% AUROC** when evaluated with human-aligned LLM-as-Judge metrics instead of ROUGE. Reveals that simple length-based heuristics can match or exceed sophisticated detectors like Semantic Entropy, exposing a fundamental flaw in current evaluation practices. (EMNLP 2025)\n\n### [Bold Claims or Self-Doubt? Factuality Hallucination Type Detection via Belief State](https:\u002F\u002Faclanthology.org\u002F2025.findings-emnlp.527\u002F)\n- **Metrics:** Truthful Rate, OH (Overconfident Hallucination detection rate), UH (Unaware Hallucination detection rate), AUC\n- **Datasets:** TriviaQA, NQOPEN, ALCUNA\n- **Comments:** Introduces the concept of **belief state**—a measure of model confidence based on answer repetition consistency across multiple samples—to categorize factuality hallucinations into two types: **Overconfident** (model confidently lies) and **Unaware** (model guesses due to missing knowledge). Proposes **BAFH**, a lightweight framework that trains a feedforward classifier on hidden states to determine belief states and classify hallucination types. Evaluated across eight LLMs (Gemma-2, Llama-3.1, Mistral) against MIND and SAR baselines. (EMNLP 2025 Findings)\n\n### [Exploring the Generalizability of Factual Hallucination Mitigation via Enhancing Precise Knowledge Utilization](https:\u002F\u002Faclanthology.org\u002F2025.findings-emnlp.211\u002F)\n- **Metrics:** Accuracy across 21 domains\n- **Datasets:** FactualBench (181K Chinese factual QA pairs spanning 21 domains)\n- **Comments:** Proposes **PKUE**, which mitigates factual hallucinations by strengthening the LLM's internal mapping between queries and parametric knowledge. Fine-tunes the model on self-generated responses to precise factual questions via preference optimization, rather than using post-hoc corrections that may overfit to specific prompt templates. Introduces **FactualBench**, a large-scale Chinese factual QA dataset, and demonstrates broad generalization across factual tasks, general tasks, and multilingual settings. (EMNLP 2025 Findings)\n\n### [Towards Faithful Natural Language Explanations: A Study Using Activation Patching in Large Language Models](https:\u002F\u002Faclanthology.org\u002F2025.emnlp-main.529\u002F)\n- **Metrics:** Accuracy, CaF (Causal Faithfulness, with variants CaF(M), CaF(T), CaF(L)), CC-SHAP, CFF (Counterfactual Faithfulness), Plausibility\n- **Datasets:** CoS-E (Commonsense Reasoning), e-SNLI (Natural Language Inference), ComVE (Commonsense Validation and Explanation)\n- **Comments:** Introduces **Causal Faithfulness (CaF)**, a metric that uses activation patching to measure the divergence between the causal effects underlying the model's answer and its natural language explanation. Unlike prior faithfulness tests based on input perturbations (which are prone to out-of-distribution issues), CaF operates on internal hidden states via Symmetric Token Replacement. Finds that alignment-tuned models produce more faithful explanations than base models, and that faithfulness and plausibility are positively correlated. Evaluates six Gemma-2 models (2B–27B). (EMNLP 2025)\n\n### [InteGround: On the Evaluation of Verification and Retrieval Planning in Integrative Grounding](https:\u002F\u002Faclanthology.org\u002F2025.findings-emnlp.732\u002F)\n- **Metrics:** Accuracy, Groundedness verification scores\n- **Datasets:** Data repurposed from four domains (claim verification, multi-hop QA)\n- **Comments:** Introduces **integrative grounding**, a task requiring LLMs to retrieve and verify multiple interdependent pieces of evidence for complex queries. Finds that when external information is incomplete, LLMs default to hallucinating rationalizations using internal knowledge. Shows that premise abduction with logical constraints and zero-shot self-reflection outperform undirected retrieval planning in constraining hallucination cascades in complex RAG scenarios. (EMNLP 2025 Findings)\n\n### [Hallucination Detection in LLMs Using Spectral Features of Attention Maps](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.17598)\n- **Metrics:** AUROC, Precision, Recall, Cohen's Kappa\n- **Datasets:** NQ-Open, TriviaQA, CoQA, SQuADv2, HaluEval-QA, TruthfulQA, GSM8K\n- **Comments:** Approaches the LLM as a dynamic graph and analyzes structural properties of internal attention mechanisms. Extracts **spectral features** (eigenvalues) from attention maps to predict when the model is fabricating information: factual retrieval produces stable eigen-structures, while hallucination leads to diffuse, chaotic patterns. Creates a white-box hallucination detector that operates independently of the generated semantic content, evaluated across seven QA benchmarks. (EMNLP 2025)\n\n### [PruneCD: Contrasting Pruned Self Model to Improve Decoding Factuality](https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.16598)\n- **Metrics:** TruthfulQA (MC1, MC2, %Truth, %Info), FACTOR, StrategyQA Accuracy\n- **Datasets:** TruthfulQA, FACTOR (News, Wiki), StrategyQA\n- **Comments:** Addresses limitations of early-exit contrastive decoding by constructing the \"amateur\" model via **dynamic layer pruning** rather than simple truncation. Removing specific intermediate reasoning layers produces a better-calibrated contrastive prior with more informative logits, steering generation away from factually incorrect but high-probability tokens while maintaining fluency. Achieves consistent factuality improvements with minimal inference overhead. (EMNLP 2025)\n\n### [Re-FRAME the Meeting Summarization SCOPE: Fact-Based Summarization and Personalization via Questions](https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.15901)\n- **Metrics:** ROUGE (R-1, R-2, R-L), BERTScore F1, MESA (8 dimensions including hallucination), P-MESA (7 personalization dimensions), Balanced Accuracy, Cohen's kappa\n- **Datasets:** QMSum (ICSI, AMI, WPCP meetings), FAME (500 English, 300 German synthetic meetings)\n- **Comments:** Tackles meeting-summary hallucinations by introducing the **FRAME** pipeline and **SCOPE** protocol. FRAME extracts salient facts and scores them thematically; SCOPE forces the model to structurally justify context selection via a nine-question reasoning trace before generating the summary. Introduces **P-MESA**, a multi-dimensional personalized evaluation framework, and shows that on QMSum and FAME, FRAME reduces hallucination and omission by 2 out of 5 points on MESA, while SCOPE improves knowledge fit and goal alignment over prompt-only baselines. (EMNLP 2025 Findings)\n\n### [CCHall: A Novel Benchmark for Joint Cross-Lingual and Cross-Modal Hallucinations Detection in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.19108)\n- **Metrics:** Accuracy, Macro-F1\n- **Datasets:** CCHall (built from AMBER, GQA, XM3600, xFlickr&Co; 9 languages across 3 resource levels: high—French, Spanish, Portuguese; medium—Czech, Dutch, Swedish; low—Croatian, Welsh, Swahili)\n- **Comments:** Introduces **CCHall**, the first benchmark for the joint intersection of **cross-lingual and cross-modal hallucinations**. Evaluates leading models (GPT-4o, Gemini-1.5, Llama-3.2-Vision) in scenarios where a model might correctly identify an object visually in English but hallucinate its properties when generating in another language. Categorizes hallucinations into four types: non-hallucination, cross-lingual only, cross-modal only, and joint cross-lingual\u002Fcross-modal. (ACL 2025)\n\n### [Hallucination Detox: Sensitivity Dropout (SenD) for Large Language Model Training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.15460)\n- **Metrics:** SelfCheckGPT, FactScore, EigenScore, Efficient EigenScore (EES), Semantic Entropy, Perplexity, HaluEval Accuracy, ROUGE-1 (XSum)\n- **Datasets:** HELM (50K Wikipedia articles), MedHALT, LegalBench, HaluEval, XSum\n- **Comments:** Proposes **Sensitivity Dropout (SenD)**, a training protocol that reduces hallucination variance by deterministically dropping sensitive embedding indices with high variability during training. Pairs this intervention with **Efficient EigenScore (EES)**, an unsupervised metric that approximates EigenScore at 2x speed. Improves test-time reliability and factual accuracy across Pythia and Llama models without hurting downstream task performance. (ACL 2025)\n\n### [ETF: An Entity Tracing Framework for Hallucination Detection in Code Summaries](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.14748)\n- **Metrics:** Precision, Recall, F1, Accuracy (instance-level and entity-level), Jaccard Similarity\n- **Datasets:** Custom dataset (411 summaries from 7 LLMs, 9,933 entity-level samples, sourced from CodeXGLUE Java Code-To-Text)\n- **Comments:** Introduces **ETF**, the first hallucination detection framework tailored for **code summarization**. Detects intrinsic and extrinsic hallucinations by tracing code entities (variables, methods, classes) from source code to generated summaries using static analysis principles. Identifies fabricated entities (extrinsic hallucinations) and incorrect entity attributions (intrinsic hallucinations). Provides a novel annotated dataset with 4,354 human-reviewed entity tuples (Cohen's Kappa: 0.72). (ACL 2025)\n\n### [Localizing and Mitigating Errors in Long-form Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.11930)\n- **Metrics:** Human preference rate, span-level error annotation (5 error types)\n- **Datasets:** HaluQuestQA (698 QA pairs, 1.8K span-level expert annotations)\n- **Comments:** Introduces **HaluQuestQA**, a dataset with expert span-level annotations of specific hallucination and omission errors in complex long-form QA answers. Trains an automated feedback model on these annotations to detect problematic spans, then applies **Error-Informed Refinement** to rewrite hallucinated or incomplete spans. Achieves an **84% human preference rate** over baseline generations. (ACL 2025 Findings)\n\n### [Can Hallucination Correction Improve Video-Language Alignment?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.15079)\n- **Metrics:** Accuracy (VELOCITI), Mean Average Precision (SSv2-Temporal, SSv2-Events), GPT-Evaluated Score (MSRVTT-QA)\n- **Datasets:** VELOCITI, SSv2-Temporal, SSv2-Events, MSRVTT-QA, VideoCon (115K training triplets)\n- **Comments:** Flips the traditional mitigation paradigm by leveraging **hallucination correction as a self-training objective** to actively improve video-language alignment. The **HACA** framework learns to identify and rewrite hallucinated spatial and temporal descriptions during self-training, treating hallucinations as informative training signals rather than noise. Leads to significant improvements in zero-shot video-caption binding and complex text-to-video retrieval. (ACL 2025 Findings)\n\n### [Monitoring Decoding: Mitigating Hallucination via Evaluating the Factuality of Partial Response during Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.03106)\n- **Metrics:** Exact Match (TriviaQA, NQ-Open), Truth\u002FInfo\u002FTruth×Info scores (TruthfulQA), Accuracy (GSM8K), Latency (ms\u002Ftoken), Throughput (token\u002Fs)\n- **Datasets:** TruthfulQA (817 questions), TriviaQA (1,200 samples), NQ-Open (1,000 samples), GSM8K (1,319 samples)\n- **Comments:** Introduces a real-time, token-by-token monitoring framework that continuously scores the factuality of the partial response during generation. When an imminent hallucination is detected mid-generation, the decoding trajectory is dynamically altered to enforce factual consistency, addressing the **snowballing** effect where a single incorrect token forces subsequent fabrications. Shifts the paradigm from post-generation correction to active preventative decoding. (ACL 2025 Findings)\n\n### [DRIFT: Detecting Representational Inconsistencies for Factual Truthfulness](https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.14210)\n\n- **Metrics:** AUROC, AURAC, Accuracy\n- **Datasets:** TriviaQA, NQ-Open, MMLU-Pro, WebQuestions\n- **Comments:** Trains a lightweight probe (3M–37M params) on **intermediate hidden states** instead of final-layer outputs, since middle layers retain uncertainty signals that the token-projection step throws away. Two modes: **question-only** (runs before or alongside generation, no added latency) and **question+answer** (waits for the full response, more accurate). A built-in **LLM router** uses probe confidence to decide whether to return the answer or hand off to a stronger model \u002F RAG. Beats **HaloScope** and **Semantic Entropy** on 10 of 12 model–dataset combinations (LLaMA-2, Qwen-2.5, Gemma-3 × four QA benchmarks), with up to 13-point AUROC gains. Probes trained on one dataset transfer to others without retraining (72–93 AUROC).\n\n### [Joint Evaluation of Answer and Reasoning Consistency for Hallucination Detection in Large Reasoning Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.04832)\n\n- **Metrics:** AUROC\n- **Dataset:** HotpotQA, TriviaQA, NQ-Open, SQuAD\n- **Comments:** Introduces **RACE**, the first black-box hallucination detection framework designed for **Large Reasoning Models (LRMs)** like DeepSeek-R1, addressing a critical gap overlooked by existing methods: hallucinations often arise from the model’s reasoning traces rather than its final answers. By jointly assessing reasoning consistency, answer uncertainty, reasoning–answer alignment, and internal coherence, RACE provides a fine-grained and robust detector that consistently outperforms state-of-the-art baselines across multiple datasets and model families. RACE demonstrates that effective hallucination detection for modern reasoning models must evaluate **both what the model answers and how it reasons**, and **pioneers the direction of black-box hallucination detection for LRMs**. Furthermore, RACE offers a user-friendly code interface to facilitate testing and improvement.\n\n### [Learning to Reason for Hallucination Span Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.02173)\n\n- **Metrics:** Precision, Recall, F1-score\n- **Datasets:** RAGTruth\n- **Comments:** Proposes **RL4HS**, a reinforcement-learning framework for *span-level hallucination detection* that couples *chain-of-thought reasoning* with *span-level rewards*. Built on **Group Relative Policy Optimization (GRPO)** with **Class-Aware Policy Optimization (CAPO)** to address reward imbalance between hallucinated and non-hallucinated spans. On RAGTruth (QA, Summarization, Data-to-Text), RL4HS substantially improves fine-grained detection over CoT-based and supervised baselines.\n\n### [When Models Lie, We Learn: Multilingual Span-Level Hallucination Detection with PsiloQA](https:\u002F\u002Fhuggingface.co\u002Fpapers\u002F2510.04849)\n\n- **Metrics:** IoU, AP\n- **Datasets:** PsiloQA, RAG-Truth, Mu-SHROOM, FAVA, HalluEntity\n- **Comments:** Introduces **PsiloQA**, a large-scale dataset for **multilingual span-level hallucination detection** that serves both as a **benchmark** and a **training resource** for detector models. Building upon prior English-only and sequence-level datasets, **PsiloQA offers fine-grained supervision across 14 languages** through an automated three-stage pipeline of QA generation, hallucinated answer elicitation, and GPT-4o–based span annotation. It enables comprehensive evaluation and training of uncertainty-based, encoder-based, and LLM-based detectors, demonstrating strong cross-lingual generalization and cost-efficient scalability.\n\n### [Mitigating Object Hallucination in MLLMs via Data-augmented Phrase-level Alignment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.18654)\n\n- **Metrics:** CHAIRi, CHAIRs; AMBER (CHAIR, Cover, Hall. rate, F1); MME-Hall score\n- **Datasets:** MSCOCO (CHAIR), AMBER, MME-Hall, MMHal-Bench, HallusionBench\n- **Comments:** Introduces Data-augmented Phrase-level Alignment (DPA) and **HALVA**, which build hallucinated\u002Fcorrect response pairs via phrase-level augmentation and train with a phrase-level alignment loss to downweight hallucinated phrases. Reduces object hallucinations while preserving general vision-language performance across multiple benchmarks.\n\n### [MLLM Can See? Dynamic Correction Decoding for Hallucination Mitigation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.11779)\n\n- **Metrics:** CHAIRs, CHAIRi; POPE F1; MME score; AMBER (CHAIR, Cover, Hal, Cog)\n- **Datasets:** MSCOCO 2014 (captioning), POPE, MME, AMBER\n- **Comments:** Proposes **DeCo**, a model-agnostic decoding method that adaptively mixes earlier-layer representations to counteract language-prior suppression of visual evidence. Cuts object hallucinations across MLLMs with modest latency overhead and compatibility with standard decoding strategies.\n\n### [AVHBench: A Cross-Modal Hallucination Benchmark for Audio-Visual Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.18325)\n\n- **Metrics:** Accuracy, Precision, Recall, F1 (judgment); METEOR, CIDEr, GAVIE (captioning)\n- **Datasets:** AVHBench (2,136 videos; 4 tasks: A->V, V->A, A-VMat, A-VCap; 5,302 QA pairs; 1,106 captions)\n- **Comments:** Introduces **AVHBench**, a cross-modal hallucination benchmark for audio-visual LLMs with three judgment tasks and one captioning task that probe audio\u002Fvideo grounding mismatches. Provides a curated dataset and evaluation protocol for both perception and comprehension.\n\n\n### [Teaching with Lies: Curriculum DPO on Synthetic Negatives for Hallucination Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.17558v1)\n\n- **Metrics:** Accuracy, Precision, Recall, F1-score\n- **Datasets:** MedHallu, HaluEval, DROP, CovidQA, PubMedQA\n- **Comments:** Proposes **HaluCheck**, a family of 1B–3B parameter LLM detectors aligned via **Direct Preference Optimization (DPO)** using *synthetic hallucinated negatives* ranked by grounding difficulty (via MiniCheck). Introduces a **curriculum learning** strategy that transitions training from easier to harder negatives. Demonstrates up to **24% relative F1 gains** on MedHallu and HaluEval and strong zero-shot robustness—outperforming larger state-of-the-art models on DROP, CovidQA, and PubMedQA.\n\n\n### [MultiHal: Multilingual Dataset for Knowledge-Graph Grounded Evaluation of LLM Hallucinations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.14101)\n- **Metrics:** Semantic similarity with sentence embeddings\n- **Datasets:** MultiHal\n- **Comments:** Introduces a new factual language modeling benchmark **MultiHal**. Building upon past benchmarks such as Shroom2024, HaluEval, HaluBench, TruthfulQA, Felm, Defan and SimpleQA, the aforementioned benchmarks are extended by mining relevant KG paths from Wikidata. **MultiHal** can be used for comparisons of knowledge updating methods such as RAG and KG-RAG, as well as factual evaluation using the mined KG paths.\n\n### [A Probabilistic Framework for LLM Hallucination Detection via Belief Tree Propagation](https:\u002F\u002Faclanthology.org\u002F2025.naacl-long.158\u002F)\n- **Metrics:** AUROC, AUC-PR, F1, Accuracy\n- **Datasets:** Wikibio-GPT3, FELM-Science, FactCheckGPT\n- **Comments:** Proposes **BTProp**, a probabilistic belief-tree framework that recursively expands a target statement into related claims and performs hidden Markov tree inference to reconcile LM self-beliefs with logical relations. Improves hallucination detection by 3-9% AUROC\u002FAUC-PR over baselines across multiple benchmarks.\n\n### [Mitigating Hallucinations in Multi-modal Large Language Models via Image Token Attention-Guided Decoding](https:\u002F\u002Faclanthology.org\u002F2025.naacl-long.75\u002F)\n- **Metrics:** CHAIRs (CS), CHAIRi (CI), F1, GPT-4V correctness score, MME score\n- **Datasets:** CHAIR (COCO), POPE, GPT-4V-assisted evaluation (COCO), MME\n- **Comments:** Introduces **iTaD**, a plug-and-play decoding strategy that uses attention to image tokens to select layers and apply inter-layer contrastive decoding. By amplifying image grounding when attention drops, it consistently reduces hallucinations across multiple MLLMs and benchmarks.\n\n### [Beyond Logit Lens: Contextual Embeddings for Robust Hallucination Detection & Grounding in VLMs](https:\u002F\u002Faclanthology.org\u002F2025.naacl-long.488\u002F)\n- **Metrics:** mAP, Precision-Recall (grounding)\n- **Datasets:** HQH, TextVQA-X, VizWiz-G\n- **Comments:** Proposes **ContextualLens**, a training-free method that uses middle-layer contextual token embeddings to score answer-image alignment. It improves hallucination detection across diverse categories and produces precise grounding boxes for GVQA tasks.\n\n### [HALoGEN: Fantastic LLM Hallucinations and Where to Find Them](https:\u002F\u002Faclanthology.org\u002F2025.acl-long.71\u002F)\n- **Metrics:** Hallucination Score, Response Ratio, Utility Score, FActScore (biographies)\n- **Datasets:** HALoGEN (10,923 prompts across 9 tasks: code packages, scientific attribution, summarization, simplification, biographies, historical events, false presuppositions, rationalization binary\u002Fnumerical)\n- **Comments:** Introduces a multi-domain hallucination benchmark with task-specific decomposition and verification pipelines that score atomic facts at scale. Provides a Type A\u002FB\u002FC error taxonomy tied to training-data provenance for analyzing hallucination sources.\n\n### [Visual Evidence Prompting Mitigates Hallucinations in Large Vision-Language Models](https:\u002F\u002Faclanthology.org\u002F2025.acl-long.205\u002F)\n- **Metrics:** Accuracy, CHAIR\n- **Datasets:** POPE, AMBER, RPE (Relation Probing Evaluation built from Visual Genome)\n- **Comments:** Proposes visual evidence prompting that injects outputs from object detection and scene graph models as structured prompts to reduce object and relation hallucinations in LVLMs. Introduces RPE to assess relation hallucination.\n\n### [ICRProbe: Tracking Hidden State Dynamics for Reliable Hallucination Detection in LLMs](https:\u002F\u002Faclanthology.org\u002F2025.acl-long.880\u002F)\n- **Metrics:** AUROC\n- **Datasets:** HaluEval, SQuAD, TriviaQA, HotpotQA\n- **Comments:** Defines the ICR Score (Information Contribution to Residual Stream) and an ICR Probe that aggregates layer-wise residual updates for reference-free hallucination detection, outperforming prior hidden-state baselines with a lightweight MLP.\n\n### [HalluLens: LLM Hallucination Benchmark](https:\u002F\u002Faclanthology.org\u002F2025.acl-long.1176\u002F)\n- **Metrics:** False refusal rate, hallucination rate, correct answer rate, Precision, Recall@K, F1@K, false acceptance rate\n- **Datasets:** HalluLens (PreciseWikiQA, LongWiki, NonExistentRefusal with MixedEntities\u002FGeneratedEntities)\n- **Comments:** Introduces a taxonomy separating intrinsic vs. extrinsic hallucinations and a benchmark with dynamically generated extrinsic tasks to reduce data leakage. Provides new evaluation tasks for short QA, long-form QA, and knowledge-gap refusal.\n\n### [Similarity-Distance-Magnitude Universal Verification](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.20167)\n- **Metrics:** Index-conditional calibration (i.e., joint prediction- and class-conditional calibration at a given alpha' value)\n- **Datasets:** Factcheck (Azaria and Mitchell, 2023); sentiment; MMLU; MMLU-pro (importantly these include distribution-shifted and out-of-distribution evaluations)\n- **Comments:** Introduces Similarity-Distance-Magnitude (SDM) activation functions, SDM calibration, and SDM networks, which are neural networks (e.g., LLMs) with uncertainty-aware verification and interpretability-by-exemplar as intrinsic properties. An example of an SDM estimator that can be used for hallucination detection when combined with retrieval (or other applicable grounding) is available via the [open-source Reexpress MCP server](https:\u002F\u002Fgithub.com\u002FReexpressAI\u002Freexpress_mcp_server). (2025)\n\n### [Coarse-to-Fine Memory Matching for Joint Retrieval and Classification](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.02287)\n- **Metrics:** Accuracy and FEVER score\n- **Datasets:** FEVER and the 2-class analysis sets of Schuster et al. (2019)\n- **Comments:** Introduces interpretability-by-exemplar for multi-stage retrieval and classification with a single model, including feature detection via alignment of bi-encoded sequences. Includes a method for beam search through the search graph of bi- and cross-encoded sequences, and an early approach for constraining the output of a retrieval system based on dense matching into the support set. (This is, in effect, an early example of test-time compute with a Transformer language model. Instead of using reinforcement learning, multi-stage search is learned end-to-end via a contrastive loss over bi- and cross-encoded sequences.) (2020)\n\n### [Detecting Local Insights from Global Labels: Supervised & Zero-Shot Sequence Labeling via a Convolutional Decomposition](https:\u002F\u002Fdirect.mit.edu\u002Fcoli\u002Farticle\u002F47\u002F4\u002F729\u002F106772\u002FDetecting-Local-Insights-from-Global-Labels)\n- **Metrics:** F_1, F_0.5, and accuracy against the ground-truth, as well as the model-approximations against the original model's predictions\n- **Datasets:** Grammatical error detection and sentiment datasets\n- **Comments:** Introduces instance-based, metric-learner approximations of neural network models and hard-attention mechanisms that can be constructed with task-specific inductive biases for effective semi-supervised learning (i.e., feature detection). These mechanisms combine to yield effective methods for interpretability-by-exemplar over the representation space of neural models. Direct relevance for hallucination detection (which is a classification task): This precedes SAE and other contrastive-representation-based interpretability methods, while providing an explicit connection between a test-instance and the training (support set) and providing a pathway for controlling for the epistemic uncertainty. The latter in particular is a limiting factor for the real-world application of many subsequent interpretability methods, including for hallucination detection tasks. (This was submitted to Computational Linguistics for review in 2020 and accepted for publication in 2021, with an additional presentation at EMNLP in 2021.)\n\n### [♟️FactCheckmate: Preemptively Detecting and Mitigating Hallucinations in LMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.02899)\n- **Metrics:** For detection: Acc.\n- **Datasets:** QA: NQ-Open, MMLU, MedMCQA, GSM8K.\n- **Comments:** This work introduces a lightweight classifier that detects hallucinations preemptively conditioning only on the input hidden states, before any text is generated. When hallucinations are predicted, it intervenes in these hidden states to steer the model toward more factual outputs.  Experiments show consistent improvements in factual accuracy across various LLMs, with minimal computational overhead.\n  \n### [Kernel Language Entropy: Fine-grained Uncertainty Quantification for LLMs from Semantic Similarities](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.20003)\n- **Metrics:** For detection: AUROC, AURAC.\n- **Datasets:** QA: TriviaQA, SQuAD, BioASQ, NQ, SVAMP.\n- **Comments:** This work presents a method for evaluating the semantic uncertainty of LLM responses. The approach generates multiple response samples and measures their semantic similarity, which is represented as a density matrix (semantic kernel). Semantic uncertainty is then quantified using the von Neumann entropy of this matrix. High uncertainty suggests potential hallucinations, alowing for their detection and mitigation.\n\n### [Steering Knowledge Selection Behaviours in LLMs via SAE-Based Representation Engineering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.15999)\n- **Metrics:** Exact Match\n- **Datasets:** NQSwap, Macnoise\n- **Comments:** The first work that uses sparse auto-encoders (SAEs) to enhance both the usage of contextual and parametric knowledge.\n\n### [DeCoRe: Decoding by Contrasting Retrieval Heads to Mitigate Hallucinations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.18860)\n- **Metrics:** MC1, MC2, MC3 scores for TruthfulQA multiple-choice task; %Truth, %Info, %Truth*Info for TruthfulQA open-ended generation task; subspan Exact Match for the open-domain QA tasks (NQ-Open, NQ-Swap, TriviaQA, PopQA, MuSiQue); accuracy for MemoTrap; Prompt-level and Instruction-level accuracies for IFEval.\n- **Datasets:** TruthfulQA, NQ-Open, NQ-Swap, TriviaQA, PopQA, MemoTrap, IFEval, MuSiQue\n\n### [Semantic Density: Uncertainty Quantification for Large Language Models through Confidence Measurement in Semantic Space](https:\u002F\u002Fneurips.cc\u002Fvirtual\u002F2024\u002Fposter\u002F95598)\n- **Metrics:** AUROC, AUPR\n- **Datasets:** CoQA, TriviaQA, SciQ, NQ\n- **Comments:** Proposes a new method, namely semantic density, to provide response-wise confidence\u002Funcertainty scores for detecting LLM hallucinations. Semantic density extracts uncertainty\u002Fconfidence information for each response from a probability distribution perspective in semantic space. It has no restriction on task types and is \"off-the-shelf\" for new models and tasks. Significant improvement over other SOTA methods are consistently observed across different datasets and base LLMs.\n\n### [MedHallu: A Comprehensive Benchmark for Detecting Medical Hallucinations in LLMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14302)\n- **Metrics:** Binary hallucination detection (Precision, Recall, F1).\n- **Datasets:** *MedHallu* – derived from PubMedQA, containing 10k QA pairs with deliberately planted plausible hallucinations.\n- **Comments:** Presents a large-scale medical-focused hallucination detection benchmark. Evaluations show that, on the hardest subset, even top models like GPT-4 achieve only ~0.625 F1 in detecting subtle falsehoods, pointing to the difficulty of medical hallucination detection.\n\n### [Smoothing Out Hallucinations: Mitigating LLM Hallucination with Smoothed Knowledge Distillation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11306)\n- **Metrics:** ROUGE-L, BERTScore, factual consistency rate on XSum\u002FCNN-DM (measured via QA-based metrics like QuestEval).\n- **Datasets:** CNN\u002FDailyMail, XSum\n- **Comments:** Proposes training with soft labels from a teacher LLM to reduce overconfidence and lower hallucination rates in summarization tasks. Maintains quality (ROUGE\u002FBERTScore) while significantly decreasing factual errors.\n\n### [Large Legal Fictions: Profiling Legal Hallucinations in LLMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.01301)\n- **Metrics:** Hallucination rate (% of outputs containing any unsupported legal claim).\n- **Datasets:** Custom set of factual US case queries, where ground-truth outcomes can be verified.\n- **Comments:** Empirical study finding that GPT-3.5 and LLaMA-2 hallucinate in 69% and 88% of legal Q&A, respectively. Highlights the risks of using off-the-shelf LLMs in legal contexts without further training or validation.\n\n### [Hallucination-Minimized Data-to-Answer Framework for Financial Decision-Makers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.07592)\n- **Metrics:** Custom confidence score combining factual verification (data overlap), retrieval correctness, and final QA consistency.\n- **Datasets:** Proprietary financial tables and queries.\n- **Comments:** Shows how grounding LLMs in relevant financial data and applying multi-metric validation can exceed 90% confident correctness. Demonstrates an effective approach to curbing hallucinations in finance.\n\n### [Hallucination Detection: A Probabilistic Framework Using Embeddings Distance Analysis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.08663)\n  * **Metrics**: Accuracy, F1-score (hallucination detection performance).\n  * **Datasets**: Synthetic Q&A dataset (generated with Llama2-7B and Llama3-8B) labeled for hallucinated vs. non-hallucinated responses.\n  * **Comments**: Proposes analyzing the embedding space of LLM outputs to detect hallucinations. By measuring Minkowski distance between embedded keywords in genuine vs. hallucinated answers, it uncovers structural differences and achieves competitive hallucination detection accuracy (~66%) without external fact-checking.\n\n### [Detecting and Mitigating Hallucination in Large Vision-Language Models via Fine-Grained AI Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.14233)\n  * **Metrics**: Accuracy, Precision\u002FRecall, F1 (hallucination detection on benchmarks like MHaluBench, MFHaluBench); Hallucination Rate and metrics such as CHAIR, Cover, Hal, Cog (for generation benchmarks like Object HalBench and AMBER).\n  * **Datasets**: MHaluBench, MFHaluBench (vision-language hallucination detection datasets); Object HalBench, AMBER, MMHal-Bench, POPE (hallucination mitigation benchmarks for LVLMs).\n  * **Comments**: Introduces *HSA-DPO*, a severity-aware direct preference optimization method that uses fine-grained AI feedback to label hallucination severity and prioritize critical errors in training. This approach achieves state-of-the-art performance in detecting visual hallucinations (outperforming GPT-4V and other models) and significantly lowers hallucination occurrence in generated outputs (e.g., 36% reduction on AMBER, 76% on Object HalBench versus the base model).\n\n### [Pelican: Correcting Hallucination in Vision-LLMs via Claim Decomposition and Program of Thought Verification](https:\u002F\u002Faclanthology.org\u002F2024.emnlp-main.470\u002F)\n  * **Metrics**: Hallucination rate reduction (%) and factual accuracy improvements on multiple vision–language instruction benchmarks (e.g., MMHal-Bench, GAVIE, MME).\n  * **Datasets**: MMHal-Bench, GAVIE (hallucination evaluation benchmarks for LVLMs); MME (general vision-language understanding benchmark).\n  * **Comments**: Proposes a framework (*Pelican*) that detects and mitigates visual hallucinations through claim verification. It decomposes an image-grounded claim into sub-claims and uses *program-of-thought* (code execution with external tools) to verify each sub-claim’s truth. An LLM then assesses overall consistency. Pelican significantly reduces hallucination rates (approx. 8–32% drop across various LVLMs, and 27% lower than prior mitigation methods) while maintaining or improving the models’ factual accuracy in following visual instructions.\n\n### [Distinguishing Ignorance from Error in LLM Hallucinations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.22071)\n  * **Metrics**: Hallucination type distribution (prevalence of hallucinations despite knowledge vs. due to lack of knowledge); classification accuracy distinguishing HK+ vs. HK- cases; improvements in detection\u002Fmitigation when handling these types separately.\n  * **Datasets**: *WACK* (Wrong Answers despite Correct Knowledge) – a constructed dataset based on TriviaQA and NaturalQuestions, containing QA instances labeled as HK- (hallucination caused by missing knowledge) or HK+ (hallucination even though the model knows the answer) for specific LLMs.\n  * **Comments**: Investigates two distinct causes of hallucination: when the model truly doesn’t know the answer (ignorance) vs. when it knows the answer but still responds incorrectly (error). Introduces an automated approach to generate model-specific labeled examples (the WACK dataset) by testing the model under various prompted scenarios. Shows that a simple classifier on the LLM’s internal representations can differentiate these cases, and that tailoring detection\u002Fmitigation to a model’s HK+ (knowledgeable error) cases yields better results than a one-size-fits-all approach.\n\n### [A Genetic Approach to Mitigate Hallucination in Generative IR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.00085)\n  * **Metrics**: Factuality verification accuracy (FEVER-style support\u002Frefute classification of answers), and answer relevance metrics (n-gram overlap, ROUGE\u002FNDCG) in open-domain QA.\n  * **Datasets**: TREC Deep Learning 2019 & 2020 (passage ranking QA tasks) and a subset of MS MARCO Dev (for open-domain answer generation).\n  * **Comments**: Models the generative information retrieval process as a genetic algorithm (called *GAuGE*: Genetic Approach using Grounded Evolution) to reduce hallucinations in answers. Candidate answers evolve through iterative “mutation” and selection, guided by a simple n-gram overlap fitness score to ensure consistency with retrieved documents. Experiments across several IR datasets show that GAuGE produces highly relevant answers with significantly fewer hallucinated statements (substantially higher fact verification scores) compared to standard RAG-style generation, all without sacrificing answer relevance.\n### [MARS: Meaning-Aware Response Scoring for Uncertainty Estimation in Generative LLMs](https:\u002F\u002Faclanthology.org\u002F2024.acl-long.419.pdf)\n- **Metrics:** AUROC\n- **Datasets:** TriviaQA, NaturalQA, WebQA\n- **Comments:** The LLM uncertainty estimation technique called MARS replaces length-normalized probability scoring by assigning greater weights to tokens that contribute more significantly to correctness. \n\n### [Do Not Design, Learn: A Trainable Scoring Function for Uncertainty Estimation in Generative LLMs](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.11278)\n- **Metrics:** AUROC, PRR\n- **Datasets:** TriviaQA, GSM8k, NaturalQA, WebQA\n- **Comments:** The LLM uncertainty estimation technique called LARS trains an encoder-based transformer that takes a query, generation, and token probabilities as input and returns an uncertainty score as output\n\n### [Quantifying Uncertainty in Answers from any Language Model and Enhancing their Trustworthiness](https:\u002F\u002Faclanthology.org\u002F2024.acl-long.283\u002F)\n- **Metrics:** Accuracy, Precision\u002FRecall\u002FAuroc\n- **Datasets:** TriviaQA, GSM8k, SVAMP, Common-sense QA\n- **Comments:** LLM uncertainty estimation technique called BSDetector that combines self-reflection certainty and observed consistency into a single confidence score. Detects incorrect\u002Fhallucinated LLM responses with high precision\u002Frecall, and can also automatically boost the accuracy of LLM responses. \n\n### [Leveraging Hallucinations to Reduce Manual Prompt Dependency in Promptable Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.15205)\n- **Metrics:** MAE, F_{beta}, S_{alpha}\n- **Datasets:** CHAMELEON, CAMO, COD10K, CVC-ColonDB, Kvasir, ISIC\n- **Comments:** The first study does not regard hallucinations as purely negative, but as a common aspect of model pre-training. Unlike previous approaches that directly eliminate hallucinations, ProMaC first stimulates hallucinations to mine the prior knowledge from model pre-training to gather task-relevant information in images. Then, it eliminates irrelevant hallucinations to mitigate their negative impact. The effectiveness of this method has been demonstrated in multiple challenging segmentation tasks.\n\n### [GraphEval: A Knowledge-Graph Based LLM Hallucination Evaluation Framework](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.10793)\n- **Metrics:** Accuracy (detection), Rouge (correction)\n- **Datasets:** SummEval, QAGS-C, QAGS-X\n- **Comments:** Proposes a hallucination detection *GraphEval* and corection framework *GraphCorrect*. Hallucination detection is done by extracting KG triples from an LLM output and comparing the entailment of the triples with respect to the provided context. Correction is done by taking triples likely to contain hallucinations (entailment below 0.5) are then prompting an LLM to generate a new, factually correct triple with respect to a provided context. Afterwards in a seperate inference pass an LLM is prompted to replace the information in the non-factual LLM output based on the corrected triple. Underlying NLI models that are used for experiments are *HHEM* (DeBERTaV3), *TRUE* and *TrueTeacher* (T5-XXL). The underlying LLM used is Claude2. Final experiments are conducted by computing Rouge scores between reference text and the proposed mitigation method.\n\n### [Lynx: An Open Source Hallucination Evaluation Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.08488)\n- **Metrics:** Accuracy\n- **Datasets:** HaluBench (consists of ~500 random samples from CovidQA, PubMedQA, DROP, FinanceBench and another set of perturbations based on the retrieved samples)\n- **Comments:** Proposes a resource HaluBench and Lynx (Llama3-70bn-instruct based model) for a reference-free metric evaluation. The focus is on instrinsic hallucination evaluation, meaning answers faithful to the given context instead of world knowledge. Hallucinated examples for HaluBench are gathered with GPT-4o. Training of Lynx is done on 2400 samples from RAGTruth, DROP, CovidQA, PubMedQA with GPT4o generated reasoning as part of the training samples. Evaluation is done by extracting a response-level binary label indicating response's faithfulness to the context.\n\n### [LLMs Prompted for Graphs: Hallucinations and Generative Capabilities](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.00159)\n- **Metrics:**  Graph edit distance, spectral distance, distance between degree distributions.\n- **Datasets:** Graph Atlas Distance\n- **Comments:** This benchmark presents the capability to directly prompt LLMs for known graph structures. Distances from the outputs of LLMs and of the ground truth graphs are studied. A ranking based on graph edit distance sorts LLMs in their hallucination amplitude.\n\n### [HallusionBench: An Advanced Diagnostic Suite for Entangled Language Hallucination and Visual Illusion in Large Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.14566.pdf)\n- **Metrics:**  Accuracy.\n- **Datasets:** HallusionBench\n- **Comments:** This benchmark presents significant challenges to advanced large visual-language models (LVLMs), such as GPT-4V(Vision), Gemini Pro Vision, Claude 3, and LLaVA-1.5, by emphasizing nuanced understanding and interpretation of visual data. This paper introduces a novel structure for these visual questions designed to establish control groups. This structure is able to conduct a quantitative analysis of the models' response tendencies, logical consistency, and various failure modes.\n\n### [Unified Hallucination Detection for Multimodal Large Language Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.03190)\n- **Metrics:**  Accuracy, F1\u002FPrecision\u002FRecall.\n- **Datasets:** MHaluBench\n- **Framework:** UniHD\n- **Comments:** This paper proposes a more unified problem setting for hallucination detection in MLLMs, unveils a meta-evaluation benchmark MHaluBench that encompasses various hallucination categories and multimodal tasks, and introduces UNIHD, a unified framework for the detection of hallucinations in content produced by MLLMs.\n\n### [FactCHD: Benchmarking Fact-Conflicting Hallucination Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.12086)\n- **Metrics:**  F1 of detection, Match of explanation\n- **Datasets:** FactCHD\n- **Highlights:** This paper introduces the FACTCHD benchmark, which focuses on detecting fact-conflicting hallucinations. FACTCHD integrates factual knowledge from multiple domains, encompassing a wide range of fact patterns, including raw facts, multi-hop reasoning, comparison, and set operations. Its distinguishing feature lies in its goal to combine evidence chains rooted in factual information, enabling persuasive reasoning in predicting the factuality or non-factuality of a claim.\n\n### [Attention Satisfies: A Constraint-Satisfaction Lens on Factual Errors of Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.15098)\n- **Metrics:** AUROC, risk-coverage curve operating points\n- **Datasets:** CounterFact, factual queries generated from Wikidata\n- **Comments:** This paper models factual queries as constraint-satisfaction problems and finds that attention to constraint tokens significantly correlates with factual correctness\u002Fhallucinations.\n\n### [TRUE: Re-Evaluating Factual Consistency Evaluation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.04991)\n- **Metrics:** AUROC, across multiple datasets and evaluation methods\n- **Datasets:** PAWS, XSum, QAGS, FRANK, SummEval, BEGIN, Q^2, DialFact, FEVER, VitaminC\n\n### [TrueTeacher: Learning Factual Consistency Evaluation with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11171)\n- **Metrics:** AUROC, across multiple datasets and evaluation methods\n- **Datasets:** XSum, QAGS, FRANK, SummEval\n\n### [SAC$`^3`$: Reliable Hallucination Detection in Black-Box Language Models via Semantic-aware Cross-check Consistency](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.01740)\n- **Metrics:** Accuracy and AUROC: classification QA and open-domain QA\n- **Datasets:** Prime number and senator search from Snowball Hallucination, HotpotQA and Nq-open QA\n\n### [Elastic Weight Removal for Faithful and Abstractive Dialogue Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17574)\n- **Metrics:** Faithfulness between predicted response and ground-truth knowledge (Tab. 1) -- Critic, Q², BERT F1, F1.\n- **Datasets:** Wizard-of-Wikipedia (WoW), the DSTC9 and DSTC11 extensions of MultiWoZ 2.1, FaithDial -- a de-hallucinated subset of WoW.\n\n### [Trusting Your Evidence: Hallucinate Less with Context-aware Decoding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14739)\n- **Metrics:** Factual consistency of summaries: BERT-Precision and FactKB. MemoTrap and NQ-Swap: Exact Match.\n- **Datasets:** Summarisation: CNN-DM, XSUM. Knowledge Conflicts: MemoTrap, NQ-Swap.\n\n### [When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10511)\n- **Metrics:** Exact Match\u002FAccuracy.\n- **Datasets:** QA datasets with long-tail entities: PopQA, EntityQuestions; NQ.\n\n### [Retrieval Augmentation Reduces Hallucination in Conversation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.07567)\n- **Metrics:** Generation: Perplexity, Unigram Overlap (F1), BLEU-4, ROUGE-L. Overlap between generation and knowledge on which the human grounded during dataset collection: Knowledge F1; only consider words that are infrequent in the dataset when calculating F1: Rare F1.\n- **Datasets:** Wow, CMU Document Grounded Conversations (CMU_DoG). Knowledge source: KiLT Wikipedia dump.\n\n### [Just Ask for Calibration: Strategies for Eliciting Calibrated Confidence Scores from Language Models Fine-Tuned with Human Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14975)\n- **Metrics:** Expected Calibration Error (ECE) with temperature scaling (ECE-t); accuracy@coverage and coverage@accuracy.\n- **Datasets:** Question Answering datasets assessing factual knowledge: TriviaQA, SciQ, TruthfulQA.\n\n### [How Language Model Hallucinations Can Snowball](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13534)\n- **Metrics:** Percentage of Wrong Answers (Hallucinations) and cases where \"the model knows it's wrong\" (Snowballed Hallucinations).\n- **Datasets:** Primality Testing, Senator Search, Graph Connectivity.\n\n### [Leftover Lunch: Advantage-based Offline Reinforcement Learning for Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14718)\n- **Metrics:** Faithfulness evaluation for Knowledge-Grounded response generation on FaithDial -- FaithCritic, CoLA (Fluency), Dialog Engagement, Length-penalised TF-IDF Diversity. \n- **Datasets:** Faithful Knowledge-Grounded Dialog: FaithDial, a more faithful subset of WoW.\n\n### [Generating with Confidence: Uncertainty Quantification for Black-box Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.19187)\n- **Metrics:** AUROC, AUARC, Uncertainty and Confidence metrics (NumSet, Deg, EigV).\n- **Datasets:** CoQA (Open-book Conversational QA dataset), TriviaQA and Natural Questions (Closed-book QA).\n\n### [Contextualized Sequence Likelihood: Enhanced Confidence Scores for Natural Language Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.01806)\n- **Metrics:** AUROC, AUARC; Improved sequence likelihood (log probability of generated sequence) used in Confidence or Uncertainty computation.\n- **Datasets:** CoQA (Open-book Conversational QA dataset), TriviaQA and Natural Questions (Closed-book QA).\n\n### [FaithDial: A Faithful Benchmark for Information-Seeking Dialogue](https:\u002F\u002Fdirect.mit.edu\u002Ftacl\u002Farticle\u002Fdoi\u002F10.1162\u002Ftacl_a_00529\u002F114373\u002FFaithDial-A-Faithful-Benchmark-for-Information)\n- **Metrics:** Metrics measure either the degree of hallucination of generated responses wrt to some given knowledge or their overlap with gold faithful responses: Critic, Q² (F1, NLI), BERTScore, F1, BLEU, ROUGE.\n- **Datasets:** FaithDial, WoW.\n\n### [Neural Path Hunter: Reducing Hallucination in Dialogue Systems via Path Grounding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08455)\n- **Metrics:** FeQA, a faithfulness metric; Critic, a hallucination critic; BLEU.\n- **Datasets:** OpenDialKG, a dataset that provides open-ended dialogue responses grounded on paths from a KG.\n\n### [HaluEval: A Large-Scale Hallucination Evaluation Benchmark](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11747)\n- **Metrics:** Accuracy: QA, Dialogue, Summarisation.\n- **Datasets:** HaluEval, a collection of generated and human-annotated hallucinated samples for evaluating the performance of LLMs in recognising hallucinations.\n\n### [The Knowledge Alignment Problem: Bridging Human and External Knowledge for Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13669)\n- **Metrics:** *Coverage*: a binary metric that determines whether all the correct gold answer values are included in the generated value. *Hallucination*: a binary indicator that assesses the presence of generated values that do not exist in the question values and gold grounding values. *User Simulator*: user simulator as an \"oracle\" language model with access to attribution information about the target answer.\n- **Datasets:** FuzzyQA, a dataset based on HybridDialogue and MuSiQue where complex questions were simplified using ChatGPT.\n\n### [Check Your Facts and Try Again: Improving Large Language Models with External Knowledge and Automated Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.12813)\n- **Metrics:** KF1, BLEU, ROUGE, chrF, METEOR, BERTScore, BARTScore, BLEURT, Avg length.\n- **Datasets:** News Chat: DSTC7 Track 2 was repurposed as an evaluation corpus for news conversation. Customer Service: uses DSTC11 Track 5 as a showcase in a conversational customer service scenario, expanding upon DSTC9 Track 1 by incorporating subjective information.\n\n### [SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.08896)\n- **Metrics:** Sentence-level Hallucination Detection (AUC-PR), and Passage-level Hallucination Detection (Pearson and Spearman's correlation coefficients).\n- **Datasets:** Generated Wikipedia articles from WikiBio, with annotated hallucinations.\n\n### [INSIDE: LLMs' Internal States Retain the Power of Hallucination Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.03744)\n- **Metrics:** AUROC, PCC; Accuracy (TruthfulQA).\n- **Datasets:** CoQA, SQuAD, Natural Questions (NQ), TriviaQA, TruthfulQA.\n- **Comments:** Introduces **INSIDE**, a hallucination detection framework that operates on LLM internal states. Proposes **EigenScore**, which uses eigenvalues of the covariance matrix of multiple response embeddings to measure semantic consistency, and a test-time feature clipping strategy to truncate extreme activations that drive overconfident hallucinations. Evaluated on LLaMA and OPT models across QA benchmarks, improving detection compared with uncertainty- and lexical-similarity baselines.\n\n### [The Internal State of an LLM Knows When it's Lying](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.13734)\n- **Metrics:** Per-topic and average accuracy.\n- **Datasets:** The True-False Dataset contains true and false statements covering several topics -- Cities, Inventions, Chemical Elements, Animals, Companies, and Scientific Facts.\n\n### [Chain-of-Knowledge: Grounding Large Language Models via Dynamic Knowledge Adapting over Heterogeneous Sources](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13269)\n- **Metrics:** Exact Match.\n- **Datasets:** FEVER, Adversarial HotpotQA.\n\n### [Halo: Estimation and Reduction of Hallucinations in Open-Source Weak Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.11764)\n- **Metrics:** HaloCheck and SelfCheckGPT scores; consistency, factuality.\n- **Datasets:** Generated and reviewed questions in the NBA domain.\n\n### [A Stitch in Time Saves Nine: Detecting and Mitigating Hallucinations of LLMs by Validating Low-Confidence Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.03987)\n- **Metrics:** Precision and Recall when detecting Sentence-level and Concept-level Hallucinations.\n- **Datasets:** ChatGPT-generated paragraphs spanning 150 topics from diverse domains.\n\n### [Sources of Hallucination by Large Language Models on Inference Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14552)\n- **Metrics:** Directional Levy\u002FHolt precision and recall with entity insertions and replacements.\n- **Datasets:** Levy\u002FHolt dataset, containing premise-hypothesis pairs with a task formatted as *Given [premise P], is it true that [hypothesis H]?*, where the model is evaluated with random premises.\n\n### [Hallucinations in Large Multilingual Translation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.16104)\n- **Metrics:** Rate to which MT system produces hallucinations under perturbation (Language Pair fraction, rate).\n- **Datasets:** Flores-101, WMT, TICO.\n\n### [Citation: A Key to Building Responsible and Accountable Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02185)\n- **Metrics:** N\u002FA\n- **Datasets:** N\u002FA\n\n### [Zero-Resource Hallucination Prevention for Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.02654)\n- **Metrics:** Hallucinatory instruction classification: AUC, ACC, F1, PEA.\n- **Datasets:** Concept-7, which focuses on classifying potential hallucinatory instructions.\n\n### [RARR: Researching and Revising What Language Models Say, Using Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.08726)\n- **Metrics:** Attributable to Identified Sources (AIS) scores before and after editing.\n- **Datasets:** Generated statements by creating task inputs from three datasets and prompting different models to produce long-form outputs which may contain hallucinations -- Factoid statements, Reasoning chains, and Knowledge-intensive dialogues.\n\n### [Q²: Evaluating Factual Consistency in Knowledge-Grounded Dialogues via Question Generation and Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08202)\n- **Metrics:** Q² is a metric itself, and it is compared with F1 token-level overlap, Precision and Recall, Q² w\u002Fo NLI, E2E NLI, Overlap, BERTScore, and BLEU.\n- **Datasets:** WoW which contains dialogues in which a bot needs to respond to user inputs in a knowledgeable way; Topical-Chat, a human-human knowledge-grounded conversation dataset; Dialogue NLI, a dataset based on the Persona-Chat dialogue task consisting of premise-hypothesis pairs.\n\n### [Do We Know What We Don’t Know? Studying Unanswerable Questions beyond SQuAD 2.0](https:\u002F\u002Faclanthology.org\u002F2021.findings-emnlp.385.pdf)\n- **Metrics:** EM on All, \"Has answer\", and \"IDK\"\n- **Datasets:** MNLI, SQuAD 2.0, ACE-whQA.\n\n### [Chain-of-Verification Reduces Hallucination in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.11495)\n- **Metrics:** Wikidata and Wiki-Category List: test precision, average number of positive and negative (hallucination) entities for list-based questions; MultiSpanQA: F1, Precision, Recall; Longform generation of biographies: FactScore.\n- **Datasets:** Wikidata, Wiki-Category List, MultiSpanQA, Longform Generation of Biographies.\n\n### [Detecting and Mitigating Hallucinations in Multilingual Summarisation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13632)\n- **Metrics:** mFACT, a novel multilingual faithful metric developed from four English faithfulness metrics: DAE, QAFactEval, ENFS%, and EntFA.\n- **Datasets:** XL-Sum, a multilingual summarisation dataset.\n\n### [Hallucinated but Factual! Inspecting the Factuality of Hallucinations in Abstractive Summarization](https:\u002F\u002Faclanthology.org\u002F2022.acl-long.236\u002F)\n- **Metrics:** XEnt: Hallucination (Accuracy, F1), Factuality (Accuracy, F1), ROUGE, % of novel n-gram, Faithfulness (%ENFS, FEQA, DAE), EntFA (% Factual Ent., % Factual Hal.)\n- **Datasets:** A novel dataset, XEnt, for analysing entity hallucination and factuality in abstractive summarisation, consisting of 800 summaries generated by BART and annotated. MEnt, a set of factuality and hallucination annotations for XSum.\n- **Comments:** Tab. 2 outlines several types of hallucinations (e.g., factual, non-factual, intrinsic).\n\n### [Enabling Large Language Models to Generate Text with Citations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14627)\n- **Metrics:** Fluency (MAUVE), Correctness (EM recall for ASQA, recall-5 for QAMPARI, claim recall for ELI5), Citation quality (citation recall, citation precision).\n- **Datasets:** QA datasets such that 1) they contain factual questions in which references are important, 2) questions require long-text answers covering multiple aspects, and 3) answering the questions requires synthesising multiple sources: ASQA, QAMPARI, ELI5.\n\n### [A Token-level Reference-free Hallucination Detection Benchmark for Free-form Text Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08704)\n- **Metrics:** Acc, G-Mean, BSS, AUC, Not Hallucination (P, R, F1), Hallucination (P, R, F1).\n- **Datasets:** HaDes (HAllucination DEtection dataSet), a novel token-level reference-free annotated hallucination detection dataset obtained by perturbing a large number of text segments extracted from the English Wikipedia and verified with crowd-sourced annotations.\n- **Comments:** Fig. 3 outlines several hallucination types (domain-specific knowledge, commonsense knowledge, incoherence or improper collocation, unrelated to central topic, conflict with preceding context, conflict with succeeding context, ..)\n\n### [Generating Benchmarks for Factuality Evaluation of Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.06908)\n- **Metrics:** Percentage of examples it assigns the highest probability to the factual completion.\n- **Datasets:** Wiki-FACTOR and News-FACTOR: two novel factuality evaluation benchmarks for LLMs, based on Wikipedia and News articles. Each example consists of a prefix, a factual completion and three similar but non-factual alternatives.\n- **Comments:** The paper introduces a framework for automatically generating such datasets from a given corpus, detailed in Section 3.\n\n### [Do Language Models Know When They're Hallucinating References?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.18248)\n- **Metrics:** Hallucination rate (H%, out of 1000 generated titles)\n- **Datasets:** Generated (true and hallucinated) references on topics from the ACM Computing Classification System.\n\n### [Why Does ChatGPT Fall Short in Providing Truthful Answers?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.10513)\n- **Metrics:** #Correct and #Wrong answers, and different type of failure counts: Comprehension, Factualness, Specificity, Inference.\n- **Datasets:** HotpotQA, BoolQ\n- **Comments:** This has a nice taxonomy on different error types -- e.g., *comprehension*, *factualness*, *specifity*, *inference*.\n\n### [LM vs LM: Detecting Factual Errors via Cross Examination](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13281)\n- **Metrics:** Precision, Recall, F1 (under different cross-examination strategies: AYS, IDK, Confidence-Based, IC-IDK)\n- **Datasets:** TriviaQA, NQ, PopQA\n\n### [RHO (ρ): Reducing Hallucination in Open-domain Dialogues with Knowledge Grounding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.01588)\n- **Metrics:** BLEU, ROUGE-L; FeQA, QuestEval, EntityCoverage (Precision, Recall, F1) to estimate the hallucination degree -- FrQA and QuestEval are QA-based metrics for evaluating the faithfulness of the output in the generation task.\n- **Datasets:** OpenDialKG\n\n### [FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14251)\n- **Metrics:** %Supported statements across varying frequency levels of human entities.\n- **Datasets:** People biographies generated from LLMs, where human annotators break them into supporting facts.\n\n### [ExpertQA: Expert-Curated Questions and Attributed Answers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.07852)\n- **Metrics:** zero-shot (P, R, F1) and fine-tuned (P, R, F1) of AutoAIS labels; FActScore F1 scores on reference factuality labels; AutoAIS (Attributable to Identified Sources) scores.\n- **Datasets:** Expert-curated questions across multiple fields (e.g., Anthropology, Architecture, Biology, Chemistry, Engineering & Technology, Healthcare\u002FMedicine; see Tab. 1 for a sample) organised by Question Type (e.g., Directed question with single unambiguous answer, open-ended potentially ambiguous question, summarisation of information of a topic, advice or suggestion on how to approach a problem; see Tab. 2)\n\n### [DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.03883)\n- **Metrics:** TruthffulQA: MC1, MC2, MC3 scores; FACTOR: News, Wiki; these were multiple-choice results. Open-ended generation: for TruthfulQA, they use %Truth, %Info, %Truth*Info, %Reject; for CoT tasks (StrategyQA and GSM8K) they go with accuracy.\n- **Datasets:** TruthfulQA, FACTOR (news\u002Fwiki), StrategyQA, GSM8K\n\n### [FreshLLMs: Refreshing Large Language Models with Search Engine Augmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03214)\n- **Metrics:** Accuracy (Strict, Relaxed on Fast-changing questions, Slow-changing questions, Never-changing questions, False-premise questions involve knowledge before 2022 and since 2022, 1-hop and multi-hop questions, and Overall).\n- **Datasets:** FreshQA, a new QA benchmark with 600 questions covering a wide spectrum of question and answer types.\n\n### [Beyond Factuality: A Comprehensive Evaluation of Large Language Models as Knowledge Generators](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07289)\n- **Metrics:** Factuality, Relevance, Coherence, Informativeness, Helpfulness and Validity.\n- **Datasets:** Natural Questions, Wizard of Wikipedia.\n\n### [Complex Claim Verification with Evidence Retrieved in the Wild](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11859)\n- **Metrics:** Accuracy, MAE, Macro-F1, soft accuracy.\n- **Datasets:** ClaimDecomp, which contains 1200 complex claims from PolitiFactL each claim is labeled with one of the six veracity labels, a justification paragraph written by expect fact-checkers, and subquestions annotated by prior work.\n\n### [FELM: Benchmarking Factuality Evaluation of Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.00741)\n- **Metrics:** Accuracy, F1\u002FPrecision\u002FRecall.\n- **Datasets:** Reasoning, Math, Writing\u002FRec, Science\u002FTech, World Knowledge: GSM8K, ChatGPT, MATH, TruthfulQA, Quora, MMLU\u002Fhc3.\n\n### [Evaluating Hallucinations in Chinese Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03368)\n- **Metrics:** Humand and GPT-4 evaluations.\n- **Datasets:** HalluQA (which they propose), and mention TruthfulQA, ChineseFactEval, HaluEval.\n\n### [On Faithfulness and Factuality in Abstractive Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.00661)\n- **Metrics:** Human annotations of hallucinated spans (intrinsic\u002Fextrinsic) and factuality (with external evidence); ROUGE-1\u002F2\u002FL; BERTScore; textual entailment; QA-based consistency; Spearman correlation with human scores.\n- **Datasets:** XSum (BBC articles); 500 test articles sampled for human evaluation (2,500 document-summary pairs).\n- **Comments:** Large-scale human study of hallucinations in extreme summarization; finds extrinsic hallucinations frequent (including in gold summaries) and textual entailment correlates best with human faithfulness\u002Ffactuality versus ROUGE\u002FBERTScore\u002FQA.\n\n### [QuestEval: Summarization Asks for Fact-based Evaluation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.12693)\n- **Metrics:** QuestEval (proposed in this work), for testing for *consistency*, *coherence*, *fluency*, and *relevance*. ROUGE, BLUE, METEOR, BERTScore. SummaQA, QAGS.\n- **Datasets:** SummEval, QAGS-XSUM, SQuAD-v2.\n\n### [QAFactEval: Improved QA-Based Factual Consistency Evaluation for Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.08542)\n- **Metrics:** QAFactEval (proposed in this work), measuring answer selection, question generation, question answering, answer overlap, and filtering\u002Fanswerability.\n- **Datasets:** SummaC, a collection of benchmarks for binary factual consistency evaluation; CGS, correct and incorrect sentences from CNN\u002FDailyMail; XSF; Polytope; FactCC; SummEval; FRANK; QAGs.\n\n### [Fast and Accurate Factual Inconsistency Detection Over Long Documents](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.13189)\n- **Metrics:** SCALE (new metric proposed in this work). Compared with Q², ANLI, SummaC, F1, BLEURT, QuestEval, BARTScore, BERTScore (Table 3). \n- **Datasets:** TRUE benchmark and ScreenEval, new dataset proposed in this work to assess the factual inconsistency in long-form dialogues (52 documents from SummScreen).\n\n### [Understanding Factuality in Abstractive Summarization with FRANK: A Benchmark for Factuality Metrics](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.13346)\n- **Metrics:** BERTScore, FEQA, QGFS, DAE, FactCC\n- **Datasets:** Proposed a new dataset FRANK: human annotated factual errors for CNN\u002FDM and XSum dataset\n\n### [The Curious Case of Hallucinatory (Un)answerability: Finding Truths in the Hidden States of Over-Confident Large Language Models](https:\u002F\u002Faclanthology.org\u002F2023.emnlp-main.220\u002F)\n- **Metrics:** (classification) F-1, Exact Match, (token) F-1\n- **Datasets:** SQuAD, Natural Questions, MuSiQue\n- **Comments:** This paper models explores LLMs' handling of (un)answerable questions in a closed-book setting, namely answering a question based on a given passage, where the passage doesn't have the answer. The paper shows that despite LLMs' tendency to hallucinate contextual answers, rather than state that they cannot answer the question, they possess internal understanding of the question's (un)answerability.\n\n### [Do Androids Know They're Only Dreaming of Electric Sheep?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.17249)\n- **Metrics:** (Hallucination detection) Response-level F1, Span-level Partial Credit Match F1\n- **Datasets:** Organically generated and synthetically edited CNN DailyMail, ConvFEVER, and E2E, labeled span-wise for hallucinations\n- **Comments:** Language models know when they're hallucinating, and we can train probes on LLM hidden states during decoding to reliably detect them. \n\n### [Correction with Backtracking Reduces Hallucination in Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.16176)\n- **Metrics:** AlignScore, FactCC, BS-Fact, ROUGE-L\n- **Datasets:** CNN\u002FDM, XSum, Newsroom\n\n### [Fine-grained Hallucination Detection and Editing for Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.06855)\n- **Metrics:** Precision, Recall, F1.\n- **Datasets:** Custom fine-grained hallucination detection\u002Fediting dataset for various types of (factual) hallucinations: Entity, Relation, Contradictory, Invented, Subjective, Unverifiable.\n\n### [LLMs as Factual Reasoners: Insights from Existing Benchmarks and Beyond](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14540)\n- **Metrics:** Accuracy for various error types -- positive examples, date swap, entity swap, negated sentences, number swap, pronoun swap.\n- **Datasets:** They propose SummEdits, a 10-domain inconsistency detection benchmark.\n\n### [Evaluating the Factual Consistency of Abstractive Text Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.12840)\n- **Metrics:** They propose FactCC, a metric that measures the factual consistency of abstractive text summarization (intuition: a summary is factually consistent if it contains the same facts as the source document)\n- **Datasets:** CNN\u002FDM for generating training data; MNLI and FEVER for training models. Human-based experiments for evaluation on claims about CNN\u002FDM articles.\n\n### [SummaC: Re-Visiting NLI-based Models for Inconsistency Detection in Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09525)\n- **Metrics:** Each dataset comes with its metrics (e.g., CoGenSumm uses a reranking-based measure; XSumFaith, SummEval, and FRANK propose several metrics and analyse how they correlate with human annotations; etc.) -- for SummaC, authors propose using balanced accuracy.\n- **Datasets:** They propose SummaC (Summary Consistency), a benchmark consisting of six large inconsistency detection datasets: CoGenSumm, XSumFaith, Polytope, FactCC, SummEval, and FRANK.\n\n### [On the Origin of Hallucinations in Conversational Models: Is it the Datasets or the Models?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07931)\n- **Metrics:** Expert and non-expert annotations: Partial Hallucination, Entailment, Hallucination, Uncoop, Generic (each of these categories has more fine-grained sub-classes -- see e.g., Fig. 2) -- annotations follow the BEGIN and VRM taxonomies.\n- **Datasets:** Knowledge-grounded conversational benchmarks: Wizard of Wikipedia (WoW), CMU-DoG, and TopicalChat -- datasets consisting of dialogues between two speakers where the goal is to communicate information about particular topics while speakers are presented with a knowledge snippet relevant to the current turn.\n\n### [Teaching Language Models to Hallucinate Less with Synthetic Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.06827)\n- **Metrics:** Hallucination rate in several settings (original, with optimised system message, with full LLM weights, with synthetic data, or with mixtures of synthetic and reference data); BLEU, ROUGE-1, ROUGE-2, ROUGE-L.\n- **Datasets:** Search-and-retrieve (MS MARCO), meeting summarisation (QMSum), automated clinical report generation (ACI-Bench).\n\n### [Faithfulness-Aware Decoding Strategies for Abstractive Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03278)\n- **Metrics:** ROUGE-L, BERTScore, BS-Fact, FactCC, DAE, QuestEval\n- **Datasets:** CNN\u002FDM, XSum\n\n### [KL-Divergence Guided Temperature Sampling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.01286)\n- **Metrics:** Conversational QA: models fine-tuned on MNLI, SNLI, FEVER, PAWS, ScTail, and VitaminC. Summarisation: models fine-tuned on ANLI and XNLI.\n- **Datasets:** Question Rewriting in Conversational Context (QReCC), XLSum.\n\n### [Investigating Hallucinations in Pruned Large Language Models for Abstractive Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.09335)\n- **Metrics:** Hallucination Risk Metrics (HaRiM+), SummaC, SummaCzs, SummaCconv, Hallucination Risk Ratio (HRR)\n- **Datasets:** FactCC, Polytope, SummEval, Legal Contracts, RCT\n\n### [Entity-Based Knowledge Conflicts in Question Answering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.05052)\n- **Metrics:** EM, Memorisation ratio.\n- **Datasets:** NQ Dev with Answer Overlap (AO) and No Answer Overlap (NAO), NewsQA.\n\n### [TruthX: Alleviating Hallucinations by Editing Large Language Models in Truthful Space](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.17811)\n- **Metrics:** MC1\u002FMC2\u002FMC3 scores for TruthffulQA multiple-choice task; %Truth, %Info, %Truth*Info for TruthffulQA open-ended generation task; Choice accuracy for Natural Questions, TriviaQA and FACTOR (news, expert, wiki).\n- **Datasets:** TruthfulQA, Natural Questions, TriviaQA, FACTOR (news, expert, wiki)\n\n### [Question Decomposition Improves the Faithfulness of Model-Generated Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.11768)\n- **Metrics:** Accuracy, Final Answer Truncation Sensitivity, Final Answer Corruption Sensitivity, Biased-Context Accuracy Change.\n- **Datasets:** HotpotQA, OpenbookQA, StrategyQA, TruthfulQA.\n\n### [Self-contradictory Hallucinations of Large Language Models: Evaluation, Detection and Mitigation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.15852)\n- **Metrics:** For detection: Precision, Recall, F1. For Mitigation: Ratio of self-contradiction removed, Ratio of informative facts retained, perplexity increased.\n- **Datasets:** Custom Open-domain Text Generation dataset, LLM-generated encyclopedic text descriptions for Wikipedia entities, PopQA.\n\n### [Detecting hallucinations in large language models using semantic entropy](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-024-07421-0)\n- **Metrics:** For detection: AUROC, AURAC.\n- **Datasets:** QA: TriviaQA, SQuAD, BioASQ, NQ-Open, SVAMP. FactualBio, a biography-generation dataset, accompanying this paper.\n\n### [CAST: Cross-modal Alignment Similarity Test for Vision Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.11007)\n- **Metrics:** Propose CAST, a simple self-consistency metric that seeks to evaluate whether multimodal models are consistent across modalities. This works in two stage, in the first stage the models generate similarities\u002Ftrue statements comparing two inputs, and in the second stage the model judges its own output for truthfulness. A consistent model should therefore always evaluate its own outputs as true.\n\n### [Uncertainty Quantification for Language Models: A Suite of Black-Box, White-Box, LLM Judge, and Ensemble Scorers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.19254)\n- **Metrics:** Black-box (consistency-based scorers): non-contradiction probability, normalized semantic negentropy, normalized cosine similarity, BERTSCore, BLEURT, and exact match rate. White-box (token-probability-based) scorers: minimum token probability, length-normalized token probability. LLM-as-a-Judge scorers: categorical (incorrect\u002Funcertain\u002Fcorrect). Proposed novel, tunable ensemble scorer that is a weighted average of any combination of black-box, white-box, and LLM-as-a-Judge scorers, where weights can be tuned using a user-provided set of graded LLM responses. \n\n## Domain-specific Entries\n\n### [Med-HALT: Medical Domain Hallucination Test for Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15343)\n- **Metrics:** Reasoning Hallucination Tests (False Confidence Tests, None of the Above Tests, Fake Questions Tests), Memory Hallucination Tests (Abstract-to-Link Tests, PMID-to-Title Tests, Title-to-Link Tests, Link-to-Title Tests); Accuracy, Pointwise Score.\n- **Datasets:** Med-HALT: MEDMCQA, Headqa, Medqa USMILE, Medqa (Taiwan), Pubmed.\n\n### [Retrieval-Based Prompt Selection for Code-Related Few-Shot Learning](https:\u002F\u002Fpeople.ece.ubc.ca\u002Famesbah\u002Fresources\u002Fpapers\u002Fcedar-icse23.pdf)\n- **Metrics:** Accuracy, Accuracy plausible match\n- **Datasets:** ATLAS dataset, TFix dataset\n- **Comments:**: Published at ICSE 2023\n\n\n\n## Overviews, Surveys, and Shared Tasks\n\n- [Mitigating LLM Hallucinations: a multifaceted approach](https:\u002F\u002Famatriain.net\u002Fblog\u002Fhallucinations)\n- [Siren’s Song in the AI Ocean: A Survey on Hallucination in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.01219)\n- [Survey of Hallucination in Natural Language Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.03629)\n- [A Survey of Hallucination in Large Foundation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05922)\n- [A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions](https:\u002F\u002Fgithub.com\u002FLuckyyySTA\u002FAwesome-LLM-hallucination)\n    - Paper available [here](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05232)\n    - Two main categories: *factuality hallucinations* and *faithfulness hallucinations*. Factuality hallucinations emphasise the discrepancy between generated content and verifiable real-world facts, typically manifesting as factual inconsistencies or fabrications. Faithfulness hallucinations refer to the divergence of generated content from user instructions or the context provided by the input, as well as self-consistency within generated content.\n- [LLM Powered Autonomous Agents](https:\u002F\u002Flilianweng.github.io\u002Fposts\u002F2023-06-23-agent\u002F)\n- [SemEval-2024 Task-6 - SHROOM, a Shared-task on Hallucinations and Related Observable Overgeneration Mistakes](https:\u002F\u002Fhelsinki-nlp.github.io\u002Fshroom\u002F)\n- [llm-hallucination-survey](https:\u002F\u002Fgithub.com\u002FHillZhang1999\u002Fllm-hallucination-survey)\n- [How Do Large Language Models Capture the Ever-changing World Knowledge? A Review of Recent Advances](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07343)\n- [The Dawn After the Dark: An Empirical Study on Factuality Hallucination in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.03205)\n\n![Taxonomy from Huang et al.](figures\u002Fhuang_taxonomy.png \"Taxonomy\")\n\n## Taxonomies\n\n[Survey of Hallucination in Natural Language Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.03629) classifies metrics in *Statistical* (ROUGE, BLEU, PARENT, Knowledge F1, ..) and *Model-based* metrics. The latter are further structured in the following classes:\n- **Information-Extraction (IE)-based**: retrieve an answer from a knowledge source and compare it with the generated answer -- there might be problems due to the error propagation from the IE model.\n- **QA-based**: measure the overlap\u002Fconsistency between generation and source reference, based on the intuition that similar answers will be generated from the same question if the generation is factually consistent with the source reference. Used to evaluate hallucinations in summarisation, dialogue, and data2text generation. Composed of a *question generation* model and a *question answering* model.\n- **Natural Language Inference (NLI)-based**: based on the idea that only the source knowledge reference should entail the entirety of the information in faithful and hallucination-free generation.\n\n[A Survey of Hallucination in “Large” Foundation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05922) surveys papers flagging them for *detection*, *mitigation*, *tasks*, *datasets*, and *evaluation metrics*. Regarding hallucinations in text, it categorises papers by *LLMs*, *Multilingual LLMs*, and *Domain-specific LLMs*.\n\n[The Dawn After the Dark: An Empirical Study on Factuality Hallucination in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.03205) proposed a taxonomy of different types of hallucinations: Entity-error Hallucination, Relation-error Hallucination, Incompleteness Hallucination, Outdatedness Hallucination, Overclaim Hallucination, Unverifiability Hallucination.\n\n[Internal Consistency and Self-Feedback in Large Language Models: A Survey](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.14507) proposed a new perspective, **Internal Consistency**, to approach \"enhancing reasoning\" and \"\"alleviating hallucinations\". This perspective allowed us to unify many seemingly unrelated works into a single framework. To improve internal consistency (which in turn enhances reasoning ability and mitigates hallucinations), this paper identified common elements across various works and summarized them into a Self-Feedback framework.\n\nThis framework consists of three components: Self-Evaluation, Internal Consistency Signal, and Self-Update.\n\n- **Self-Evaluation**: Responsible for evaluating the model's internal consistency based on its language expressions, decoding layer probability distributions, and hidden states.\n- **Internal Consistency Signal**: Through Self-Evaluation, we can obtain numerical, textual, external, and even comparative signals.\n- **Self-Update**: Using these signals, we can update the model's expressions or even the model itself to improve internal consistency.\n\n## Measuring Hallucinations in LLMs\n- [AnyScale - Llama 2 is about as factually accurate as GPT-4 for summaries and is 30X cheaper](https:\u002F\u002Fwww.anyscale.com\u002Fblog\u002Fllama-2-is-about-as-factually-accurate-as-gpt-4-for-summaries-and-is-30x-cheaper)\n- [Arthur.ai - Hallucination Experiment](https:\u002F\u002Fwww.arthur.ai\u002Fgap-articles\u002Fhallucination-experiment)\n- [Vectara - Cut the Bull…. Detecting Hallucinations in Large Language Models](https:\u002F\u002Fvectara.com\u002Fcut-the-bull-detecting-hallucinations-in-large-language-models\u002F)\n- [Vectara LLM Hallucination Leaderboard](https:\u002F\u002Fgithub.com\u002Fvectara\u002Fhallucination-leaderboard)\n- [TofuEval: Evaluating Hallucinations of LLMs on Topic-Focused Dialogue Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.13249)\n- [UQLM: Uncertainty Quantification for Language Models](https:\u002F\u002Fgithub.com\u002Fcvs-health\u002Fuqlm)\n\n\n## Open Source Models for Measuring Hallucinations\n- [MiniCheck Code and Model - GitHub](https:\u002F\u002Fgithub.com\u002FLiyan06\u002FMiniCheck)\n- [AlignScore Code and Model - GitHub](https:\u002F\u002Fgithub.com\u002Fyuh-zha\u002FAlignScore)\n- [Google True Teacher Model - HuggingFace](https:\u002F\u002Fhuggingface.co\u002Fgoogle\u002Ft5_11b_trueteacher_and_anli)\n- [Hallucination Evaluation Model - HuggingFace](https:\u002F\u002Fhuggingface.co\u002Fvectara\u002Fhallucination_evaluation_model)\n- [Summac Code and Model - GitHub](https:\u002F\u002Fgithub.com\u002Ftingofurro\u002Fsummac)\n- [SCALE Code and Model - GitHub](https:\u002F\u002Fgithub.com\u002Fasappresearch\u002Fscale-score)\n\n## Definitions and Notes\n\n### Extrinsic and Intrinsic Hallucinations\n\n[Neural Path Hunter](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08455) defines as *extrinsic hallucination* as an utterance that brings a new span of text that does not correspond\nto a valid triple in a KG, and as *intrinsic hallucination* as an utterance that misuses either the subject or object in a KG triple such that there is no direct path between the two entities. [Survey of Hallucination in Natural Language Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.03629) defines as *extrinsic hallucination* a case where  the generated output that cannot be verified from the source content, and as an *intrinsic hallucination* a case where the generated output contradicts the source content.\n\n## Citing this repository\n\n```\n@misc{MinerviniAHD2024,\n  author = {Pasquale Minervini and Aryo Pradipta Gema and others},\n  title = {awesome-hallucination-detection},\n  year = {2024},\n  publisher = {GitHub},\n  journal = {GitHub repository},\n  howpublished = {\\url{https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection}}\n}\n```\n","# awesome-hallucination-detection\n\n[![Awesome](https:\u002F\u002Fcdn.rawgit.com\u002Fsindresorhus\u002Fawesome\u002Fd7305f38d29fed78fa85652e3a63e154dd8e8829\u002Fmedia\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection) [![License: Apache 2.0](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-Apache_2.0-blue.svg)](https:\u002F\u002Fopensource.org\u002Flicenses\u002FApache-2.0) [![PRs Welcome](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPRs-welcome-brightgreen.svg)](https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection\u002Fpulls) [![Papers](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPapers-139-blue.svg)](https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection) [![Maintained](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FMaintained-yes-green.svg)](https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection)\n\n## 论文与摘要\n\n### [The Hidden Life of Tokens: Reducing Hallucination of Large Vision-Language Models via Visual Information Steering](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.03628)\n- **评估指标（Metrics）：** CHAIRs, CHAIRi, POPE Accuracy\u002FF1, MMHal-Bench GPT-4 score, MME score\n- **数据集（Datasets）：** MSCOCO 2014 (CHAIR, 500 images), POPE (COCO subset), MMHal-Bench (96 image-question pairs), MME\n- **备注（Comments）：** 提出了 **VISTA**，一种无需训练的推理时框架，通过在激活空间中引导视觉信息来对抗大型视觉语言模型（LVLM，Large Vision-Language Model）的幻觉。揭示了 LVLM 生成过程中的三种现象：视觉信息逐渐丢失、语义有意义的 token 早期激活、以及词汇排序中隐藏的 genuine 信息。通过利用早期层 logits 在推理时强化视觉 grounding，在三种解码策略下，四种架构（LLaVA-1.5, MiniGPT-4, Shikra, InstructBLIP）的幻觉减少了约 40%。（ICML 2025）\n\n### [Look Twice Before You Answer: Memory-Space Visual Retracing for Hallucination Mitigation in Multimodal Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.03577)\n- **评估指标（Metrics）：** CHAIRs, CHAIRi, POPE Accuracy\u002FF1, HallusionBench (fACC, qACC, aACC), MME, MMBench, MM-Vet, LLaVA-Bench, VizWiz-VQA\n- **数据集（Datasets）：** POPE (MSCOCO, A-OKVQA, GQA), CHAIR, HallusionBench, MME, MMBench, MM-Vet, LLaVA-Bench, VizWiz-VQA\n- **备注（Comments）：** 提出了 **MemVR**，一种受人类认知启发的无需训练解码方法：当模型在生成过程中表现出不确定性时，视觉 token 通过前馈网络（Feed Forward Network）作为键值记忆（key-value memory）重新注入，类似于记忆消退时\"再看一眼\"图像。在八个基准测试和多种多模态大语言模型（MLLM，Multimodal Large Language Model）架构（LLaVA-1.5, Qwen-VL, GLM4V）上显著减少幻觉，同时保持通用能力。（ICML 2025）\n\n### [Robust Multimodal Large Language Models Against Modality Conflict](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.07151)\n- **评估指标（Metrics）：** ROUGE-L F1, Hallucination Rate, LLM-Judge Score (0–4)\n- **数据集（Datasets）：** MMMC (Multimodal Modality Conflict, 20K image-question-answer triples from Visual Genome), HallusionBench, MMBench, MMStar, MMMU, MathVista, OCRBench, AI2D, MMVet, MME\n- **备注（Comments）：** 正式定义了**模态冲突（modality conflict）**——视觉输入与文本输入之间的矛盾使 MLLM 陷入两难困境——是幻觉的主要驱动因素。引入了 **MMMC** 数据集，包含对象、属性和关系冲突，并评估了三种缓解策略：提示工程（prompt engineering）、监督微调（supervised fine-tuning）和强化学习（reinforcement learning）。发现强化学习（RL，Reinforcement Learning）提供了最稳健的防御，训练模型优先采信视觉证据而非误导性文本线索。（ICML 2025）\n\n### [GLSim: Detecting Object Hallucinations in LVLMs via Global-Local Similarity](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.19972)\n- **评估指标（Metrics）：** AUROC, AUPR\n- **数据集（Datasets）：** MSCOCO (5K validation images, 80 object classes), Objects365 (5K validation images, 365 object classes)\n- **备注（Comments）：** 提出了 **GLSim**，一种无需训练的幻觉检测框架，结合了图像与文本模态之间的全局和局部嵌入相似性（embedding similarity）互补信号。通过从中间层嵌入中提取连续的幻觉似然分数，GLSim 同时捕捉上下文和细粒度视角。在多个 LVLM（LLaVA-1.5, MiniGPT-4, Shikra, InstructBLIP, Qwen2.5-VL）上优于竞争性基线，无需外部监督或评判模型。（NeurIPS 2025）\n\n### [Intervene-All-Paths: Unified Mitigation of LVLM Hallucinations across Alignment Formats](https:\u002F\u002Farxiv.org\u002Fabs\u002F2511.17254)\n- **评估指标（Metrics）：** POPE Accuracy\u002FF1, MCQ-POPE Accuracy\u002FMacro-F1, CHAIRs, CHAIRi, MME (Existence, Count, Position, Color)\n- **数据集（Datasets）：** POPE (COCO, A-OKVQA, GQA), MCQ-POPE, CHAIR (COCO), MME\n- **备注（Comments）：** 证明了 LVLM 幻觉源于三种相互作用的因果路径：image-to-input-text、image-to-output-text 和 text-to-text。提出了一种无需训练的 head-level 干预框架，识别所有路径中的关键**幻觉头（hallucination heads）**，并针对不同问答格式（是\u002F否、多选、开放式）应用定制化修正。在不同对齐类型上实现一致的幻觉减少。（NeurIPS 2025）\n\n### [One SPACE to Rule Them All: Jointly Mitigating Factuality and Faithfulness Hallucinations in LLMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.11088)\n- **评估指标（Metrics）：** TruthfulQA (MC1, MC2, Truthfulness, Informativeness, True*Info), PDTB DISQ Score (Targeted, Counterfactual, Consistency, Overall)\n- **数据集（Datasets）：** TruthfulQA, PDTB (Penn Discourse TreeBank)\n- **备注（Comments）：** 揭示了一个关键的零和动态：针对事实性（factuality）的干预往往会损害忠实性（faithfulness），反之亦然。**SPACE** 通过证明两种幻觉类型在神经表征中共享重叠子空间来解决这一问题。使用双任务特征建模、谱聚类（spectral clustering）和注意力头显著性评分来识别和编辑共享的激活子空间，从而在不牺牲指令遵循能力的情况下，同时缓解两种幻觉类型。（NeurIPS 2025）\n\n### [Reasoning Models Hallucinate More: Factuality-Aware Reinforcement Learning for Large Reasoning Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.24630)\n- **评估指标（Metrics）：** Pass@1 Accuracy (reasoning), Truthfulness ratio (TruthfulQA), Accuracy (HaluEval-QA), Truthfulness ratio (HalluQA)\n- **数据集（Datasets）：** TruthfulQA (817 samples), HaluEval-QA (10K samples), HalluQA (450 samples), GSM8K (1,319 samples), MATH-500, AIME 2024, AIME 2025\n- **备注（Comments）：** 揭示了推理模型（使用思维链 CoT，Chain-of-Thought）在复杂事实问题上实际上比基础模型产生**更多**幻觉，因为扩展的生成提供了更大的事实性漂移（factuality drift）表面。引入了 **FSPO**（Factuality-aware Step-wise Policy Optimization，事实感知逐步策略优化），一种强化学习微调算法，在每个推理步骤中纳入显式的事实性验证，动态调整 token-level 优势值以在整个推理轨迹中保持事实正确性。（NeurIPS 2025）\n\n### [幻觉的假象：重新评估大语言模型中的幻觉检测](https:\u002F\u002Faclanthology.org\u002F2025.emnlp-main.1761\u002F)\n- **评估指标（Metrics）：** AUROC（受试者工作特征曲线下面积）、PR-AUC（精确率-召回率曲线下面积）、Precision（精确率）、Recall（召回率）、F1、ROUGE-L F1\n- **数据集：** NQ-Open（3,610 个问答对）、TriviaQA（3,842 个样本）、SQuADv2（4,150 个样本）\n- **评论：** 研究表明，基于 ROUGE 的评估系统性地高估了问答任务中的幻觉检测性能。通过全面的人工研究，发现 ROUGE 的精确率极低，且多种已建立的检测方法（Perplexity（困惑度）、EigenScore（特征值分数）、eRank）在使用与人类对齐的 LLM-as-Judge（大语言模型作为评判者）指标而非 ROUGE 进行评估时，性能下降高达 **45.9% AUROC**。研究还揭示，简单的基于长度的启发式方法可以匹敌甚至超越 Semantic Entropy（语义熵）等复杂检测器，暴露了当前评估实践中的根本性缺陷。（EMNLP 2025）\n\n### [大胆断言还是自我怀疑？基于信念状态的事实性幻觉类型检测](https:\u002F\u002Faclanthology.org\u002F2025.findings-emnlp.527\u002F)\n- **评估指标：** Truthful Rate（真实率）、OH（Overconfident Hallucination detection rate，过度自信幻觉检测率）、UH（Unaware Hallucination detection rate，未知幻觉检测率）、AUC\n- **数据集：** TriviaQA、NQOPEN、ALCUNA\n- **评论：** 引入了**信念状态（belief state）**的概念——一种基于多次采样中答案重复一致性来衡量模型置信度的指标——将事实性幻觉分为两类：**Overconfident（过度自信）**（模型自信地撒谎）和**Unaware（未知）**（模型因知识缺失而猜测）。提出了 **BAFH**，一种轻量级框架，通过在隐藏状态上训练前馈分类器来确定信念状态并分类幻觉类型。在八个 LLM（Gemma-2、Llama-3.1、Mistral）上与 MIND 和 SAR 基线进行了评估。（EMNLP 2025 Findings）\n\n### [通过增强精确知识利用探索事实性幻觉缓解的泛化性](https:\u002F\u002Faclanthology.org\u002F2025.findings-emnlp.211\u002F)\n- **评估指标：** 21 个领域的准确率\n- **数据集：** FactualBench（涵盖 21 个领域的 181K 中文事实性问答对）\n- **评论：** 提出了 **PKUE**，通过加强 LLM 内部查询与参数化知识之间的映射来缓解事实性幻觉。通过偏好优化对模型进行微调，使用自生成的精确事实问题回答，而非可能过度拟合特定提示模板的事后修正方法。引入了 **FactualBench**，一个大规模中文事实性问答数据集，并展示了在事实性任务、通用任务和多语言设置中的广泛泛化能力。（EMNLP 2025 Findings）\n\n### [迈向忠实的自然语言解释：大语言模型中激活修补的研究](https:\u002F\u002Faclanthology.org\u002F2025.emnlp-main.529\u002F)\n- **评估指标：** Accuracy（准确率）、CaF（Causal Faithfulness，因果忠实度，变体包括 CaF(M)、CaF(T)、CaF(L)）、CC-SHAP、CFF（Counterfactual Faithfulness，反事实忠实度）、Plausibility（合理性）\n- **数据集：** CoS-E（Commonsense Reasoning，常识推理）、e-SNLI（Natural Language Inference，自然语言推理）、ComVE（Commonsense Validation and Explanation，常识验证与解释）\n- **评论：** 引入了 **Causal Faithfulness (CaF)**，一种使用激活修补（activation patching）来衡量模型答案背后的因果效应与其自然语言解释之间差异的指标。与先前基于输入扰动的忠实度测试（容易出现分布外问题）不同，CaF 通过 Symmetric Token Replacement（对称 token 替换）在内部隐藏状态上操作。发现对齐调优模型比基础模型产生更忠实的解释，且忠实度与合理性呈正相关。评估了六个 Gemma-2 模型（2B–27B）。（EMNLP 2025）\n\n### [InteGround：关于整合性 grounding 中验证与检索规划评估的研究](https:\u002F\u002Faclanthology.org\u002F2025.findings-emnlp.732\u002F)\n- **评估指标：** Accuracy、Groundedness verification scores（groundedness 验证分数）\n- **数据集：** 从四个领域重新利用的数据（声明验证、多跳问答）\n- **评论：** 引入了**整合性 grounding（integrative grounding）**，一项要求 LLM 为复杂查询检索和验证多个相互依赖证据的任务。发现当外部信息不完整时，LLM 会默认使用内部知识幻觉合理化。研究表明，在复杂 RAG（Retrieval-Augmented Generation，检索增强生成）场景中，带有逻辑约束的前提溯因（premise abduction）和零样本自我反思在约束幻觉级联方面优于无方向的检索规划。（EMNLP 2025 Findings）\n\n### [使用注意力图谱的谱特征进行大语言模型幻觉检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.17598)\n- **评估指标：** AUROC、Precision、Recall、Cohen's Kappa（科恩卡帕系数）\n- **数据集：** NQ-Open、TriviaQA、CoQA、SQuADv2、HaluEval-QA、TruthfulQA、GSM8K\n- **评论：** 将 LLM 视为动态图，分析内部注意力机制的结构特性。从注意力图谱中提取**谱特征（spectral features）**（特征值）来预测模型何时在编造信息：事实检索产生稳定的特征结构，而幻觉导致扩散、混乱的模式。创建了一个白盒幻觉检测器，独立于生成的语义内容运行，在七个问答基准上进行了评估。（EMNLP 2025）\n\n### [PruneCD：对比剪枝自模型以提高解码事实性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.16598)\n- **评估指标：** TruthfulQA（MC1、MC2、%Truth、%Info）、FACTOR、StrategyQA Accuracy\n- **数据集：** TruthfulQA、FACTOR（News、Wiki）、StrategyQA\n- **评论：** 通过**动态层剪枝（dynamic layer pruning）**而非简单截断来构建\"业余\"模型，解决了早退对比解码的局限性。移除特定的中间推理层会产生更好的校准对比先验，具有更有信息量的 logits，引导生成远离事实上错误但高概率的 token，同时保持流畅性。以最小推理开销实现一致的事实性改进。（EMNLP 2025）\n\n### [Re-FRAME 会议摘要 SCOPE：基于事实的摘要与通过问题的个性化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.15901)\n- **评估指标：** ROUGE（R-1、R-2、R-L）、BERTScore F1、MESA（8 个维度包括幻觉）、P-MESA（7 个个性化维度）、Balanced Accuracy（平衡准确率）、Cohen's kappa\n- **数据集：** QMSum（ICSI、AMI、WPCP 会议）、FAME（500 个英文、300 个德文合成会议）\n- **评论：** 通过引入 **FRAME** 流程和 **SCOPE** 协议解决会议摘要幻觉问题。FRAME 提取显著事实并按主题评分；SCOPE 强制模型在生成摘要前通过九个问题的推理轨迹对上下文选择进行结构性论证。引入了 **P-MESA**，一个多维个性化评估框架，并表明在 QMSum 和 FAME 上，FRAME 将 MESA 上的幻觉和遗漏降低 2\u002F5 分，而 SCOPE 在知识契合度和目标对齐方面优于仅提示基线。（EMNLP 2025 Findings）\n\n### [CCHall: 面向大语言模型跨语言与跨模态幻觉联合检测的新型基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.19108)\n- **评估指标（Metrics）：** Accuracy（准确率）、Macro-F1（宏平均 F1 分数）\n- **数据集（Datasets）：** CCHall（基于 AMBER、GQA、XM3600、xFlickr&Co 构建；涵盖 3 个资源级别的 9 种语言：高资源——法语、西班牙语、葡萄牙语；中资源——捷克语、荷兰语、瑞典语；低资源——克罗地亚语、威尔士语、斯瓦希里语）\n- **备注（Comments）：** 首次提出 **CCHall** 基准，用于 **跨语言（cross-lingual）与跨模态（cross-modal）幻觉** 的联合交叉研究。评估了领先模型（GPT-4o、Gemini-1.5、Llama-3.2-Vision）在以下场景中的表现：模型可能在英语视觉识别中正确识别物体，但在用其他语言生成描述时对其属性产生幻觉。将幻觉分为四类：非幻觉、仅跨语言幻觉、仅跨模态幻觉，以及联合跨语言\u002F跨模态幻觉。（ACL 2025）\n\n### [Hallucination Detox: 面向大语言模型训练的敏感性 Dropout（SenD）](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.15460)\n- **评估指标（Metrics）：** SelfCheckGPT、FactScore、EigenScore、Efficient EigenScore（EES，高效特征分数）、Semantic Entropy（语义熵）、Perplexity（困惑度）、HaluEval Accuracy、ROUGE-1（XSum）\n- **数据集（Datasets）：** HELM（5 万篇维基百科文章）、MedHALT、LegalBench、HaluEval、XSum\n- **备注（Comments）：** 提出 **Sensitivity Dropout（SenD，敏感性 Dropout）**，一种通过在训练过程中确定性丢弃高变异性敏感嵌入索引来降低幻觉方差的训练协议。将该干预方法与 **Efficient EigenScore（EES）** 配对使用，EES 是一种无监督指标，以 2 倍速度近似 EigenScore。在不损害下游任务性能的前提下，提升 Pythia 和 Llama 模型的测试时可靠性和事实准确性。（ACL 2025）\n\n### [ETF: 面向代码摘要幻觉检测的实体追踪框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.14748)\n- **评估指标（Metrics）：** Precision（精确率）、Recall（召回率）、F1、Accuracy（准确率，实例级和实体级）、Jaccard Similarity（Jaccard 相似度）\n- **数据集（Datasets）：** 自定义数据集（来自 7 个大语言模型的 411 条摘要，9,933 个实体级样本，源自 CodeXGLUE Java Code-To-Text）\n- **备注（Comments）：** 首次提出 **ETF**，首个专为 **代码摘要（code summarization）** 定制的幻觉检测框架。利用静态分析原理，通过追踪源代码到生成摘要中的代码实体（变量、方法、类）来检测内在幻觉（intrinsic hallucinations）和外在幻觉（extrinsic hallucinations）。识别虚构实体（外在幻觉）和错误的实体归属（内在幻觉）。提供一个新颖的标注数据集，包含 4,354 个人工审核的实体元组（Cohen's Kappa：0.72）。（ACL 2025）\n\n### [长文本问答中的错误定位与缓解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.11930)\n- **评估指标（Metrics）：** Human preference rate（人类偏好率）、span-level error annotation（片段级错误标注，5 种错误类型）\n- **数据集（Datasets）：** HaluQuestQA（698 个问答对，1.8K 个片段级专家标注）\n- **备注（Comments）：** 首次提出 **HaluQuestQA**，一个包含专家片段级标注的数据集，用于标注复杂长文本问答答案中的特定幻觉和遗漏错误。在这些标注上训练自动化反馈模型以检测问题片段，然后应用 **Error-Informed Refinement（错误感知优化）** 重写幻觉或不完整的片段。在基线生成结果上达到 **84% 的人类偏好率**。（ACL 2025 Findings）\n\n### [幻觉校正能否提升视频-语言对齐？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.15079)\n- **评估指标（Metrics）：** Accuracy（准确率，VELOCITI）、Mean Average Precision（平均精度均值，SSv2-Temporal、SSv2-Events）、GPT-Evaluated Score（GPT 评估分数，MSRVTT-QA）\n- **数据集（Datasets）：** VELOCITI、SSv2-Temporal、SSv2-Events、MSRVTT-QA、VideoCon（11.5 万训练三元组）\n- **备注（Comments）：** 颠覆传统的缓解范式，利用 **幻觉校正作为自训练目标（self-training objective）** 来主动提升视频-语言对齐。**HACA** 框架在自训练过程中学习识别并重写幻觉化的空间和时间描述，将幻觉视为信息性训练信号而非噪声。在零样本视频-字幕绑定和复杂文本到视频检索方面取得显著提升。（ACL 2025 Findings）\n\n### [监控解码：通过评估生成过程中部分回复的事实性来缓解幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.03106)\n- **评估指标（Metrics）：** Exact Match（精确匹配，TriviaQA、NQ-Open）、Truth\u002FInfo\u002FTruth×Info scores（TruthfulQA）、Accuracy（准确率，GSM8K）、Latency（延迟，ms\u002Ftoken）、Throughput（吞吐量，token\u002Fs）\n- **数据集（Datasets）：** TruthfulQA（817 个问题）、TriviaQA（1,200 个样本）、NQ-Open（1,000 个样本）、GSM8K（1,319 个样本）\n- **备注（Comments）：** 提出一种实时、逐词元（token-by-token）监控框架，在生成过程中持续对部分回复的事实性进行评分。当在生成中期检测到即将发生的幻觉时，动态调整解码轨迹以强制事实一致性，解决 **雪球效应（snowballing）** 问题——即单个错误词元导致后续虚构内容的连锁反应。将范式从生成后校正转变为主动预防性解码。（ACL 2025 Findings）\n\n### [DRIFT: 检测表征不一致性以实现事实真实性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.14210)\n\n- **评估指标（Metrics）：** AUROC、AURAC、Accuracy（准确率）\n- **数据集（Datasets）：** TriviaQA、NQ-Open、MMLU-Pro、WebQuestions\n- **备注（Comments）：** 在**中间层隐藏状态（intermediate hidden states）**而非最终层输出上训练轻量级探测模型（300 万至 3700 万参数），因为中间层保留了词元投影步骤会丢弃的不确定性信号。两种模式：**question-only（仅问题）**（在生成前或生成过程中运行，无额外延迟）和 **question+answer（问题+答案）**（等待完整回复，更准确）。内置的 **LLM router（大语言模型路由器）** 利用探测模型置信度决定是返回答案还是转交给更强的模型 \u002F RAG（检索增强生成）。在 12 个模型-数据集组合中的 10 个上超越 **HaloScope** 和 **Semantic Entropy**（LLaMA-2、Qwen-2.5、Gemma-3 × 四个 QA 基准），AUROC 最高提升 13 个百分点。在一个数据集上训练的探测模型无需重新训练即可迁移到其他数据集（AUROC 72-93）。\n\n### [联合评估答案与推理一致性以检测大型推理模型中的幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.04832)\n\n- **评估指标：** AUROC（受试者工作特征曲线下面积）\n- **数据集：** HotpotQA、TriviaQA、NQ-Open、SQuAD\n- **说明：** 提出了 **RACE**，首个专为 **大型推理模型（Large Reasoning Models, LRMs）** 如 DeepSeek-R1 设计的黑盒幻觉检测框架，解决了现有方法忽视的关键问题：幻觉往往源于模型的推理轨迹（reasoning traces）而非最终答案。通过联合评估推理一致性、答案不确定性、推理-答案对齐度以及内部一致性，RACE 提供了一种细粒度且鲁棒的检测器，在多个数据集和模型家族上持续超越最先进的基线方法。RACE 表明，针对现代推理模型的有效幻觉检测必须同时评估**模型回答了什么**以及**它是如何推理的**，并**开创了 LRMs 黑盒幻觉检测的方向**。此外，RACE 提供了用户友好的代码接口，便于测试和改进。\n\n### [学习推理以实现幻觉片段检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.02173)\n\n- **评估指标：** 精确率（Precision）、召回率（Recall）、F1 分数\n- **数据集：** RAGTruth\n- **说明：** 提出了 **RL4HS**，一种用于*片段级幻觉检测（span-level hallucination detection）*的强化学习框架，将*思维链推理（chain-of-thought reasoning）*与*片段级奖励（span-level rewards）*相结合。基于 **组相对策略优化（Group Relative Policy Optimization, GRPO）** 构建，并采用 **类别感知策略优化（Class-Aware Policy Optimization, CAPO）** 解决幻觉片段与非幻觉片段之间的奖励不平衡问题。在 RAGTruth（问答、摘要、数据到文本）上，RL4HS 显著优于基于 CoT 和监督学习的基线方法，实现了更精细的检测。\n\n### [当模型说谎，我们学习：基于 PsiloQA 的多语言片段级幻觉检测](https:\u002F\u002Fhuggingface.co\u002Fpapers\u002F2510.04849)\n\n- **评估指标：** IoU（交并比）、AP（平均精度）\n- **数据集：** PsiloQA、RAG-Truth、Mu-SHROOM、FAVA、HalluEntity\n- **说明：** 提出了 **PsiloQA**，一个用于**多语言片段级幻觉检测**的大规模数据集，既可作为检测器模型的**基准测试（benchmark）**，也可作为**训练资源**。在先前仅支持英语和序列级数据集的基础上，**PsiloQA 通过自动化的三阶段流程（问答生成、幻觉答案诱导、基于 GPT-4o 的片段标注），为 14 种语言提供细粒度监督**。它支持对基于不确定性、基于编码器和基于大语言模型的检测器进行全面评估和训练，展现出强大的跨语言泛化能力和成本高效的可扩展性。\n\n### [通过数据增强的短语级对齐缓解多模态大语言模型中的目标幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.18654)\n\n- **评估指标：** CHAIRi、CHAIRs；AMBER（CHAIR、Cover、Hall. rate、F1）；MME-Hall 分数\n- **数据集：** MSCOCO（CHAIR）、AMBER、MME-Hall、MMHal-Bench、HallusionBench\n- **说明：** 提出了数据增强的短语级对齐（Data-augmented Phrase-level Alignment, DPA）和 **HALVA**，通过短语级增强构建幻觉\u002F正确响应对，并使用短语级对齐损失进行训练以降低幻觉短语的权重。在多个基准测试中减少目标幻觉的同时，保持通用视觉-语言性能。\n\n### [多模态大语言模型能看见吗？用于幻觉缓解的动态校正解码](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.11779)\n\n- **评估指标：** CHAIRs、CHAIRi；POPE F1；MME 分数；AMBER（CHAIR、Cover、Hal、Cog）\n- **数据集：** MSCOCO 2014（图像描述）、POPE、MME、AMBER\n- **说明：** 提出了 **DeCo**，一种模型无关的解码方法，通过自适应混合较早层的表示来抵消语言先验对视觉证据的抑制。以适度的延迟开销和标准解码策略的兼容性，减少多模态大语言模型中的目标幻觉。\n\n### [AVHBench：面向视听大语言模型的跨模态幻觉基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.18325)\n\n- **评估指标：** 准确率、精确率、召回率、F1（判断任务）；METEOR、CIDEr、GAVIE（描述任务）\n- **数据集：** AVHBench（2,136 个视频；4 项任务：A->V、V->A、A-VMat、A-VCap；5,302 个问答对；1,106 个描述）\n- **说明：** 提出了 **AVHBench**，一个面向视听大语言模型的跨模态幻觉基准，包含三项判断任务和一项描述任务，用于探测音频\u002F视频基础不匹配问题。为感知和理解任务提供了精选数据集和评估协议。\n\n### [以谎言教学：基于合成负样本的课程 DPO 用于幻觉检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.17558v1)\n\n- **评估指标：** 准确率、精确率、召回率、F1 分数\n- **数据集：** MedHallu、HaluEval、DROP、CovidQA、PubMedQA\n- **说明：** 提出了 **HaluCheck**，一系列 1B–3B 参数的大语言模型检测器，通过 **直接偏好优化（Direct Preference Optimization, DPO）** 使用*合成幻觉负样本*进行对齐，负样本按基础难度（通过 MiniCheck）排序。引入了**课程学习（curriculum learning）**策略，将训练从较容易的负样本过渡到较困难的负样本。在 MedHallu 和 HaluEval 上实现了高达 **24% 的相对 F1 提升**，并在 DROP、CovidQA 和 PubMedQA 上展现出强大的零样本鲁棒性——超越更大的最先进模型。\n\n### [MultiHal：用于基于知识图谱的大语言模型幻觉评估的多语言数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.14101)\n- **评估指标：** 基于句子嵌入的语义相似度\n- **数据集：** MultiHal\n- **说明：** 提出了一个新的真实语言建模基准 **MultiHal**。在 Shroom2024、HaluEval、HaluBench、TruthfulQA、Felm、Defan 和 SimpleQA 等过往基准的基础上，通过从 Wikidata 挖掘相关知识图谱路径进行扩展。**MultiHal** 可用于比较知识更新方法（如 RAG 和 KG-RAG），以及使用挖掘的 KG 路径进行事实性评估。\n\n### [基于信念树传播的大语言模型幻觉检测概率框架](https:\u002F\u002Faclanthology.org\u002F2025.naacl-long.158\u002F)\n- **评估指标：** AUROC、AUC-PR、F1、准确率\n- **数据集：** Wikibio-GPT3、FELM-Science、FactCheckGPT\n- **说明：** 提出了 **BTProp**，一种概率信念树框架，将目标陈述递归扩展为相关主张，并执行隐马尔可夫树推断以调和语言模型的自我信念与逻辑关系。在多个基准测试上，幻觉检测的 AUROC\u002FAUC-PR 比基线提升 3-9%。\n\n### [通过图像 Token 注意力引导解码缓解多模态大语言模型的幻觉](https:\u002F\u002Faclanthology.org\u002F2025.naacl-long.75\u002F)\n- **评估指标（Metrics）：** CHAIRs (CS)、CHAIRi (CI)、F1、GPT-4V 正确性评分、MME 评分\n- **数据集（Datasets）：** CHAIR (COCO)、POPE、GPT-4V 辅助评估 (COCO)、MME\n- **备注（Comments）：** 提出 **iTaD**（image Token attention-guided Decoding，图像 Token 注意力引导解码），一种即插即用的解码策略，利用对图像 Token 的注意力来选择层并应用层间对比解码。通过在注意力下降时增强图像 grounding（图像 grounding 指模型输出与图像内容的对齐），该方法在多个多模态大语言模型（MLLMs，Multi-modal Large Language Models）和基准测试中持续减少幻觉。\n\n### [超越 Logit Lens：用于视觉语言模型中鲁棒幻觉检测与 Grounding 的上下文嵌入](https:\u002F\u002Faclanthology.org\u002F2025.naacl-long.488\u002F)\n- **评估指标（Metrics）：** mAP（平均精度均值）、精确率-召回率（grounding）\n- **数据集（Datasets）：** HQH、TextVQA-X、VizWiz-G\n- **备注（Comments）：** 提出 **ContextualLens**，一种无需训练的方法，利用中间层上下文 Token 嵌入来评分答案-图像对齐度。它在多种类别的幻觉检测上取得提升，并为 GVQA（Grounded Visual Question Answering，有根据的视觉问答）任务生成精确的 grounding 框。\n\n### [HALoGEN：奇妙的 LLM 幻觉及其发现之处](https:\u002F\u002Faclanthology.org\u002F2025.acl-long.71\u002F)\n- **评估指标（Metrics）：** 幻觉评分、响应比率、效用评分、FActScore（传记）\n- **数据集（Datasets）：** HALoGEN（10,923 条提示，涵盖 9 项任务：代码包、科学归因、摘要、简化、传记、历史事件、错误预设、二元\u002F数值合理化）\n- **备注（Comments）：** 引入一个多领域幻觉基准测试，包含任务特定的分解和验证流程，可大规模评分原子事实。提供与训练数据来源相关的 A\u002FB\u002FC 类错误分类法，用于分析幻觉来源。\n\n### [视觉证据提示缓解大视觉语言模型中的幻觉](https:\u002F\u002Faclanthology.org\u002F2025.acl-long.205\u002F)\n- **评估指标（Metrics）：** 准确率、CHAIR\n- **数据集（Datasets）：** POPE、AMBER、RPE（关系探测评估，基于 Visual Genome 构建）\n- **备注（Comments）：** 提出视觉证据提示，将目标检测和场景图模型的输出作为结构化提示注入，以减少大视觉语言模型（LVLMs，Large Vision-Language Models）中的目标和关系幻觉。引入 RPE 用于评估关系幻觉。\n\n### [ICRProbe：追踪隐藏状态动态以实现 LLM 中可靠的幻觉检测](https:\u002F\u002Faclanthology.org\u002F2025.acl-long.880\u002F)\n- **评估指标（Metrics）：** AUROC（受试者工作特征曲线下面积）\n- **数据集（Datasets）：** HaluEval、SQuAD、TriviaQA、HotpotQA\n- **备注（Comments）：** 定义 ICR 评分（Information Contribution to Residual Stream，对残差流的信息贡献）和 ICR Probe，通过聚合层级的残差更新进行无需参考的幻觉检测，以轻量级 MLP（多层感知机）超越先前的隐藏状态基线方法。\n\n### [HalluLens：LLM 幻觉基准测试](https:\u002F\u002Faclanthology.org\u002F2025.acl-long.1176\u002F)\n- **评估指标（Metrics）：** 错误拒绝率、幻觉率、正确回答率、精确率、Recall@K、F1@K、错误接受率\n- **数据集（Datasets）：** HalluLens（PreciseWikiQA、LongWiki、NonExistentRefusal 含 MixedEntities\u002FGeneratedEntities）\n- **备注（Comments）：** 引入区分内在幻觉与外在幻觉的分类法，以及包含动态生成外在任务的基准测试以减少数据泄露。为短问答、长问答和知识缺口拒绝提供新的评估任务。\n\n### [相似度-距离-幅度通用验证](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.20167)\n- **评估指标（Metrics）：** 索引条件校准（即给定 alpha' 值下的联合预测-类别条件校准）\n- **数据集（Datasets）：** Factcheck (Azaria and Mitchell, 2023)；情感分析；MMLU；MMLU-pro（重要的是这些包含分布偏移和分布外评估）\n- **备注（Comments）：** 引入相似度-距离-幅度（SDM，Similarity-Distance-Magnitude）激活函数、SDM 校准和 SDM 网络，后者是具有不确定性感知验证和基于示例的可解释性作为内在属性的神经网络（如 LLM）。一个可用于结合检索（或其他适用的 grounding）进行幻觉检测的 SDM 估计器示例可通过 [开源 Reexpress MCP 服务器](https:\u002F\u002Fgithub.com\u002FReexpressAI\u002Freexpress_mcp_server) 获取。(2025)\n\n### [用于联合检索与分类的粗到细记忆匹配](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.02287)\n- **评估指标（Metrics）：** 准确率和 FEVER 评分\n- **数据集（Datasets）：** FEVER 和 Schuster 等人 (2019) 的二分类分析集\n- **备注（Comments）：** 引入基于示例的可解释性，用于单模型的多阶段检索和分类，包括通过双编码序列对齐进行特征检测。包含一种在双编码和交叉编码序列的搜索图上进行束搜索（beam search）的方法，以及一种早期方法，用于基于密集匹配将检索系统的输出约束到支持集。（这实际上是使用 Transformer 语言模型进行测试时计算（test-time compute）的早期示例。不同于使用强化学习，多阶段搜索通过双编码和交叉编码序列上的对比损失进行端到端学习。）(2020)\n\n### [从全局标签检测局部洞察：通过卷积分解进行监督与零样本序列标注](https:\u002F\u002Fdirect.mit.edu\u002Fcoli\u002Farticle\u002F47\u002F4\u002F729\u002F106772\u002FDetecting-Local-Insights-from-Global-Labels)\n- **评估指标（Metrics）：** 相对于真实标签的 F_1、F_0.5 和准确率，以及模型近似相对于原始模型预测的指标\n- **数据集（Datasets）：** 语法错误检测和情感数据集\n- **备注（Comments）：** 引入基于实例的神经网络模型度量学习器近似和硬注意力机制，可通过任务特定的归纳偏置构建以实现有效的半监督学习（即特征检测）。这些机制结合产生有效的方法，用于神经网络模型表示空间上的基于示例的可解释性。与幻觉检测（一项分类任务）直接相关：这早于 SAE（Sparse Autoencoder，稀疏自编码器）和其他基于对比表示的可解释性方法，同时提供测试实例与训练（支持集）之间的显式连接，并提供控制认知不确定性（epistemic uncertainty）的途径。后者尤其是许多后续可解释性方法（包括幻觉检测任务）实际应用中的限制因素。（本文于 2020 年提交至 Computational Linguistics 审稿，2021 年被接受发表，并于 2021 年在 EMNLP 上额外报告。）\n\n### [♟️FactCheckmate: 基于输入隐状态预先检测和缓解语言模型幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.02899)\n- **评估指标（Metrics）：** 检测任务：准确率（Acc）\n- **数据集（Datasets）：** 问答（QA）：NQ-Open、MMLU、MedMCQA、GSM8K\n- **备注：** 本研究提出了一种轻量级分类器，仅基于输入隐状态（input hidden states）即可在生成任何文本之前预先检测幻觉。当预测到幻觉时，该方法会干预这些隐状态，引导模型生成更符合事实的输出。实验表明，该方法在多种大语言模型（LLMs）上均能持续提升事实准确性，且计算开销极小。\n\n### [核语言熵（Kernel Language Entropy）：基于语义相似性的大语言模型细粒度不确定性量化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.20003)\n- **评估指标：** 检测任务：AUROC（受试者工作特征曲线下面积）、AURAC（接收者操作特征曲线下面积-覆盖率）\n- **数据集：** 问答：TriviaQA、SQuAD、BioASQ、NQ、SVAMP\n- **备注：** 本研究提出了一种评估大语言模型回复语义不确定性（semantic uncertainty）的方法。该方法生成多个回复样本并测量其语义相似性，以密度矩阵（density matrix，即语义核 semantic kernel）表示。随后通过该矩阵的冯·诺依曼熵（von Neumann entropy）量化语义不确定性。高不确定性表明可能存在幻觉，从而实现幻觉的检测与缓解。\n\n### [基于稀疏自编码器（SAE）表示工程引导大语言模型的知识选择行为](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.15999)\n- **评估指标：** 精确匹配（Exact Match）\n- **数据集：** NQSwap、Macnoise\n- **备注：** 首个利用稀疏自编码器（Sparse Auto-Encoders, SAEs）增强上下文知识（contextual knowledge）和参数知识（parametric knowledge）使用的研究。\n\n### [DeCoRe：通过对比检索头（Retrieval Heads）解码以缓解幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.18860)\n- **评估指标：** TruthfulQA 多选任务：MC1、MC2、MC3 分数；TruthfulQA 开放式生成任务：%Truth（真实率）、%Info（信息率）、%Truth*Info（真实信息率）；开放域问答任务（NQ-Open、NQ-Swap、TriviaQA、PopQA、MuSiQue）：子跨度精确匹配（subspan Exact Match）；MemoTrap：准确率；IFEval：提示级（Prompt-level）和指令级（Instruction-level）准确率\n- **数据集：** TruthfulQA、NQ-Open、NQ-Swap、TriviaQA、PopQA、MemoTrap、IFEval、MuSiQue\n\n### [语义密度（Semantic Density）：通过语义空间置信度测量实现大语言模型的不确定性量化](https:\u002F\u002Fneurips.cc\u002Fvirtual\u002F2024\u002Fposter\u002F95598)\n- **评估指标：** AUROC、AUPR（精确率-召回率曲线下面积）\n- **数据集：** CoQA、TriviaQA、SciQ、NQ\n- **备注：** 提出了一种新方法——语义密度（semantic density），为检测大语言模型幻觉提供回复级别的置信度\u002F不确定性分数。语义密度从概率分布视角在语义空间中为每个回复提取不确定性\u002F置信度信息。该方法对任务类型无限制，对新模型和任务具有\"开箱即用\"（off-the-shelf）特性。在不同数据集和基础大语言模型上，该方法相比其他最先进（SOTA）方法均表现出显著提升。\n\n### [MedHallu：大语言模型医学幻觉检测综合基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14302)\n- **评估指标：** 二分类幻觉检测（精确率 Precision、召回率 Recall、F1 分数）。\n- **数据集：** *MedHallu*——源自 PubMedQA，包含 10,000 个问答对，其中植入了故意设计的似是而非的幻觉。\n- **备注：** 提出了一个大规模医学聚焦的幻觉检测基准。评估显示，在最困难的子集上，即使是 GPT-4 等顶级模型在检测细微虚假陈述时也仅能达到约 0.625 的 F1 分数，凸显了医学幻觉检测的难度。\n\n### [平滑幻觉：通过平滑知识蒸馏（Smoothed Knowledge Distillation）缓解大语言模型幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.11306)\n- **评估指标：** ROUGE-L、BERTScore、XSum\u002FCNN-DM 上的事实一致性率（通过 QuestEval 等基于问答的指标测量）。\n- **数据集：** CNN\u002FDailyMail、XSum\n- **备注：** 提出使用教师大语言模型的软标签（soft labels）进行训练，以降低过度自信并减少摘要任务中的幻觉率。在保持质量（ROUGE\u002FBERTScore）的同时显著降低事实错误。\n\n### [大型法律虚构：剖析大语言模型中的法律幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.01301)\n- **评估指标：** 幻觉率（包含任何无依据法律主张的输出百分比）。\n- **数据集：** 自定义的美国案例事实查询集，可验证真实结果。\n- **备注：** 实证研究发现，GPT-3.5 和 LLaMA-2 分别在 69% 和 88% 的法律问答中出现幻觉。强调了在未经过进一步训练或验证的情况下，将现成大语言模型应用于法律领域的风险。\n\n### [面向金融决策者的幻觉最小化数据到答案框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.07592)\n- **评估指标：** 自定义置信度分数，结合事实验证（数据重叠）、检索正确性和最终问答一致性。\n- **数据集：** 专有金融表格和查询。\n- **备注：** 展示了将大语言模型锚定于相关金融数据并应用多指标验证如何实现超过 90% 的置信正确率。证明了在金融领域有效抑制幻觉的方法。\n\n### [幻觉检测：基于嵌入距离分析的概率框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.08663)\n  * **评估指标**：准确率、F1 分数（幻觉检测性能）。\n  * **数据集**：合成问答数据集（使用 Llama2-7B 和 Llama3-8B 生成），标注为幻觉回复与非幻觉回复。\n  * **备注**：提出分析大语言模型输出的嵌入空间（embedding space）以检测幻觉。通过测量真实答案与幻觉答案中嵌入关键词的闵可夫斯基距离（Minkowski distance），揭示结构差异，在不依赖外部事实核查的情况下实现具有竞争力的幻觉检测准确率（约 66%）。\n\n### [通过细粒度人工智能反馈检测和缓解大型视觉语言模型中的幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.14233)\n  * **评估指标**：准确率、精确率\u002F召回率、F1（MHaluBench、MFHaluBench 等基准上的幻觉检测）；幻觉率以及 CHAIR、Cover、Hal、Cog 等指标（用于 Object HalBench 和 AMBER 等生成基准的幻觉缓解评估）。\n  * **数据集**：MHaluBench、MFHaluBench（视觉语言幻觉检测数据集）；Object HalBench、AMBER、MMHal-Bench、POPE（大型视觉语言模型的幻觉缓解基准）。\n  * **备注**：提出 *HSA-DPO*（严重程度感知直接偏好优化），一种利用细粒度人工智能反馈标注幻觉严重程度并在训练中优先处理关键错误的方法。该方法在视觉幻觉检测方面达到最先进性能（优于 GPT-4V 及其他模型），并显著降低生成输出中的幻觉发生率（例如，在 AMBER 上降低 36%，在 Object HalBench 上相比基础模型降低 76%）。\n\n### [Pelican: 通过声明分解和思维程序验证纠正视觉大语言模型中的幻觉](https:\u002F\u002Faclanthology.org\u002F2024.emnlp-main.470\u002F)\n  * **评估指标（Metrics）**：幻觉率降低百分比（%）以及在多个视觉-语言指令基准测试（如 MMHal-Bench、GAVIE、MME）上的事实准确性提升。\n  * **数据集（Datasets）**：MMHal-Bench、GAVIE（用于大型视觉语言模型 LVLM 的幻觉评估基准）；MME（通用视觉-语言理解基准）。\n  * **说明（Comments）**：提出一个名为 *Pelican* 的框架，通过声明验证来检测和缓解视觉幻觉。该框架将基于图像的声明分解为子声明，并使用 *思维程序（program-of-thought）*（结合外部工具的代码执行）来验证每个子声明的真实性。随后由大语言模型（LLM）评估整体一致性。Pelican 显著降低了幻觉率（在各种 LVLM 上降幅约为 8–32%，比之前缓解方法低 27%），同时保持或提升了模型在遵循视觉指令时的事实准确性。\n\n### [区分大语言模型幻觉中的无知与错误](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.22071)\n  * **评估指标（Metrics）**：幻觉类型分布（尽管有知识但仍出现幻觉的普遍性 vs. 因缺乏知识导致的幻觉）；区分 HK+ 与 HK- 情况的分类准确率；分别处理这些类型时检测\u002F缓解效果的提升。\n  * **数据集（Datasets）**：*WACK*（Wrong Answers despite Correct Knowledge，尽管知识正确但答案错误）——基于 TriviaQA 和 NaturalQuestions 构建的数据集，包含标注为 HK-（因知识缺失导致的幻觉）或 HK+（尽管模型知道答案但仍出现幻觉）的问答实例，针对特定大语言模型。\n  * **说明（Comments）**：研究幻觉的两种不同成因：模型确实不知道答案（无知）vs. 模型知道答案但仍回答错误（错误）。提出一种自动化方法，通过在多种提示场景下测试模型来生成模型特定的标注样本（WACK 数据集）。研究表明，基于大语言模型内部表征的简单分类器可以区分这两种情况，并且针对模型的 HK+（有知识的错误）情况定制检测\u002F缓解策略，比一刀切的方法效果更好。\n\n### [一种缓解生成式信息检索中幻觉的遗传算法方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.00085)\n  * **评估指标（Metrics）**：事实验证准确率（FEVER 风格的支持\u002F反驳分类），以及开放域问答中的答案相关性指标（n-gram 重叠、ROUGE\u002FNDCG）。\n  * **数据集（Datasets）**：TREC Deep Learning 2019 & 2020（段落排序问答任务）以及 MS MARCO Dev 的子集（用于开放域答案生成）。\n  * **说明（Comments）**：将生成式信息检索过程建模为遗传算法（称为 *GAuGE*：基于 grounded evolution 的遗传方法），以减少答案中的幻觉。候选答案通过迭代的\"变异\"和选择进行演化，由简单的 n-gram 重叠适应度分数指导，以确保与检索文档的一致性。在多个 IR 数据集上的实验表明，GAuGE 生成的答案相关性高，且幻觉陈述显著减少（事实验证分数大幅提高），相比标准 RAG 风格生成方法，同时不牺牲答案相关性。\n\n### [MARS：面向生成式大语言模型不确定性估计的语义感知响应评分](https:\u002F\u002Faclanthology.org\u002F2024.acl-long.419.pdf)\n- **评估指标（Metrics）：** AUROC\n- **数据集（Datasets）：** TriviaQA、NaturalQA、WebQA\n- **说明（Comments）：** 名为 MARS 的大语言模型不确定性估计技术，通过为对正确性贡献更大的 token 分配更高权重，替代了长度归一化概率评分。\n\n### [无需设计，只需学习：面向生成式大语言模型不确定性估计的可训练评分函数](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2406.11278)\n- **评估指标（Metrics）：** AUROC、PRR\n- **数据集（Datasets）：** TriviaQA、GSM8k、NaturalQA、WebQA\n- **说明（Comments）：** 名为 LARS 的大语言模型不确定性估计技术，训练一个基于编码器的 transformer，将查询、生成内容和 token 概率作为输入，输出不确定性分数。\n\n### [量化任何语言模型答案中的不确定性并增强其可信度](https:\u002F\u002Faclanthology.org\u002F2024.acl-long.283\u002F)\n- **评估指标（Metrics）：** 准确率、精确率\u002F召回率\u002FAUROC\n- **数据集（Datasets）：** TriviaQA、GSM8k、SVAMP、常识问答（Common-sense QA）\n- **说明（Comments）：** 名为 BSDetector 的大语言模型不确定性估计技术，将自我反思确定性和观察一致性结合为单一置信度分数。以高精确率\u002F召回率检测错误\u002F幻觉的大语言模型响应，并可自动提升大语言模型响应的准确率。\n\n### [利用幻觉减少可提示分割中对人工提示的依赖](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.15205)\n- **评估指标（Metrics）：** MAE、F_{beta}、S_{alpha}\n- **数据集（Datasets）：** CHAMELEON、CAMO、COD10K、CVC-ColonDB、Kvasir、ISIC\n- **说明（Comments）：** 首个不将幻觉纯粹视为负面现象，而是将其视为模型预训练常见方面的研究。与之前直接消除幻觉的方法不同，ProMaC 首先激发幻觉，以挖掘模型预训练中的先验知识，收集图像中的任务相关信息。然后，消除无关幻觉以减轻其负面影响。该方法在多个具有挑战性的分割任务中证明了有效性。\n\n### [GraphEval：基于知识图谱的大语言模型幻觉评估框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.10793)\n- **评估指标（Metrics）：** 准确率（检测）、Rouge（纠正）\n- **数据集（Datasets）：** SummEval、QAGS-C、QAGS-X\n- **说明（Comments）：** 提出幻觉检测框架 *GraphEval* 和纠正框架 *GraphCorrect*。幻觉检测通过从大语言模型输出中提取知识图谱（KG）三元组，并比较这些三元组与所提供上下文之间的蕴含关系来完成。纠正过程则是将可能包含幻觉的三元组（蕴含分数低于 0.5）输入大语言模型，生成与所提供上下文一致的新的事实正确三元组。随后在单独的推理过程中，提示大语言模型根据纠正后的三元组替换非事实性大语言模型输出中的信息。实验中使用的底层自然语言推理（NLI）模型包括 *HHEM*（DeBERTaV3）、*TRUE* 和 *TrueTeacher*（T5-XXL）。使用的底层大语言模型为 Claude2。最终实验通过计算参考文本与所提出缓解方法之间的 Rouge 分数进行。\n\n### [Lynx: 一个开源的幻觉评估模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.08488)\n- **评估指标（Metrics）：** Accuracy（准确率）\n- **数据集（Datasets）：** HaluBench（包含约500个从CovidQA、PubMedQA、DROP、FinanceBench中随机抽取的样本，以及基于检索样本的另一组扰动数据）\n- **说明（Comments）：** 提出了HaluBench资源库和Lynx（基于Llama3-70bn-instruct的模型），用于无参考的指标评估。重点在于内在幻觉评估，即答案对给定上下文的忠实性而非对世界知识的忠实性。HaluBench的幻觉示例通过GPT-4o收集。Lynx的训练使用了来自RAGTruth、DROP、CovidQA、PubMedQA的2400个样本，其中GPT-4o生成的推理作为训练样本的一部分。评估通过提取响应级别的二元标签来指示响应与上下文的忠实程度。\n\n### [面向图的LLM提示：幻觉与生成能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.00159)\n- **评估指标（Metrics）：** Graph edit distance（图编辑距离）、spectral distance（谱距离）、degree distributions（度分布）之间的距离。\n- **数据集（Datasets）：** Graph Atlas Distance\n- **说明（Comments）：** 该基准测试展示了直接提示LLM生成已知图结构的能力。研究了LLM输出与真实图（ground truth graphs）之间的距离。基于图编辑距离的排序可对LLM的幻觉幅度进行排序。\n\n### [HallusionBench：面向大型视觉语言模型中纠缠的语言幻觉与视觉错觉的高级诊断套件](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.14566.pdf)\n- **评估指标（Metrics）：** Accuracy（准确率）。\n- **数据集（Datasets）：** HallusionBench\n- **说明（Comments）：** 该基准测试通过强调对视觉数据的细致理解与解释，为先进的视觉语言模型（LVLMs）如GPT-4V(Vision)、Gemini Pro Vision、Claude 3和LLaVA-1.5带来了重大挑战。本文引入了一种新颖的视觉问题结构以建立对照组。该结构能够对模型的响应倾向、逻辑一致性以及各种失效模式进行定量分析。\n\n### [多模态大型语言模型的统一幻觉检测](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2402.03190)\n- **评估指标（Metrics）：** Accuracy（准确率）、F1\u002FPrecision\u002FRecall（F1分数\u002F精确率\u002F召回率）。\n- **数据集（Datasets）：** MHaluBench\n- **框架（Framework）：** UniHD\n- **说明（Comments）：** 本文提出了一个更统一的MLLMs幻觉检测问题设定，揭示了一个元评估基准MHaluBench，涵盖多种幻觉类别和多模态任务，并引入了UNIHD，一个用于检测MLLMs生成内容中幻觉的统一框架。\n\n### [FactCHD：事实冲突幻觉检测基准测试](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2310.12086)\n- **评估指标（Metrics）：** 检测的F1分数、解释匹配度\n- **数据集（Datasets）：** FactCHD\n- **亮点（Highlights）：** 本文引入了FACTCHD基准测试，专注于检测事实冲突幻觉。FACTCHD整合了来自多个领域的事实知识，涵盖广泛的事实模式，包括原始事实、多跳推理（multi-hop reasoning）、比较和集合操作。其显著特点在于旨在结合根植于事实信息的证据链，从而在预测声明的事实性或非事实性时实现有说服力的推理。\n\n### [注意力满足：从约束满足视角看语言模型的事实错误](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.15098)\n- **评估指标（Metrics）：** AUROC（受试者工作特征曲线下面积）、risk-coverage curve（风险-覆盖率曲线）操作点\n- **数据集（Datasets）：** CounterFact、从Wikidata生成的事实查询\n- **说明（Comments）：** 本文将事实查询建模为约束满足问题（constraint-satisfaction problems），发现对约束token的注意力与事实正确性\u002F幻觉显著相关。\n\n### [TRUE：重新评估事实一致性评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.04991)\n- **评估指标（Metrics）：** AUROC，跨多个数据集和评估方法\n- **数据集（Datasets）：** PAWS、XSum、QAGS、FRANK、SummEval、BEGIN、Q^2、DialFact、FEVER、VitaminC\n\n### [TrueTeacher：利用大型语言模型学习事实一致性评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11171)\n- **评估指标（Metrics）：** AUROC，跨多个数据集和评估方法\n- **数据集（Datasets）：** XSum、QAGS、FRANK、SummEval\n\n### [SAC$`^3`$：通过语义感知交叉验证一致性实现黑盒语言模型中的可靠幻觉检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.01740)\n- **评估指标（Metrics）：** 分类问答和开放域问答的Accuracy（准确率）和AUROC\n- **数据集（Datasets）：** Snowball Hallucination中的素数测试和参议员搜索、HotpotQA和Nq-open QA\n\n### [弹性权重移除以实现忠实且抽象的对话生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17574)\n- **评估指标（Metrics）：** 预测响应与真实知识之间的忠实性（表1）——Critic、Q²、BERT F1、F1。\n- **数据集（Datasets）：** Wizard-of-Wikipedia (WoW)、MultiWoZ 2.1的DSTC9和DSTC11扩展版、FaithDial——WoW的去幻觉子集。\n\n### [信任你的证据：通过上下文感知解码减少幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14739)\n- **评估指标（Metrics）：** 摘要的事实一致性：BERT-Precision和FactKB。MemoTrap和NQ-Swap：Exact Match（精确匹配）。\n- **数据集（Datasets）：** 摘要：CNN-DM、XSUM。知识冲突：MemoTrap、NQ-Swap。\n\n### [何时不应信任语言模型：探究参数化与非参数化记忆的有效性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10511)\n- **评估指标（Metrics）：** Exact Match\u002FAccuracy（精确匹配\u002F准确率）。\n- **数据集（Datasets）：** 包含长尾实体的问答数据集：PopQA、EntityQuestions；NQ。\n\n### [检索增强减少对话中的幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.07567)\n- **评估指标（Metrics）：** 生成：Perplexity（困惑度）、Unigram Overlap (F1)（一元词重叠）、BLEU-4、ROUGE-L。生成与数据集收集过程中人类所依据知识之间的重叠：Knowledge F1；计算F1时仅考虑数据集中不频繁的词：Rare F1。\n- **数据集（Datasets）：** Wow、CMU Document Grounded Conversations (CMU_DoG)。知识来源：KiLT Wikipedia dump。\n\n### [只需请求校准：从使用人类反馈微调的语言模型中获取校准置信度分数的策略](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14975)\n- **评估指标（Metrics）：** 带温度缩放的期望校准误差（ECE-t）；accuracy@coverage和coverage@accuracy。\n- **数据集（Datasets）：** 评估事实知识的问答数据集：TriviaQA、SciQ、TruthfulQA。\n\n### [语言模型幻觉如何滚雪球](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13534)\n- **评估指标（Metrics）：** 错误答案百分比（幻觉）以及\"模型知道自己错了\"的情况（滚雪球式幻觉）。\n- **数据集（Datasets）：** Primality Testing（素性测试）、Senator Search（参议员搜索）、Graph Connectivity（图连通性）。\n\n### [Leftover Lunch: 基于优势函数的离线强化学习用于语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14718)\n- **评估指标（Metrics）:** 在 FaithDial 上进行知识增强响应生成的忠实度（Faithfulness）评估 —— FaithCritic、CoLA（流畅度，Fluency）、对话参与度（Dialog Engagement）、长度惩罚 TF-IDF 多样性（Length-penalised TF-IDF Diversity）。\n- **数据集（Datasets）:** 忠实知识增强对话：FaithDial，WoW 的一个更忠实的子集。\n\n### [Generating with Confidence: 黑盒大语言模型的不确定性量化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.19187)\n- **评估指标（Metrics）:** AUROC、AUARC、不确定性和置信度指标（NumSet、Deg、EigV）。\n- **数据集（Datasets）:** CoQA（开放式对话问答数据集，Open-book Conversational QA dataset）、TriviaQA 和 Natural Questions（闭卷问答，Closed-book QA）。\n\n### [Contextualized Sequence Likelihood: 用于自然语言生成的增强置信度分数](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.01806)\n- **评估指标（Metrics）:** AUROC、AUARC；改进的序列似然（生成序列的对数概率，log probability of generated sequence）用于置信度或不确定性计算。\n- **数据集（Datasets）:** CoQA（开放式对话问答数据集，Open-book Conversational QA dataset）、TriviaQA 和 Natural Questions（闭卷问答，Closed-book QA）。\n\n### [FaithDial: 面向信息检索对话的忠实基准测试](https:\u002F\u002Fdirect.mit.edu\u002Ftacl\u002Farticle\u002Fdoi\u002F10.1162\u002Ftacl_a_00529\u002F114373\u002FFaithDial-A-Faithful-Benchmark-for-Information)\n- **评估指标（Metrics）:** 指标衡量生成响应相对于给定知识产生幻觉（hallucination）的程度，或与黄金忠实响应的重叠度：Critic、Q²（F1、NLI）、BERTScore、F1、BLEU、ROUGE。\n- **数据集（Datasets）:** FaithDial、WoW。\n\n### [Neural Path Hunter: 通过路径 grounding 减少对话系统中的幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08455)\n- **评估指标（Metrics）:** FeQA，一个忠实度指标；Critic，一个幻觉评判器；BLEU。\n- **数据集（Datasets）:** OpenDialKG，一个提供基于知识图谱（KG）路径的开放式对话响应的数据集。\n\n### [HaluEval: 大规模幻觉评估基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11747)\n- **评估指标（Metrics）:** 准确率：问答（QA）、对话、摘要。\n- **数据集（Datasets）:** HaluEval，一个用于评估大语言模型（LLMs）识别幻觉性能的生成和人工标注幻觉样本集合。\n\n### [The Knowledge Alignment Problem: 弥合人类知识与外部知识在大语言模型中的鸿沟](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13669)\n- **评估指标（Metrics）:** *覆盖率（Coverage）*：一个二元指标，判断所有正确的黄金答案值是否包含在生成值中。*幻觉（Hallucination）*：一个二元指标，评估生成值中是否存在问题值和黄金 grounding 值中不存在的值。*用户模拟器（User Simulator）*：作为\"预言机（oracle）\"语言模型的用户模拟器，可访问目标答案的归因信息。\n- **数据集（Datasets）:** FuzzyQA，一个基于 HybridDialogue 和 MuSiQue 的数据集，其中复杂问题使用 ChatGPT 进行了简化。\n\n### [Check Your Facts and Try Again: 利用外部知识和自动反馈改进大语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.12813)\n- **评估指标（Metrics）:** KF1、BLEU、ROUGE、chrF、METEOR、BERTScore、BARTScore、BLEURT、平均长度。\n- **数据集（Datasets）:** 新闻对话：DSTC7 Track 2 被重新用作新闻对话的评估语料库。客户服务：使用 DSTC11 Track 5 作为对话客户服务场景的展示，通过整合主观信息扩展了 DSTC9 Track 1。\n\n### [SelfCheckGPT: 面向生成式大语言模型的零资源黑盒幻觉检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.08896)\n- **评估指标（Metrics）:** 句子级幻觉检测（AUC-PR），以及段落级幻觉检测（Pearson 和 Spearman 相关系数）。\n- **数据集（Datasets）:** 来自 WikiBio 的生成维基百科文章，带有标注的幻觉。\n\n### [INSIDE: 大语言模型的内部状态保留幻觉检测能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.03744)\n- **评估指标（Metrics）:** AUROC、PCC；准确率（TruthfulQA）。\n- **数据集（Datasets）:** CoQA、SQuAD、Natural Questions（NQ）、TriviaQA、TruthfulQA。\n- **备注（Comments）:** 介绍了 **INSIDE**，一个基于大语言模型内部状态运行的幻觉检测框架。提出了 **EigenScore**，使用多个响应嵌入的协方差矩阵特征值（eigenvalues of the covariance matrix of multiple response embeddings）来衡量语义一致性，以及一种测试时特征裁剪策略来截断导致过度自信幻觉的极端激活。在 LLaMA 和 OPT 模型上跨问答基准进行评估，相比不确定性和词汇相似性基线改进了检测效果。\n\n### [大语言模型的内部状态知道它在说谎](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.13734)\n- **评估指标（Metrics）:** 每主题和平均准确率。\n- **数据集（Datasets）:** True-False 数据集包含涵盖多个主题的真假陈述 —— 城市、发明、化学元素、动物、公司和科学事实。\n\n### [Chain-of-Knowledge: 通过异构源上的动态知识适配来 grounding 大语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13269)\n- **评估指标（Metrics）:** 精确匹配（Exact Match）。\n- **数据集（Datasets）:** FEVER、对抗性 HotpotQA。\n\n### [Halo: 开源弱大语言模型中幻觉的估计与减少](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.11764)\n- **评估指标（Metrics）:** HaloCheck 和 SelfCheckGPT 分数；一致性、事实性。\n- **数据集（Datasets）:** NBA 领域中生成和审核的问题。\n\n### [A Stitch in Time Saves Nine: 通过验证低置信度生成来检测和缓解大语言模型的幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.03987)\n- **评估指标（Metrics）:** 检测句子级和概念级幻觉时的精确率和召回率。\n- **数据集（Datasets）:** 涵盖 150 个不同领域主题的 ChatGPT 生成段落。\n\n### [大语言模型在推理任务上产生幻觉的来源](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14552)\n- **评估指标（Metrics）:** 带实体插入和替换的方向性 Levy\u002FHolt 精确率和召回率。\n- **数据集（Datasets）:** Levy\u002FHolt 数据集，包含前提-假设对，任务格式为*给定[前提 P]，[假设 H]是否为真？*，其中模型使用随机前提进行评估。\n\n### [大型多语言翻译模型中的幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.16104)\n- **评估指标（Metrics）:** 机器翻译（MT）系统在扰动下产生幻觉的比率（语言对比例、比率）。\n- **数据集（Datasets）:** Flores-101、WMT、TICO。\n\n### [Citation: 构建负责任和可问责大语言模型的关键](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.02185)\n- **评估指标（Metrics）:** 不适用\n- **数据集（Datasets）:** 不适用\n\n### [大语言模型的零资源幻觉预防](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.02654)\n- **评估指标（Metrics）:** 幻觉指令分类：AUC、ACC、F1、PEA。\n- **数据集（Datasets）:** Concept-7，专注于分类潜在幻觉指令。\n\n### [RARR: 使用语言模型研究和修正语言模型的输出](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.08726)\n- **评估指标（Metrics）:** 编辑前后的可归因于已识别来源（Attributable to Identified Sources, AIS）分数。\n- **数据集（Datasets）:** 通过从三个数据集创建任务输入并提示不同模型生成长文本输出（可能包含幻觉）来生成陈述——事实性陈述（Factoid statements）、推理链（Reasoning chains）和知识密集型对话（Knowledge-intensive dialogues）。\n\n### [Q²: 通过问题生成和问题回答评估知识 grounded 对话中的事实一致性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08202)\n- **评估指标（Metrics）:** Q² 本身就是一个指标，它与 F1 词元级重叠（token-level overlap）、精确率（Precision）和召回率（Recall）、无 NLI 的 Q²、端到端 NLI（E2E NLI）、重叠度（Overlap）、BERTScore 和 BLEU 进行比较。\n- **数据集（Datasets）:** WoW，包含机器人需要以知识丰富的方式响应用户输入的对话；Topical-Chat，一个人类-人类知识 grounded 对话数据集；Dialogue NLI，一个基于 Persona-Chat 对话任务的数据集，由前提-假设对组成。\n\n### [我们知道自己不知道什么吗？研究 SQuAD 2.0 之外无法回答的问题](https:\u002F\u002Faclanthology.org\u002F2021.findings-emnlp.385.pdf)\n- **评估指标（Metrics）:** 全部、有答案（\"Has answer\"）和不知道（\"IDK\"）上的精确匹配（EM）。\n- **数据集（Datasets）:** MNLI、SQuAD 2.0、ACE-whQA。\n\n### [验证链减少大语言模型中的幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.11495)\n- **评估指标（Metrics）:** Wikidata 和 Wiki-Category 列表：基于列表问题的测试精确率、平均正例和负例（幻觉）实体数量；MultiSpanQA：F1、精确率、召回率；长文本传记生成：FactScore。\n- **数据集（Datasets）:** Wikidata、Wiki-Category 列表、MultiSpanQA、长文本传记生成。\n\n### [检测和缓解多语言摘要中的幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13632)\n- **评估指标（Metrics）:** mFACT，一种从四种英语忠实度指标（DAE、QAFactEval、ENFS%、EntFA）开发的新型多语言忠实度指标。\n- **数据集（Datasets）:** XL-Sum，一个多语言摘要数据集。\n\n### [幻觉但事实正确！检查抽象摘要中幻觉的事实性](https:\u002F\u002Faclanthology.org\u002F2022.acl-long.236\u002F)\n- **评估指标（Metrics）:** XEnt：幻觉（准确率、F1）、事实性（准确率、F1）、ROUGE、新颖 n-gram 百分比、忠实度（%ENFS、FEQA、DAE）、EntFA（% 事实实体、% 事实幻觉）。\n- **数据集（Datasets）:** XEnt，一个用于分析抽象摘要中实体幻觉和事实性的新数据集，包含 800 个由 BART 生成并标注的摘要。MEnt，XSum 的事实性和幻觉标注集。\n- **备注（Comments）:** 表 2 概述了几种幻觉类型（例如，事实性、非事实性、内在性）。\n\n### [使大语言模型能够生成带引用的文本](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14627)\n- **评估指标（Metrics）:** 流畅度（MAUVE）、正确性（ASQA 的 EM 召回率、QAMPARI 的 recall-5、ELI5 的声明召回率）、引用质量（引用召回率、引用精确率）。\n- **数据集（Datasets）:** 问答数据集，满足以下条件：1）包含引用很重要的事实性问题；2）问题需要涵盖多个方面的长文本答案；3）回答问题需要综合多个来源：ASQA、QAMPARI、ELI5。\n\n### [面向自由形式文本生成的词元级无参考幻觉检测基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08704)\n- **评估指标（Metrics）:** 准确率（Acc）、几何平均数（G-Mean）、BSS、AUC、非幻觉（精确率 P、召回率 R、F1）、幻觉（精确率 P、召回率 R、F1）。\n- **数据集（Datasets）:** HaDes（HAllucination DEtection dataSet，幻觉检测数据集），一个新颖的词元级无参考标注幻觉检测数据集，通过扰动从英文维基百科提取的大量文本片段并通过众包标注验证获得。\n- **备注（Comments）:** 图 3 概述了几种幻觉类型（领域特定知识、常识知识、不连贯或不恰当搭配、与中心主题无关、与前面上下文冲突、与后面上下文冲突等）。\n\n### [为语言模型的事实性评估生成基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.06908)\n- **评估指标（Metrics）:** 将最高概率分配给事实性补全的示例百分比。\n- **数据集（Datasets）:** Wiki-FACTOR 和 News-FACTOR：两个用于大语言模型事实性评估的新基准，分别基于维基百科和新闻文章。每个示例包含一个前缀、一个事实性补全和三个相似但非事实性的替代选项。\n- **备注（Comments）:** 该论文介绍了一个从给定语料库自动生成此类数据集的框架，详见第 3 节。\n\n### [语言模型知道自己何时在幻觉引用吗？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.18248)\n- **评估指标（Metrics）:** 幻觉率（H%，在 1000 个生成的标题中）。\n- **数据集（Datasets）:** 在 ACM 计算分类系统主题上生成的（真实和幻觉）引用。\n\n### [为什么 ChatGPT 在提供真实答案方面表现不佳？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.10513)\n- **评估指标（Metrics）:** 正确答案数和错误答案数，以及不同类型失败的计数：理解（Comprehension）、事实性（Factualness）、特异性（Specificity）、推理（Inference）。\n- **数据集（Datasets）:** HotpotQA、BoolQ。\n- **备注（Comments）:** 本文对不同错误类型有一个很好的分类——例如，理解（comprehension）、事实性（factualness）、特异性（specificity）、推理（inference）。\n\n### [LM vs LM：通过交叉检验检测事实错误](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13281)\n- **评估指标（Metrics）:** 精确率、召回率、F1（在不同交叉检验策略下：AYS、IDK、基于置信度、IC-IDK）。\n- **数据集（Datasets）:** TriviaQA、NQ、PopQA。\n\n### [RHO (ρ)：通过知识 grounding 减少开放域对话中的幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.01588)\n- **评估指标（Metrics）:** BLEU、ROUGE-L；FeQA、QuestEval、实体覆盖率（EntityCoverage，精确率、召回率、F1）用于估计幻觉程度——FrQA 和 QuestEval 是基于问答的指标，用于评估生成任务中输出的忠实度。\n- **数据集（Datasets）:** OpenDialKG。\n\n### [FActScore：长文本生成中事实精确性的细粒度原子评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14251)\n- **评估指标（Metrics）:** 在不同频率水平的人类实体中，被支持的陈述百分比。\n- **数据集（Datasets）:** 从大语言模型生成的人物传记，人类标注者将其分解为支持性事实。\n\n### [ExpertQA：专家策划的问题和归因答案](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.07852)\n- **评估指标（Metrics）:** AutoAIS 标签的零样本（精确率 P、召回率 R、F1）和微调（精确率 P、召回率 R、F1）；参考事实性标签上的 FActScore F1 分数；AutoAIS（可归因于已识别来源）分数。\n- **数据集（Datasets）:** 跨多个领域的专家策划问题（例如，人类学、建筑学、生物学、化学、工程与技术、医疗保健\u002F医学；示例见表 1），按问题类型组织（例如，具有单一明确答案的定向问题、可能模糊的开放式问题、主题信息摘要、解决问题的方法建议；见表 2）。\n\n### [DoLa: 通过对比层解码提升大语言模型的事实性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.03883)\n- **评估指标（Metrics）：** TruthfulQA：MC1、MC2、MC3 分数；FACTOR：News、Wiki；这些均为多选题结果。开放式生成：对于 TruthfulQA，使用 %Truth、%Info、%Truth*Info、%Reject；对于 CoT（Chain-of-Thought，思维链）任务（StrategyQA 和 GSM8K），采用准确率（accuracy）。\n- **数据集（Datasets）：** TruthfulQA、FACTOR（新闻\u002F维基）、StrategyQA、GSM8K\n\n### [FreshLLMs：通过搜索引擎增强刷新大语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03214)\n- **评估指标（Metrics）：** 准确率（Accuracy）（严格、宽松，针对快速变化问题、慢速变化问题、永不变化问题、错误前提问题——涉及 2022 年前和 2022 年后的知识、单跳和多跳问题，以及总体）。\n- **数据集（Datasets）：** FreshQA，一个新的 QA 基准测试，包含 600 个问题，涵盖广泛的问题和答案类型。\n\n### [超越事实性：大语言模型作为知识生成器的全面评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07289)\n- **评估指标（Metrics）：** 事实性（Factuality）、相关性（Relevance）、连贯性（Coherence）、信息丰富度（Informativeness）、有用性（Helpfulness）和有效性（Validity）。\n- **数据集（Datasets）：** Natural Questions、Wizard of Wikipedia。\n\n### [基于野外检索证据的复杂声明验证](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11859)\n- **评估指标（Metrics）：** 准确率（Accuracy）、MAE（Mean Absolute Error，平均绝对误差）、Macro-F1、软准确率（soft accuracy）。\n- **数据集（Datasets）：** ClaimDecomp，包含来自 PolitiFact 的 1200 个复杂声明；每个声明标注有六种真实性标签之一、由专家事实核查员撰写的理由段落，以及先前工作标注的子问题。\n\n### [FELM：大语言模型事实性评估基准测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.00741)\n- **评估指标（Metrics）：** 准确率（Accuracy）、F1\u002F精确率（Precision）\u002F召回率（Recall）。\n- **数据集（Datasets）：** 推理、数学、写作\u002F推荐、科学\u002F技术、世界知识：GSM8K、ChatGPT、MATH、TruthfulQA、Quora、MMLU\u002Fhc3。\n\n### [评估中文大语言模型中的幻觉现象](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.03368)\n- **评估指标（Metrics）：** 人工评估和 GPT-4 评估。\n- **数据集（Datasets）：** HalluQA（本文提出），并提及 TruthfulQA、ChineseFactEval、HaluEval。\n\n### [关于抽象摘要中的忠实性与事实性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.00661)\n- **评估指标（Metrics）：** 人工标注的幻觉片段（内在\u002F外在）和事实性（基于外部证据）；ROUGE-1\u002F2\u002FL；BERTScore；文本蕴含（textual entailment）；基于 QA 的一致性；与人类评分的 Spearman 相关性。\n- **数据集（Datasets）：** XSum（BBC 文章）；抽取 500 篇测试文章进行人工评估（2,500 个文档-摘要对）。\n- **备注（Comments）：** 针对极端摘要中幻觉现象的大规模人工研究；发现外在幻觉频繁出现（包括在金标准摘要中），且文本蕴含与人类忠实性\u002F事实性的相关性优于 ROUGE\u002FBERTScore\u002FQA。\n\n### [QuestEval：摘要需要基于事实的评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.12693)\n- **评估指标（Metrics）：** QuestEval（本文提出），用于测试*一致性（consistency）*、*连贯性（coherence）*、*流畅性（fluency）*和*相关性（relevance）*。ROUGE、BLEU、METEOR、BERTScore。SummaQA、QAGS。\n- **数据集（Datasets）：** SummEval、QAGS-XSUM、SQuAD-v2。\n\n### [QAFactEval：基于改进 QA 的摘要事实一致性评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.08542)\n- **评估指标（Metrics）：** QAFactEval（本文提出），衡量答案选择、问题生成、问题回答、答案重叠以及过滤\u002F可回答性。\n- **数据集（Datasets）：** SummaC，二元事实一致性评估的基准集合；CGS，来自 CNN\u002FDailyMail 的正确和错误句子；XSF；Polytope；FactCC；SummEval；FRANK；QAGs。\n\n### [长文档上的快速准确事实不一致检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.13189)\n- **评估指标（Metrics）：** SCALE（本文提出的新指标）。与 Q²、ANLI、SummaC、F1、BLEURT、QuestEval、BARTScore、BERTScore 进行比较（表 3）。\n- **数据集（Datasets）：** TRUE 基准和 ScreenEval，本文提出的新数据集，用于评估长对话中的事实不一致性（来自 SummScreen 的 52 个文档）。\n\n### [通过 FRANK 理解抽象摘要中的事实性：事实性指标基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.13346)\n- **评估指标（Metrics）：** BERTScore、FEQA、QGFS、DAE、FactCC\n- **数据集（Datasets）：** 提出了新数据集 FRANK：针对 CNN\u002FDM 和 XSum 数据集的人工标注事实错误\n\n### [幻觉性（不可）回答性的奇特案例：在过度自信大语言模型的隐藏状态中寻找真相](https:\u002F\u002Faclanthology.org\u002F2023.emnlp-main.220\u002F)\n- **评估指标（Metrics）：**（分类）F-1、精确匹配（Exact Match），（token）F-1\n- **数据集（Datasets）：** SQuAD、Natural Questions、MuSiQue\n- **备注（Comments）：** 本文探讨了 LLM 在闭卷设置中处理（不可）回答问题的能力，即基于给定段落回答问题，而段落中没有答案。研究表明，尽管 LLM 倾向于幻觉上下文答案，而非声明无法回答问题，但它们内部具备对问题（不可）回答性的理解。\n\n### [机器人知道它们只是在梦见电子羊吗？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.17249)\n- **评估指标（Metrics）：**（幻觉检测）响应级 F1、片段级部分匹配 F1（Span-level Partial Credit Match F1）\n- **数据集（Datasets）：** 有机生成和合成编辑的 CNN DailyMail、ConvFEVER 和 E2E，按片段标注幻觉\n- **备注（Comments）：** 语言模型知道自己在幻觉，我们可以在 LLM 解码过程中的隐藏状态上训练探针来可靠地检测它们。\n\n### [回溯修正减少摘要中的幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.16176)\n- **评估指标（Metrics）：** AlignScore、FactCC、BS-Fact、ROUGE-L\n- **数据集（Datasets）：** CNN\u002FDM、XSum、Newsroom\n\n### [语言模型的细粒度幻觉检测与编辑](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.06855)\n- **评估指标（Metrics）：** 精确率（Precision）、召回率（Recall）、F1。\n- **数据集（Datasets）：** 针对各类（事实性）幻觉的自定义细粒度幻觉检测\u002F编辑数据集：实体（Entity）、关系（Relation）、矛盾（Contradictory）、虚构（Invented）、主观（Subjective）、不可验证（Unverifiable）。\n\n### [LLM 作为事实推理器：来自现有基准及更远的洞察](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14540)\n- **评估指标（Metrics）：** 各类错误类型的准确率——正例、日期交换、实体交换、否定句、数字交换、代词交换。\n- **数据集（Datasets）：** 提出 SummEdits，一个 10 领域的不一致性检测基准。\n\n### [评估抽象文本摘要的事实一致性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.12840)\n- **评估指标（Metrics）：** 提出 FactCC，一种衡量抽象文本摘要事实一致性的指标（直觉：如果摘要包含与源文档相同的事实，则其事实一致）\n- **数据集（Datasets）：** CNN\u002FDM 用于生成训练数据；MNLI 和 FEVER 用于训练模型。基于人工的实验用于评估关于 CNN\u002FDM 文章的声明。\n\n### [SummaC: 重新审视基于 NLI（自然语言推理）的摘要不一致性检测模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09525)\n- **评估指标（Metrics）：** 每个数据集配有其专属指标（例如，CoGenSumm 使用基于重排序的度量方法；XSumFaith、SummEval 和 FRANK 提出了多种指标并分析其与人工标注的相关性；等等）——对于 SummaC，作者建议使用平衡准确率（balanced accuracy）。\n- **数据集（Datasets）：** 他们提出了 SummaC（Summary Consistency，摘要一致性），一个包含六个大型不一致性检测数据集的基准：CoGenSumm、XSumFaith、Polytope、FactCC、SummEval 和 FRANK。\n\n### [对话模型中幻觉的起源：是数据集还是模型的问题？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07931)\n- **评估指标（Metrics）：** 专家和非专家标注：部分幻觉（Partial Hallucination）、蕴含（Entailment）、幻觉（Hallucination）、不合作（Uncoop）、通用回复（Generic）（每个类别都有更细粒度的子类——参见例如图 2）——标注遵循 BEGIN 和 VRM 分类体系。\n- **数据集（Datasets）：** 基于知识的对话基准：Wizard of Wikipedia（WoW）、CMU-DoG 和 TopicalChat——这些数据集包含两位说话者之间的对话，目标是在交流特定主题信息的同时，为说话者提供与当前轮次相关的知识片段。\n\n### [通过合成任务减少语言模型的幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.06827)\n- **评估指标（Metrics）：** 多种设置下的幻觉率（原始设置、优化系统消息设置、完整 LLM 权重设置、合成数据设置，或合成数据与参考数据混合设置）；BLEU、ROUGE-1、ROUGE-2、ROUGE-L。\n- **数据集（Datasets）：** 搜索与检索（MS MARCO）、会议摘要（QMSum）、自动化临床报告生成（ACI-Bench）。\n\n### [抽取式摘要的忠实性感知解码策略](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03278)\n- **评估指标（Metrics）：** ROUGE-L、BERTScore、BS-Fact、FactCC、DAE、QuestEval\n- **数据集（Datasets）：** CNN\u002FDM、XSum\n\n### [KL 散度引导的温度采样](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.01286)\n- **评估指标（Metrics）：** 对话式问答（Conversational QA）：在 MNLI、SNLI、FEVER、PAWS、ScTail 和 VitaminC 上微调的模型。摘要（Summarisation）：在 ANLI 和 XNLI 上微调的模型。\n- **数据集（Datasets）：** 对话上下文中的问题重写（QReCC）、XLSum。\n\n### [研究抽取式摘要中剪枝大语言模型的幻觉现象](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.09335)\n- **评估指标（Metrics）：** 幻觉风险指标（HaRiM+）、SummaC、SummaCzs、SummaCconv、幻觉风险比率（HRR）\n- **数据集（Datasets）：** FactCC、Polytope、SummEval、法律合同（Legal Contracts）、RCT\n\n### [问答中的基于实体知识冲突](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.05052)\n- **评估指标（Metrics）：** EM（Exact Match，精确匹配）、记忆比率（Memorisation ratio）。\n- **数据集（Datasets）：** 带答案重叠（AO）和无答案重叠（NAO）的 NQ Dev、NewsQA。\n\n### [TruthX：在真实空间中编辑大语言模型以缓解幻觉](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.17811)\n- **评估指标（Metrics）：** TruthfulQA 多项选择任务的 MC1\u002FMC2\u002FMC3 分数；TruthfulQA 开放式生成任务的 %Truth（真实率）、%Info（信息率）、%Truth*Info（真实与信息乘积）；Natural Questions、TriviaQA 和 FACTOR（新闻、专家、维基）的选择准确率。\n- **数据集（Datasets）：** TruthfulQA、Natural Questions、TriviaQA、FACTOR（新闻、专家、维基）\n\n### [问题分解提升模型生成推理的忠实性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.11768)\n- **评估指标（Metrics）：** 准确率（Accuracy）、最终答案截断敏感性（Final Answer Truncation Sensitivity）、最终答案损坏敏感性（Final Answer Corruption Sensitivity）、偏见上下文准确率变化（Biased-Context Accuracy Change）。\n- **数据集（Datasets）：** HotpotQA、OpenbookQA、StrategyQA、TruthfulQA。\n\n### [大语言模型的自相矛盾幻觉：评估、检测与缓解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.15852)\n- **评估指标（Metrics）：** 检测方面：精确率（Precision）、召回率（Recall）、F1 值。缓解方面：自相矛盾移除比率、信息性事实保留比率、困惑度增加量。\n- **数据集（Datasets）：** 自定义开放域文本生成数据集、大语言模型生成的维基百科实体百科文本描述、PopQA。\n\n### [使用语义熵检测大语言模型中的幻觉](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-024-07421-0)\n- **评估指标（Metrics）：** 检测方面：AUROC（受试者工作特征曲线下面积）、AURAC。\n- **数据集（Datasets）：** 问答（QA）：TriviaQA、SQuAD、BioASQ、NQ-Open、SVAMP。FactualBio，本文附带的一个传记生成数据集。\n\n### [CAST：视觉语言模型的跨模态对齐相似性测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.11007)\n- **评估指标（Metrics）：** 提出 CAST，一种简单的自一致性指标，旨在评估多模态模型在跨模态时是否保持一致。该方法分为两个阶段：第一阶段模型生成比较两个输入的相似性\u002F真实陈述，第二阶段模型判断自身输出的真实性。因此，一致的模型应始终将其自身输出评估为真实。\n\n### [语言模型的不确定性量化：一套黑盒、白盒、LLM 评判和集成评分器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.19254)\n- **评估指标（Metrics）：** 黑盒（基于一致性的评分器）：非矛盾概率（non-contradiction probability）、归一化语义负熵（normalized semantic negentropy）、归一化余弦相似度（normalized cosine similarity）、BERTSCore、BLEURT 和精确匹配率。白盒（基于 token 概率的）评分器：最小 token 概率、长度归一化 token 概率。LLM 作为评判（LLM-as-a-Judge）评分器：分类（错误\u002F不确定\u002F正确）。提出了一种新颖的可调集成评分器，它是黑盒、白盒和 LLM-as-a-Judge 评分器的任意组合的加权平均，权重可使用用户提供的评分 LLM 响应集进行调优。\n\n## 领域特定条目\n\n### [Med-HALT：大语言模型的医学领域幻觉测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15343)\n- **评估指标（Metrics）：** 推理幻觉测试（虚假置信度测试、以上皆非测试、虚假问题测试）、记忆幻觉测试（摘要到链接测试、PMID 到标题测试、标题到链接测试、链接到标题测试）；准确率（Accuracy）、逐点评分（Pointwise Score）。\n- **数据集（Datasets）：** Med-HALT：MEDMCQA、Headqa、Medqa USMILE、Medqa（台湾）、Pubmed。\n\n### [面向代码相关少样本学习的基于检索的提示选择](https:\u002F\u002Fpeople.ece.ubc.ca\u002Famesbah\u002Fresources\u002Fpapers\u002Fcedar-icse23.pdf)\n- **评估指标（Metrics）：** 准确率（Accuracy）、合理匹配准确率（Accuracy plausible match）\n- **数据集（Datasets）：** ATLAS 数据集、TFix 数据集\n- **备注（Comments）：** 发表于 ICSE 2023\n\n## 概述、综述与共享任务\n\n- [Mitigating LLM Hallucinations: a multifaceted approach](https:\u002F\u002Famatriain.net\u002Fblog\u002Fhallucinations)（缓解大语言模型幻觉：一种多维度方法）\n- [Siren’s Song in the AI Ocean: A Survey on Hallucination in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.01219)（AI海洋中的塞壬之歌：大语言模型幻觉综述）\n- [Survey of Hallucination in Natural Language Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.03629)（自然语言生成中的幻觉综述）\n- [A Survey of Hallucination in Large Foundation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05922)（大型基础模型中的幻觉综述）\n- [A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions](https:\u002F\u002Fgithub.com\u002FLuckyyySTA\u002FAwesome-LLM-hallucination)（大语言模型幻觉综述：原理、分类、挑战与开放问题）\n    - 论文见[此处](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.05232)\n    - 两大类别：*factuality hallucinations*（事实性幻觉）和 *faithfulness hallucinations*（忠实性幻觉）。事实性幻觉强调生成内容与可验证的真实世界事实之间的差异，通常表现为事实不一致或虚构。忠实性幻觉指生成内容与用户指令或输入提供的上下文之间的偏差，以及生成内容内部的自我一致性。\n- [LLM Powered Autonomous Agents](https:\u002F\u002Flilianweng.github.io\u002Fposts\u002F2023-06-23-agent\u002F)（大语言模型驱动的自主智能体）\n- [SemEval-2024 Task-6 - SHROOM, a Shared-task on Hallucinations and Related Observable Overgeneration Mistakes](https:\u002F\u002Fhelsinki-nlp.github.io\u002Fshroom\u002F)（SemEval-2024 任务6 - SHROOM，关于幻觉及相关可观察过度生成错误的共享任务）\n- [llm-hallucination-survey](https:\u002F\u002Fgithub.com\u002FHillZhang1999\u002Fllm-hallucination-survey)\n- [How Do Large Language Models Capture the Ever-changing World Knowledge? A Review of Recent Advances](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.07343)（大语言模型如何捕捉瞬息万变的世界知识？近期进展综述）\n- [The Dawn After the Dark: An Empirical Study on Factuality Hallucination in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.03205)（黑暗后的黎明：大语言模型事实性幻觉的实证研究）\n\n![Huang等人的分类体系](figures\u002Fhuang_taxonomy.png \"Taxonomy\")\n\n## 分类体系\n\n[Survey of Hallucination in Natural Language Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.03629) 将评估指标分为 *Statistical*（统计类，如ROUGE、BLEU、PARENT、Knowledge F1等）和 *Model-based*（基于模型的）指标。后者进一步细分为以下类别：\n- **Information-Extraction (IE)-based**（基于信息抽取的）：从知识源中检索答案并与生成答案进行比较——可能因IE模型的错误传播而产生问题。\n- **QA-based**（基于问答的）：基于\"若生成内容与源参考事实一致，则相同问题会产生相似答案\"的直觉，衡量生成内容与源参考之间的重叠\u002F一致性。用于评估摘要、对话和data2text生成中的幻觉。由*问题生成*模型和*问题回答*模型组成。\n- **Natural Language Inference (NLI)-based**（基于自然语言推理的）：基于\"只有源知识参考才能蕴含忠实且无幻觉生成中的全部信息\"的理念。\n\n[A Survey of Hallucination in \"Large\" Foundation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05922) 对论文进行标记，涵盖 *detection*（检测）、*mitigation*（缓解）、*tasks*（任务）、*datasets*（数据集）和 *evaluation metrics*（评估指标）。关于文本中的幻觉，该综述按 *LLMs*（大语言模型）、*Multilingual LLMs*（多语言大语言模型）和 *Domain-specific LLMs*（领域特定大语言模型）对论文进行分类。\n\n[The Dawn After the Dark: An Empirical Study on Factuality Hallucination in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.03205) 提出了不同类型幻觉的分类体系：实体错误幻觉（Entity-error Hallucination）、关系错误幻觉（Relation-error Hallucination）、不完整幻觉（Incompleteness Hallucination）、过时幻觉（Outdatedness Hallucination）、过度断言幻觉（Overclaim Hallucination）、不可验证幻觉（Unverifiability Hallucination）。\n\n[Internal Consistency and Self-Feedback in Large Language Models: A Survey](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.14507) 提出了一个新视角——**Internal Consistency**（内部一致性），以应对\"增强推理\"和\"缓解幻觉\"。这一视角使我们能够将许多看似无关的工作统一到一个框架中。为提高内部一致性（进而增强推理能力并缓解幻觉），该论文识别了各项工作的共同要素，并将其总结为 Self-Feedback（自我反馈）框架。\n\n该框架包含三个组件：Self-Evaluation（自我评估）、Internal Consistency Signal（内部一致性信号）和 Self-Update（自我更新）。\n\n- **Self-Evaluation**（自我评估）：负责基于模型的语言表达、解码层概率分布和隐藏状态来评估模型的内部一致性。\n- **Internal Consistency Signal**（内部一致性信号）：通过自我评估，我们可以获得数值型、文本型、外部型甚至比较型的信号。\n- **Self-Update**（自我更新）：利用这些信号，我们可以更新模型的表达甚至模型本身，以提高内部一致性。\n\n## 大语言模型中的幻觉测量\n- [AnyScale - Llama 2 is about as factually accurate as GPT-4 for summaries and is 30X cheaper](https:\u002F\u002Fwww.anyscale.com\u002Fblog\u002Fllama-2-is-about-as-factually-accurate-as-gpt-4-for-summaries-and-is-30x-cheaper)\n- [Arthur.ai - Hallucination Experiment](https:\u002F\u002Fwww.arthur.ai\u002Fgap-articles\u002Fhallucination-experiment)\n- [Vectara - Cut the Bull…. Detecting Hallucinations in Large Language Models](https:\u002F\u002Fvectara.com\u002Fcut-the-bull-detecting-hallucinations-in-large-language-models\u002F)\n- [Vectara LLM Hallucination Leaderboard](https:\u002F\u002Fgithub.com\u002Fvectara\u002Fhallucination-leaderboard)\n- [TofuEval: Evaluating Hallucinations of LLMs on Topic-Focused Dialogue Summarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.13249)\n- [UQLM: Uncertainty Quantification for Language Models](https:\u002F\u002Fgithub.com\u002Fcvs-health\u002Fuqlm)\n\n\n## 用于测量幻觉的开源模型\n- [MiniCheck Code and Model - GitHub](https:\u002F\u002Fgithub.com\u002FLiyan06\u002FMiniCheck)\n- [AlignScore Code and Model - GitHub](https:\u002F\u002Fgithub.com\u002Fyuh-zha\u002FAlignScore)\n- [Google True Teacher Model - HuggingFace](https:\u002F\u002Fhuggingface.co\u002Fgoogle\u002Ft5_11b_trueteacher_and_anli)\n- [Hallucination Evaluation Model - HuggingFace](https:\u002F\u002Fhuggingface.co\u002Fvectara\u002Fhallucination_evaluation_model)\n- [Summac Code and Model - GitHub](https:\u002F\u002Fgithub.com\u002Ftingofurro\u002Fsummac)\n- [SCALE Code and Model - GitHub](https:\u002F\u002Fgithub.com\u002Fasappresearch\u002Fscale-score)\n\n## 定义与说明\n\n### 外在幻觉与内在幻觉\n\n[Neural Path Hunter](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08455) 将 *extrinsic hallucination*（外在幻觉）定义为引入新的文本片段、但该片段不对应知识图谱（KG, Knowledge Graph）中有效三元组的表述；将 *intrinsic hallucination*（内在幻觉）定义为误用知识图谱三元组中的主语或宾语、导致两个实体之间不存在直接路径的表述。[Survey of Hallucination in Natural Language Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.03629) 将 *extrinsic hallucination*（外在幻觉）定义为无法从源内容验证的生成输出；将 *intrinsic hallucination*（内在幻觉）定义为与源内容相矛盾的生成输出。\n\n## 引用本仓库\n\n```\n@misc{MinerviniAHD2024,\n  author = {Pasquale Minervini and Aryo Pradipta Gema and others},\n  title = {awesome-hallucination-detection},\n  year = {2024},\n  publisher = {GitHub},\n  journal = {GitHub repository},\n  howpublished = {\\url{https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection}}\n}\n```","# awesome-hallucination-detection 快速上手指南\n\n> 本仓库为大语言模型幻觉检测领域的精选论文合集，提供系统性的研究资源索引。\n\n---\n\n## 环境准备\n\n### 系统要求\n- **操作系统**: Linux \u002F macOS \u002F Windows (WSL2 推荐)\n- **Python**: 3.8+\n- **Git**: 2.20+\n\n### 前置依赖\n本仓库为资源索引型项目，核心依赖为文献管理工具：\n\n| 工具 | 用途 | 安装命令 |\n|:---|:---|:---|\n| `git` | 克隆仓库 | 系统自带或 `apt install git` |\n| `paperswithcode-client` (可选) | 批量下载论文元数据 | `pip install paperswithcode-client` |\n\n---\n\n## 安装步骤\n\n### 1. 克隆仓库（国内加速）\n\n```bash\n# 使用 GitHub 镜像加速\ngit clone https:\u002F\u002Fghproxy.com\u002Fhttps:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection.git\n\n# 或直接使用 SSH\ngit clone git@github.com:EdinburghNLP\u002Fawesome-hallucination-detection.git\n```\n\n### 2. 进入项目目录\n\n```bash\ncd awesome-hallucination-detection\n```\n\n### 3. 安装可选工具（按需）\n\n```bash\n# 安装 arxiv 论文下载工具\npip install arxiv\n\n# 安装学术搜索 CLI\npip install semanticscholar\n```\n\n---\n\n## 基本使用\n\n### 浏览论文索引\n\n```bash\n# 查看 README 中的论文列表\ncat README.md | grep -A 5 \"### \\[\" | head -50\n```\n\n### 按会议\u002F年份筛选\n\n```bash\n# 提取 NeurIPS 2025 论文\ngrep -B 1 \"NeurIPS 2025\" README.md | grep \"###\"\n\n# 提取 ICML 2025 论文  \ngrep -B 1 \"ICML 2025\" README.md | grep \"###\"\n```\n\n### 批量获取论文 PDF\n\n```python\n# save_papers.py\nimport arxiv, re, json\n\nwith open(\"README.md\") as f:\n    content = f.read()\n\n# 提取 arXiv 链接\narxiv_ids = re.findall(r'arxiv\\.org\u002Fabs\u002F(\\d+\\.\\d+)', content)\n\nfor aid in set(arxiv_ids)[:5]:  # 先下载前5篇测试\n    paper = next(arxiv.Client().results(arxiv.Search(id_list=[aid])))\n    paper.download_pdf(filename=f\"papers\u002F{aid}.pdf\")\n    print(f\"Downloaded: {paper.title}\")\n```\n\n运行：\n```bash\nmkdir -p papers && python save_papers.py\n```\n\n### 按检测方法类型检索\n\n| 方法类型 | 关键词搜索 |\n|:---|:---|\n| 无需训练的方法 | `grep -i \"training-free\" README.md` |\n| 基于激活的检测 | `grep -i \"activation\\|hidden state\" README.md` |\n| 视觉-语言模型 | `grep -i \"LVLM\\|VLM\\|multimodal\" README.md` |\n| 强化学习缓解 | `grep -i \"reinforcement learning\\|RL\" README.md` |\n\n---\n\n## 快速定位核心资源\n\n```bash\n# 查看最新添加的论文（按年份倒序）\ngrep -E \"\\(20(25|24)\" README.md | tail -20\n\n# 查找包含代码\u002F数据集的论文\ngrep -i \"code\\|github\\|dataset\" README.md | head -10\n```\n\n> **提示**: 本仓库持续更新，建议 `git pull` 获取最新论文索引。如需贡献新论文，参考 [PR 模板](https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection\u002Fpulls) 提交。","某医疗 AI 创业公司正在开发一款辅助诊断系统，需要集成多模态大模型分析医学影像与病历文本，但团队发现模型频繁产生\"幻觉\"——在 X 光片中标注不存在的病灶，或将患者病史张冠李戴。\n\n### 没有 awesome-hallucination-detection 时\n\n- 团队从零开始检索论文，花费 2 周才找到 20 余篇相关研究，遗漏了关键的 VISTA 和 MemVR 等免训练方案，被迫投入 3 个月自研检测模块\n- 工程师反复试错不同评估指标，发现 CHAIR、POPE、HallusionBench 等基准测试的适用场景混淆不清，导致选型失误两次\n- 测试人员手工整理各论文的实验配置，无法快速比对 LLaVA-1.5、Qwen-VL 等架构在不同解码策略下的幻觉抑制效果\n- 产品上线前夜，团队才发现未处理\"模态冲突\"问题——模型易被病历文本中的干扰描述误导，忽视影像真实特征\n\n### 使用 awesome-hallucination-detection 后\n\n- 通过 139 篇分类整理的论文索引，1 天内定位到 GLSim 全局-局部相似度检测框架，直接复现作为产线质检模块，省去 3 个月研发周期\n- 对照仓库中的 Metrics 对照表，明确 POPE 适用于对象级幻觉检测、MMHal-Bench 适合综合评估，测试方案一次成型\n- 利用论文附带的 Datasets 和 Comments 信息，快速筛选出适配医疗影像的免训练方法 VISTA，在 LLaVA-Med 上实现 40% 幻觉降低\n- 提前识别\"模态冲突\"风险，参考 MMMC 数据集设计对抗测试用例，采用强化学习策略训练模型优先信任影像证据\n\nawesome-hallucination-detection 将分散在顶会中的幻觉检测知识转化为结构化工程手册，让团队以天为单位完成原本需要数月的方案选型与落地。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FEdinburghNLP_awesome-hallucination-detection_26849b91.png","EdinburghNLP","Edinburgh NLP","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FEdinburghNLP_1c2aec58.png","The Natural Language Processing Group at the University of Edinburgh",null,"http:\u002F\u002Fgroups.inf.ed.ac.uk\u002Fedinburghnlp\u002F","https:\u002F\u002Fgithub.com\u002FEdinburghNLP",1070,86,"2026-04-10T07:04:16","Apache-2.0","","未说明",{"notes":91,"python":89,"dependencies":92},"该仓库为论文列表和资源汇总项目（Awesome List），本身不提供可运行的代码或工具，因此未包含任何运行环境需求信息。仓库主要收集幻觉检测领域的研究论文、数据集和评估指标，内容以文献综述和论文链接为主。如需了解具体论文中方法的运行环境，需查阅各原始论文及其开源代码仓库。",[],[15],[95,96,97],"hallucinations","llms","nlp",4,"2026-03-27T02:49:30.150509","2026-04-11T18:32:43.053423",[102,107,112,117,122],{"id":103,"question_zh":104,"answer_zh":105,"source_url":106},4081,"如何向本仓库贡献新的幻觉检测相关资源或论文？","欢迎通过提交 Pull Request（PR）的方式来贡献内容。维护者建议贡献者直接提交 PR，因为贡献者比任何人都更了解自己的论文或资源。具体步骤：Fork 本仓库 → 在 README.md 中添加您的资源 → 提交 Pull Request。","https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection\u002Fissues\u002F16",{"id":108,"question_zh":109,"answer_zh":110,"source_url":111},4082,"有哪些其他有用的幻觉检测相关仓库推荐？","推荐关注 Awesome-LLM-hallucination 仓库（https:\u002F\u002Fgithub.com\u002FLuckyyySTA\u002FAwesome-LLM-hallucination），该仓库包含相关的综述预印本，已被维护者添加到本仓库的推荐列表中。","https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection\u002Fissues\u002F9",{"id":113,"question_zh":114,"answer_zh":115,"source_url":116},4083,"有哪些公开的幻觉检测评测基准和模型可以参考？","推荐以下资源：1) LLM Hallucination Leaderboard（https:\u002F\u002Fgithub.com\u002Fvectara\u002Fhallucination-leaderboard）- 幻觉检测排行榜；2) Hugging Face 模型：vectara\u002Fhallucination_evaluation_model；3) 相关博客文章：https:\u002F\u002Fvectara.com\u002Fcut-the-bull-detecting-hallucinations-in-large-language-models\u002F。该工作曾被《纽约时报》报道（2023年11月）。","https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection\u002Fissues\u002F4",{"id":118,"question_zh":119,"answer_zh":120,"source_url":121},4084,"README 中的引用年份有误（显示为2014）怎么办？","这是一个已修复的笔误。维护者已在提交 d6aba89 中修复了该问题，将错误的2014年份更正为正确年份。如发现类似问题，欢迎提交 Issue 反馈。","https:\u002F\u002Fgithub.com\u002FEdinburghNLP\u002Fawesome-hallucination-detection\u002Fissues\u002F26",{"id":123,"question_zh":124,"answer_zh":125,"source_url":106},4085,"如何推荐自己的幻觉检测论文加入仓库？","可以直接提交 Pull Request 将论文添加到 README.md 中。以 ICLR 2024 论文 'INSIDE: LLMs' Internal States Retain the Power of Hallucination Detection' 为例，作者通过 Issue 推荐后，维护者建议其直接提交 PR 以便更快合并。",[]]