[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-wq2012--awesome-diarization":3,"tool-wq2012--awesome-diarization":64},[4,17,27,35,44,52],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",140436,2,"2026-04-05T23:32:43",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":10,"last_commit_at":41,"category_tags":42,"status":16},4292,"Deep-Live-Cam","hacksider\u002FDeep-Live-Cam","Deep-Live-Cam 是一款专注于实时换脸与视频生成的开源工具，用户仅需一张静态照片，即可通过“一键操作”实现摄像头画面的即时变脸或制作深度伪造视频。它有效解决了传统换脸技术流程繁琐、对硬件配置要求极高以及难以实时预览的痛点，让高质量的数字内容创作变得触手可及。\n\n这款工具不仅适合开发者和技术研究人员探索算法边界，更因其极简的操作逻辑（仅需三步：选脸、选摄像头、启动），广泛适用于普通用户、内容创作者、设计师及直播主播。无论是为了动画角色定制、服装展示模特替换，还是制作趣味短视频和直播互动，Deep-Live-Cam 都能提供流畅的支持。\n\n其核心技术亮点在于强大的实时处理能力，支持口型遮罩（Mouth Mask）以保留使用者原始的嘴部动作，确保表情自然精准；同时具备“人脸映射”功能，可同时对画面中的多个主体应用不同面孔。此外，项目内置了严格的内容安全过滤机制，自动拦截涉及裸露、暴力等不当素材，并倡导用户在获得授权及明确标注的前提下合规使用，体现了技术发展与伦理责任的平衡。",88924,"2026-04-06T03:28:53",[13,14,15,43],"视频",{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":23,"last_commit_at":50,"category_tags":51,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":53,"name":54,"github_repo":55,"description_zh":56,"stars":57,"difficulty_score":23,"last_commit_at":58,"category_tags":59,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,60,43,61,15,62,26,13,63],"数据工具","插件","其他","音频",{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":79,"owner_company":80,"owner_location":81,"owner_email":82,"owner_twitter":83,"owner_website":84,"owner_url":85,"languages":83,"stars":86,"forks":87,"last_commit_at":88,"license":89,"difficulty_score":90,"env_os":91,"env_gpu":92,"env_ram":92,"env_deps":93,"category_tags":96,"github_topics":97,"view_count":23,"oss_zip_url":83,"oss_zip_packed_at":83,"status":16,"created_at":105,"updated_at":106,"faqs":107,"releases":128},4188,"wq2012\u002Fawesome-diarization","awesome-diarization","A curated list of awesome Speaker Diarization papers, libraries, datasets, and other resources.","awesome-diarization 是一个专注于“说话人日记”（Speaker Diarization）领域的精选资源库，旨在回答“谁在什么时候说了什么”这一核心语音处理问题。它系统地整理了全球范围内的优质论文、开源代码库、数据集、评估工具及学习材料，帮助从业者快速定位所需技术资源。\n\n该项目主要解决了语音技术领域资源分散、查找困难的问题。说话人日记是会议记录、客服质检和访谈分析的关键前置步骤，但相关研究更新迅速且涉及复杂的深度学习模型。awesome-diarization 通过分类梳理，将从零基础的综述文章到前沿的大语言模型（LLM）辅助纠错技术，再到具体的聚类算法和音频增强工具，全部汇聚于一处，极大地降低了研究与开发门槛。\n\n这份资源清单特别适合人工智能研究人员、语音算法工程师以及希望深入了解语音分割技术的开发者使用。无论是需要复现最新 SOTA 模型的研究者，还是寻找成熟框架进行二次开发的工程师，都能在此找到对应的解决方案。其独特亮点在于不仅涵盖了传统的监督式学习和端到端神经网络方法，还及时收录了结合大语言模型进行后处理的最新探索，展现了该领域从纯声学特征向语义理解融合的最","awesome-diarization 是一个专注于“说话人日记”（Speaker Diarization）领域的精选资源库，旨在回答“谁在什么时候说了什么”这一核心语音处理问题。它系统地整理了全球范围内的优质论文、开源代码库、数据集、评估工具及学习材料，帮助从业者快速定位所需技术资源。\n\n该项目主要解决了语音技术领域资源分散、查找困难的问题。说话人日记是会议记录、客服质检和访谈分析的关键前置步骤，但相关研究更新迅速且涉及复杂的深度学习模型。awesome-diarization 通过分类梳理，将从零基础的综述文章到前沿的大语言模型（LLM）辅助纠错技术，再到具体的聚类算法和音频增强工具，全部汇聚于一处，极大地降低了研究与开发门槛。\n\n这份资源清单特别适合人工智能研究人员、语音算法工程师以及希望深入了解语音分割技术的开发者使用。无论是需要复现最新 SOTA 模型的研究者，还是寻找成熟框架进行二次开发的工程师，都能在此找到对应的解决方案。其独特亮点在于不仅涵盖了传统的监督式学习和端到端神经网络方法，还及时收录了结合大语言模型进行后处理的最新探索，展现了该领域从纯声学特征向语义理解融合的最新趋势。对于想要系统构建语音分析能力的团队而言，这是一个不可或缺的入门指南与进阶宝库。","# Awesome Speaker Diarization [![Awesome](https:\u002F\u002Fcdn.rawgit.com\u002Fsindresorhus\u002Fawesome\u002Fd7305f38d29fed78fa85652e3a63e154dd8e8829\u002Fmedia\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Fsindresorhus\u002Fawesome) [![Contribution](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fcontributions-welcome-brightgreen.svg?style=flat)](https:\u002F\u002Fgithub.com\u002Fwq2012\u002Fawesome-diarization\u002Fblob\u002Fmaster\u002FCONTRIBUTING.md)\n\n## Table of contents\n\n* [Overview](#Overview)\n* [Publications](#Publications)\n* [Software](#Software)\n  * [Framework](#Framework)\n  * [Evaluation](#Evaluation)\n  * [Clustering](#Clustering)\n  * [Speaker embedding](#Speaker-embedding)\n  * [Speaker change detection](#Speaker-change-detection)\n  * [Audio feature extraction](#Audio-feature-extraction)\n  * [Audio data augmentation](#Audio-data-augmentation)\n  * [Other sotware](#Other-software)\n* [Datasets](#Datasets)\n  * [Diarization datasets](#Diarization-datasets)\n  * [Speaker embedding training sets](#Speaker-embedding-training-sets)\n  * [Augmentation noise sources](#Augmentation-noise-sources)\n* [Conferences](#Conferences)\n* [Other learning materials](#Other-learning-materials)\n  * [Online courses](#Online-courses)\n  * [Books](#Books)\n  * [Tech blogs](#Tech-blogs)\n  * [Video tutorials](#Video-tutorials)\n* [Products](#Products)\n\n## Overview\n\nThis is a curated list of awesome Speaker Diarization papers, libraries, datasets, and other resources.\n\nThe purpose of this repo is to organize the world’s resources for speaker diarization, and make them universally accessible and useful.\n\nTo add items to this page, simply send a pull request. ([contributing guide](CONTRIBUTING.md))\n\n## Publications\n\n### Special topics\n\n#### Review & survey papers\n\n* [A Review of Speaker Diarization: Recent Advances with Deep Learning](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.09624.pdf), 2021\n* [A review on speaker diarization systems and approaches](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fabs\u002Fpii\u002FS0167639312000696), 2012\n* [Speaker diarization: A review of recent research](http:\u002F\u002Fwww.eurecom.fr\u002Ffr\u002Fpublication\u002F3152\u002Fdownload\u002Fmm-publi-3152.pdf), 2010\n\n#### Large language model (LLM)\n\n* [DiarizationLM: Speaker Diarization Post-Processing with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.03506), 2024\n* [Enhancing Speaker Diarization with Large Language Models: A Contextual Beam Search Approach](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05248), 2023\n* [Lexical speaker error correction: Leveraging language models for speaker diarization error correction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.09313), 2023\n\n#### Supervised diarization\n\n* [DiaPer: End-to-End Neural Diarization with Perceiver-Based Attractors](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.04324), 2023\n* [TOLD: A Novel Two-Stage Overlap-Aware Framework for Speaker Diarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05397), 2023\n* [Speaker Overlap-aware Neural Diarization for Multi-party Meeting Analysis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.10243), 2022\n* [End-to-End Diarization for Variable Number of Speakers with Local-Global Networks and Discriminative Speaker Embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.02096), 2021\n* [Supervised online diarization with sample mean loss for multi-domain data](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.01266), 2019\n* [Discriminative Neural Clustering for Speaker Diarisation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.09703), 2019\n* [End-to-End Neural Speaker Diarization with Permutation-Free Objectives](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.05952), 2019\n* [End-to-End Neural Speaker Diarization with Self-attention](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.06247), 2019\n* [Fully Supervised Speaker Diarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04719), 2018\n\n#### Joint diarization and ASR\n\n* [A Comparative Study on Speaker-attributed Automatic Speech Recognition in Multi-party Meetings](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.16834), 2022\n* [Turn-to-Diarize: Online Speaker Diarization Constrained by Transformer Transducer Speaker Turn Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.11641), 2021\n* [Transcribe-to-Diarize: Neural Speaker Diarization for Unlimited Number of Speakers using End-to-End Speaker-Attributed ASR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.03151), 2021\n* [Joint Speech Recognition and Speaker Diarization via Sequence Transduction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.05337), 2019\n* [Says who? Deep learning models for joint speech recognition, segmentation and diarization](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8462375), 2018\n\n#### Online speaker diarization\n\n* [Speaker Diarization as a Fully Online Bandit Learning Problem in MiniVox](https:\u002F\u002Fproceedings.mlr.press\u002Fv157\u002Flin21c\u002Flin21c.pdf), 2021\n* [Online Speaker Diarization with Relation Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.08162), 2020\n* [VoiceID on the Fly: A Speaker Recognition System that Learns from Scratch](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002Fpdfs\u002Finterspeech_2020\u002Flin20b_interspeech.pdf), 2020\n\n#### Challenges\n\n* [M2MeT: The ICASSP 2022 Multi-Channel Multi-Party Meeting Transcription Challenge](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.07393), 2022\n* [The Hitachi-JHU DIHARD III system: Competitive end-to-end neural diarization and x-vector clustering systems combined by DOVER-Lap](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.01363)\n* [Diarization is Hard: Some Experiences and Lessons Learned for the JHU\nTeam in the Inaugural DIHARD Challenge](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002Fpdfs\u002Finterspeech_2018\u002Fsell18_interspeech.pdf), 2018\n* [ODESSA at Albayzin Speaker Diarization Challenge 2018](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002FIberSPEECH_2018\u002Fpdfs\u002FIberS18_AE-5_Patino.pdf), 2018\n* [Joint Discriminative Embedding Learning, Speech Activity and Overlap Detection for the DIHARD Challenge](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002FInterspeech_2018\u002Fpdfs\u002F2304.pdf), 2018\n\n#### Audio-Visual Speaker Diarization\n\n* [AVA-AVD: Audio-Visual Speaker Diarization in the Wild](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fabs\u002F10.1145\u002F3503161.3548027), 2022\n* [DyViSE: Dynamic Vision-Guided Speaker Embedding for Audio-Visual Speaker Diarization](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9948860), 2022\n* [End-to-End Audio-Visual Neural Speaker Diarization](https:\u002F\u002Fisca-speech.org\u002Farchive\u002Finterspeech_2022\u002Fhe22c_interspeech.html), 2022\n* [MSDWild: Multi-modal Speaker Diarization Dataset in the Wild](https:\u002F\u002Fisca-speech.org\u002Farchive\u002Finterspeech_2022\u002Fliu22t_interspeech.html), 2022\n\n\n\n### Other\n\n#### 2021\n\n* [Overlap-aware low-latency online speaker diarization based on end-to-end local segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.06483)\n* [End-to-end speaker segmentation for overlap-aware resegmentation](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002Finterspeech_2021\u002Fbredin21_interspeech.html)\n* [DIVE: End-to-end Speech Diarization via Iterative Speaker Embedding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.13802)\n* [DOVER-Lap: A method for combining overlap-aware diarization outputs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.01997)\n* [Bayesian HMM clustering of x-vector sequences (VBx) in speaker diarization: Theory, implementation and analysis on standard tasks](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0885230821000619)\n* [AISHELL-4: An Open Source Dataset for Speech Enhancement, Separation, Recognition and Speaker Diarization in Conference Scenario](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.03603), 2021\n\n#### 2020\n\n* [An End-to-End Speaker Diarization Service for improving Multimedia Content Access](https:\u002F\u002Fnem-initiative.org\u002Fwp-content\u002Fuploads\u002F2020\u002F07\u002F1-4-an_end_to_end_speaker_diarization_service_for_improving_multimedia_content_access.pdf)\n* [Spot the conversation: speaker diarisation in the wild](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.01216)\n* [Speaker Diarization with Region Proposal Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.06220)\n* [Target-Speaker Voice Activity Detection: a Novel Approach for Multi-Speaker Diarization in a Dinner Party Scenario](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.07272)\n\n#### 2019\n\n* [Overlap-aware diarization: resegmentation using neural end-to-end overlapped speech detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11646)\n* [Speaker diarization using latent space clustering in generative adversarial network](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11398)\n* [A study of semi-supervised speaker diarization system using gan mixture model](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11416)\n* [Learning deep representations by multilayer bootstrap networks for speaker diarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10969)\n* [Enhancements for Audio-only Diarization Systems](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00082)\n* [LSTM based Similarity Measurement with Spectral Clustering for Speaker Diarization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.10393)\n* [Meeting Transcription Using Virtual Microphone Arrays](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fuploads\u002Fprod\u002F2019\u002F05\u002FDenmarkTechReport-5ccb8b095c8f3.pdf)\n* [Speaker diarisation using 2D self-attentive combination of embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.03190)\n* [Speaker Diarization with Lexical Information](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.06756)\n\n#### 2018\n\n* [Neural speech turn segmentation and affinity propagation for speaker diarization](https:\u002F\u002Fhal.archives-ouvertes.fr\u002Fhal-01912236\u002F)\n* [Multimodal Speaker Segmentation and Diarization using Lexical and Acoustic Cues via Sequence to Sequence Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.10731)\n* [Joint Speaker Diarization and Recognition Using Convolutional and Recurrent Neural Networks](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8461666)\n\n#### 2017\n\n* [Speaker Diarization with LSTM](https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.10468)\n* [Speaker diarization using deep neural network embeddings](http:\u002F\u002Fdanielpovey.com\u002Ffiles\u002F2017_icassp_diarization_embeddings.pdf)\n* [Speaker diarization using convolutional neural network for statistics accumulation refinement](https:\u002F\u002Fpdfs.semanticscholar.org\u002F35c4\u002F0fde977932d8a3cd24f5a1724c9dbca8b38d.pdf)\n* [pyannote. metrics: a toolkit for reproducible evaluation, diagnostic, and error analysis of speaker diarization systems](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002FInterspeech_2017\u002Fpdfs\u002F0411.PDF)\n* [Speaker Change Detection in Broadcast TV using Bidirectional Long Short-Term Memory Networks](https:\u002F\u002Fpdfs.semanticscholar.org\u002Fedff\u002Fb62b32ffcc2b5cc846e26375cb300fac9ecc.pdf)\n* [Speaker Diarization using Deep Recurrent Convolutional Neural Networks for Speaker Embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1708.02840)\n\n#### 2016\n\n* [A Speaker Diarization System for Studying Peer-Led Team Learning Groups](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.07136.pdf)\n\n#### 2015\n\n* [Diarization resegmentation in the factor analysis subspace](https:\u002F\u002Fengineering.jhu.edu\u002Fhltcoe\u002Fwp-content\u002Fuploads\u002Fsites\u002F92\u002F2016\u002F10\u002FSell_Garcia-Romero_2015A.pdf)\n\n#### 2014\n\n* [A study of the cosine distance-based mean shift for telephone speech diarization](https:\u002F\u002Fwww.researchgate.net\u002Fprofile\u002FPatrick_Kenny\u002Fpublication\u002F260661427_A_Study_of_the_Cosine_Distance-Based_Mean_Shift_for_Telephone_Speech_Diarization\u002Flinks\u002F0c96053270d2eaa133000000.pdf)\n* [Speaker diarization with PLDA i-vector scoring and unsupervised calibration](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F7078610)\n* [Artificial neural network features for speaker diarization](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F7078608)\n\n#### 2013\n* [Unsupervised methods for speaker diarization: An integrated and iterative approach](http:\u002F\u002Fgroups.csail.mit.edu\u002Fsls\u002Fpublications\u002F2013\u002FShum_IEEE_Oct-2013.pdf)\n\n#### 2011\n\n* [PLDA-based Clustering for Speaker Diarization of Broadcast Streams](https:\u002F\u002Fpdfs.semanticscholar.org\u002F0175\u002Fa752c5c72cadc7c0b899fd15f2f6b93c3335.pdf)\n* [Speaker diarization of meetings based on speaker role n-gram models](https:\u002F\u002Fpublications.idiap.ch\u002Fdownloads\u002Fpapers\u002F2011\u002FValente_ICASSP2011_2011.pdf)\n\n#### 2009\n\n* [Speaker Diarization for Meeting Room Audio](https:\u002F\u002Fwiki.inf.ed.ac.uk\u002Ftwiki\u002Fpub\u002FCSTR\u002FListenSemester2_2009_10\u002Fsun_IS2009_SpDia_meeting.PDF)\n\n#### 2008\n\n* [Stream-based speaker segmentation using speaker factors and eigenvoices](https:\u002F\u002Fwww.researchgate.net\u002Fprofile\u002FPietro_Laface\u002Fpublication\u002F224313019_Stream-based_speaker_segmentation_using_speaker_factors_and_eigenvoices\u002Flinks\u002F5770fe8608ae10de639dc121.pdf)\n\n#### 2006\n\n* [An overview of automatic speaker diarization systems](https:\u002F\u002Falize.univ-avignon.fr\u002Fdoc\u002Fpublis\u002F06_IEEE-TASP_Tranter.pdf)\n* [A spectral clustering approach to speaker diarization](http:\u002F\u002Fwww.ifp.illinois.edu\u002F~hning2\u002Fpapers\u002FNing_spectral.pdf)\n\n## Software\n\n### Framework\n\n| Link | Language | Description |\n| ---- | -------- | ----------- |\n| [FunASR](https:\u002F\u002Fgithub.com\u002Falibaba-damo-academy\u002FFunASR) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Falibaba-damo-academy\u002FFunASR?style=social) | Python & PyTorch | FunASR is an open-source speech toolkit based on PyTorch, which aims at bridging the gap between academic researchs and industrial applications. |\n| [MiniVox](https:\u002F\u002Fgithub.com\u002Fdoerlbh\u002FMiniVox) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdoerlbh\u002FMiniVox?style=social) | MATLAB | MiniVox is an open-source evaluation system for the online speaker diarization task. |\n| [SpeechBrain](https:\u002F\u002Fgithub.com\u002Fspeechbrain\u002Fspeechbrain) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fspeechbrain\u002Fspeechbrain?style=social) | Python & PyTorch | SpeechBrain is an open-source and all-in-one speech toolkit based on PyTorch. |\n| [SIDEKIT for diarization (s4d)](https:\u002F\u002Fprojets-lium.univ-lemans.fr\u002Fs4d\u002F) | Python | An open source package extension of SIDEKIT for Speaker diarization. |\n| [pyAudioAnalysis](https:\u002F\u002Fgithub.com\u002Ftyiannak\u002FpyAudioAnalysis) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ftyiannak\u002FpyAudioAnalysis?style=social) | Python | Python Audio Analysis Library: Feature Extraction, Classification, Segmentation and Applications. |\n| [AaltoASR](https:\u002F\u002Fgithub.com\u002Faalto-speech\u002Fspeaker-diarization) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Faalto-speech\u002Fspeaker-diarization?style=social) | Python & Perl | Speaker diarization scripts, based on AaltoASR. |\n| [LIUM SpkDiarization](https:\u002F\u002Fprojets-lium.univ-lemans.fr\u002Fspkdiarization\u002F) | Java | LIUM_SpkDiarization is a software dedicated to speaker diarization (i.e. speaker segmentation and clustering). It is written in Java, and includes the most recent developments in the domain (as of 2013). |\n| [kaldi-asr](https:\u002F\u002Fgithub.com\u002Fkaldi-asr\u002Fkaldi\u002Ftree\u002Fmaster\u002Fegs\u002Fcallhome_diarization) [![Build Status](https:\u002F\u002Ftravis-ci.com\u002Fkaldi-asr\u002Fkaldi.svg?branch=master)](https:\u002F\u002Ftravis-ci.com\u002Fkaldi-asr\u002Fkaldi) | Bash | Example scripts for speaker diarization on a portion of CALLHOME used in the 2000 NIST speaker recognition evaluation. |\n| [kaldi-speaker-diarization](https:\u002F\u002Fgithub.com\u002Fcadia-lvl\u002Fkaldi-speaker-diarization) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fcadia-lvl\u002Fkaldi-speaker-diarization?style=social) | Bash | Icelandic speaker diarization scripts using kaldi. |\n| [Alize LIA_SpkSeg](https:\u002F\u002Falize.univ-avignon.fr\u002F) | C++ | ALIZÉ is an opensource platform for speaker recognition. LIA_SpkSeg is the tools for speaker diarization. |\n| [pyannote-audio](https:\u002F\u002Fgithub.com\u002Fpyannote\u002Fpyannote-audio) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fpyannote\u002Fpyannote-audio?style=social) | Python | Neural building blocks for speaker diarization: speech activity detection, speaker change detection, speaker embedding. |\n| [pyBK](https:\u002F\u002Fgithub.com\u002Fjosepatino\u002FpyBK) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fjosepatino\u002FpyBK?style=social) | Python | Speaker diarization using binary key speaker modelling. Computationally light solution that does not require external training data. |\n| [Speaker-Diarization](https:\u002F\u002Fgithub.com\u002Ftaylorlu\u002FSpeaker-Diarization) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ftaylorlu\u002FSpeaker-Diarization?style=social) | Python | Speaker diarization using uis-rnn and GhostVLAD. An easier way to support openset speakers. |\n| [EEND](https:\u002F\u002Fgithub.com\u002Fhitachi-speech\u002FEEND) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhitachi-speech\u002FEEND?style=social) | Python & Bash & Perl | End-to-End Neural Diarization. |\n| [VBx](https:\u002F\u002Fgithub.com\u002FBUTSpeechFIT\u002FVBx) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FBUTSpeechFIT\u002FVBx?style=social) | Python | Variational Bayes HMM over x-vectors diarization. x-vector extractor [recipe](https:\u002F\u002Fgithub.com\u002Fphonexiaresearch\u002FVBx-training-recipe) |\n| [RE-VERB](https:\u002F\u002Fgithub.com\u002Fteam-re-verb\u002FRE-VERB) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fteam-re-verb\u002FRE-VERB?style=social) | Python & JavaScript | RE: VERB is speaker diarization system, it allows the user to send\u002Frecord audio of a conversation and receive timestamps of who spoke when. |\n| [StreamingSpeakerDiarization](https:\u002F\u002Fgithub.com\u002Fjuanmc2005\u002FStreamingSpeakerDiarization\u002F) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fjuanmc2005\u002FStreamingSpeakerDiarization?style=social) | Python | Streaming speaker diarization, extends [pyannote.audio](https:\u002F\u002Fgithub.com\u002Fpyannote\u002Fpyannote-audio) to online processing |\n| [simple_diarizer](https:\u002F\u002Fgithub.com\u002Fcvqluu\u002Fsimple_diarizer) | Python | Simplified diarization pipeline using some pretrained models. Made to be a simple as possible to go from an input audio file to diarized segments. |\n| [Picovoice Falcon](https:\u002F\u002Fgithub.com\u002FPicovoice\u002Ffalcon) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FPicovoice\u002Ffalcon?style=social) | C & Python | A [lightweight, accurate, and fast](https:\u002F\u002Fpicovoice.ai\u002Fdocs\u002Fbenchmark\u002Fspeaker-diarization\u002F#accuracy) speaker diarization engine written in C and available in Python, running on CPU with minimal overhead. |\n| [DiaPer](https:\u002F\u002Fgithub.com\u002FBUTSpeechFIT\u002FDiaPer) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FBUTSpeechFIT\u002FDiaPer?style=social) | Python | Pytorch implementation for [DiaPer: End-to-End Neural Diarization with Perceiver-Based Attractors](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2312.04324.pdf) including models pre-trained on free and public data. |\n| [sherpa-onnx](https:\u002F\u002Fgithub.com\u002Fk2-fsa\u002Fsherpa-onnx) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fk2-fsa\u002Fsherpa-onnx?style=social) | C++ & C & `C#` & Dart & Go & Java & JavaScript & Kotlin & Pascal & Python & Rust & Swift | Support speaker diarization, speech recognition, and text-to speech on various platforms with various language bindings. |\n| [FluidAudio](https:\u002F\u002Fgithub.com\u002FFluidInference\u002FFluidAudio) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FFluidInference\u002FFluidAudio?style=social) | Swift | A native Swift speaker diarization library for Apple platforms, using CoreML for efficient, real-time audio processing with high accuracy. |\n\n\n### Evaluation\n\n| Link | Language | Description |\n| ---- | -------- | ----------- |\n| [pyannote-metrics](https:\u002F\u002Fgithub.com\u002Fpyannote\u002Fpyannote-metrics) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fpyannote\u002Fpyannote-metrics?style=social) [![Build Status](https:\u002F\u002Ftravis-ci.org\u002Fpyannote\u002Fpyannote-metrics.svg?branch=master)](https:\u002F\u002Ftravis-ci.org\u002Fpyannote\u002Fpyannote-metrics)  | Python| A toolkit for reproducible evaluation, diagnostic, and error analysis of speaker diarization systems. |\n| [SimpleDER](https:\u002F\u002Fgithub.com\u002Fwq2012\u002FSimpleDER) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fwq2012\u002FSimpleDER?style=social) ![Python package](https:\u002F\u002Fgithub.com\u002Fwq2012\u002FSimpleDER\u002Fworkflows\u002FPython%20package\u002Fbadge.svg) | Python | A lightweight library to compute Diarization Error Rate (DER). |\n| [DiarizationLM](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fspeaker-id\u002Fblob\u002Fmaster\u002FDiarizationLM\u002FREADME.md) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fgoogle\u002Fspeaker-id?style=social) [![Build Status](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fspeaker-id\u002Factions\u002Fworkflows\u002Fpython-app-diarizationlm.yml\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fspeaker-id\u002Factions\u002Fworkflows\u002Fpython-app-diarizationlm.yml) | Python | Implements Word Error Rate (WER), Word Diarization Error Rate (WDER), and concatenated minimum-permutation Word Error Rate (cpWER). |\n| NIST md-eval | Perl | (1) modified [md-eval.pl](http:\u002F\u002Fwww1.icsi.berkeley.edu\u002F~knoxm\u002Fdia\u002F) from [Mary Tai Knox](http:\u002F\u002Fwww1.icsi.berkeley.edu\u002F~knoxm); (2) [md-eval-v21.pl](https:\u002F\u002Fgithub.com\u002Fjitendrab\u002Fbtp\u002Fblob\u002Fmaster\u002Fc_code\u002Fsingle_diag_gaussian_no_viterbi\u002Fmd-eval-v21.pl) from [jitendra](https:\u002F\u002Fgithub.com\u002Fjitendrab); (3) [md-eval-22.pl](https:\u002F\u002Fgithub.com\u002Fnryant\u002Fdscore\u002Fblob\u002Fmaster\u002Fscorelib\u002Fmd-eval-22.pl) from [nryant](https:\u002F\u002Fgithub.com\u002Fnryant) |\n| [dscore](https:\u002F\u002Fgithub.com\u002Fnryant\u002Fdscore) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fnryant\u002Fdscore?style=social) | Python & Perl | Diarization scoring tools. |\n| [Sequence Match Accuracy](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fuis-rnn\u002Fblob\u002Fmaster\u002Fuisrnn\u002Fevals.py) | Python | Match the accuracy of two sequences with Hungarian algorithm. |\n| [spyder](https:\u002F\u002Fgithub.com\u002Fdesh2608\u002Fspyder) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdesh2608\u002Fspyder?style=social) | Python & C++ | Simple Python package for fast DER computation. |\n| [CDER](https:\u002F\u002Fgithub.com\u002FSpeechClub\u002FCDER_Metric) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FSpeechClub\u002FCDER_Metric?style=social) | Python | Conversational DER from [The Conversational Short-phrase Speaker Diarization (CSSD) Task: Dataset, Evaluation Metric and Baselines](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.08042) |\n\n### Clustering\n\n| Link | Language | Description |\n| ---- | -------- | ----------- |\n| [uis-rnn](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fuis-rnn) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fgoogle\u002Fuis-rnn?style=social) [![Build Status](https:\u002F\u002Ftravis-ci.org\u002Fgoogle\u002Fuis-rnn.svg?branch=master)](https:\u002F\u002Ftravis-ci.org\u002Fgoogle\u002Fuis-rnn) | Python & PyTorch | Google's Unbounded Interleaved-State Recurrent Neural Network (UIS-RNN) algorithm, for Fully Supervised Speaker Diarization. This clustering algorithm is **supervised**. |\n| [uis-rnn-sml](https:\u002F\u002Fgithub.com\u002FDonkeyShot21\u002Fuis-rnn-sml) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FDonkeyShot21\u002Fuis-rnn-sml?style=social) | Python & PyTorch | A variant of UIS-RNN, for the paper Supervised Online Diarization with Sample Mean Loss for Multi-Domain Data. |\n| [DNC](https:\u002F\u002Fgithub.com\u002FFlorianKrey\u002FDNC) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FFlorianKrey\u002FDNC?style=social) | Python & ESPnet | Transformer-based Discriminative Neural Clustering (DNC) for Speaker Diarisation. Like UIS-RNN, it is **supervised**. |\n| [SpectralCluster](https:\u002F\u002Fgithub.com\u002Fwq2012\u002FSpectralCluster) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fwq2012\u002FSpectralCluster?style=social) [![Build Status](https:\u002F\u002Ftravis-ci.org\u002Fwq2012\u002FSpectralCluster.svg?branch=master)](https:\u002F\u002Ftravis-ci.org\u002Fwq2012\u002FSpectralCluster) | Python | Spectral clustering with affinity matrix refinement operations, auto-tune, and speaker turn constraints. |\n| [sklearn.cluster](https:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fclustering.html) [![Build Status]( https:\u002F\u002Fapi.travis-ci.org\u002Fscikit-learn\u002Fscikit-learn.svg?branch=master)](https:\u002F\u002Ftravis-ci.org\u002Fscikit-learn\u002Fscikit-learn) | Python | scikit-learn clustering algorithms. |\n| [PLDA](https:\u002F\u002Fgithub.com\u002FRaviSoji\u002Fplda) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FRaviSoji\u002Fplda?style=social) | Python | Probabilistic Linear Discriminant Analysis & classification, written in Python. |\n| [PLDA](https:\u002F\u002Fgithub.com\u002Fmrouvier\u002Fplda) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmrouvier\u002Fplda?style=social) | C++ | Open-source implementation of simplified PLDA (Probabilistic Linear Discriminant Analysis). |\n| [Auto-Tuning Spectral Clustering](https:\u002F\u002Fgithub.com\u002Ftango4j\u002FAuto-Tuning-Spectral-Clustering.git) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ftango4j\u002FAuto-Tuning-Spectral-Clustering?style=social) | Python | Auto-tuning Spectral Clustering method that does not need development set or supervised tuning. |\n\n\n### Speaker embedding\n\n| Link | Method | Language | Description |\n| ---- | ------ | -------- | ----------- |\n| [resemble-ai\u002FResemblyzer](https:\u002F\u002Fgithub.com\u002Fresemble-ai\u002FResemblyzer) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fresemble-ai\u002FResemblyzer?style=social) | d-vector | Python & PyTorch | PyTorch implementation of generalized end-to-end loss for speaker verification, which can be used for voice cloning and diarization. |\n| [Speaker_Verification](https:\u002F\u002Fgithub.com\u002FJanghyun1230\u002FSpeaker_Verification) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FJanghyun1230\u002FSpeaker_Verification?style=social) | d-vector | Python & TensorFlow | Tensorflow implementation of generalized end-to-end loss for speaker verification. |\n| [PyTorch_Speaker_Verification](https:\u002F\u002Fgithub.com\u002FHarryVolek\u002FPyTorch_Speaker_Verification) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FHarryVolek\u002FPyTorch_Speaker_Verification?style=social) | d-vector | Python & PyTorch | PyTorch implementation of \"Generalized End-to-End Loss for Speaker Verification\" by Wan, Li et al. With UIS-RNN integration. |\n| [Real-Time Voice Cloning](https:\u002F\u002Fgithub.com\u002FCorentinJ\u002FReal-Time-Voice-Cloning) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FCorentinJ\u002FReal-Time-Voice-Cloning?style=social) | d-vector | Python & PyTorch | Implementation of \"Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis\" (SV2TTS) with a vocoder that works in real-time. |\n| [conformer-speaker-encoder](https:\u002F\u002Fhuggingface.co\u002Ftflite-hub\u002Fconformer-speaker-encoder) | d-vector |Python & TFLite | Massively multilingual conformer-based speaker recognition models in TFLite format. |\n| [deep-speaker](https:\u002F\u002Fgithub.com\u002Fphilipperemy\u002Fdeep-speaker) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fphilipperemy\u002Fdeep-speaker?style=social) | d-vector |Python & Keras | Third party implementation of the Baidu paper Deep Speaker: an End-to-End Neural Speaker Embedding System. |\n| [x-vector-kaldi-tf](https:\u002F\u002Fgithub.com\u002Fhsn-zeinali\u002Fx-vector-kaldi-tf) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhsn-zeinali\u002Fx-vector-kaldi-tf?style=social) | x-vector | Python & TensorFlow & Perl | Tensorflow implementation of x-vector topology on top of Kaldi recipe. |\n| [kaldi-ivector](https:\u002F\u002Fgithub.com\u002Fidiap\u002Fkaldi-ivector) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fidiap\u002Fkaldi-ivector?style=social) | i-vector | C++ & Perl |  Extension to Kaldi implementing the standard i-vector hyperparameter estimation and i-vector extraction procedure. |\n| [voxceleb-ivector](https:\u002F\u002Fgithub.com\u002Fswshon\u002Fvoxceleb-ivector) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fswshon\u002Fvoxceleb-ivector?style=social) | i-vector |Perl | Voxceleb1 i-vector based speaker recognition system. |\n| [pytorch_xvectors](https:\u002F\u002Fgithub.com\u002Fmanojpamk\u002Fpytorch_xvectors) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmanojpamk\u002Fpytorch_xvectors?style=social) | x-vector | Python & PyTorch | PyTorch implementation of Voxceleb x-vectors. Additionaly, includes meta-learning architectures for embedding training. Evaluated with speaker diarization and speaker verification. |\n| [ASVtorch](https:\u002F\u002Fgitlab.com\u002Fville.vestman\u002Fasvtorch) | i-vector | Python & PyTorch | ASVtorch is a toolkit for automatic speaker recognition. |\n| [asv-subtools](https:\u002F\u002Fgithub.com\u002FSnowdar\u002Fasv-subtools) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FSnowdar\u002Fasv-subtools?style=social) | i-vector & x-vector | Kaldi & PyTorch | ASV-Subtools is developed based on Pytorch and Kaldi for the task of speaker recognition, language identification, etc. The 'sub' of 'subtools' means that there are many modular tools and the parts constitute the whole. |\n| [WeSpeaker](https:\u002F\u002Fgithub.com\u002Fwenet-e2e\u002Fwespeaker.git) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fwenet-e2e\u002Fwespeaker?style=social) | x-vector & r-vector | Python & C++ & PyTorch | WeSpeaker is a research and production oriented speaker verification, recognition and diarization toolkit, which supports very strong recipes with on-the-fly data preparation, model training and evaluation, as well as runtime C++ codes. |\n| [ReDimNet](https:\u002F\u002Fgithub.com\u002FIDRnD\u002FReDimNet) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FIDRnD\u002FReDimNet?style=social) | improved resnet | Pytorch | Neural network architecture presented in the paper [Reshape Dimensions Network for Speaker Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.18223) |\n\n\n### Speaker change detection\n\n| Link  | Language | Description |\n| ----  | -------- | ----------- |\n| [change_detection](https:\u002F\u002Fgithub.com\u002Fyinruiqing\u002Fchange_detection) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fyinruiqing\u002Fchange_detection?style=social) | Python & Keras | Code for Speaker Change Detection in Broadcast TV using Bidirectional Long Short-Term Memory Networks. |\n| [tidydiarize](https:\u002F\u002Fgithub.com\u002Fakashmjn\u002Ftinydiarize) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fakashmjn\u002Ftinydiarize?style=social) | Python | Diarization inside OpenAI Whisper decoder |\n\n### Audio feature extraction\n\n| Link  | Language | Description |\n| ----  | -------- | ----------- |\n| [LibROSA](https:\u002F\u002Fgithub.com\u002Flibrosa\u002Flibrosa) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Flibrosa\u002Flibrosa?style=social) | Python | Python library for audio and music analysis. https:\u002F\u002Flibrosa.github.io\u002F |\n| [python_speech_features](https:\u002F\u002Fgithub.com\u002Fjameslyons\u002Fpython_speech_features) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fjameslyons\u002Fpython_speech_features?style=social) | Python | This library provides common speech features for ASR including MFCCs and filterbank energies. https:\u002F\u002Fpython-speech-features.readthedocs.io\u002Fen\u002Flatest\u002F |\n| [pyAudioAnalysis](https:\u002F\u002Fgithub.com\u002Ftyiannak\u002FpyAudioAnalysis) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ftyiannak\u002FpyAudioAnalysis?style=social) | Python | Python Audio Analysis Library: Feature Extraction, Classification, Segmentation and Applications. |\n\n### Audio data augmentation\n\n| Link  | Language | Description |\n| ----  | -------- | ----------- |\n| [pyroomacoustics](https:\u002F\u002Fgithub.com\u002FLCAV\u002Fpyroomacoustics) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FLCAV\u002Fpyroomacoustics?style=social) | Python | Pyroomacoustics is a package for audio signal processing for indoor applications. It was developed as a fast prototyping platform for beamforming algorithms in indoor scenarios. https:\u002F\u002Fpyroomacoustics.readthedocs.io |\n| [gpuRIR](https:\u002F\u002Fgithub.com\u002FDavidDiazGuerra\u002FgpuRIR) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FDavidDiazGuerra\u002FgpuRIR?style=social) | Python | Python library for Room Impulse Response (RIR) simulation with GPU acceleration |\n| [rir_simulator_python](https:\u002F\u002Fgithub.com\u002Fsunits\u002Frir_simulator_python) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fsunits\u002Frir_simulator_python?style=social) | Python | Room impulse response simulator using python |\n| [WavAugment](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FWavAugment) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ffacebookresearch\u002FWavAugment?style=social) | Python & PyTorch | WavAugment performs data augmentation on audio data. The audio data is represented as pytorch tensors |\n| [EEND_dataprep](https:\u002F\u002Fgithub.com\u002FBUTSpeechFIT\u002FEEND_dataprep) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FBUTSpeechFIT\u002FEEND_dataprep?style=social) | Bash & Python | Recipes for generating [simulated conversations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.00890) used to train end-to-end diarization models. |\n\n### Other software\n\n| Link | Language | Description |\n| ---- | -------- | ----------- |\n| [VB Diarization](https:\u002F\u002Fgithub.com\u002Fwq2012\u002FVB_diarization) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fwq2012\u002FVB_diarization?style=social) [![Build Status](https:\u002F\u002Ftravis-ci.org\u002Fwq2012\u002FVB_diarization.svg?branch=master)](https:\u002F\u002Ftravis-ci.org\u002Fwq2012\u002FVB_diarization) | Python | VB Diarization with Eigenvoice and HMM Priors. |\n| [DOVER-Lap](https:\u002F\u002Fgithub.com\u002Fdesh2608\u002Fdover-lap) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdesh2608\u002Fdover-lap?style=social) | Python | Python package for combining diarization system outputs |\n| [Diar-az](https:\u002F\u002Fgithub.com\u002Fcadia-lvl\u002Fdiar-az) | Python | Data formatting tool to support the ruv-di dataset. Kaldi to Gecko to Kaldi and corpus and back |  |\n\n## Datasets\n\n### Diarization datasets\n\n| Audio | Diarization ground truth | Language | Pricing | Additional information |\n| ----- | ------------------------ | -------- | ------- | ---------------------- |\n| [2000 NIST Speaker Recognition Evaluation](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC2001S97) | [Disk-6 (Switchboard)](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fspeaker-id\u002Ftree\u002Fmaster\u002Fpublications\u002FLstmDiarization\u002Fevaluation\u002FNIST_SRE2000\u002FDisk6_ground_truth), [Disk-8  (CALLHOME)](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fspeaker-id\u002Ftree\u002Fmaster\u002Fpublications\u002FLstmDiarization\u002Fevaluation\u002FNIST_SRE2000\u002FDisk8_ground_truth) | Multiple | $2400.00 | [Evaluation Plan](https:\u002F\u002Fwww.nist.gov\u002Fsites\u002Fdefault\u002Ffiles\u002Fdocuments\u002F2017\u002F09\u002F26\u002Fspk-2000-plan-v1.0.htm_.pdf) |\n| [2003 NIST Rich Transcription Evaluation Data](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC2007S10) | Together with audios | en, ar, zh | $2000.00 | telephone speech, broadcast news |\n| [CALLHOME American English Speech](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC97S42) | [CALLHOME American English Transcripts](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC97T14) | en | $1500.00 + $1000.00| [CH109 whitelist](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fspeaker-id\u002Fblob\u002Fmaster\u002Fpublications\u002FLstmDiarization\u002Fevaluation\u002FCALLHOME_American_English\u002Fch109_whitelist.txt) |\n| [The ICSI Meeting Corpus](http:\u002F\u002Fgroups.inf.ed.ac.uk\u002Fami\u002Ficsi\u002F) | Together with audios | en | Free | [License](http:\u002F\u002Fgroups.inf.ed.ac.uk\u002Fami\u002Ficsi\u002Flicense.shtml) |\n| [The AMI Meeting Corpus](http:\u002F\u002Fgroups.inf.ed.ac.uk\u002Fami\u002Fcorpus\u002F) | Together with audios (need to be processed) | Multiple | Free | [License](http:\u002F\u002Fgroups.inf.ed.ac.uk\u002Fami\u002Fcorpus\u002Flicense.shtml) |\n| [Fisher English Training Speech Part 1 Speech](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC2004S13) | [Fisher English Training Speech Part 1 Transcripts](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC2004T19)| en | $7000.00 + $1000.00 |\n| [Fisher English Training Part 2, Speech](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC2005S13) | [Fisher English Training Part 2, Transcripts](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC2005T19) | en | $7000.00 + $1000.00 |\n| [VoxConverse](https:\u002F\u002Fgithub.com\u002Fjoonson\u002Fvoxconverse) | TBD | TBD | Free | VoxConverse is an audio-visual diarisation dataset consisting of over 50 hours of multispeaker clips of human speech, extracted from YouTube videos |\n| [MiniVox Benchmark](https:\u002F\u002Fgithub.com\u002Fdoerlbh\u002FMiniVox) | [MiniVox Benchmark](https:\u002F\u002Fgithub.com\u002Fdoerlbh\u002FMiniVox) | en | Free | MiniVox is an automatic framework to transform any speaker-labelled dataset into continuous speech datastream with episodically revealed label feedbacks. |\n| [The AliMeeting Corpus](https:\u002F\u002Fgithub.com\u002Fyufan-aslp\u002FAliMeeting) | Together with audios | zh | Free |  |\n\n### Speaker embedding training sets\n\n| Name | Utterances | Speakers | Language | Pricing | Additional information |\n| ---- | ---------- | -------- | -------- | ------- | ---------------------- |\n| [TIMIT](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC93S1) | 6K+ | 630 | en | $250.00 | Published in 1993, the TIMIT corpus of read speech is one of the earliest speaker recognition datasets. |\n| [VCTK](https:\u002F\u002Fhomepages.inf.ed.ac.uk\u002Fjyamagis\u002Fpage3\u002Fpage58\u002Fpage58.html) | 43K+ | 109 | en | Free | Most were selected from a newspaper plus the Rainbow Passage and an elicitation paragraph intended to identify the speaker's accent. |\n| [LibriSpeech](http:\u002F\u002Fwww.openslr.org\u002F12) | 292K | 2K+ | en | Free | Large-scale (1000 hours) corpus of read English speech. |\n| [Multilingual LibriSpeech (MLS)](http:\u002F\u002Fopenslr.org\u002F94\u002F) | ? | ? | en, de, nl, es, fr, it, pt, po | Free | Multilingual LibriSpeech (MLS) dataset is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish. |\n| [LibriVox](https:\u002F\u002Flibrivox.org\u002F) | 180K | 9K+ | Multiple | Free | Free public domain audiobooks. LibriSpeech is a processed subset of LibriVox. Each original unsegmented utterance could be very long. |\n| [VoxCeleb 1&2](http:\u002F\u002Fwww.robots.ox.ac.uk\u002F~vgg\u002Fdata\u002Fvoxceleb\u002F) | 1M+ | 7K | Multiple | Free | VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from interview videos uploaded to YouTube. |\n| [The Spoken Wikipedia Corpora](https:\u002F\u002Fnats.gitlab.io\u002Fswc\u002F) | 5K | 879 | en, de, nl | Free | Volunteer readers reading Wikipedia articles. |\n| [CN-Celeb](http:\u002F\u002Fwww.openslr.org\u002F82\u002F) | 130K+ | 1K | zh | Free | A Free Chinese Speaker Recognition Corpus Released by CSLT@Tsinghua University. |\n| [BookTubeSpeech](https:\u002F\u002Fusers.wpi.edu\u002F~jrwhitehill\u002FBookTubeSpeech\u002Findex.html) | 8K | 8K | en | Free | Audio samples extracted from BookTube videos - videos where people share their opinions on books - from YouTube. The dataset can be downloaded using [BookTubeSpeech-download](https:\u002F\u002Fgithub.com\u002Fwq2012\u002FBookTubeSpeech-download). |\n| [DeepMine](http:\u002F\u002Fdata.deepmine.ir\u002Fen\u002Findex.html) | 540K | 1850 | fa, en | Unknown | A speech database in Persian and English designed to build and evaluate speaker verification, as well as Persian ASR systems. |\n| [NISP-Dataset](https:\u002F\u002Fgithub.com\u002Fiiscleap\u002FNISP-Dataset) | ? | 345 | hi, kn, ml, ta, te (all Indian languages) | Free | This dataset contains speech recordings along with speaker physical parameters (height, weight, ... ) as well as regional information and linguistic information. |\n| [VoxBlink2](https:\u002F\u002Fvoxblink2.github.io\u002F) | 10M | 100k+ | 18 lanugages (en, pt, es, ru, ar, ...) | CC BY-NC-SA 4.0 | Multilingual dataset from [VoxBlink2: A 100K+ Speaker Recognition Corpus and the Open-Set Speaker-Identification Benchmark](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.11510) |\n\n### Augmentation noise sources\n\n| Name | Utterances | Pricing | Additional information |\n| ---- | ---------- | ------- | ---------------------- |\n| [AudioSet](https:\u002F\u002Fresearch.google.com\u002Faudioset\u002F) | 2M | Free | A large-scale dataset of manually annotated audio events. |\n| [MUSAN](https:\u002F\u002Fwww.openslr.org\u002F17\u002F) | N\u002FA | Free | MUSAN is a corpus of music, speech, and noise recordings. |\n\n## Conferences\n\n| Conference\u002FWorkshop | Frequency | Page Limit  | Organization | Blind Review |\n| ------------------- | --------- | ----------  | ------------ | ------------ |\n| ICASSP              | Annual    | 4 + 1 (ref) | IEEE         | No           |\n| InterSpeech         | Annual    | 4 + 1 (ref) | ISCA         | No           |\n| Speaker Odyssey     | Biennial  | 8 + 2 (ref) | ISCA         | No           |\n| SLT                 | Biennial  | 6 + 2 (ref) | IEEE         | Yes          |\n| ASRU                | Biennial  | 6 + 2 (ref) | IEEE         | Yes          |\n| WASPAA              | Biennial  | 4 + 1 (ref) | IEEE         | No           |\n| IJCB                | Annual    | 8       | IEEE & IAPR TC-4 | Yes          |\n\n## Other learning materials\n\n### Online courses\n\n* Course on Udemy: [A Tutorial on Speaker Diarization](https:\u002F\u002Fwww.udemy.com\u002Fcourse\u002Fdiarization\u002F?referralCode=21D7CC0AEABB7FE3680F)\n\n### Books\n\n* [Voice Identity Techniques: From core algorithms to engineering practice (Chinese)](https:\u002F\u002Fgithub.com\u002Fwq2012\u002FVoiceIdentityBook) by Quan Wang, 2020\n\n### Tech blogs\n\n* [Literature Review For Speaker Change Detection](https:\u002F\u002Fhedonistrh.github.io\u002F2018-07-09-Literature-Review-for-Speaker-Change-Detection\u002F)\n  by [Halil Erdoğan](https:\u002F\u002Fgithub.com\u002Fhedonistrh)\n* [Speaker Diarization: Separation of Multiple Speakers in an Audio File](https:\u002F\u002Fmedium.com\u002Fdatadriveninvestor\u002Fspeaker-diarization-22121f1264b1) by [Jaspreet Singh](https:\u002F\u002Fmedium.com\u002F@jaspreetuseducation)\n* [Speaker Diarization with Kaldi](https:\u002F\u002Ftowardsdatascience.com\u002Fspeaker-diarization-with-kaldi-e30301b05cc8) by [Yoav Ramon](https:\u002F\u002Ftowardsdatascience.com\u002F@yoavramon)\n* [Who spoke when! How to Build your own Speaker Diarization Module](https:\u002F\u002Fmedium.com\u002Fsaarthi-ai\u002Fwho-spoke-when-build-your-own-speaker-diarization-module-from-scratch-e7d725ee279) by Rahul Saxena\n\n### Video tutorials\n\n* [pyannote audio: neural building blocks for speaker diarization](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=37R_R82lfwA) by Hervé Bredin\n* [Google's Diarization System: Speaker Diarization with LSTM](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=pjxGPZQeeO4) by Google\n* [Fully Supervised Speaker Diarization: Say Goodbye to clustering](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=pGkqwRPzx9U) by Google\n* [Turn-to-Diarize: Online Speaker Diarization Constrained by Transformer Transducer Speaker Turn Detection](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=U79Aw1ky7ag) by Google\n* [Speaker Diarization: Optimal Clustering and Learning Speaker Embeddings](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=vcyB8xb1-ys) by Microsoft Research\n* [Robust Speaker Diarization for Meetings: the ICSI system](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=kEcUcfLmIS0) by Microsoft Research\n* [【机器之心&博文视点】入门声纹技术｜第二讲：声纹分割聚类与其他应用](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=HE9JW8yKYRk) by Quan Wang\n\n## Products\n\n| Company | Product                                                                                                                                      |\n| ------- |----------------------------------------------------------------------------------------------------------------------------------------------|\n| Google  | [Recorder app](https:\u002F\u002Fsupport.google.com\u002Fpixelphone?p=recorder_speaker_labels)                                                              |\n| Google  | [Google Cloud Speech-to-Text API](https:\u002F\u002Fcloud.google.com\u002Fspeech-to-text\u002Fdocs\u002Fmultiple-voices)                                              |\n| Amazon  | [Amazon Transcribe](https:\u002F\u002Faws.amazon.com\u002Ftranscribe)                                                                                       |\n| IBM     | [Watson Speech To Text API](https:\u002F\u002Fwww.ibm.com\u002Fwatson\u002Fservices\u002Fspeech-to-text)                                                              |\n| DeepAffects | [Speaker Diarization API](https:\u002F\u002Fwww.deepaffects.com\u002Fdiarization-api)                                                                       |\n| Alibaba | [Tingwu (听悟)](https:\u002F\u002Ftingwu.aliyuncs.com\u002Ftrans)                                                                                             |\n| Microsoft | [Azure Conversation Transcription API](https:\u002F\u002Flearn.microsoft.com\u002Fen-us\u002Fazure\u002Fcognitive-services\u002Fspeech-service\u002Fconversation-transcription) |\n\n## Star History\n\n[![Star History Chart](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fwq2012_awesome-diarization_readme_e43ec8f5d3e1.png)](https:\u002F\u002Fstar-history.com\u002F#wq2012\u002Fawesome-diarization&Date)\n","# 优秀的说话人日志技术 [![Awesome](https:\u002F\u002Fcdn.rawgit.com\u002Fsindresorhus\u002Fawesome\u002Fd7305f38d29fed78fa85652e3a63e154dd8e8829\u002Fmedia\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Fsindresorhus\u002Fawesome) [![贡献](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fcontributions-welcome-brightgreen.svg?style=flat)](https:\u002F\u002Fgithub.com\u002Fwq2012\u002Fawesome-diarization\u002Fblob\u002Fmaster\u002FCONTRIBUTING.md)\n\n## 目录\n\n* [概述](#Overview)\n* [论文](#Publications)\n* [软件](#Software)\n  * [框架](#Framework)\n  * [评估](#Evaluation)\n  * [聚类](#Clustering)\n  * [说话人嵌入](#Speaker-embedding)\n  * [说话人变化检测](#Speaker-change-detection)\n  * [音频特征提取](#Audio-feature-extraction)\n  * [音频数据增强](#Audio-data-augmentation)\n  * [其他软件](#Other-software)\n* [数据集](#Datasets)\n  * [说话人日志数据集](#Diarization-datasets)\n  * [说话人嵌入训练集](#Speaker-embedding-training-sets)\n  * [增强噪声源](#Augmentation-noise-sources)\n* [会议](#Conferences)\n* [其他学习资料](#Other-learning-materials)\n  * [在线课程](#Online-courses)\n  * [书籍](#Books)\n  * [技术博客](#Tech-blogs)\n  * [视频教程](#Video-tutorials)\n* [产品](#Products)\n\n## 概述\n\n这是一个精心整理的关于优秀说话人日志技术的论文、库、数据集及其他资源列表。\n\n本仓库旨在整理全球范围内与说话人日志相关的资源，使其易于获取并具有广泛的应用价值。\n\n如需添加内容，请直接提交拉取请求。（[贡献指南](CONTRIBUTING.md)）\n\n## 论文\n\n### 特别专题\n\n#### 综述与调查论文\n\n* [基于深度学习的说话人日志技术最新进展综述](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.09624.pdf), 2021年\n* [说话人日志系统及方法综述](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fabs\u002Fpii\u002FS0167639312000696), 2012年\n* [说话人日志技术：近期研究综述](http:\u002F\u002Fwww.eurecom.fr\u002Ffr\u002Fpublication\u002F3152\u002Fdownload\u002Fmm-publi-3152.pdf), 2010年\n\n#### 大型语言模型（LLM）\n\n* [DiarizationLM：利用大型语言模型进行说话人日志后处理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.03506), 2024年\n* [结合大型语言模型提升说话人日志性能：一种上下文感知的束搜索方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.05248), 2023年\n* [词汇级说话人错误纠正：利用语言模型修正说话人日志中的错误](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.09313), 2023年\n\n#### 有监督说话人日志\n\n* [DiaPer：基于Perceiver吸引子的端到端神经网络说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.04324), 2023年\n* [TOLD：一种新颖的双阶段重叠感知说话人日志框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05397), 2023年\n* [面向多人群会议分析的重叠感知神经网络说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.10243), 2022年\n* [使用局部-全局网络和判别式说话人嵌入实现可变发言人数的端到端说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.02096), 2021年\n* [针对多领域数据的样本均值损失有监督在线说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.01266), 2019年\n* [用于说话人日志的判别式神经聚类](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.09703), 2019年\n* [无排列目标的端到端神经网络说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.05952), 2019年\n* [带有自注意力机制的端到端神经网络说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.06247), 2019年\n* [完全有监督的说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04719), 2018年\n\n#### 说话人日志与自动语音识别联合\n\n* [多人群会议中基于说话人的自动语音识别对比研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.16834), 2022年\n* [Turn-to-Diarize：基于Transformer Transducer说话人轮次检测约束的在线说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.11641), 2021年\n* [Transcribe-to-Diarize：使用端到端说话人归属ASR实现无限数量发言人的神经网络说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.03151), 2021年\n* [通过序列转换实现语音识别与说话人日志联合](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.05337), 2019年\n* [是谁在说？用于联合语音识别、分割和说话人日志的深度学习模型](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8462375), 2018年\n\n#### 在线说话人日志\n\n* [MiniVox中的完全在线赌博学习问题形式的说话人日志](https:\u002F\u002Fproceedings.mlr.press\u002Fv157\u002Flin21c\u002Flin21c.pdf), 2021年\n* [基于关系网络的在线说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.08162), 2020年\n* [即时VoiceID：从零开始学习的说话人识别系统](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002Fpdfs\u002Finterspeech_2020\u002Flin20b_interspeech.pdf), 2020年\n\n#### 竞赛挑战\n\n* [M2MeT：ICASSP 2022多通道多人群会议转录挑战](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.07393), 2022年\n* [日立-JHU DIHARD III系统：由DOVER-Lap结合的竞争性端到端神经网络说话人日志和x-vector聚类系统](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.01363)\n* [说话人日志真难：JHU团队在首届DIHARD挑战赛中的经验与教训](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002Fpdfs\u002Finterspeech_2018\u002Fsell18_interspeech.pdf), 2018年\n* [ODESSA参加2018年Albayzin说话人日志挑战赛](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002FIberSPEECH_2018\u002Fpdfs\u002FIberS18_AE-5_Patino.pdf), 2018年\n* [为DIHARD挑战赛联合进行判别式嵌入学习、语音活动与重叠检测](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002FInterspeech_2018\u002Fpdfs\u002F2304.pdf), 2018年\n\n#### 视听说话人日志\n\n* [AVA-AVD：野外环境下的视听说话人日志](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fabs\u002F10.1145\u002F3503161.3548027), 2022年\n* [DyViSE：动态视觉引导的视听说话人日志嵌入](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9948860), 2022年\n* [端到端视听神经网络说话人日志](https:\u002F\u002Fisca-speech.org\u002Farchive\u002Finterspeech_2022\u002Fhe22c_interspeech.html), 2022年\n* [MSDWild：野外环境下的多模态说话人日志数据集](https:\u002F\u002Fisca-speech.org\u002Farchive\u002Finterspeech_2022\u002Fliu22t_interspeech.html), 2022年\n\n### 其他\n\n#### 2021年\n\n* [基于端到端局部分割的重叠感知低延迟在线说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.06483)\n* [用于重叠感知重新分割的端到端说话人分割](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002Finterspeech_2021\u002Fbredin21_interspeech.html)\n* [DIVE：通过迭代说话人嵌入实现端到端语音日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.13802)\n* [DOVER-Lap：一种结合重叠感知日志输出的方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.01997)\n* [在说话人日志中对x-vector序列进行贝叶斯HMM聚类（VBx）：理论、实现及标准任务上的分析](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0885230821000619)\n* [AISHELL-4：会议场景下语音增强、分离、识别与说话人日志的开源数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.03603)，2021年\n\n#### 2020年\n\n* [用于改善多媒体内容访问的端到端说话人日志服务](https:\u002F\u002Fnem-initiative.org\u002Fwp-content\u002Fuploads\u002F2020\u002F07\u002F1-4-an_end_to_end_speaker_diarization_service_for_improving_multimedia_content_access.pdf)\n* [发现对话：野外环境下的说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.01216)\n* [基于区域提议网络的说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.06220)\n* [目标说话人语音活动检测：一种针对晚宴场景下多说话人日志的新方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.07272)\n\n#### 2019年\n\n* [重叠感知的日志：利用神经网络端到端重叠语音检测进行重新分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11646)\n* [利用生成对抗网络中的潜在空间聚类进行说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11398)\n* [使用GAN混合模型的半监督说话人日志系统研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11416)\n* [通过多层自举网络学习深度表示以用于说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10969)\n* [纯音频日志系统的改进](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00082)\n* [基于LSTM的相似性度量与谱聚类用于说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.10393)\n* [使用虚拟麦克风阵列进行会议转录](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fuploads\u002Fprod\u002F2019\u002F05\u002FDenmarkTechReport-5ccb8b095c8f3.pdf)\n* [利用嵌入的二维自注意力组合进行说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.03190)\n* [结合词汇信息的说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.06756)\n\n#### 2018年\n\n* [神经网络语音轮次分割与亲和力传播用于说话人日志](https:\u002F\u002Fhal.archives-ouvertes.fr\u002Fhal-01912236\u002F)\n* [利用序列到序列神经网络结合词汇与声学线索进行多模态说话人分割与日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.10731)\n* [联合使用卷积与循环神经网络进行说话人日志与识别](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8461666)\n\n#### 2017年\n\n* [基于LSTM的说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.10468)\n* [使用深度神经网络嵌入进行说话人日志](http:\u002F\u002Fdanielpovey.com\u002Ffiles\u002F2017_icassp_diarization_embeddings.pdf)\n* [利用卷积神经网络进行统计积累优化的说话人日志](https:\u002F\u002Fpdfs.semanticscholar.org\u002F35c4\u002F0fde977932d8a3cd24f5a1724c9dbca8b38d.pdf)\n* [pyannote.metrics：一个用于可重复评估、诊断及错误分析说话人日志系统的工具包](https:\u002F\u002Fwww.isca-speech.org\u002Farchive\u002FInterspeech_2017\u002Fpdfs\u002F0411.PDF)\n* [利用双向长短期记忆网络进行广播电视中的说话人变化检测](https:\u002F\u002Fpdfs.semanticscholar.org\u002Fedff\u002Fb62b32ffcc2b5cc846e26375cb300fac9ecc.pdf)\n* [使用深度递归卷积神经网络提取说话人嵌入进行说话人日志](https:\u002F\u002Farxiv.org\u002Fabs\u002F1708.02840)\n\n#### 2016年\n\n* [用于研究同伴主导团队学习小组的说话人日志系统](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.07136.pdf)\n\n#### 2015年\n\n* [在因子分析子空间中的日志重新分割](https:\u002F\u002Fengineering.jhu.edu\u002Fhltcoe\u002Fwp-content\u002Fuploads\u002Fsites\u002F92\u002F2016\u002F10\u002FSell_Garcia-Romero_2015A.pdf)\n\n#### 2014年\n\n* [基于余弦距离的均值漂移在电话语音日志中的应用研究](https:\u002F\u002Fwww.researchgate.net\u002Fprofile\u002FPatrick_Kenny\u002Fpublication\u002F260661427_A_Study_of_the_Cosine_Distance-Based_Mean_Shift_for_Telephone_Speech_Diarization\u002Flinks\u002F0c96053270d2eaa133000000.pdf)\n* [使用PLDA i-vector评分与无监督校准进行说话人日志](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F7078610)\n* [用于说话人日志的人工神经网络特征](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F7078608)\n\n#### 2013年\n\n* [说话人日志的无监督方法：一种集成与迭代的方法](http:\u002F\u002Fgroups.csail.mit.edu\u002Fsls\u002Fpublications\u002F2013\u002FShum_IEEE_Oct-2013.pdf)\n\n#### 2011年\n\n* [基于PLDA的聚类用于广播流的说话人日志](https:\u002F\u002Fpdfs.semanticscholar.org\u002F0175\u002Fa752c5c72cadc7c0b899fd15f2f6b93c3335.pdf)\n* [基于说话人角色n-gram模型的会议说话人日志](https:\u002F\u002Fpublications.idiap.ch\u002Fdownloads\u002Fpapers\u002F2011\u002FValente_ICASSP2011_2011.pdf)\n\n#### 2009年\n\n* [会议室音频的说话人日志](https:\u002F\u002Fwiki.inf.ed.ac.uk\u002Ftwiki\u002Fpub\u002FCSTR\u002FListenSemester2_2009_10\u002Fsun_IS2009_SpDia_meeting.PDF)\n\n#### 2008年\n\n* [基于说话人因子与特征语音的流式说话人分割](https:\u002F\u002Fwww.researchgate.net\u002Fprofile\u002FPietro_Laface\u002Fpublication\u002F224313019_Stream-based_speaker_segmentation_using_speaker_factors_and_eigenvoices\u002Flinks\u002F5770fe8608ae10de639dc121.pdf)\n\n#### 2006年\n\n* [自动说话人日志系统的概述](https:\u002F\u002Falize.univ-avignon.fr\u002Fdoc\u002Fpublis\u002F06_IEEE-TASP_Tranter.pdf)\n* [基于谱聚类的说话人日志方法](http:\u002F\u002Fwww.ifp.illinois.edu\u002F~hning2\u002Fpapers\u002FNing_spectral.pdf)\n\n## 软件\n\n### 框架\n\n| 链接 | 语言 | 描述 |\n| ---- | -------- | ----------- |\n| [FunASR](https:\u002F\u002Fgithub.com\u002Falibaba-damo-academy\u002FFunASR) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Falibaba-damo-academy\u002FFunASR?style=social) | Python & PyTorch | FunASR 是一个基于 PyTorch 的开源语音工具包，旨在弥合学术研究与工业应用之间的差距。 |\n| [MiniVox](https:\u002F\u002Fgithub.com\u002Fdoerlbh\u002FMiniVox) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdoerlbh\u002FMiniVox?style=social) | MATLAB | MiniVox 是一个用于在线说话人日志任务的开源评估系统。 |\n| [SpeechBrain](https:\u002F\u002Fgithub.com\u002Fspeechbrain\u002Fspeechbrain) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fspeechbrain\u002Fspeechbrain?style=social) | Python & PyTorch | SpeechBrain 是一个基于 PyTorch 的开源、一体化语音工具包。 |\n| [SIDEKIT for diarization (s4d)](https:\u002F\u002Fprojets-lium.univ-lemans.fr\u002Fs4d\u002F) | Python | SIDEKIT 的开源扩展包，专门用于说话人日志。 |\n| [pyAudioAnalysis](https:\u002F\u002Fgithub.com\u002Ftyiannak\u002FpyAudioAnalysis) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ftyiannak\u002FpyAudioAnalysis?style=social) | Python | Python 音频分析库：特征提取、分类、分割及应用。 |\n| [AaltoASR](https:\u002F\u002Fgithub.com\u002Faalto-speech\u002Fspeaker-diarization) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Faalto-speech\u002Fspeaker-diarization?style=social) | Python & Perl | 基于 AaltoASR 的说话人日志脚本。 |\n| [LIUM SpkDiarization](https:\u002F\u002Fprojets-lium.univ-lemans.fr\u002Fspkdiarization\u002F) | Java | LIUM_SpkDiarization 是一款专门用于说话人日志（即说话人分割与聚类）的软件，采用 Java 编写，并包含了该领域截至 2013 年的最新研究成果。 |\n| [kaldi-asr](https:\u002F\u002Fgithub.com\u002Fkaldi-asr\u002Fkaldi\u002Ftree\u002Fmaster\u002Fegs\u002Fcallhome_diarization) [![构建状态](https:\u002F\u002Ftravis-ci.com\u002Fkaldi-asr\u002Fkaldi.svg?branch=master)](https:\u002F\u002Ftravis-ci.com\u002Fkaldi-asr\u002Fkaldi) | Bash | 用于 2000 年 NIST 说话人识别评测中 CALLHOME 数据集一部分的说话人日志示例脚本。 |\n| [kaldi-speaker-diarization](https:\u002F\u002Fgithub.com\u002Fcadia-lvl\u002Fkaldi-speaker-diarization) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fcadia-lvl\u002Fkaldi-speaker-diarization?style=social) | Bash | 使用 Kaldi 进行冰岛语说话人日志的脚本。 |\n| [Alize LIA_SpkSeg](https:\u002F\u002Falize.univ-avignon.fr\u002F) | C++ | ALIZÉ 是一个用于说话人识别的开源平台。LIA_SpkSeg 是其中用于说话人日志的工具。 |\n| [pyannote-audio](https:\u002F\u002Fgithub.com\u002Fpyannote\u002Fpyannote-audio) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fpyannote\u002Fpyannote-audio?style=social) | Python | 用于说话人日志的神经网络模块：语音活动检测、说话人变化检测、说话人嵌入。 |\n| [pyBK](https:\u002F\u002Fgithub.com\u002Fjosepatino\u002FpyBK) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fjosepatino\u002FpyBK?style=social) | Python | 基于二进制关键说话人模型的说话人日志。这是一种计算开销较低、无需外部训练数据的解决方案。 |\n| [Speaker-Diarization](https:\u002F\u002Fgithub.com\u002Ftaylorlu\u002FSpeaker-Diarization) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ftaylorlu\u002FSpeaker-Diarization?style=social) | Python | 使用 uis-rnn 和 GhostVLAD 进行说话人日志。一种更易于支持开放集说话人的方法。 |\n| [EEND](https:\u002F\u002Fgithub.com\u002Fhitachi-speech\u002FEEND) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhitachi-speech\u002FEEND?style=social) | Python & Bash & Perl | 端到端神经网络说话人日志。 |\n| [VBx](https:\u002F\u002Fgithub.com\u002FBUTSpeechFIT\u002FVBx) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FBUTSpeechFIT\u002FVBx?style=social) | Python | 基于 x-vector 的变分贝叶斯隐马尔可夫模型说话人日志。x-vector 提取器 [配方](https:\u002F\u002Fgithub.com\u002Fphonexiaresearch\u002FVBx-training-recipe) |\n| [RE-VERB](https:\u002F\u002Fgithub.com\u002Fteam-re-verb\u002FRE-VERB) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fteam-re-verb\u002FRE-VERB?style=social) | Python & JavaScript | RE: VERB 是一个说话人日志系统，允许用户上传或录制对话音频，并获取每位发言者的时间戳。 |\n| [StreamingSpeakerDiarization](https:\u002F\u002Fgithub.com\u002Fjuanmc2005\u002FStreamingSpeakerDiarization\u002F) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fjuanmc2005\u002FStreamingSpeakerDiarization?style=social) | Python | 流式说话人日志，扩展了 [pyannote.audio](https:\u002F\u002Fgithub.com\u002Fpyannote\u002Fpyannote-audio) 以支持在线处理。 |\n| [simple_diarizer](https:\u002F\u002Fgithub.com\u002Fcvqluu\u002Fsimple_diarizer) | Python | 使用一些预训练模型的简化说话人日志流程。旨在尽可能简单地将输入音频文件转换为已标注的说话人片段。 |\n| [Picovoice Falcon](https:\u002F\u002Fgithub.com\u002FPicovoice\u002Ffalcon) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FPicovoice\u002Ffalcon?style=social) | C & Python | 一个[轻量级、高精度且快速](https:\u002F\u002Fpicovoice.ai\u002Fdocs\u002Fbenchmark\u002Fspeaker-diarization\u002F#accuracy)的说话人日志引擎，用 C 语言编写并提供 Python 接口，在 CPU 上运行时开销极小。 |\n| [DiaPer](https:\u002F\u002Fgithub.com\u002FBUTSpeechFIT\u002FDiaPer) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FBUTSpeechFIT\u002FDiaPer?style=social) | Python | DiaPer：基于 Perceiver 的吸引子的端到端神经网络说话人日志的 PyTorch 实现，包含在免费公开数据上预训练的模型。 |\n| [sherpa-onnx](https:\u002F\u002Fgithub.com\u002Fk2-fsa\u002Fsherpa-onnx) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fk2-fsa\u002Fsherpa-onnx?style=social) | C++ & C & `C#` & Dart & Go & Java & JavaScript & Kotlin & Pascal & Python & Rust & Swift | 支持多种平台和语言绑定下的说话人日志、语音识别和文本转语音功能。 |\n| [FluidAudio](https:\u002F\u002Fgithub.com\u002FFluidInference\u002FFluidAudio) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FFluidInference\u002FFluidAudio?style=social) | Swift | 一款原生 Swift 的苹果平台说话人日志库，利用 CoreML 实现高效、实时的高精度音频处理。 |\n\n### 评估\n\n| 链接 | 语言 | 描述 |\n| ---- | -------- | ----------- |\n| [pyannote-metrics](https:\u002F\u002Fgithub.com\u002Fpyannote\u002Fpyannote-metrics) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fpyannote\u002Fpyannote-metrics?style=social) [![构建状态](https:\u002F\u002Ftravis-ci.org\u002Fpyannote\u002Fpyannote-metrics.svg?branch=master)](https:\u002F\u002Ftravis-ci.org\u002Fpyannote\u002Fpyannote-metrics)  | Python| 用于可重复评估、诊断和错误分析的说话人日志系统工具包。 |\n| [SimpleDER](https:\u002F\u002Fgithub.com\u002Fwq2012\u002FSimpleDER) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fwq2012\u002FSimpleDER?style=social) ![Python 包](https:\u002F\u002Fgithub.com\u002Fwq2012\u002FSimpleDER\u002Fworkflows\u002FPython%20package\u002Fbadge.svg) | Python | 一个轻量级库，用于计算日志错误率（DER）。 |\n| [DiarizationLM](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fspeaker-id\u002Fblob\u002Fmaster\u002FDiarizationLM\u002FREADME.md) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fgoogle\u002Fspeaker-id?style=social) [![构建状态](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fspeaker-id\u002Factions\u002Fworkflows\u002Fpython-app-diarizationlm.yml\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fspeaker-id\u002Factions\u002Fworkflows\u002Fpython-app-diarizationlm.yml) | Python | 实现了词错误率（WER）、词日志错误率（WDER）以及拼接最小置换词错误率（cpWER）。 |\n| NIST md-eval | Perl | (1) 修改自 [Mary Tai Knox](http:\u002F\u002Fwww1.icsi.berkeley.edu\u002F~knoxm) 的 [md-eval.pl](http:\u002F\u002Fwww1.icsi.berkeley.edu\u002F~knoxm\u002Fdia\u002F)；(2) 来自 [jitendra](https:\u002F\u002Fgithub.com\u002Fjitendrab) 的 [md-eval-v21.pl](https:\u002F\u002Fgithub.com\u002Fjitendrab\u002Fbtp\u002Fblob\u002Fmaster\u002Fc_code\u002Fsingle_diag_gaussian_no_viterbi\u002Fmd-eval-v21.pl)；(3) 来自 [nryant](https:\u002F\u002Fgithub.com\u002Fnryant) 的 [md-eval-22.pl](https:\u002F\u002Fgithub.com\u002Fnryant\u002Fdscore\u002Fblob\u002Fmaster\u002Fscorelib\u002Fmd-eval-22.pl) |\n| [dscore](https:\u002F\u002Fgithub.com\u002Fnryant\u002Fdscore) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fnryant\u002Fdscore?style=social) | Python & Perl | 日志评分工具。 |\n| [Sequence Match Accuracy](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fuis-rnn\u002Fblob\u002Fmaster\u002Fuisrnn\u002Fevals.py) | Python | 使用匈牙利算法匹配两个序列的准确度。 |\n| [spyder](https:\u002F\u002Fgithub.com\u002Fdesh2608\u002Fspyder) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdesh2608\u002Fspyder?style=social) | Python & C++ | 用于快速计算 DER 的简单 Python 包。 |\n| [CDER](https:\u002F\u002Fgithub.com\u002FSpeechClub\u002FCDER_Metric) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FSpeechClub\u002FCDER_Metric?style=social) | Python | 来自论文《会话语段说话人日志任务：数据集、评估指标与基线》（[arXiv:2208.08042](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.08042)）中的会话型 DER。\n\n### 聚类\n\n| 链接 | 语言 | 描述 |\n| ---- | -------- | ----------- |\n| [uis-rnn](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fuis-rnn) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fgoogle\u002Fuis-rnn?style=social) [![构建状态](https:\u002F\u002Ftravis-ci.org\u002Fgoogle\u002Fuis-rnn.svg?branch=master)](https:\u002F\u002Ftravis-ci.org\u002Fgoogle\u002Fuis-rnn) | Python & PyTorch | 谷歌的无界交错状态循环神经网络（UIS-RNN）算法，用于完全监督的说话人日志。该聚类算法是**监督式**的。 |\n| [uis-rnn-sml](https:\u002F\u002Fgithub.com\u002FDonkeyShot21\u002Fuis-rnn-sml) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FDonkeyShot21\u002Fuis-rnn-sml?style=social) | Python & PyTorch | UIS-RNN 的一种变体，用于论文《基于样本均值损失的多领域数据在线监督日志》。 |\n| [DNC](https:\u002F\u002Fgithub.com\u002FFlorianKrey\u002FDNC) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FFlorianKrey\u002FDNC?style=social) | Python & ESPnet | 基于 Transformer 的判别性神经聚类（DNC），用于说话人日志。与 UIS-RNN 一样，它也是**监督式**的。 |\n| [SpectralCluster](https:\u002F\u002Fgithub.com\u002Fwq2012\u002FSpectralCluster) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fwq2012\u002FSpectralCluster?style=social) [![构建状态](https:\u002F\u002Ftravis-ci.org\u002Fwq2012\u002FSpectralCluster.svg?branch=master)](https:\u002F\u002Ftravis-ci.org\u002Fwq2012\u002FSpectralCluster) | Python | 具有亲和矩阵优化操作、自动调优和说话人轮次约束的谱聚类。 |\n| [sklearn.cluster](https:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fclustering.html) [![构建状态]( https:\u002F\u002Fapi.travis-ci.org\u002Fscikit-learn\u002Fscikit-learn.svg?branch=master)](https:\u002F\u002Ftravis-ci.org\u002Fscikit-learn\u002Fscikit-learn) | Python | scikit-learn 中的聚类算法。 |\n| [PLDA](https:\u002F\u002Fgithub.com\u002FRaviSoji\u002Fplda) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FRaviSoji\u002Fplda?style=social) | Python | 概率线性判别分析及分类，用 Python 编写。 |\n| [PLDA](https:\u002F\u002Fgithub.com\u002Fmrouvier\u002Fplda) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmrouvier\u002Fplda?style=social) | C++ | 开源的简化版概率线性判别分析（PLDA）实现。 |\n| [Auto-Tuning Spectral Clustering](https:\u002F\u002Fgithub.com\u002Ftango4j\u002FAuto-Tuning-Spectral-Clustering.git) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ftango4j\u002FAuto-Tuning-Spectral-Clustering?style=social) | Python | 自动调优的谱聚类方法，无需开发集或监督调优。 |\n\n### 说话人嵌入\n\n| 链接 | 方法 | 语言 | 描述 |\n| ---- | ------ | -------- | ----------- |\n| [resemble-ai\u002FResemblyzer](https:\u002F\u002Fgithub.com\u002Fresemble-ai\u002FResemblyzer) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fresemble-ai\u002FResemblyzer?style=social) | d-vector | Python & PyTorch | 基于 PyTorch 的广义端到端损失的说话人验证实现，可用于语音克隆和角色分离。 |\n| [Speaker_Verification](https:\u002F\u002Fgithub.com\u002FJanghyun1230\u002FSpeaker_Verification) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FJanghyun1230\u002FSpeaker_Verification?style=social) | d-vector | Python & TensorFlow | 基于 TensorFlow 的广义端到端损失的说话人验证实现。 |\n| [PyTorch_Speaker_Verification](https:\u002F\u002Fgithub.com\u002FHarryVolek\u002FPyTorch_Speaker_Verification) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FHarryVolek\u002FPyTorch_Speaker_Verification?style=social) | d-vector | Python & PyTorch | Wan、Li 等人提出的“用于说话人验证的广义端到端损失”的 PyTorch 实现，并集成了 UIS-RNN。 |\n| [Real-Time Voice Cloning](https:\u002F\u002Fgithub.com\u002FCorentinJ\u002FReal-Time-Voice-Cloning) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FCorentinJ\u002FReal-Time-Voice-Cloning?style=social) | d-vector | Python & PyTorch | “从说话人验证到多说话人文本转语音合成的迁移学习”（SV2TTS）的实现，配备可实时工作的声码器。 |\n| [conformer-speaker-encoder](https:\u002F\u002Fhuggingface.co\u002Ftflite-hub\u002Fconformer-speaker-encoder) | d-vector |Python & TFLite | 大规模多语言的基于 Conformer 的说话人识别模型，以 TFLite 格式提供。 |\n| [deep-speaker](https:\u002F\u002Fgithub.com\u002Fphilipperemy\u002Fdeep-speaker) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fphilipperemy\u002Fdeep-speaker?style=social) | d-vector |Python & Keras | 第三方实现百度论文《Deep Speaker：一个端到端神经网络说话人嵌入系统》。 |\n| [x-vector-kaldi-tf](https:\u002F\u002Fgithub.com\u002Fhsn-zeinali\u002Fx-vector-kaldi-tf) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fhsn-zeinali\u002Fx-vector-kaldi-tf?style=social) | x-vector | Python & TensorFlow & Perl | 在 Kaldi 流程基础上，使用 TensorFlow 实现 x-vector 拓扑结构。 |\n| [kaldi-ivector](https:\u002F\u002Fgithub.com\u002Fidiap\u002Fkaldi-ivector) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fidiap\u002Fkaldi-ivector?style=social) | i-vector | C++ & Perl | Kaldi 的扩展，实现了标准的 i-vector 超参数估计和提取流程。 |\n| [voxceleb-ivector](https:\u002F\u002Fgithub.com\u002Fswshon\u002Fvoxceleb-ivector) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fswshon\u002Fvoxceleb-ivector?style=social) | i-vector |Perl | 基于 Voxceleb1 i-vector 的说话人识别系统。 |\n| [pytorch_xvectors](https:\u002F\u002Fgithub.com\u002Fmanojpamk\u002Fpytorch_xvectors) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fmanojpamk\u002Fpytorch_xvectors?style=social) | x-vector | Python & PyTorch | Voxceleb x-vectors 的 PyTorch 实现。此外，还包括用于嵌入训练的元学习架构。已通过说话人角色分离和说话人验证进行评估。 |\n| [ASVtorch](https:\u002F\u002Fgitlab.com\u002Fville.vestman\u002Fasvtorch) | i-vector | Python & PyTorch | ASVtorch 是一个自动说话人识别工具包。 |\n| [asv-subtools](https:\u002F\u002Fgithub.com\u002FSnowdar\u002Fasv-subtools) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FSnowdar\u002Fasv-subtools?style=social) | i-vector & x-vector | Kaldi & PyTorch | ASV-Subtools 基于 PyTorch 和 Kaldi 开发，用于说话人识别、语言辨识等任务。“sub”表示该工具包含多个模块化组件，共同构成整体。 |\n| [WeSpeaker](https:\u002F\u002Fgithub.com\u002Fwenet-e2e\u002Fwespeaker.git) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fwenet-e2e\u002Fwespeaker?style=social) | x-vector & r-vector | Python & C++ & PyTorch | WeSpeaker 是一个面向研究与生产的说话人验证、识别和角色分离工具包，支持强大的配方，具备实时数据准备、模型训练与评估功能，以及运行时的 C++ 代码。 |\n| [ReDimNet](https:\u002F\u002Fgithub.com\u002FIDRnD\u002FReDimNet) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FIDRnD\u002FReDimNet?style=social) | 改进的 ResNet | PyTorch | 论文[用于说话人识别的重塑维度网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.18223)中提出的神经网络架构。 |\n\n\n### 说话人变化检测\n\n| 链接  | 语言 | 描述 |\n| ----  | -------- | ----------- |\n| [change_detection](https:\u002F\u002Fgithub.com\u002Fyinruiqing\u002Fchange_detection) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fyinruiqing\u002Fchange_detection?style=social) | Python & Keras | 使用双向长短期记忆网络进行广播电视中说话人变化检测的代码。 |\n| [tidydiarize](https:\u002F\u002Fgithub.com\u002Fakashmjn\u002Ftinydiarize) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fakashmjn\u002Ftinydiarize?style=social) | Python | 在 OpenAI Whisper 解码器中进行角色分离 |\n\n### 音频特征提取\n\n| 链接  | 语言 | 描述 |\n| ----  | -------- | ----------- |\n| [LibROSA](https:\u002F\u002Fgithub.com\u002Flibrosa\u002Flibrosa) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Flibrosa\u002Flibrosa?style=social) | Python | 用于音频和音乐分析的 Python 库。https:\u002F\u002Flibrosa.github.io\u002F |\n| [python_speech_features](https:\u002F\u002Fgithub.com\u002Fjameslyons\u002Fpython_speech_features) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fjameslyons\u002Fpython_speech_features?style=social) | Python | 该库提供了 ASR 中常用的语音特征，包括 MFCC 和滤波器组能量。https:\u002F\u002Fpython-speech-features.readthedocs.io\u002Fen\u002Flatest\u002F |\n| [pyAudioAnalysis](https:\u002F\u002Fgithub.com\u002Ftyiannak\u002FpyAudioAnalysis) ![GitHub 星标](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ftyiannak\u002FpyAudioAnalysis?style=social) | Python | Python 音频分析库：特征提取、分类、分割及应用。 |\n\n### 音频数据增强\n\n| 链接  | 语言 | 描述 |\n| ----  | -------- | ----------- |\n| [pyroomacoustics](https:\u002F\u002Fgithub.com\u002FLCAV\u002Fpyroomacoustics) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FLCAV\u002Fpyroomacoustics?style=social) | Python | Pyroomacoustics 是一个用于室内应用的音频信号处理包。它被开发为一种在室内场景中快速原型化波束形成算法的平台。https:\u002F\u002Fpyroomacoustics.readthedocs.io |\n| [gpuRIR](https:\u002F\u002Fgithub.com\u002FDavidDiazGuerra\u002FgpuRIR) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FDavidDiazGuerra\u002FgpuRIR?style=social) | Python | 使用 GPU 加速的房间脉冲响应（RIR）仿真 Python 库 |\n| [rir_simulator_python](https:\u002F\u002Fgithub.com\u002Fsunits\u002Frir_simulator_python) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fsunits\u002Frir_simulator_python?style=social) | Python | 使用 Python 的房间脉冲响应模拟器 |\n| [WavAugment](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FWavAugment) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Ffacebookresearch\u002FWavAugment?style=social) | Python & PyTorch | WavAugment 对音频数据进行数据增强。音频数据以 PyTorch 张量的形式表示 |\n| [EEND_dataprep](https:\u002F\u002Fgithub.com\u002FBUTSpeechFIT\u002FEEND_dataprep) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FBUTSpeechFIT\u002FEEND_dataprep?style=social) | Bash & Python | 用于生成[模拟对话](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.00890)的配方，这些对话用于训练端到端的说话人分离模型。 |\n\n### 其他软件\n\n| 链接 | 语言 | 描述 |\n| ---- | -------- | ----------- |\n| [VB Diarization](https:\u002F\u002Fgithub.com\u002Fwq2012\u002FVB_diarization) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fwq2012\u002FVB_diarization?style=social) [![Build Status](https:\u002F\u002Ftravis-ci.org\u002Fwq2012\u002FVB_diarization.svg?branch=master)](https:\u002F\u002Ftravis-ci.org\u002Fwq2012\u002FVB_diarization) | Python | 基于特征语音和 HMM 先验的 VB 说话人分离。 |\n| [DOVER-Lap](https:\u002F\u002Fgithub.com\u002Fdesh2608\u002Fdover-lap) ![GitHub stars](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002Fdesh2608\u002Fdover-lap?style=social) | Python | 用于合并说话人分离系统输出的 Python 包 |\n| [Diar-az](https:\u002F\u002Fgithub.com\u002Fcadia-lvl\u002Fdiar-az) | Python | 数据格式化工具，用于支持 ruv-di 数据集。Kaldi 到 Gecko 再回到 Kaldi 和语料库 |  |\n\n## 数据集\n\n### 说话人分离数据集\n\n| 音频 | 说话人分离真值 | 语言 | 价格 | 其他信息 |\n| ----- | ------------------------ | -------- | ------- | ---------------------- |\n| [2000 年 NIST 说话人识别评估数据](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC2001S97) | [光盘 6（Switchboard）](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fspeaker-id\u002Ftree\u002Fmaster\u002Fpublications\u002FLstmDiarization\u002Fevaluation\u002FNIST_SRE2000\u002FDisk6_ground_truth), [光盘 8（CALLHOME）](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fspeaker-id\u002Ftree\u002Fmaster\u002Fpublications\u002FLstmDiarization\u002Fevaluation\u002FNIST_SRE2000\u002FDisk8_ground_truth) | 多种 | $2400.00 | [评估计划](https:\u002F\u002Fwww.nist.gov\u002Fsites\u002Fdefault\u002Ffiles\u002Fdocuments\u002F2017\u002F09\u002F26\u002Fspk-2000-plan-v1.0.htm_.pdf) |\n| [2003 年 NIST 丰富转录评估数据](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC2007S10) | 连同音频一起 | 英语、阿拉伯语、中文 | $2000.00 | 电话通话、广播新闻 |\n| [CALLHOME 美式英语语音](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC97S42) | [CALLHOME 美式英语转录本](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC97T14) | 英语 | $1500.00 + $1000.00 | [CH109 白名单](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fspeaker-id\u002Fblob\u002Fmaster\u002Fpublications\u002FLstmDiarization\u002Fevaluation\u002FCALLHOME_American_English\u002Fch109_whitelist.txt) |\n| [ICSI 会议语料库](http:\u002F\u002Fgroups.inf.ed.ac.uk\u002Fami\u002Ficsi\u002F) | 连同音频一起 | 英语 | 免费 | [许可证](http:\u002F\u002Fgroups.inf.ed.ac.uk\u002Fami\u002Ficsi\u002Flicense.shtml) |\n| [AMI 会议语料库](http:\u002F\u002Fgroups.inf.ed.ac.uk\u002Fami\u002Fcorpus\u002F) | 连同音频一起（需处理） | 多种 | 免费 | [许可证](http:\u002F\u002Fgroups.inf.ed.ac.uk\u002Fami\u002Fcorpus\u002Flicense.shtml) |\n| [Fisher 英语训练语音第一部分](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC2004S13) | [Fisher 英语训练语音第一部分转录本](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC2004T19) | 英语 | $7000.00 + $1000.00 |\n| [Fisher 英语训练第二部分，语音](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC2005S13) | [Fisher 英语训练第二部分，转录本](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC2005T19) | 英语 | $7000.00 + $1000.00 |\n| [VoxConverse](https:\u002F\u002Fgithub.com\u002Fjoonson\u002Fvoxconverse) | 待定 | 待定 | 免费 | VoxConverse 是一个视听说话人分离数据集，包含超过 50 小时的人类多说话者视频片段，这些片段来自 YouTube 视频 |\n| [MiniVox 基准测试](https:\u002F\u002Fgithub.com\u002Fdoerlbh\u002FMiniVox) | [MiniVox 基准测试](https:\u002F\u002Fgithub.com\u002Fdoerlbh\u002FMiniVox) | 英语 | 免费 | MiniVox 是一个自动化框架，可将任何带有说话人标签的数据集转换为连续的语音数据流，并以分段方式提供标签反馈。 |\n| [AliMeeting 语料库](https:\u002F\u002Fgithub.com\u002Fyufan-aslp\u002FAliMeeting) | 连同音频一起 | 中文 | 免费 |  |\n\n### 说话人嵌入训练数据集\n\n| 名称 | 发话次数 | 说话人数量 | 语言 | 价格 | 备注 |\n| ---- | ---------- | -------- | -------- | ------- | ---------------------- |\n| [TIMIT](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002FLDC93S1) | 6K+ | 630 | 英语 | $250.00 | TIMIT语料库于1993年发布，是一份较早的阅读型语音数据集，广泛用于说话人识别研究。 |\n| [VCTK](https:\u002F\u002Fhomepages.inf.ed.ac.uk\u002Fjyamagis\u002Fpage3\u002Fpage58\u002Fpage58.html) | 43K+ | 109 | 英语 | 免费 | 数据主要来自报纸文章、Rainbow Passage以及一段用于识别口音的诱导段落。 |\n| [LibriSpeech](http:\u002F\u002Fwww.openslr.org\u002F12) | 292K | 2K+ | 英语 | 免费 | 大规模（1000小时）的英语阅读语音语料库。 |\n| [多语言 LibriSpeech (MLS)](http:\u002F\u002Fopenslr.org\u002F94\u002F) | ? | ? | 英语、德语、荷兰语、西班牙语、法语、意大利语、葡萄牙语、波兰语 | 免费 | 多语言 LibriSpeech 数据集是一个适合语音研究的大规模多语言语料库。该数据集来源于 LibriVox 的有声读物，包含8种语言：英语、德语、荷兰语、西班牙语、法语、意大利语、葡萄牙语和波兰语。 |\n| [LibriVox](https:\u002F\u002Flibrivox.org\u002F) | 180K | 9K+ | 多种语言 | 免费 | 免费的公共领域有声读物。LibriSpeech 是 LibriVox 的一个处理后的子集，原始未分割的发话可能非常长。 |\n| [VoxCeleb 1&2](http:\u002F\u002Fwww.robots.ox.ac.uk\u002F~vgg\u002Fdata\u002Fvoxceleb\u002F) | 1M+ | 7K | 多种语言 | 免费 | VoxCeleb 是一个视听数据集，由上传至 YouTube 的访谈视频中截取的短片段组成。 |\n| [Spoken Wikipedia 语料库](https:\u002F\u002Fnats.gitlab.io\u002Fswc\u002F) | 5K | 879 | 英语、德语、荷兰语 | 免费 | 志愿者朗读的维基百科文章。 |\n| [CN-Celeb](http:\u002F\u002Fwww.openslr.org\u002F82\u002F) | 130K+ | 1K | 中文 | 免费 | 清华大学 CSLT 发布的免费中文说话人识别语料库。 |\n| [BookTubeSpeech](https:\u002F\u002Fusers.wpi.edu\u002F~jrwhitehill\u002FBookTubeSpeech\u002Findex.html) | 8K | 8K | 英语 | 免费 | 从 YouTube 上的 BookTube 视频中提取的音频样本——这些视频是人们分享对书籍看法的内容。该数据集可通过 [BookTubeSpeech-download](https:\u002F\u002Fgithub.com\u002Fwq2012\u002FBookTubeSpeech-download) 下载。 |\n| [DeepMine](http:\u002F\u002Fdata.deepmine.ir\u002Fen\u002Findex.html) | 540K | 1850 | 波斯语、英语 | 未知 | 一份波斯语和英语的语音数据库，旨在构建和评估说话人验证系统以及波斯语 ASR 系统。 |\n| [NISP-数据集](https:\u002F\u002Fgithub.com\u002Fiiscleap\u002FNISP-Dataset) | ? | 345 | 印地语、卡纳达语、马拉雅拉姆语、泰米尔语、泰卢固语（均为印度语言） | 免费 | 该数据集包含语音录音，以及说话人的身体参数（身高、体重等）、地域信息和语言学信息。 |\n| [VoxBlink2](https:\u002F\u002Fvoxblink2.github.io\u002F) | 10M | 100K+ | 18种语言（英语、葡萄牙语、西班牙语、俄语、阿拉伯语等） | CC BY-NC-SA 4.0 | 来自 [VoxBlink2: 一个拥有10万以上说话人的说话人识别语料库及开放集说话人识别基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.11510) 的多语言数据集。\n\n### 增强用噪声源\n\n| 名称 | 发话次数 | 价格 | 备注 |\n| ---- | ---------- | ------- | ---------------------- |\n| [AudioSet](https:\u002F\u002Fresearch.google.com\u002Faudioset\u002F) | 2M | 免费 | 一个大规模的手动标注音频事件数据集。 |\n| [MUSAN](https:\u002F\u002Fwww.openslr.org\u002F17\u002F) | 不适用 | 免费 | MUSAN 是音乐、语音和噪声录音的语料库。 |\n\n## 会议\n\n| 会议\u002F研讨会 | 频率 | 页数限制 | 主办单位 | 盲审 |\n| ------------------- | --------- | ----------  | ------------ | ------------ |\n| ICASSP              | 每年    | 4 + 1（参考文献） | IEEE         | 否           |\n| InterSpeech         | 每年    | 4 + 1（参考文献） | ISCA         | 否           |\n| Speaker Odyssey     | 每两年  | 8 + 2（参考文献） | ISCA         | 否           |\n| SLT                 | 每两年  | 6 + 2（参考文献） | IEEE         | 是          |\n| ASRU                | 每两年  | 6 + 2（参考文献） | IEEE         | 是          |\n| WASPAA              | 每两年  | 4 + 1（参考文献） | IEEE         | 否           |\n| IJCB                | 每年    | 8       | IEEE & IAPR TC-4 | 是          |\n\n## 其他学习资料\n\n### 在线课程\n\n* Udemy 上的课程：[说话人日志化教程](https:\u002F\u002Fwww.udemy.com\u002Fcourse\u002Fdiarization\u002F?referralCode=21D7CC0AEABB7FE3680F)\n\n### 书籍\n\n* [声音身份技术：从核心算法到工程实践（中文）](https:\u002F\u002Fgithub.com\u002Fwq2012\u002FVoiceIdentityBook) 王权著，2020年出版\n\n### 技术博客\n\n* [说话人变化检测文献综述](https:\u002F\u002Fhedonistrh.github.io\u002F2018-07-09-Literature-Review-for-Speaker-Change-Detection\u002F) 由 [Halil Erdoğan](https:\u002F\u002Fgithub.com\u002Fhedonistrh) 撰写\n* [说话人日志化：分离音频文件中的多个说话人](https:\u002F\u002Fmedium.com\u002Fdatadriveninvestor\u002Fspeaker-diarization-22121f1264b1) 由 [Jaspreet Singh](https:\u002F\u002Fmedium.com\u002F@jaspreetuseducation) 撰写\n* [使用 Kaldi 进行说话人日志化](https:\u002F\u002Ftowardsdatascience.com\u002Fspeaker-diarization-with-kaldi-e30301b05cc8) 由 [Yoav Ramon](https:\u002F\u002Ftowardsdatascience.com\u002F@yoavramon) 撰写\n* [谁在什么时候说话！如何从零开始构建自己的说话人日志化模块](https:\u002F\u002Fmedium.com\u002Fsaarthi-ai\u002Fwho-spoke-when-build-your-own-speaker-diarization-module-from-scratch-e7d725ee279) 由 Rahul Saxena 撰写\n\n### 视频教程\n\n* [pyannote audio：用于说话人日志化的神经网络构建模块](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=37R_R82lfwA) 由 Hervé Bredin 演讲\n* [谷歌的日志化系统：基于 LSTM 的说话人日志化](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=pjxGPZQeeO4) 由谷歌讲解\n* [完全监督的说话人日志化：告别聚类](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=pGkqwRPzx9U) 由谷歌演示\n* [Turn-to-Diarize：基于 Transformer Transducer 说话人轮次检测约束的在线说话人日志化](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=U79Aw1ky7ag) 由谷歌展示\n* [说话人日志化：最优聚类与说话人嵌入学习](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=vcyB8xb1-ys) 由微软研究院讲解\n* [面向会议的鲁棒说话人日志化：ICSI 系统](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=kEcUcfLmIS0) 由微软研究院介绍\n* [【机器之心&博文视点】入门声纹技术｜第二讲：声纹分割聚类与其他应用](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=HE9JW8yKYRk) 由王权主讲\n\n## 产品\n\n| 公司       | 产品                                                                                                                                      |\n| ---------- |----------------------------------------------------------------------------------------------------------------------------------------------|\n| Google     | [录音机应用](https:\u002F\u002Fsupport.google.com\u002Fpixelphone?p=recorder_speaker_labels)                                                              |\n| Google     | [Google Cloud 语音转文本 API](https:\u002F\u002Fcloud.google.com\u002Fspeech-to-text\u002Fdocs\u002Fmultiple-voices)                                              |\n| Amazon     | [Amazon Transcribe](https:\u002F\u002Faws.amazon.com\u002Ftranscribe)                                                                                       |\n| IBM        | [Watson 语音转文本 API](https:\u002F\u002Fwww.ibm.com\u002Fwatson\u002Fservices\u002Fspeech-to-text)                                                              |\n| DeepAffects | [说话人日志化 API](https:\u002F\u002Fwww.deepaffects.com\u002Fdiarization-api)                                                                       |\n| 阿里巴巴   | [听悟](https:\u002F\u002Ftingwu.aliyuncs.com\u002Ftrans)                                                                                             |\n| 微软       | [Azure 对话转录 API](https:\u002F\u002Flearn.microsoft.com\u002Fen-us\u002Fazure\u002Fcognitive-services\u002Fspeech-service\u002Fconversation-transcription) |\n\n## 星标历史\n\n[![星标历史图表](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fwq2012_awesome-diarization_readme_e43ec8f5d3e1.png)](https:\u002F\u002Fstar-history.com\u002F#wq2012\u002Fawesome-diarization&Date)","# awesome-diarization 快速上手指南\n\n**注意**：`awesome-diarization` 并非一个可直接安装的软件库或工具包，而是一个**精选资源列表**（Awesome List），汇集了说话人日记（Speaker Diarization）领域的论文、开源框架、数据集和学习资料。\n\n要开始使用相关技术，您需要从该列表推荐的 **Software -> Framework** 类别中选择具体的开源项目（如 `pyannote-audio`、`NVIDIA NeMo` 或 `Kaldi`）进行安装和使用。以下指南以目前社区最流行、易于上手的 **pyannote-audio** 为例，展示如何基于此资源列表开启说话人日记任务。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**：Linux (推荐 Ubuntu 20.04+) 或 macOS。Windows 用户建议使用 WSL2。\n*   **Python 版本**：Python 3.8 - 3.10（具体版本需参照所选框架的要求，pyannote 通常推荐 3.9+）。\n*   **硬件依赖**：\n    *   **GPU**：强烈建议配备 NVIDIA GPU 并安装 CUDA 驱动，以加速深度学习模型的推理和训练。\n    *   **CPU**：若无 GPU，仅能用于小规模测试，速度较慢。\n*   **前置依赖**：\n    *   `pip` 包管理工具\n    *   `git` 版本控制工具\n    *   `ffmpeg` (用于音频处理)\n\n**安装 ffmpeg (Ubuntu\u002FDebian):**\n```bash\nsudo apt update\nsudo apt install ffmpeg\n```\n\n## 安装步骤\n\n由于 `awesome-diarization` 是资源列表，我们在此安装其推荐的代表性框架 `pyannote-audio`。\n\n1.  **创建虚拟环境**（推荐）：\n    ```bash\n    python -m venv diarization-env\n    source diarization-env\u002Fbin\u002Factivate\n    ```\n\n2.  **安装 pyannote-audio**：\n    直接使用 pip 安装最新版本。\n    ```bash\n    pip install pyannote-audio\n    ```\n\n    *注：若国内下载速度慢，可使用清华源加速：*\n    ```bash\n    pip install pyannote-audio -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n    ```\n\n3.  **获取模型权限（重要）**：\n    `pyannote-audio` 的预训练模型托管在 Hugging Face 上，需要接受用户协议并获取 Token。\n    *   访问 [pyannote 官方模型页面](https:\u002F\u002Fhuggingface.co\u002Fpyannote\u002Fspeaker-diarization-3.1) 和 [语音活动检测页面](https:\u002F\u002Fhuggingface.co\u002Fpyannote\u002Fsegmentation-3.0)。\n    *   登录 Hugging Face 账号，点击 \"Agree\" 接受协议。\n    *   在 [Access Tokens 设置页](https:\u002F\u002Fhuggingface.co\u002Fsettings\u002Ftokens) 创建一个新的 Read 权限 Token。\n    *   将 Token 保存到本地环境变量或 `.env` 文件中。\n\n## 基本使用\n\n以下是一个最简单的 Python 脚本示例，展示如何使用安装好的框架对音频文件进行说话人分离。\n\n1.  **准备音频文件**：确保你有一个名为 `input_audio.wav` 的音频文件。\n\n2.  **编写代码**：创建 `diarize.py` 文件。\n\n    ```python\n    from pyannote.audio import Pipeline\n    from huggingface_hub import login\n\n    # 1. 登录 Hugging Face (替换为你的实际 Token)\n    # 建议将 token 放入环境变量 HF_TOKEN 中，避免硬编码\n    login(token=\"YOUR_HUGGING_FACE_TOKEN\")\n\n    # 2. 加载预训练的说话人日记流水线\n    # 使用最新的 3.1 版本模型\n    pipeline = Pipeline.from_pretrained(\"pyannote\u002Fspeaker-diarization-3.1\")\n\n    # 如果需要使用 GPU，确保已安装 torch cuda 版本，pipeline 会自动检测\n    # pipeline.to(torch.device(\"cuda\")) \n\n    # 3. 执行说话人日记\n    audio_file = \"input_audio.wav\"\n    diarization = pipeline(audio_file)\n\n    # 4. 输出结果\n    print(f\"{'START':\u003C10} | {'END':\u003C10} | {'SPEAKER'}\")\n    for turn, _, speaker in diarization.itertracks(yield_label=True):\n        print(f\"{turn.start:\u003C10.2f} | {turn.end:\u003C10.2f} | {speaker}\")\n    ```\n\n3.  **运行脚本**：\n    ```bash\n    python diarize.py\n    ```\n\n**输出示例**：\n```text\nSTART      | END        | SPEAKER\n0.00       | 3.50       | SPEAKER_00\n3.50       | 8.20       | SPEAKER_01\n8.20       | 12.00      | SPEAKER_00\n```\n\n**下一步建议**：\n回到 `awesome-diarization` 仓库的 **Software** 和 **Publications** 章节，探索更多针对特定场景（如在线日记、重叠语音处理、多模态日记）的先进框架和最新论文。","某智能会议助手团队正在开发自动会议纪要功能，需要从长达数小时的多方通话录音中精准区分不同发言者并生成带说话人标签的文本。\n\n### 没有 awesome-diarization 时\n- **资源搜集低效**：开发人员需花费数周在各大论文库和 GitHub 中盲目搜索，难以辨别哪些说话人分离（Diarization）算法最适合当前业务场景。\n- **技术选型困难**：面对重叠说话、噪声干扰等复杂情况，缺乏权威的评测框架和对比数据，导致模型选择全靠“试错”，极易踩坑。\n- **数据准备繁琐**：找不到高质量的专业数据集和噪声增强源，训练数据匮乏，导致模型在真实会议场景下的泛化能力极差。\n- **前沿技术脱节**：难以及时获取结合大语言模型（LLM）进行后处理纠错的最新研究成果，产品智能化程度停滞不前。\n\n### 使用 awesome-diarization 后\n- **一站式资源导航**：团队直接利用其分类清晰的清单，快速锁定了适合多方会议的 SOTA（最先进）框架和专用数据集，研发启动时间缩短 80%。\n- **科学评估决策**：参考列表中提供的评测工具和聚类算法对比，迅速确定了抗重叠说话能力最强的方案，避免了无效的模型训练。\n- **数据增强便捷**：直接复用推荐的噪声源和数据增强工具，显著提升了模型在嘈杂环境下的鲁棒性，准确率大幅提升。\n- **紧跟技术潮流**：通过收录的 LLM 相关最新论文，团队成功引入了基于上下文的说话人纠错机制，使会议纪要的可读性达到商用标准。\n\nawesome-diarization 通过系统化整理全球顶尖资源，将原本分散且高门槛的技术探索过程转化为高效、可落地的工程实践，极大加速了语音智能产品的迭代周期。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fwq2012_awesome-diarization_e068fde7.png","wq2012","Quan Wang","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fwq2012_08d25f5f.png","Senior Staff Software Engineer & Tech Lead Manager @ Google DeepMind • Textbook Author • Instructor @ Udemy •  IEEE Senior Member","Google","New York","quanw@google.com",null,"https:\u002F\u002Fwangquan.me\u002F","https:\u002F\u002Fgithub.com\u002Fwq2012",1858,239,"2026-04-04T06:44:23","Apache-2.0",5,"","未说明",{"notes":94,"python":92,"dependencies":95},"该仓库（awesome-diarization）是一个 curated list（精选列表），主要收集了说话人日记领域的论文、软件框架、数据集和学习资源链接，其本身不是一个可直接运行的软件工具或代码库。因此，README 中未包含具体的操作系统、硬件配置、Python 版本或依赖库等运行环境需求。用户需根据列表中引用的具体软件项目（如 pyannote.audio, NVIDIA NeMo 等）查阅其各自的文档以获取运行要求。",[],[13,63],[98,99,100,101,102,103,104],"speaker-diarization","awesome","awesome-list","machine-learning","speech-recognition","speech-processing","deep-learning","2026-03-27T02:49:30.150509","2026-04-06T14:03:48.945673",[108,113,118,123],{"id":109,"question_zh":110,"answer_zh":111,"source_url":112},19090,"训练中文语音说话人日记（Speaker Diarization）模型需要多少数据？","具体数据量取决于多个因素，包括说话人嵌入模型的选择、数据类型（领域、声学条件等）、是否使用预训练模型，以及训练数据与目标领域的一致性。但在一般工业应用中，通常认为拥有 10,000 名（10K）训练说话人的数据是一个基准线。","https:\u002F\u002Fgithub.com\u002Fwq2012\u002Fawesome-diarization\u002Fissues\u002F18",{"id":114,"question_zh":115,"answer_zh":116,"source_url":117},19089,"如何向该仓库贡献代码或技术博客文章？应该放在哪个章节？","任何代码资源应提交到 \"Software\"（软件）部分，您可以根据代码的具体用途选择相应的子类别。任何在线文章或技术博客应提交到 \"Tech blogs\"（技术博客）部分。您可以在提交 Pull Request (PR) 时说明您建议的位置，或者询问维护者是否有更合适的归类。","https:\u002F\u002Fgithub.com\u002Fwq2012\u002Fawesome-diarization\u002Fissues\u002F36",{"id":119,"question_zh":120,"answer_zh":121,"source_url":122},19091,"像 Nvidia 的 Multi-scale Speaker Diarization 或 TitaNet 这样的重要论文，可以添加到列表中吗？","可以。维护者确认这些是非常优秀的工作。如果您发现类似的高质量论文（例如在 Papers with Code 排行榜上找到的），欢迎直接通过 Pull Request (PR) 将它们添加到论文列表中。","https:\u002F\u002Fgithub.com\u002Fwq2012\u002Fawesome-diarization\u002Fissues\u002F30",{"id":124,"question_zh":125,"answer_zh":126,"source_url":127},19092,"为什么无法访问提到的排行榜（Leaderboard）或 stateoftheart.ai 网站？","外部网站（如 https:\u002F\u002Fwww.stateoftheart.ai\u002F）如果出现宕机或无法访问的情况，仓库维护者通常无法直接修复或干预。这属于第三方服务的问题，用户只能等待对方恢复服务。","https:\u002F\u002Fgithub.com\u002Fwq2012\u002Fawesome-diarization\u002Fissues\u002F3",[]]