[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-coqui-ai--TTS-papers":3,"tool-coqui-ai--TTS-papers":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",157379,2,"2026-04-15T23:32:42",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":76,"owner_twitter":76,"owner_website":76,"owner_url":77,"languages":76,"stars":78,"forks":79,"last_commit_at":80,"license":81,"difficulty_score":82,"env_os":83,"env_gpu":84,"env_ram":84,"env_deps":85,"category_tags":88,"github_topics":91,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":97,"updated_at":98,"faqs":99,"releases":100},7907,"coqui-ai\u002FTTS-papers","TTS-papers","🐸 collection of TTS papers","TTS-papers 是一个专注于语音合成（TTS）领域的开源论文合集，旨在为研究者和开发者提供一份系统化的学术资源导航。面对 TTS 技术快速迭代、相关文献浩如烟海的现状，它有效解决了科研人员难以高效追踪前沿算法、对比不同模型优劣的痛点。\n\n该资源特别适合从事语音技术研究的学者、AI 工程师以及希望深入理解 TTS 原理的高级开发者使用。与普通工具不同，TTS-papers 不仅罗列了从经典的 Tacotron 系列到 FastSpeech、FlowTron 等主流模型的论文链接，还包含了部分论文的扩展解读与技术要点分析。其独特亮点在于涵盖了音素对齐、半监督训练、对抗生成网络（GAN）在 TTS 中的应用以及歌唱合成等细分方向，并简要指出了各模型的核心机制（如 FlowTron 的逆自回归流架构）与性能特征。通过这份清单，用户可以快速定位关键文献，洞察技术演进脉络，从而加速实验复现或新模型的研发进程。","(Feel free to suggest changes)\n# Papers\n- Merging Phoneme and Char representations: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1811.07240.pdf\n- Tacotron Transfer Learning : https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.06508.pdf\n- Phoneme Timing From Attention: https:\u002F\u002Fieeexplore.ieee.org\u002Fstamp\u002Fstamp.jsp?tp=&arnumber=8683827\n- Semi-Supervised Training to Improve Data Efficiency in End-to-End Speech Synthesis - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1808.10128.pdf\n- Listening while Speaking: Speech Chain by Deep Learning - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1707.04879.pdf\n- Generelized End-to-End Loss for Speaker Verification: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1710.10467.pdf\n- Es-Tacotron2: Multi-Task Tacotron 2 with Pre-Trained Estimated Network for Reducing the Over-Smoothness Problem: https:\u002F\u002Fwww.mdpi.com\u002F2078-2489\u002F10\u002F4\u002F131\u002Fpdf\n\t- Against Over-Smoothness\n- FastSpeech: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.09263.pdf\n- Learning Singing From Speech: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1912.10128.pdf\n- TTS-GAN: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1909.11646.pdf\n    - they use duration and linguistic features for en2en TTS.\n    - Close to WaveNet performance.\n- DurIAN: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1909.01700.pdf\n    - Duration aware Tacotron\n- MelNet: https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01083\n- AlignTTS: https:\u002F\u002Farxiv.org\u002Fpdf\u002F2003.01950.pdf\n- Unsupervised Speech Decomposition via Triple Information Bottleneck\n    - https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.11284.pdf\n    - https:\u002F\u002Fanonymous0818.github.io\u002F\n- FlowTron: https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.05957.pdf\n    - Inverse Autoregresive Flow on Tacotron like architecture\n    - WaveGlow as vocoder.\n    - Speech style embedding with Mixture of Gaussian model.\n    - Model is large and havier than vanilla Tacotron\n    - MOS values are slighly better than public Tacotron implementation.\n- Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention : https:\u002F\u002Farxiv.org\u002Fpdf\u002F1710.08969.pdf\n\n### Expansive Summaries\n\u003Cdetails>\n\u003Csummary> End-to-End Adversarial Text-to-Speech: http:\u002F\u002Farxiv.org\u002Fabs\u002F2006.03575 (Click to Expand)\u003C\u002Fsummary>\n\n  - end2end feed-forward TTS learning.\n  - Character alignment has been done with a separate aligner module.\n  - The aligner predicts length of each character.\n\t\t- The center location of a char is found wrt the total length of the previous characters.\n\t\t- Char positions are interpolated with a Gaussian window wrt the real audio length.\n\t- audio output is computed in mu-law domain. (I don't have a reasoning for this)\n\t- use only 2 secs audio windows for traning.\n\t- GAN-TTS generator is used to produce audio signal.\n\t- RWD is used as a audio level discriminator.\n\t- MelD: They use BigGAN-deep architecture as spectrogram level discriminator regading the problem as image reconstruction.\n\t- Spectrogram loss\n\t\t- Using only adversarial feed-back is not enough to learn the char alignments. They use a spectrogram loss b\u002Fw predicted spectrograms and ground-truth specs.\n\t\t- Note that model predicts audio signals. Spectrograms above are computed from the generated audio.\n\t\t- Dynamic Time Wraping is used to compute a minimal-cost alignment b\u002Fw generated spectrograms and ground-truth.\n\t\t- It involves a dynamic programming approach to find a minimal-cost alignment.\n\t- Aligner length loss is used to penalize the aligner for predicting different than the real audio length.\n\t- They train the model with multi speaker dataset but report results on the best performing speaker.\n\t- Ablation Study importance of each component: (LengthLoss and SpectrogramLoss) > RWD > MelD > Phonemes > MultiSpeakerDataset.\n\t- My 2 cents: It is a feed forward model which provides end-2-end speech synthesis with no need to train a separate vocoder model. However, it is very complicated model with a lot of hyperparameters and implementation details. Also the final result is not close to the state of the art. I think we need to find specific algorithms for learning character alignments which would reduce the need of tunning a combination of different algorithms.\n\t  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_123a53ffee63.png\" width=\"50%\">\n\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> Fast Speech2: http:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04558 (Click to Expand)\u003C\u002Fsummary>\n\n  - Use phoneme durations generated by [MFA](https:\u002F\u002Fmontreal-forced-aligner.readthedocs.io\u002Fen\u002Flatest\u002Fintroduction.html) as labels to train a length regulator.\n  - Thay use frame level F0 and L2 spectrogram norms (Variance Information) as additional features.\n  - Variance predictor module predicts the variance information at inference time.\n  - Ablation study result improvements: model \u003C model + L2_norm \u003C model + L2_norm + F0\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_c41d05fa0325.png)\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> Glow-TTS: https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.11129.pdf (Click to Expand)\u003C\u002Fsummary>\n\n  - Use Monotonic Alignment Search to learn the alignment b\u002Fw text and spectrogram\n  - This alignment is used to train a Duration Predictor to be used at inference.\n  - Encoder maps each character to a Gaussian Distribution.\n  - Decoder maps each spectrogram frame to a latent vector using Normalizing Flow (Glow Layers)\n  - Encoder and Decoder outputs are aligned with MAS.\n  - At each iteration first the most probable alignment is found by MAS and this alignment is used to update mode parameters.\n  - A duration predictor is trained to predict the number of spectrogram frames for each character.\n  - At inference only the duration predictor is used instead of MAS\n  - Encoder has the architecture of the TTS transformer with 2 updates\n  - Instead of absolute positional encoding, they use realtive positional encoding.\n  - They also use a residual connection for the Encoder Prenet.\n  - Decoder has the same architecture as the Glow model.\n  - They train both single and multi-speaker model.\n  - It is showed experimentally, Glow-TTS is more robust against long sentences compared to original Tacotron2\n  - 15x faster than Tacotron2 at inference\n  - My 2 cents: Their samples sound not as natural as Tacotron. I believe normal attention models still generate more natural speech since the attention learns to map characters to model outputs directly. However, using Glow-TTS might be a good alternative for hard datasets.\n  - Samples: https:\u002F\u002Fgithub.com\u002Fjaywalnut310\u002Fglow-tts\n  - Repository: https:\u002F\u002Fgithub.com\u002Fjaywalnut310\u002Fglow-tts\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_74d519931a4b.png)\n\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> Non-Autoregressive Neural Text-to-Speech: http:\u002F\u002Farxiv.org\u002Fabs\u002F1905.08459 (Click to Expand)\u003C\u002Fsummary>\n\n   - A derivation of Deep Voice 3 model using non-causal convolutional layers.\n   - Teacher-Student paradigm to train annon-autoregressive student with multiple attention blocks from an autoregressive teacher model.\n   - The teacher is used to generate text-to-spectrogram alignments to be used by the student model.\n   - The model is trained with two loss functions for attention alignment and spectrogram generation.\n   - Multi attention blocks refine the attention alignment layer by layer.\n   - The student uses dot-product attention with query, key and value vectors. The query is only positinal encoding vectors. The key and the value are the encoder outputs.\n   - Proposed model is heavily tied to the positional encoding which also relies on different constant values.\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_fe385f1cbbe1.png)\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> Double Decoder Consistency: https:\u002F\u002Ferogol.com\u002Fsolving-attention-problems-of-tts-models-with-double-decoder-consistency (Click to Expand)\u003C\u002Fsummary>\n\n   - The model uses a Tacotron like architecture but with 2 decoders and a postnet.\n   - DDC uses two synchronous decoders using different reduction rates.\n   - The decoders use different reduction rates thus they compute outputs in different granularities and learn different aspects of the input data.\n   - The model uses the consistency between these two decoders to increase robustness of learned text-to-spectrogram alignment.\n   - The model also applies a refinement to the final decoder output by applying the postnet iteratively multiple times.\n   - DDC uses Batch Normalization in the prenet module and drops Dropout layers.\n   - DDC uses gradual training to reduce the total training time.\n   - We use a Multi-Band Melgan Generator as a vocoder trained with Multiple Random Window Discriminators differently than the original work.\n   - We are able to train a DDC model only in 2 days with a single GPU and the final model is able to generate faster than real-time speech on a CPU.\n  Demo page: https:\u002F\u002Ferogol.github.io\u002Fddc-samples\u002F\n  Code: https:\u002F\u002Fgithub.com\u002Fmozilla\u002FTTS\n  ![image](https:\u002F\u002Ferogol.com\u002Fwp-content\u002Fuploads\u002F2020\u002F06\u002FDDC_overview-1536x1220.png)\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> Parallel Tacotron2: http:\u002F\u002Farxiv.org\u002Fabs\u002F2103.14574 (Click to Expand)\u003C\u002Fsummary>\n\n   - Does not require external duration information.\n   - Solves the alignment issues between the real and ground-truth spectrograms by Soft-DTW loss.\n   - Predicted durations are converted to alignment by a learned conversion function, rather than a Length Regulator, to solve rounding issues.\n   - Learns an attention map over \"Token Boundary Grids\" which are computed from predicted durations.\n   - Decoder is built on 6 \"light-weight Convolutions\" blocks.\n   - A VAE is used to project input spectrograms to latent features and merged with the characterr embeddings as an input to the network.\n   - Soft-DTW is computationally intensive since it computes pairwise difference for all the spectrogram frames. They contrain it with a certain diagonal window to reduce the overhead.\n   - The final duration objective is the sum of Duration Loss, VAE loss and Spectrogram Loss.\n   - They only use proprietary datasets for the experiments 😦.\n   - Achieves the same MOS with the Tacotron2 model and outperforms ParallelTacotron.\n   - **Demo page**: https:\u002F\u002Fgoogle.github.io\u002Ftacotron\u002Fpublications\u002Fparallel_tacotron_2\u002Findex.html\n   - **Code**: No code so far\n  \u003Cimg src=\"https:\u002F\u002Fuser-images.githubusercontent.com\u002F1402048\u002F113508025-017eb180-954e-11eb-8cc5-c7dc87945bac.png\" data-canonical-src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_eb5c5741b6a9.png\"  height=\"800\"\u002F>\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> WaveGrad2: https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.09660.pdf (Click to Expand)\u003C\u002Fsummary>\n\n- It computes the raw waveform directly from a phoneme sequence.\n- A Tacotron2 like encoder model is used to compute a hidden representation from phonemes.\n- Non-Attentive Tacotron like soft duration predictor to align the hidden represenatation with the output.\n- They expand the hidden representation with the predicted durations and sample a certain window to convert to a waveform.\n- They explored different window sizes netween 64 and 256 frames corresponding to 0.8 and 3.2 secs of speech. They found that the larger is the better.\n- **Demo page**: Nothing so far\n- **Code**: No code so far\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_03b699fd24a9.png\"  height=\"450\"\u002F>\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_40519ffebf46.png\"  height=\"450\"\u002F>\n\u003C\u002Fdetails>\n\n______________________________________________________________________\n\n## Multi-Speaker Papers\n- Training Multi-Speaker Neural Text-to-Speech Systems using Speaker-Imbalanced Speech Corpora - https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.00771\n- Deep Voice 2 - https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F6889-deep-voice-2-multi-speaker-neural-text-to-speech.pdf\n- Sample Efficient Adaptive TTS - https:\u002F\u002Fopenreview.net\u002Fpdf?id=rkzjUoAcFX\n\t- WaveNet + Speaker Embedding approach\n- Voice Loop - https:\u002F\u002Farxiv.org\u002Fabs\u002F1707.06588\n- MODELING MULTI-SPEAKER LATENT SPACE TO IMPROVE NEURAL TTS QUICK ENROLLING NEW SPEAKER AND ENHANCING PREMIUM VOICE - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.05253.pdf\n- Transfer learning from speaker verification to multispeaker text-to-speech synthesis - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1806.04558.pdf\n- Fitting new speakers based on a short untranscribed sample - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1802.06984.pdf\n- Generalized end-to-end loss for speaker verification - https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.10467\n\n### Expansive Summaries\n\u003Cdetails>\n\u003Csummary> Semi-supervised Learning for Multi-speaker Text-to-speech Synthesis Using Discrete Speech Representation: http:\u002F\u002Farxiv.org\u002Fabs\u002F2005.08024 \u003C\u002Fsummary>\n\n   - Train a multi-speaker TTS model with only an hour long paired data (text-to-voice alignment) and more unpaired (only voide) data.\n   - It learns a code book with each code word corresponds to a single phoneme.\n   - The code-book is aligned to phonemes using the paired data and CTC algorithm.\n   - This code book functions like a proxy to implicitly estimate the phoneme sequence of the unpaired data.\n   - They stack Tacotron2 model on top to perform TTS using the code word embeddings generated by the initial part of the model.\n   - They beat the benchmark methods in 1hr long paired data setting.\n   - They don't report full paired data results.\n   - They don't have a good ablation study which could be interesting to see how different parts of the model contribute to the performance.\n   - They use Griffin-Lim as a vocoder thus there is space for improvement.\n\n  Demo page: https:\u002F\u002Fttaoretw.github.io\u002Fmultispkr-semi-tts\u002Fdemo.html \u003Cbr>\n  Code: https:\u002F\u002Fgithub.com\u002FttaoREtw\u002Fsemi-tts\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_3a64ecfc12eb.png)\n\u003C\u002Fdetails>\n\u003Cdetails>\n\u003Csummary> Attentron: Few-shot Text-to-Speech Exploiting Attention-based Variable Length Embedding: https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.08484 \u003C\u002Fsummary>\n\n   - Use two encoders to learn speaker depended features.\n   - Coarse encoder learns a global speaker embedding vector based on provided reference spectrograms.\n   - Fine encoder learns a variable length embedding keeping the temporal dimention in cooperation with a attention module.\n   - The attention selects important reference spectrogram frames to synthesize target speech.\n   - Pre-train the model with a single speaker dataset first (LJSpeech for 30k iters.)\n   - Fine-tune the model with a multi-speaker dataset. (VCTK for 70k iters.)\n   - It achieves slightly better metrics in comparison to using x-vectors from speaker classification model and VAE based reference audio encoder.\n\n\n  Demo page: https:\u002F\u002Fhyperconnect.github.io\u002FAttentron\u002F \u003Cbr>\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_7eb18e8c28f4.png)\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_cd07cfce5f07.png)\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> Towards Universal Text-to-Speech: http:\u002F\u002Fwww.interspeech2020.org\u002Fuploadfile\u002Fpdf\u002FWed-3-4-3.pdf \u003C\u002Fsummary>\n\n   - A framework for a sequence to sequence multi-lingual TTS\n   - The model is trained with a very large, highly unbalanced dataset.\n   - The model is able to learn a new language with 6 minutes and a new speaker with 20 seconds of data after the initial training.\n   - The model architecture is a Transformer based Encoder-Decoder network with a Speaker Network and a Language Network for the speaker and language conditinoning. The outputs of these networks are concatenated to the Encoder output.\n   - The conditioning networks take a one-hot vector representing the speaker or language ID and projects it to a conditioning representation. \n   - They use a WaveNet vocoder for converting predicted Mel-Spectrograms to the waveform output. \n   - They use language depended phonemes inputs that are not shared among languages. \n   - They sample each batch based on the inverse frequency of each language in the dataset. Thus each training batch has a uniform distribution over languages, alleviating the language imbalance in the training dataset.\n   - For learning new speakers\u002Flanguages, they fine-tune the Encoder-Decoder model with the conditioning networks. They don’t train the WaveNet model.\n   - They use 1250 hours professional recordings from 50 languages for the training. \n   - They use 16khz sampling rate for all the audio samples and trim silences at the beginning and the end of each clip. \n   - They use 4 V100 GPUs for training but they don’t mention how long they trained the model. \n   - The results show that single speaker models are better than the proposed approach in MOS metric.\n   - Also using conditioning networks is important for the long-tail languages in the dataset as they improve the MOS metric for them but impair the performance for the high-resource languages. \n   - When they add a new speaker, they observe that using more than 5 minutes of data degrades the model performance. They claim that since these recordings are not as clean as the original recordings, using more of them affects the model’s general performance. \n   - The multi-lingual model is able to train with only 6 minutes of data for new speakers and languages whereas a single speaker model requires 3 hours to train and cannot even attain similar MOS values as the 6 minutes multi-lingual model. \n\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_9903781892b1.png)\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_797abca26546.png)\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> AdaSpeech: Adaptive Text to Speech for Custom Voice: https:\u002F\u002Fopenreview.net\u002Fpdf?id=Drynvt7gg4L \u003C\u002Fsummary>\n\n   - They proposed a system that can adapt to different input acoustic properties of users and it uses minimum number of parameters to achieve this. \n   - The main architecture is based on FastSpeech2 model that uses Pitch and Variance predictors to learn the finer granularities of the input speech. \n   - They use 3 additional conditioning networks. \n   - Utterance level. It takes mel-spectrogram of the reference speech as input. \n   - Phoneme level. It takes phoneme level mel-spectrograms as input and computes phoneme-level conditioning vectors. Phoneme-level  mel-spectrograms are computed by taking the average spectrogram frame in the duration of each phoneme. \n   - Phoneme level 2. It takes phoneme encoder outputs as inputs. This differs from the network above by just using the phoneme information without seeing the spectrograms. \n   - All these conditioning networks and the back-bone FastSpeech2 uses Layer Normalisation layers. \n   - Conditional layer normalisation. They propose fine-tuning only the scale and bias parameters of each layer normalisation layer when the model is fine-tuned for a new speaker. They train a speaker conditioning module for each Layer Norm layer that outputs a scale and a bias values. (They use one speaker conditioning module per Transformer block.)\n   - It means that you only store the Speaker Conditioning module for each new speaker and predict the scale and bias values at inference as you keep the rest of the model the same.  \n   - In the experiments, they train pre-train the model on LibriTTS dataset and fine-tune it with VCTK and LJSpeech\n   - The results show that using Conditional Layer Normalisation achieves better than their 2 baselines which use only speaker embedding and decoder network fine-tunning. \n   - Their ablation study shows that the most significant part of the model is the “Phoneme level” network followed by Conditional Layer Normalisation and “Utterance level” network in an order.\n   - One important down-side of the paper is that there is almost no comparison with the literature and it makes the results harder to assess objectively. \n\n  Demo page: https:\u002F\u002Fspeechresearch.github.io\u002Fadaspeech\u002F \u003Cbr>\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_cc749ae05156.png)\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_e2f6e2cdc7e5.png)\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_daf5d7b223a4.png)\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_2bab4a01d3f1.png)\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_5b846808cb77.png)\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_ef267ee104b5.png)\n\n\u003C\u002Fdetails>\n\n______________________________________________________________________\n\n## Attention\n- Location-Relative Attention Mechanisms for Robust Long-Formspeech Synthesis - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1910.10288.pdf\n\n______________________________________________________________________\n\n## Vocoders\n- MelGAN: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1910.06711.pdf\n- ParallelWaveGAN: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1910.11480.pdf\n    - Multi scale STFT loss\n    - ~1M model parameters (very small)\n    - Slightly worse than WaveRNN\n- Improving FFTNEt\n    - https:\u002F\u002Fwww.okamotocamera.com\u002Fslt_2018.pdfF\n    - https:\u002F\u002Fwww.okamotocamera.com\u002Fslt_2018.pdf\n- FFTnet\n    - https:\u002F\u002Fgfx.cs.princeton.edu\u002Fpubs\u002FJin_2018_FAR\u002Fclips\u002Fclips.php\n    - https:\u002F\u002Fgfx.cs.princeton.edu\u002Fpubs\u002FJin_2018_FAR\u002Ffftnet-jin2018.pdf\n- SPEECH WAVEFORM RECONSTRUCTION USING CONVOLUTIONAL NEURALNETWORKS WITH NOISE AND PERIODIC INPUTS\n    - 150.162.46.34:8080\u002Ficassp2019\u002FICASSP2019\u002Fpdfs\u002F0007045.pdf\n- Towards Achieveing Robust Universal Vocoding\n    - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1811.06292.pdf\n- LPCNet\n    - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1810.11846.pdf\n    - https:\u002F\u002Farxiv.org\u002Fpdf\u002F2001.11686.pdf\n- ExciteNet\n    - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1811.04769v3.pdf\n- GELP: GAN-Excited Linear Prediction for Speech Synthesis fromMel-spectrogram\n    - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.03976v3.pdf\n- High Fidelity Speech Synthesis with Adversarial Networks: https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11646\n    - GAN-TTS, end-to-end speech synthesis\n    - Uses duration and linguistic features\n    - Duration and acoustic features are predicted by additional models.\n    - Random Window Discriminator: Ingest not the whole Voice sample but random\n    windows.\n    - Multiple RWDs. Some conditional and some unconditional. (conditioned on\n    input features)\n    - Punchline: Use randomly sampled windows with different window sizes for D.\n    - Shared results sounds mechanical that shows the limits of non-neural\n    acoustic features.\n- Multi-Band MelGAN: https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.05106\n    - Use PWGAN losses instead of feature-matching loss.\n    - Using a larger receptive field boosts model performance significantly.\n    - Generator pretraining for 200k iters.\n    - Multi-Band voice signal prediction. The output is summation of 4 different\n    band predictions with PQMF synthesis filters.\n    - Multi-band model has 1.9m parameters (quite small).\n    - Claimed to be 7x faster than MelGAN\n    - On a Chinese dataset: MOS 4.22\n - WaveGLow: https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.00002\n \t- Very large model (268M parameters)\n\t- Hard to train since on 12GB GPU it can only takes batch size 1.\n\t- Real-time inference due to the use of convolutions.\n\t- Based on Invertable Normalizing Flow. (Great tutorial https:\u002F\u002Fblog.evjang.com\u002F2018\u002F01\u002Fnf1.html\n)\n\t- Model learns and invetible mapping of audio samples to mel-spectrograms with Max Likelihood loss.\n\t- In inference network runs in reverse direction and give mel-specs are converted to audio samples.\n\t- Training has been done using 8 Nvidia V100 with 32GB ram, batch size 24. (Expensive)\n\n - SqueezeWave: https:\u002F\u002Farxiv.org\u002Fpdf\u002F2001.05685.pdf , code: https:\u002F\u002Fgithub.com\u002Ftianrengao\u002FSqueezeWave\n \t- ~5-13x faster than real-time\n\t- WaveGlow redanduncies: Long audio samples, upsamples mel-specs, large channel dimensions in WN function.\n\t- Fixes: More but shorter audio samples as input,  (L=2000, C=8 vs L=64, C=256)\n\t- L=64 matchs the mel-spec resolution so no upsampling necessary.\n\t- Use depth-wise separable convolutions in WN modules.\n\t- Use regular convolution instead of dilated since audio samples are shorter.\n\t- Do not split module outputs to residual and network output, assuming these vectors are almost identical.\n\t- Training has been done using Titan RTX 24GB batch size 96 for 600k iterations.\n\t- MOS on LJSpeech: WaveGLow - 4.57, SqueezeWave (L=128 C=256) - 4.07 and SqueezeWave (L=64 C=256) - 3.77\n\t- Smallest model has 21K samples per second on Raspi3.\n\n\u003Cdetails>\n\u003Csummary>WaveGrad: https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.00713.pdf \u003C\u002Fsummary>\n\n  - It is based on Probability Diffusion and Lagenvin Dynamics\n  - The base idea is to learn a function that maps a known distribution to target data distribution iteratively.\n  - They report 0.2 real-time factor on a GPU but CPU performance is not shared.\n  - In the example code below, the author reports that the model converges after 2 days of training on a single GPU.\n  - MOS scores on the paper are not compherensive enough but shows comparable performance to known models like WaveRNN and WaveNet.\n\n  Code: https:\u002F\u002Fgithub.com\u002Fivanvovk\u002FWaveGrad\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_b9f725a7c464.png)\n\u003C\u002Fdetails>\n\n\n\n# From the Internet (Blogs, Videos etc)\n\n## Videos\n### Paper Discussion\n- Tacotron 2 : https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=2iarxxm-v9w\n\n### Talks\n- Talk on Pushing the Frontier of Neural Text-to-Speech, by Xu Tan, 2021, https:\u002F\u002Fyoutu.be\u002FMA8PCvmr8B0\n- Talk on Generative Model-Based Text-to-Speech Synthesis, by Heiga Zen, 2017\n\t- Video: https:\u002F\u002Fyoutu.be\u002FnsrSrYtKkT8\n\t- Slide: https:\u002F\u002Fresearch.google.com\u002Fpubs\u002Fpub45882.html\n- Tutorials on Neural Parametric Text-to-Speech Synthesis at ISCA Odyessy 2020, by Xin Wang, 2020\n\t- Video: https:\u002F\u002Fyoutu.be\u002FWCe7SYcDzAI\n\t- Slide: http:\u002F\u002Ftonywangx.github.io\u002Fslide.html#dec-2020\n- ISCA Speech Processing Course on Neural vocoders, 2022\n\t- Basic components of neural vocoders: https:\u002F\u002Fyoutu.be\u002FM833q5I-ZYs\n\t- Deep generative models for speech compression (LPCNet): https:\u002F\u002Fyoutu.be\u002F7KsnFx3pLgw\n\t- Neural auto-regressive, source-filter and glottal vocoders: https:\u002F\u002Fyoutu.be\u002FgPrmxdberX0\n\t\t- Slide: http:\u002F\u002Ftonywangx.github.io\u002Fslide.html#jul-2020\n- Speech synthesis from neural decoding of spoken sentences | AISC : https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=MNDtMDPmnMo\n- Generative Text-to-Speech Synthesis : https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=j4mVEAnKiNg\n- SPEECH SYNTHESIS FOR THE GAMING INDUSTRY : https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=aOHAYe4A-2Q\n\n### General\n- Modern Text-to-Speech Systems Review : https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=8rXLSc-ZcRY\n\n## Jupyter notebooks\n- Tutorials on Selected Neural Vocoders: https:\u002F\u002Fgithub.com\u002Fnii-yamagishilab\u002Fproject-NN-Pytorch-scripts\u002Ftree\u002Fmaster\u002Ftutorials\u002Fb1_neural_vocoder\n\n## Blogs\n- Text to Speech Deep Learning Architectures : http:\u002F\u002Fwww.erogol.com\u002Ftext-speech-deep-learning-architectures\u002F\n","（欢迎提出修改建议）\n# 论文\n- 合并音素和字符表示：https:\u002F\u002Farxiv.org\u002Fpdf\u002F1811.07240.pdf\n- Tacotron 迁移学习：https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.06508.pdf\n- 从注意力机制中提取音素时序信息：https:\u002F\u002Fieeexplore.ieee.org\u002Fstamp\u002Fstamp.jsp?tp=&arnumber=8683827\n- 半监督训练以提高端到端语音合成的数据效率：https:\u002F\u002Farxiv.org\u002Fpdf\u002F1808.10128.pdf\n- 边说边听：基于深度学习的语音链：https:\u002F\u002Farxiv.org\u002Fpdf\u002F1707.04879.pdf\n- 面向说话人验证的广义端到端损失函数：https:\u002F\u002Farxiv.org\u002Fpdf\u002F1710.10467.pdf\n- Es-Tacotron2：带有预训练估计网络的多任务Tacotron 2，用于减少过度平滑问题：https:\u002F\u002Fwww.mdpi.com\u002F2078-2489\u002F10\u002F4\u002F131\u002Fpdf\n\t- 针对过度平滑问题\n- FastSpeech：https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.09263.pdf\n- 从语音中学习歌唱：https:\u002F\u002Farxiv.org\u002Fpdf\u002F1912.10128.pdf\n- TTS-GAN：https:\u002F\u002Farxiv.org\u002Fpdf\u002F1909.11646.pdf\n    - 他们使用持续时间和语言学特征进行英-英TTS。\n    - 性能接近WaveNet。\n- DurIAN：https:\u002F\u002Farxiv.org\u002Fpdf\u002F1909.01700.pdf\n    - 持续时间感知的Tacotron\n- MelNet：https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01083\n- AlignTTS：https:\u002F\u002Farxiv.org\u002Fpdf\u002F2003.01950.pdf\n- 基于三重信息瓶颈的无监督语音分解\n    - https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.11284.pdf\n    - https:\u002F\u002Fanonymous0818.github.io\u002F\n- FlowTron：https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.05957.pdf\n    - 在类似Tacotron的架构上使用逆自回归流\n    - 使用WaveGlow作为声码器。\n    - 采用高斯混合模型进行语音风格嵌入。\n    - 模型比普通的Tacotron更大更重。\n    - MOS评分略高于公开的Tacotron实现。\n- 基于深度卷积网络和引导式注意力的高效可训练文本转语音系统：https:\u002F\u002Farxiv.org\u002Fpdf\u002F1710.08969.pdf\n\n### 扩展摘要\n\u003Cdetails>\n\u003Csummary> 端到端对抗性文本转语音：http:\u002F\u002Farxiv.org\u002Fabs\u002F2006.03575（点击展开）\u003C\u002Fsummary>\n\n  - 端到端前馈式TTS学习。\n  - 字符对齐是通过一个单独的对齐模块完成的。\n  - 对齐模块预测每个字符的长度。\n\t\t- 根据前面所有字符的总长度确定当前字符的中心位置。\n\t\t- 根据实际音频长度，用高斯窗口插值计算字符的位置。\n\t- 音频输出是在mu-law域中计算的。（对此我没有明确的理由）\n\t- 训练时仅使用2秒的音频窗口。\n\t- 使用GAN-TTS生成器来产生音频信号。\n\t- RWD被用作音频级别的判别器。\n\t- MelD：他们使用BigGAN-deep架构作为频谱级别的判别器，将问题视为图像重建。\n\t- 频谱损失\n\t\t- 仅靠对抗性反馈不足以学习字符对齐。因此，他们使用预测频谱与真实频谱之间的频谱损失。\n\t\t- 需要注意的是，模型预测的是音频信号，上述频谱是从生成的音频中计算出来的。\n\t\t- 使用动态时间规整来计算生成频谱与真实频谱之间的最小代价对齐。\n\t\t- 这涉及一种动态规划方法，以找到最小代价的对齐方式。\n\t- 对齐模块的长度损失用于惩罚对齐模块预测的长度与实际音频长度不符的情况。\n\t- 他们使用多说话人数据集训练模型，但报告结果时只针对表现最好的说话人。\n\t- 各组件的重要性排序：（长度损失和频谱损失） > RWD > MelD > 音素 > 多说话人数据集。\n\t- 我的看法：这是一个前馈模型，可以提供端到端的语音合成，而无需单独训练声码器模型。然而，它是一个非常复杂的模型，有许多超参数和实现细节。此外，最终结果也远未达到最先进水平。我认为我们需要找到专门用于学习字符对齐的算法，这样就可以减少对多种不同算法组合进行调优的需求。\n\t  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_123a53ffee63.png\" width=\"50%\">\n\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> Fast Speech2：http:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04558（点击展开）\u003C\u002Fsummary>\n\n  - 使用由[MFA](https:\u002F\u002Fmontreal-forced-aligner.readthedocs.io\u002Fen\u002Flatest\u002Fintroduction.html)生成的音素持续时间作为标签，来训练长度调节器。\n  - 他们还使用帧级F0和L2谱范数（方差信息）作为附加特征。\n  - 方差预测模块在推理时会预测这些方差信息。\n  - 消融实验结果显示改进效果：模型 \u003C 模型 + L2范数 \u003C 模型 + L2范数 + F0\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_c41d05fa0325.png)\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> Glow-TTS：https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.11129.pdf（点击展开）\u003C\u002Fsummary>\n\n  - 使用单调对齐搜索来学习文本与频谱之间的对齐关系。\n  - 这种对齐关系用于训练持续时间预测器，以便在推理时使用。\n  - 编码器将每个字符映射到一个高斯分布。\n  - 解码器使用归一化流（Glow层）将每个频谱帧映射到潜在向量。\n  - 编码器和解码器的输出通过MAS进行对齐。\n  - 在每次迭代中，首先通过MAS找到最可能的对齐方式，然后利用该对齐方式更新模型参数。\n  - 训练一个持续时间预测器，用于预测每个字符对应的频谱帧数。\n  - 在推理时，不再使用MAS，而是直接使用持续时间预测器。\n  - 编码器采用了TTS变压器的架构，并进行了两次改进。\n  - 他们没有使用绝对位置编码，而是使用相对位置编码。\n  - 编码器的Prenet部分也加入了残差连接。\n  - 解码器的架构与Glow模型相同。\n  - 他们同时训练单说话人和多说话人模型。\n  - 实验表明，与原始Tacotron2相比，Glow-TTS在处理长句子时更加稳健。\n  - 推理速度比Tacotron2快15倍。\n  - 我的看法：他们的样本听起来不如Tacotron自然。我认为传统的注意力模型仍然能够生成更自然的语音，因为注意力机制可以直接学习将字符映射到模型输出。不过，对于一些困难的数据集来说，Glow-TTS可能是一个不错的替代方案。\n  - 样本：https:\u002F\u002Fgithub.com\u002Fjaywalnut310\u002Fglow-tts\n  - 仓库：https:\u002F\u002Fgithub.com\u002Fjaywalnut310\u002Fglow-tts\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_74d519931a4b.png)\n\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> 非自回归神经网络文本转语音：http:\u002F\u002Farxiv.org\u002Fabs\u002F1905.08459（点击展开）\u003C\u002Fsummary>\n\n- 一种使用非因果卷积层的Deep Voice 3模型变体。\n   - 教师-学生范式，利用来自自回归教师模型的多个注意力块来训练非自回归的学生模型。\n   - 教师模型用于生成文本到频谱图的对齐信息，供学生模型使用。\n   - 模型采用两种损失函数进行训练：一种用于注意力对齐，另一种用于频谱图生成。\n   - 多个注意力块逐层细化注意力对齐。\n   - 学生模型使用点积注意力机制，包含查询、键和值向量。查询仅由位置编码向量构成，而键和值则来自编码器的输出。\n   - 提出的模型与位置编码紧密相关，且依赖于不同的常数值。\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_fe385f1cbbe1.png)\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> 双解码器一致性：https:\u002F\u002Ferogol.com\u002Fsolving-attention-problems-of-tts-models-with-double-decoder-consistency（点击展开）\u003C\u002Fsummary>\n\n   - 该模型采用类似Tacotron的架构，但配备了两个解码器和一个后网络。\n   - DDC使用两个同步的解码器，分别采用不同的下采样率。\n   - 由于解码器的下采样率不同，它们以不同的粒度计算输出，并学习输入数据的不同方面。\n   - 模型利用这两个解码器之间的一致性来提高所学文本到频谱图对齐的鲁棒性。\n   - 模型还通过多次迭代应用后网络对最终解码器的输出进行精炼。\n   - DDC在预网络模块中使用批归一化，并去掉了Dropout层。\n   - DDC采用渐进式训练方法，以缩短总训练时间。\n   - 我们使用多频段MelGAN生成器作为声码器，并采用多个随机窗口判别器进行训练，这与原始工作有所不同。\n   - 我们仅用一台GPU在两天内就完成了DDC模型的训练，最终模型能够在CPU上以超实时速度生成语音。\n  演示页面：https:\u002F\u002Ferogol.github.io\u002Fddc-samples\u002F\n  代码：https:\u002F\u002Fgithub.com\u002Fmozilla\u002FTTS\n  ![image](https:\u002F\u002Ferogol.com\u002Fwp-content\u002Fuploads\u002F2020\u002F06\u002FDDC_overview-1536x1220.png)\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> 并行Tacotron2：http:\u002F\u002Farxiv.org\u002Fabs\u002F2103.14574（点击展开）\u003C\u002Fsummary>\n\n   - 不需要外部持续时间信息。\n   - 通过Soft-DTW损失解决真实频谱图与地面真值频谱图之间的对齐问题。\n   - 预测的持续时间通过一个学习得到的转换函数而非长度调节器转化为对齐关系，以解决四舍五入问题。\n   - 模型基于从预测持续时间计算出的“标记边界网格”学习注意力图。\n   - 解码器由6个“轻量级卷积”块组成。\n   - 使用VAE将输入频谱图投影到潜在特征，并与字符嵌入合并作为网络的输入。\n   - Soft-DTW计算量较大，因为它需要计算所有频谱帧之间的两两差异。为降低开销，他们限制在一个特定的对角线窗口内进行计算。\n   - 最终的优化目标是持续时间损失、VAE损失和频谱图损失的总和。\n   - 他们仅使用专有数据集进行实验 😦。\n   - 在MOS评分上与Tacotron2模型持平，并优于ParallelTacotron。\n   - **演示页面**：https:\u002F\u002Fgoogle.github.io\u002Ftacotron\u002Fpublications\u002Fparallel_tacotron_2\u002Findex.html\n   - **代码**：目前尚无公开代码\n  \u003Cimg src=\"https:\u002F\u002Fuser-images.githubusercontent.com\u002F1402048\u002F113508025-017eb180-954e-11eb-8cc5-c7dc87945bac.png\" data-canonical-src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_eb5c5741b6a9.png\"  height=\"800\"\u002F>\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> WaveGrad2：https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.09660.pdf（点击展开）\u003C\u002Fsummary>\n\n- 它直接从音素序列计算原始波形。\n- 使用类似Tacotron2的编码器模型从音素中计算隐藏表示。\n- 类似无注意力Tacotron的软持续时间预测器，用于将隐藏表示与输出对齐。\n- 他们根据预测的持续时间扩展隐藏表示，并采样一定窗口将其转换为波形。\n- 他们探索了不同窗口大小，范围从64到256帧，对应0.8至3.2秒的语音。结果表明，窗口越大效果越好。\n- **演示页面**：暂无\n- **代码**：暂无\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_03b699fd24a9.png\"  height=\"450\"\u002F>\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_40519ffebf46.png\"  height=\"450\"\u002F>\n\u003C\u002Fdetails>\n\n______________________________________________________________________\n\n\n\n## 多说话人论文\n- 使用说话人不平衡语音语料库训练多说话人神经文本转语音系统 - https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.00771\n- Deep Voice 2 - https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F6889-deep-voice-2-multi-speaker-neural-text-to-speech.pdf\n- 样本高效的自适应TTS - https:\u002F\u002Fopenreview.net\u002Fpdf?id=rkzjUoAcFX\n\t- WaveNet + 说话人嵌入方法\n- Voice Loop - https:\u002F\u002Farxiv.org\u002Fabs\u002F1707.06588\n- 建模多说话人潜在空间以改进神经TTS：快速录入新说话人并提升优质语音质量 - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.05253.pdf\n- 从说话人验证到多说话人文本转语音合成的迁移学习 - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1806.04558.pdf\n- 基于短篇未转录样本拟合新说话人 - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1802.06984.pdf\n- 通用端到端说话人验证损失 - https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.10467\n\n### 扩展摘要\n\u003Cdetails>\n\u003Csummary> 基于离散语音表示的半监督学习用于多说话人文本转语音合成：http:\u002F\u002Farxiv.org\u002Fabs\u002F2005.08024 \u003C\u002Fsummary>\n\n   - 仅使用一小时的配对数据（文本到语音对齐）以及更多的非配对数据（仅有语音）来训练多说话人TTS模型。\n   - 模型学习一个代码本，每个代码词对应一个音素。\n   - 通过配对数据和CTC算法将代码本与音素对齐。\n   - 这个代码本充当代理，隐式估计非配对数据中的音素序列。\n   - 他们在前面部分的基础上叠加Tacotron2模型，利用代码词嵌入执行TTS。\n   - 在仅有一小时配对数据的情况下，他们的表现超过了基准方法。\n   - 他们没有报告完整配对数据下的结果。\n   - 缺乏充分的消融实验，无法清晰了解模型各部分对性能的贡献。\n   - 他们使用Griffin-Lim作为声码器，因此仍有改进空间。\n\n演示页面：https:\u002F\u002Fttaoretw.github.io\u002Fmultispkr-semi-tts\u002Fdemo.html \u003Cbr>\n  代码：https:\u002F\u002Fgithub.com\u002FttaoREtw\u002Fsemi-tts\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_3a64ecfc12eb.png)\n\u003C\u002Fdetails>\n\u003Cdetails>\n\u003Csummary> Attentron：基于注意力机制的可变长度嵌入的少样本文本到语音合成：https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.08484 \u003C\u002Fsummary>\n\n   - 使用两个编码器来学习说话人相关的特征。\n   - 粗粒度编码器根据提供的参考声谱图学习全局说话人嵌入向量。\n   - 细粒度编码器与注意力模块协同工作，学习保留时间维度的可变长度嵌入。\n   - 注意力机制会选择重要的参考声谱帧来合成目标语音。\n   - 首先使用单说话人数据集对模型进行预训练（LJSpeech，3万次迭代）。\n   - 然后使用多说话人数据集对模型进行微调。（VCTK，7万次迭代）。\n   - 与使用说话人分类模型中的x-vector以及基于VAE的参考音频编码器相比，该方法在指标上略胜一筹。\n\n\n  演示页面：https:\u002F\u002Fhyperconnect.github.io\u002FAttentron\u002F \u003Cbr>\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_7eb18e8c28f4.png)\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_cd07cfce5f07.png)\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> 向通用文本到语音合成迈进：http:\u002F\u002Fwww.interspeech2020.org\u002Fuploadfile\u002Fpdf\u002FWed-3-4-3.pdf \u003C\u002Fsummary>\n\n   - 一种用于多语言序列到序列文本到语音合成的框架。\n   - 该模型是在一个非常庞大且高度不平衡的数据集上训练的。\n   - 在初始训练之后，该模型仅需6分钟的新语言数据和20秒的新说话人数据即可学会新的语言或说话人。\n   - 模型架构是一个基于Transformer的编码器-解码器网络，包含说话人网络和语言网络，用于说话人和语言条件的调节。这些网络的输出会与编码器的输出拼接在一起。\n   - 条件网络接收表示说话人或语言ID的独热向量，并将其映射为条件表示。\n   - 它们使用WaveNet声码器将预测的梅尔频谱图转换为波形输出。\n   - 他们采用语言依赖的音素输入，不同语言之间不共享音素。\n   - 他们在每个批次中根据数据集中各语言出现频率的倒数进行采样，从而确保每个训练批次在语言分布上是均匀的，以缓解训练数据集中语言不平衡的问题。\n   - 对于新说话人或新语言的学习，他们只对编码器-解码器模型及其条件网络进行微调，而不重新训练WaveNet模型。\n   - 他们使用来自50种语言的1250小时专业录音作为训练数据。\n   - 所有音频样本均采用16kHz采样率，并裁剪掉每个片段开头和结尾的静音部分。\n   - 他们使用4块V100显卡进行训练，但并未提及训练持续了多长时间。\n   - 实验结果表明，在MOS评分方面，单说话人模型的表现优于所提出的方案。\n   - 此外，对于数据集中资源较少的语言来说，使用条件网络非常重要，因为它们可以提升这些语言的MOS评分，但却会降低高资源语言的表现。\n   - 当添加新说话人时，他们发现使用超过5分钟的录音反而会降低模型性能。他们认为，由于这些录音的质量不如原始录音，因此使用更多这样的数据会影响模型的整体表现。\n   - 多语言模型只需6分钟的新数据即可训练出新的说话人或语言，而单说话人模型则需要3个小时的训练，且即使如此也无法达到与6分钟多语言模型相同的MOS值。\n\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_9903781892b1.png)\n  ![image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_797abca26546.png)\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary> AdaSpeech：自适应文本到语音合成，用于定制声音：https:\u002F\u002Fopenreview.net\u002Fpdf?id=Drynvt7gg4L \u003C\u002Fsummary>\n\n   - 他们提出了一种能够适应用户不同输入声学特性的系统，并且仅使用最少数量的参数即可实现这一功能。\n   - 主要架构基于FastSpeech2模型，该模型利用音高和方差预测器来学习输入语音的更细微特征。\n   - 他们额外使用了3个条件网络。\n   - 句子级网络：以参考语音的梅尔频谱图为输入。\n   - 音素级网络：以音素级别的梅尔频谱图为输入，计算音素级别的条件向量。音素级别的梅尔频谱图是通过取每个音素持续时间内平均的频谱帧来计算的。\n   - 音素级2：以音素编码器的输出为输入。这与上述网络的不同之处在于，它仅使用音素信息，而不查看频谱图。\n   - 所有这些条件网络以及基础的FastSpeech2模型都使用层归一化层。\n   - 条件层归一化：他们提出，当模型针对新说话人进行微调时，只需微调每一层归一化的尺度和偏置参数。他们为每个层归一化层训练了一个说话人条件模块，该模块会输出尺度和偏置值。（他们为每个Transformer块使用一个说话人条件模块。）\n   - 这意味着，对于每个新说话人，你只需要存储对应的说话人条件模块，并在推理时预测尺度和偏置值，而模型的其余部分保持不变。\n   - 在实验中，他们首先在LibriTTS数据集上对模型进行预训练，然后用VCTK和LJSpeech数据集对其进行微调。\n   - 结果显示，使用条件层归一化的方法优于他们的两个基线方法，即仅使用说话人嵌入和微调解码器网络的方法。\n   - 消融实验表明，模型中最重要的部分依次是“音素级”网络、条件层归一化以及“句子级”网络。\n   - 该论文的一个重要缺点是几乎没有与现有文献的比较，这使得结果难以客观评估。\n\n演示页面：https:\u002F\u002Fspeechresearch.github.io\u002Fadaspeech\u002F \u003Cbr>\n  ![图片](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_cc749ae05156.png)\n  ![图片](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_e2f6e2cdc7e5.png)\n  ![图片](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_daf5d7b223a4.png)\n  ![图片](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_2bab4a01d3f1.png)\n  ![图片](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_5b846808cb77.png)\n  ![图片](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_ef267ee104b5.png)\n\n\u003C\u002Fdetails>\n\n______________________________________________________________________\n\n\n\n## 注意事项\n- 用于鲁棒长语音合成的位置相关注意力机制 - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1910.10288.pdf\n\n______________________________________________________________________\n\n## 声码器\n- MelGAN: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1910.06711.pdf\n- ParallelWaveGAN: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1910.11480.pdf\n    - 多尺度STFT损失\n    - 约100万模型参数（非常小）\n    - 性能略逊于WaveRNN\n- 改进的FFTNet\n    - https:\u002F\u002Fwww.okamotocamera.com\u002Fslt_2018.pdfF\n    - https:\u002F\u002Fwww.okamotocamera.com\u002Fslt_2018.pdf\n- FFTnet\n    - https:\u002F\u002Fgfx.cs.princeton.edu\u002Fpubs\u002FJin_2018_FAR\u002Fclips\u002Fclips.php\n    - https:\u002F\u002Fgfx.cs.princeton.edu\u002Fpubs\u002FJin_2018_FAR\u002Ffftnet-jin2018.pdf\n- 使用带有噪声和周期性输入的卷积神经网络进行语音波形重建\n    - 150.162.46.34:8080\u002Ficassp2019\u002FICASSP2019\u002Fpdfs\u002F0007045.pdf\n- 朝着实现鲁棒通用声码器的目标迈进\n    - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1811.06292.pdf\n- LPCNet\n    - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1810.11846.pdf\n    - https:\u002F\u002Farxiv.org\u002Fpdf\u002F2001.11686.pdf\n- ExciteNet\n    - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1811.04769v3.pdf\n- GELP：基于生成对抗网络激励的线性预测语音合成，从梅尔频谱图生成语音\n    - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.03976v3.pdf\n- 使用对抗网络实现高保真语音合成：https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11646\n    - GAN-TTS，端到端语音合成\n    - 使用持续时间和语言特征\n    - 持续时间和声学特征由额外的模型预测。\n    - 随机窗口判别器：不输入整个语音样本，而是随机窗口。\n    - 多个RWD。有些是有条件的，有些是无条件的。（根据输入特征进行条件化）\n    - 关键点：使用不同大小的随机窗口作为D。\n    - 共享的结果听起来很机械，这表明非神经声学特征的局限性。\n- 多频段MelGAN：https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.05106\n    - 使用PWGAN损失代替特征匹配损失。\n    - 使用更大的感受野显著提升了模型性能。\n    - 生成器预训练20万次迭代。\n    - 多频段语音信号预测。输出是4个不同频段预测结果与PQMF合成滤波器的叠加。\n    - 多频段模型有190万个参数（相当小）。\n    - 声称比MelGAN快7倍\n    - 在中文数据集上：MOS 4.22\n - WaveGLow：https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.00002\n \t- 模型非常庞大（2.68亿参数）\n\t- 训练困难，因为在12GB显存的GPU上只能使用1的批量大小。\n\t- 由于使用了卷积，可以实现实时推理。\n\t- 基于可逆归一化流。（精彩教程 https:\u002F\u002Fblog.evjang.com\u002F2018\u002F01\u002Fnf1.html\n)\n\t- 模型通过最大似然损失学习音频样本到梅尔频谱图的可逆映射。\n\t- 推理时网络反向运行，将梅尔频谱图转换为音频样本。\n\t- 训练是在8块Nvidia V100显卡上进行的，每张卡32GB内存，批量大小为24。（成本高昂）\n\n - SqueezeWave：https:\u002F\u002Farxiv.org\u002Fpdf\u002F2001.05685.pdf，代码：https:\u002F\u002Fgithub.com\u002Ftianrengao\u002FSqueezeWave\n \t- 比实时快约5-13倍\n\t- WaveGlow的冗余：长音频样本、上采样梅尔频谱图、WN函数中较大的通道维度。\n\t- 解决方案：输入更多但更短的音频样本，（L=2000，C=8 vs L=64，C=256）\n\t- L=64与梅尔频谱分辨率一致，因此无需上采样。\n\t- 在WN模块中使用深度可分离卷积。\n\t- 由于音频样本较短，使用普通卷积而不是扩张卷积。\n\t- 不再将模块输出分为残差和网络输出，假设这两个向量几乎相同。\n\t- 训练是在Titan RTX 24GB显卡上以96的批量大小进行了60万次迭代。\n\t- MOS在LJSpeech上的表现：WaveGLow - 4.57，SqueezeWave（L=128 C=256）- 4.07，SqueezeWave（L=64 C=256）- 3.77\n\t- 最小的模型在Raspi3上每秒处理2.1万个样本。\n\n\u003Cdetails>\n\u003Csummary>WaveGrad：https:\u002F\u002Farxiv.org\u002Fpdf\u002F2009.00713.pdf \u003C\u002Fsummary>\n\n  - 它基于概率扩散和朗之万动力学\n  - 核心思想是逐步学习一个函数，将已知分布映射到目标数据分布。\n  - 他们报告在GPU上实现了0.2倍实时速度，但未提及CPU性能。\n  - 在下面的示例代码中，作者报告该模型在单个GPU上训练两天后即可收敛。\n  - 论文中的MOS评分不够全面，但显示出与WaveRNN和WaveNet等知名模型相当的性能。\n\n  代码：https:\u002F\u002Fgithub.com\u002Fivanvovk\u002FWaveGrad\n  ![图片](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_readme_b9f725a7c464.png)\n\u003C\u002Fdetails>\n\n\n\n# 来自互联网（博客、视频等）\n\n## 视频\n### 论文讨论\n- Tacotron 2 : https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=2iarxxm-v9w\n\n### 演讲\n- 关于推动神经文本转语音前沿的演讲，徐坦，2021年，https:\u002F\u002Fyoutu.be\u002FMA8PCvmr8B0\n- 关于基于生成模型的文本转语音合成的演讲，Heiga Zen，2017年\n\t- 视频：https:\u002F\u002Fyoutu.be\u002FnsrSrYtKkT8\n\t- 幻灯片：https:\u002F\u002Fresearch.google.com\u002Fpubs\u002Fpub45882.html\n- ISCA Odyessy 2020关于神经参数化文本转语音合成的教程，Xin Wang，2020年\n\t- 视频：https:\u002F\u002Fyoutu.be\u002FWCe7SYcDzAI\n\t- 幻灯片：http:\u002F\u002Ftonywangx.github.io\u002Fslide.html#dec-2020\n- ISCA语音处理课程关于神经声码器，2022年\n\t- 神经声码器的基本组件：https:\u002F\u002Fyoutu.be\u002FM833q5I-ZYs\n\t- 用于语音压缩的深度生成模型（LPCNet）：https:\u002F\u002Fyoutu.be\u002F7KsnFx3pLgw\n\t- 神经自回归、源滤波器和声门声码器：https:\u002F\u002Fyoutu.be\u002FgPrmxdberX0\n\t\t- 幻灯片：http:\u002F\u002Ftonywangx.github.io\u002Fslide.html#jul-2020\n- 从口语句子的神经解码中合成语音 | AISC：https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=MNDtMDPmnMo\n- 生成式文本转语音合成：https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=j4mVEAnKiNg\n- 游戏行业的语音合成：https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=aOHAYe4A-2Q\n\n### 一般\n- 现代文本转语音系统综述：https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=8rXLSc-ZcRY\n\n## Jupyter笔记本\n- 选定神经声码器的教程：https:\u002F\u002Fgithub.com\u002Fnii-yamagishilab\u002Fproject-NN-Pytorch-scripts\u002Ftree\u002Fmaster\u002Ftutorials\u002Fb1_neural_vocoder\n\n## 博客\n- 文本转语音深度学习架构：http:\u002F\u002Fwww.erogol.com\u002Ftext-speech-deep-learning-architectures\u002F","# TTS-papers 快速上手指南\n\n**工具简介**：\nTTS-papers 并非一个可直接运行的软件库，而是一个由社区维护的**语音合成（TTS）领域核心论文清单与解读合集**。它收录了从 Tacotron、FastSpeech 到 Glow-TTS、Parallel Tacotron 等主流模型的论文链接、核心创新点总结及优缺点分析。本指南旨在帮助开发者利用该资源快速定位技术路线并获取官方代码实现。\n\n## 1. 环境准备\n\n由于本项目主要是文档和链接集合，无需安装特定的运行时环境。但为了复现清单中提到的模型，建议准备以下基础开发环境：\n\n*   **操作系统**：Linux (推荐 Ubuntu 20.04+) 或 macOS\n*   **编程语言**：Python 3.8+\n*   **深度学习框架**：PyTorch 或 TensorFlow (具体取决于你选择的论文对应实现)\n*   **硬件要求**：\n    *   训练：建议使用 NVIDIA GPU (显存 16GB+ 为佳)\n    *   推理：CPU 或任意 GPU 均可\n*   **前置依赖**：\n    *   Git (用于克隆相关代码库)\n    *   CUDA Toolkit (如需 GPU 加速)\n\n## 2. 获取资源与安装\n\nTTS-papers 本身是一个 GitHub 仓库，主要作为索引使用。你需要克隆该仓库以查看详细的论文解读，并根据解读中的链接去获取具体模型的代码。\n\n### 克隆项目仓库\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Ferogol\u002FTTS-papers.git\ncd TTS-papers\n```\n\n### 查找目标模型代码\n在 `README.md` 中找到你感兴趣的模型（例如 **Glow-TTS** 或 **FastSpeech2**），点击对应的 \"Repository\" 或 \"Code\" 链接跳转至官方实现仓库。\n\n*示例：若选择 Glow-TTS*\n```bash\n# 跳转到官方实现仓库 (以 Glow-TTS 为例)\ngit clone https:\u002F\u002Fgithub.com\u002Fjaywalnut310\u002Fglow-tts.git\ncd glow-tts\n\n# 安装该模型所需的特定依赖\npip install -r requirements.txt\n```\n\n> **提示**：国内开发者若遇到 GitHub 克隆速度慢的问题，可使用国内镜像源加速：\n> ```bash\n> git clone https:\u002F\u002Fgitee.com\u002Fmirrors\u002Fglow-tts.git  # 需确认具体镜像地址，或使用通用加速代理\n> # 或者配置 pip 国内源\n> pip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n> ```\n\n## 3. 基本使用\n\n由于 TTS-papers 是论文清单，\"使用\"该工具的核心流程是：**阅读摘要 -> 确定模型 -> 运行对应代码**。以下是基于清单中热门模型 **Glow-TTS** 的最小化使用示例。\n\n### 步骤一：数据准备\n确保你拥有整理好的音频数据集（通常为 `.wav` 文件）和对应的文本标注文件（`.txt` 或 `.json`）。\n\n### 步骤二：模型训练\n进入具体模型的代码目录，执行训练脚本。以下命令为典型 PyTorch TTS 项目的训练方式：\n\n```bash\n# 配置数据路径并开始训练\npython train.py --config_path config.json --data_path .\u002Fmy_dataset\n```\n\n### 步骤三：语音合成（推理）\n训练完成后，使用检查点文件生成语音：\n\n```bash\n# 输入文本并生成音频\npython synthesize.py --text \"你好，这是由 Glow-TTS 生成的语音。\" --checkpoint_path logs\u002Fglow_tts\u002Fbest_model.pth --output_dir .\u002Foutputs\n```\n\n### 如何利用 TTS-papers 进行选型？\n在阅读 `README.md` 时，关注以下关键指标以决定使用哪个模型：\n*   **实时率 (RTF)**：如 **Glow-TTS** 比 Tacotron2 快 15 倍，适合低延迟场景。\n*   **音质 (MOS)**：如 **FlowTron** 的 MOS 略优于标准 Tacotron，但模型更大。\n*   **数据需求**：如 **Semi-Supervised Training** 相关论文适合数据量少的场景。\n*   **对齐稳定性**：如 **Double Decoder Consistency (DDC)** 专门解决注意力对齐失败问题。\n\n通过查阅该清单中的 \"Expansive Summaries\" 部分，你可以快速了解各算法的优缺点（例如：*FastSpeech2 引入了音高和能量预测以提升表现*），从而避免盲目尝试。","某语音合成初创团队的算法工程师正在研发一款支持多情感表达的有声书朗读系统，急需寻找能解决发音过度平滑和对齐不准问题的前沿方案。\n\n### 没有 TTS-papers 时\n- **文献检索效率低下**：工程师需在 arXiv、IEEE 等多个平台手动搜索关键词，耗时数天才能凑齐关于\"Phoneme Timing\"或\"Over-Smoothness\"的零散论文。\n- **技术选型盲目试错**：面对 Tacotron2 的过度平滑问题，因缺乏如 Es-Tacotron2 或 DurIAN 等针对性方案的指引，团队只能盲目复述基础模型，反复调整超参数却收效甚微。\n- **关键细节容易遗漏**：在复现 FlowTron 等复杂模型时，由于未参考 README 中关于“高斯混合模型嵌入”或“判别器架构”的详细摘要，导致模型训练收敛困难或音质不达标。\n- **前沿趋势把握滞后**：难以快速识别出 FastSpeech 的前馈架构优势或端到端对抗生成（GAN-TTS）的最新进展，致使产品技术路线落后于行业主流。\n\n### 使用 TTS-papers 后\n- **一站式精准获取**：直接通过 TTS-papers 索引定位到《Es-Tacotron2》和《AlignTTS》等核心论文，将文献调研周期从数天缩短至几小时。\n- **痛点方案直击要害**：依据库中对“减少过度平滑”和“时长感知”的明确标注，迅速引入预训练估计网络和时长控制机制，显著提升了语音的自然度和节奏感。\n- **实现细节清晰透明**：利用库中提供的扩展摘要（Expansive Summaries），快速掌握 GAN-TTS 中字符对齐的损失函数设计及动态时间规整（DTW）的具体用法，大幅降低复现门槛。\n- **技术迭代有的放矢**：基于整理的模型对比（如 MOS 值差异和模型大小分析），团队果断放弃笨重的旧架构，转向更高效的前馈模型，加速了产品落地进程。\n\nTTS-papers 通过结构化整理前沿论文与实战要点，将语音合成研发从“大海捞针”转变为“按图索骥”，极大提升了算法迭代的准确率与速度。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoqui-ai_TTS-papers_72241656.png","coqui-ai","coqui","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fcoqui-ai_1a73797a.png","Coqui, a startup providing open speech tech for everyone 🐸",null,"https:\u002F\u002Fgithub.com\u002Fcoqui-ai",726,79,"2026-04-12T21:38:45","MPL-2.0",5,"","未说明",{"notes":86,"python":84,"dependencies":87},"该仓库（TTS-papers）并非一个可运行的 AI 工具或代码库，而是一个语音合成（TTS）领域相关学术论文的整理列表。README 内容主要包含论文标题、链接以及对部分论文（如 Glow-TTS, FastSpeech2, Parallel Tacotron2 等）的技术摘要和评论。文中提到的'Code'链接均指向其他独立的项目仓库（如 mozilla\u002FTTS, jaywalnut310\u002Fglow-tts），本仓库本身不包含模型代码、训练脚本或推理环境，因此没有具体的操作系统、GPU、内存、Python 版本或依赖库要求。",[],[14,89,90],"音频","其他",[92,93,94,72,95,96],"speech","papers","tts","deep-learning","research-paper","2026-03-27T02:49:30.150509","2026-04-16T08:12:33.203327",[],[]]