[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-linto-ai--whisper-timestamped":3,"tool-linto-ai--whisper-timestamped":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",151314,2,"2026-04-11T23:32:58",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":77,"owner_twitter":76,"owner_website":78,"owner_url":79,"languages":80,"stars":89,"forks":90,"last_commit_at":91,"license":92,"difficulty_score":10,"env_os":93,"env_gpu":94,"env_ram":95,"env_deps":96,"category_tags":107,"github_topics":109,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":130,"updated_at":131,"faqs":132,"releases":160},6850,"linto-ai\u002Fwhisper-timestamped","whisper-timestamped","Multilingual Automatic Speech Recognition with word-level timestamps and confidence","whisper-timestamped 是一款基于 OpenAI Whisper 模型的增强工具，专为多语言自动语音识别设计。它核心解决了原生 Whisper 模型只能提供粗略片段级时间戳、无法精准定位单词起止时间的痛点，能够输出精确到单词级别的时间标记，并为每个单词和语音片段赋予置信度评分。\n\n该工具特别适合需要高精度字幕制作、语音数据分析的开发者、研究人员以及内容创作者使用。其独特技术亮点在于利用动态时间规整（DTW）算法处理交叉注意力权重，无需额外的推理步骤即可在解码过程中实时完成单词对齐，既保证了精度又有效控制了内存占用，使其能流畅处理长音频文件。此外，whisper-timestamped 还集成了语音活动检测（VAD）功能，能有效过滤静音干扰，减少模型在无声段产生“幻觉”文本的情况。作为 openai-whisper 的扩展包，它兼容现有版本并提供了 Python 接口与命令行工具，让获取带时间戳的高质量转录结果变得更加简单高效。","# whisper-timestamped\n\nMultilingual Automatic Speech Recognition with word-level timestamps and confidence.\n\n* [Description](#description)\n   * [Notes on other approaches](#notes-on-other-approaches)\n* [Installation](#installation)\n   * [First installation](#first-installation)\n      * [Additional packages that might be needed](#additional-packages-that-might-be-needed)\n      * [Docker](#docker)\n   * [Light installation for CPU](#light-installation-for-cpu)\n   * [Upgrade to the latest version](#upgrade-to-the-latest-version)\n* [Usage](#usage)\n   * [Python](#python)\n   * [Command line](#command-line)\n   * [Utility Functions](#utility-functions)\n   * [Plot of word alignment](#plot-of-word-alignment)\n   * [Example output](#example-output)\n* [API Reference](#api-reference)\n   * [Main Transcription Function](#main-transcription-function)\n   * [Utility Functions](#utility-functions-1)\n   * [File Writing Functions](#file-writing-functions)\n* [Options that may improve results](#options-that-may-improve-results)\n   * [Accurate Whisper transcription](#accurate-whisper-transcription)\n   * [Running Voice Activity Detection (VAD) before sending to Whisper](#running-voice-activity-detection-vad-before-sending-to-whisper)\n   * [Detecting disfluencies](#detecting-disfluencies)\n* [Acknowledgments\u002FSupport](#acknowledgments-support)\n* [Citations](#citations)\n\n## Description\n\n[Whisper](https:\u002F\u002Fopenai.com\u002Fblog\u002Fwhisper\u002F) is a set of multi-lingual, robust speech recognition models trained by OpenAI that achieve state-of-the-art results in many languages. Whisper models were trained to predict approximate timestamps on speech segments (most of the time with 1-second accuracy), but they cannot originally predict word timestamps. This repository proposes an implementation to **predict word timestamps and provide a more accurate estimation of speech segments when transcribing with Whisper models**.\nBesides, a confidence score is assigned to each word and each segment.\n\nThe approach is based on Dynamic Time Warping (DTW) applied to cross-attention weights, as demonstrated by [this notebook by Jong Wook Kim](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fwhisper\u002Fblob\u002Ff82bc59f5ea234d4b97fb2860842ed38519f7e65\u002Fnotebooks\u002FMultilingual_ASR.ipynb). There are some additions to this notebook:\n* The start\u002Fend estimation is more accurate.\n* Confidence scores are assigned to each word.\n* **If possible (without beam search...)**, no additional inference steps are required to predict word timestamps (word alignment is done on the fly after each speech segment is decoded).\n* Special care has been taken regarding memory usage: `whisper-timestamped` is able to process long files with little additional memory compared to the regular use of the Whisper model.\n\n`whisper-timestamped` is an extension of the [`openai-whisper`](https:\u002F\u002Fpypi.org\u002Fproject\u002Fwhisper-openai\u002F) Python package and is meant to be compatible with any version of `openai-whisper`.\nIt provides more efficient\u002Faccurate word timestamps, along with those additional features:\n* Voice Activity Detection (VAD) can be run before applying Whisper model,\n  to avoid hallucinations due to errors in the training data (for instance, predicting \"Thanks you for watching!\" on pure silence).\n  Several VAD methods are available: silero (default), auditok, auditok:v3.1\n* When the language is not specified, the language probabilities are provided among the outputs.\n\n**Disclaimer: Please note that this extension is intended for experimental purposes and may significantly impact performance. We are not responsible for any issues or inefficiencies that arise from its use.**\n\n### Notes on other approaches\n\nAn alternative relevant approach to recovering word-level timestamps involves using wav2vec models that predict characters, as successfully implemented in [whisperX](https:\u002F\u002Fgithub.com\u002Fm-bain\u002FwhisperX). However, these approaches have several drawbacks that are not present in approaches based on cross-attention weights such as `whisper_timestamped`. These drawbacks include:\n* The need to find one wav2vec model per language to support, which does not scale well with the multi-lingual capabilities of Whisper.\n* The need to handle (at least) one additional neural network (wav2vec model), which consumes memory.\n* The need to normalize characters in Whisper transcription to match the character set of the wav2vec model. This involves awkward language-dependent conversions, such as converting numbers to words (\"2\" -> \"two\"), symbols to words (\"%\" -> \"percent\", \"€\" -> \"euro(s)\")...\n* The lack of robustness around speech disfluencies (fillers, hesitations, repeated words...) that are usually removed by Whisper.\n\nAn alternative approach that does not require an additional model is to look at the probabilities of timestamp tokens estimated by the Whisper model after each (sub)word token is predicted. This was implemented, for instance, in whisper.cpp and stable-ts. However, this approach lacks robustness because Whisper models have not been trained to output meaningful timestamps after each word. Whisper models tend to predict timestamps only after a certain number of words have been predicted (typically at the end of a sentence), and the probability distribution of timestamps outside this condition may be inaccurate. In practice, these methods can produce results that are totally out-of-sync on some periods of time (we observed this especially when there is jingle music). Also, the timestamp precision of Whisper models tends to be rounded to 1 second (as in many video subtitles), which is too inaccurate for words, and reaching better accuracy is tricky.\n\n## Installation\n\n### First installation\n\nRequirements:\n* `python3` (version higher or equal to 3.7, at least 3.9 is recommended)\n* `ffmpeg` (see instructions for installation on the [whisper repository](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fwhisper))\n\nYou can install `whisper-timestamped` either by using pip:\n```bash\npip3 install whisper-timestamped\n```\n\nor by cloning this repository and running installation:\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped\ncd whisper-timestamped\u002F\npython3 setup.py install\n```\n\n#### Additional packages that might be needed\n\nIf you want to plot alignment between audio timestamps and words (as in [this section](#plot-of-word-alignment)), you also need matplotlib:\n```bash\npip3 install matplotlib\n```\n\nIf you want to use VAD option (Voice Activity Detection before running Whisper model), you also need torchaudio and onnxruntime:\n```bash\npip3 install onnxruntime torchaudio\n```\n\nIf you want to use finetuned Whisper models from the Hugging Face Hub, you also need transformers:\n```bash\npip3 install transformers\n```\n\n#### Docker\n\nA docker image of about 9GB can be built using:\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped\ncd whisper-timestamped\u002F\ndocker build -t whisper_timestamped:latest .\n```\n\n### Light installation for CPU\n\nIf you don't have a GPU (or don't want to use it), then you don't need to install the CUDA dependencies. You should then just install a light version of torch **before** installing whisper-timestamped, for instance as follows:\n```bash\npip3 install \\\n     torch==1.13.1+cpu \\\n     torchaudio==0.13.1+cpu \\\n     -f https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Ftorch_stable.html\n```\n\nA specific docker image of about 3.5GB can also be built using:\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped\ncd whisper-timestamped\u002F\ndocker build -t whisper_timestamped_cpu:latest -f Dockerfile.cpu .\n```\n\n### Upgrade to the latest version\n\nWhen using pip, the library can be updated to the latest version using:\n```\npip3 install --upgrade --no-deps --force-reinstall git+https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped\n```\n\nA specific version of `openai-whisper` can be used by running, for example:\n```bash\npip3 install openai-whisper==20230124\n```\n\n## Usage\n\n### Python\n\nIn Python, you can use the function `whisper_timestamped.transcribe()`, which is similar to the function `whisper.transcribe()`:\n```python\nimport whisper_timestamped\nhelp(whisper_timestamped.transcribe)\n```\nThe main difference with `whisper.transcribe()` is that the output will include a key `\"words\"` for all segments, with the word start and end position. Note that the word will include punctuation. See the example [below](#example-output).\n\nBesides, the default decoding options are different to favour efficient decoding (greedy decoding instead of beam search, and no temperature sampling fallback). To have same default as in `whisper`, use ```beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0)```.\n\nThere are also additional options related to word alignement.\n\nIn general, if you import `whisper_timestamped` instead of `whisper` in your Python script and use `transcribe(model, ...)` instead of `model.transcribe(...)`, it should do the job:\n```python\nimport whisper_timestamped as whisper\n\naudio = whisper.load_audio(\"AUDIO.wav\")\n\nmodel = whisper.load_model(\"tiny\", device=\"cpu\")\n\nresult = whisper.transcribe(model, audio, language=\"fr\")\n\nimport json\nprint(json.dumps(result, indent = 2, ensure_ascii = False))\n```\n\nNote that you can use a finetuned Whisper model from HuggingFace or a local folder by using the `load_model` method of `whisper_timestamped`. For instance, if you want to use [whisper-large-v2-nob](https:\u002F\u002Fhuggingface.co\u002FNbAiLab\u002Fwhisper-large-v2-nob), you can simply do the following:\n```python\nimport whisper_timestamped as whisper\n\nmodel = whisper.load_model(\"NbAiLab\u002Fwhisper-large-v2-nob\", device=\"cpu\")\n\n# ...\n```\n\n### Command line\n\nYou can also use `whisper_timestamped` on the command line, similarly to `whisper`. See help with:\n```bash\nwhisper_timestamped --help\n```\n\nThe main differences with `whisper` CLI are:\n* Output files:\n  * The output JSON contains word timestamps and confidence scores. See example [below](#example-output).\n  * There is an additional CSV output format.\n  * For SRT, VTT, TSV formats, there will be additional files saved with word timestamps.\n* Some default options are different:\n  * By default, no output folder is set: Use `--output_dir .` for Whisper default.\n  * By default, there is no verbose: Use `--verbose True` for Whisper default.\n  * By default, beam search decoding and temperature sampling fallback are disabled, to favour an efficient decoding.\n    To set the same as Whisper default, you can use `--accurate` (which is an alias for ```--beam_size 5 --temperature_increment_on_fallback 0.2 --best_of 5```).\n* There are some additional specific options:\n  \u003C!-- * `--efficient` to use a faster greedy decoding (without beam search neither several sampling at each step),\n  which enables a special path where word timestamps are computed on the fly (no need to run inference twice).\n  Note that transcription results might be significantly worse on challenging audios with this option. -->\n  * `--compute_confidence` to enable\u002Fdisable the computation of confidence scores for each word.\n  * `--punctuations_with_words` to decide whether punctuation marks should be included or not with preceding words.\n\nAn example command to process several files using the `tiny` model and output the results in the current folder, as would be done by default with whisper, is as follows:\n```\nwhisper_timestamped audio1.flac audio2.mp3 audio3.wav --model tiny --output_dir .\n```\n\nNote that you can use a fine-tuned Whisper model from HuggingFace or a local folder. For instance, if you want to use the [whisper-large-v2-nob](https:\u002F\u002Fhuggingface.co\u002FNbAiLab\u002Fwhisper-large-v2-nob) model, you can simply do the following:\n```\nwhisper_timestamped --model NbAiLab\u002Fwhisper-large-v2-nob \u003C...>\n```\n\n### Utility Functions\n\nIn addition to the main `transcribe` function, whisper-timestamped provides some utility functions:\n\n#### `remove_non_speech`\n\nRemove non-speech segments from audio using Voice Activity Detection (VAD).\n\n```python\nfrom whisper_timestamped import remove_non_speech\n\naudio_speech, segments, convert_timestamps = remove_non_speech(audio, vad=\"silero\")\n```\n\n#### `load_model`\n\nLoad a Whisper model from a given name or path, including support for fine-tuned models from HuggingFace.\n\n```python\nfrom whisper_timestamped import load_model\n\nmodel = load_model(\"NbAiLab\u002Fwhisper-large-v2-nob\", device=\"cpu\")\n```\n\n### Plot of word alignment\n\nNote that you can use the `plot_word_alignment` option of the `whisper_timestamped.transcribe()` Python function or the `--plot` option of the `whisper_timestamped` CLI to see the word alignment for each segment.\n\n![Example alignement](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flinto-ai_whisper-timestamped_readme_f41f3efe6335.png)\n\n* The upper plot represents the transformation of cross-attention weights used for alignment with Dynamic Time Warping. The abscissa represents time, and the ordinate represents the predicted tokens, with special timestamp tokens at the beginning and end, and (sub)words and punctuation in the middle.\n* The lower plot is an MFCC representation of the input signal (features used by Whisper, based on Mel-frequency cepstrum).\n* The vertical dotted red lines show where the word boundaries are found (with punctuation marks \"glued\" to the previous word).\n\n### Example output\n\nThe output of `whisper_timestamped.transcribe()` function is a python dictionary,\nwhich can be viewed in JSON format using the CLI.\n\nThe JSON schema can be seen in [tests\u002Fjson_schema.json](tests\u002Fjson_schema.json).\n\nHere is an example output:\n```bash\nwhisper_timestamped AUDIO_FILE.wav --model tiny --language fr\n```\n```json\n{\n  \"text\": \" Bonjour! Est-ce que vous allez bien?\",\n  \"segments\": [\n    {\n      \"id\": 0,\n      \"seek\": 0,\n      \"start\": 0.5,\n      \"end\": 1.2,\n      \"text\": \" Bonjour!\",\n      \"tokens\": [ 25431, 2298 ],\n      \"temperature\": 0.0,\n      \"avg_logprob\": -0.6674491882324218,\n      \"compression_ratio\": 0.8181818181818182,\n      \"no_speech_prob\": 0.10241222381591797,\n      \"confidence\": 0.51,\n      \"words\": [\n        {\n          \"text\": \"Bonjour!\",\n          \"start\": 0.5,\n          \"end\": 1.2,\n          \"confidence\": 0.51\n        }\n      ]\n    },\n    {\n      \"id\": 1,\n      \"seek\": 200,\n      \"start\": 2.02,\n      \"end\": 4.48,\n      \"text\": \" Est-ce que vous allez bien?\",\n      \"tokens\": [ 50364, 4410, 12, 384, 631, 2630, 18146, 3610, 2506, 50464 ],\n      \"temperature\": 0.0,\n      \"avg_logprob\": -0.43492694334550336,\n      \"compression_ratio\": 0.7714285714285715,\n      \"no_speech_prob\": 0.06502953916788101,\n      \"confidence\": 0.595,\n      \"words\": [\n        {\n          \"text\": \"Est-ce\",\n          \"start\": 2.02,\n          \"end\": 3.78,\n          \"confidence\": 0.441\n        },\n        {\n          \"text\": \"que\",\n          \"start\": 3.78,\n          \"end\": 3.84,\n          \"confidence\": 0.948\n        },\n        {\n          \"text\": \"vous\",\n          \"start\": 3.84,\n          \"end\": 4.0,\n          \"confidence\": 0.935\n        },\n        {\n          \"text\": \"allez\",\n          \"start\": 4.0,\n          \"end\": 4.14,\n          \"confidence\": 0.347\n        },\n        {\n          \"text\": \"bien?\",\n          \"start\": 4.14,\n          \"end\": 4.48,\n          \"confidence\": 0.998\n        }\n      ]\n    }\n  ],\n  \"language\": \"fr\"\n}\n```\nIf the language is not specified (e.g. without option `--language fr` in the CLI) you will find an additional key with the language probabilities:\n```json\n{\n  ...\n  \"language\": \"fr\",\n  \"language_probs\": {\n    \"en\": 0.027954353019595146,\n    \"zh\": 0.02743500843644142,\n    ...\n    \"fr\": 0.9196318984031677,\n    ...\n    \"su\": 3.0119704064190955e-08,\n    \"yue\": 2.2565967810805887e-05\n  }\n}\n```\n\n## API Reference\n\n### Main Transcription Function\n\n#### `transcribe_timestamped(model, audio, **kwargs)`\n\nTranscribe audio using a Whisper model and compute word-level timestamps.\n\n##### Parameters:\n\n- `model`: Whisper model instance\n  The Whisper model to use for transcription.\n\n- `audio`: Union[str, np.ndarray, torch.Tensor]\n  The path to the audio file to transcribe, or the audio waveform as a NumPy array or PyTorch tensor.\n\n- `language`: str, optional (default: None)\n  The language of the audio. If None, language detection will be performed.\n\n- `task`: str, default \"transcribe\"\n  The task to perform: either \"transcribe\" for speech recognition or \"translate\" for translation to English.\n\n- `vad`: Union[bool, str, List[Tuple[float, float]]], optional (default: False)\n  Whether to use Voice Activity Detection (VAD) to remove non-speech segments. Can be:\n  - True\u002FFalse: Enable\u002Fdisable VAD (uses Silero VAD by default)\n  - \"silero\": Use Silero VAD\n  - \"auditok\": Use Auditok VAD\n  - List of (start, end) timestamps: Explicitly specify speech segments\n\n- `detect_disfluencies`: bool, default False\n  Whether to detect and mark disfluencies (hesitations, filler words, etc.) in the transcription.\n\n- `trust_whisper_timestamps`: bool, default True\n  Whether to rely on Whisper's timestamps for initial segment positions.\n\n- `compute_word_confidence`: bool, default True\n  Whether to compute confidence scores for words.\n\n- `include_punctuation_in_confidence`: bool, default False\n  Whether to include punctuation probability when computing word confidence.\n\n- `refine_whisper_precision`: float, default 0.5\n  How much to refine Whisper segment positions, in seconds. Must be a multiple of 0.02.\n\n- `min_word_duration`: float, default 0.02\n  Minimum duration of a word, in seconds.\n\n- `plot_word_alignment`: bool or str, default False\n  Whether to plot the word alignment for each segment. If a string, save the plot to the given file.\n\n- `word_alignement_most_top_layers`: int, optional (default: None)\n  Number of top layers to use for word alignment. If None, use all layers.\n\n- `remove_empty_words`: bool, default False\n  Whether to remove words with no duration occurring at the end of segments.\n\n- `naive_approach`: bool, default False\n  Force the naive approach of decoding twice (once for transcription, once for alignment).\n\n- `use_backend_timestamps`: bool, default False\n  Whether to use word timestamps provided by the backend (openai-whisper or transformers), instead of the ones computed by more complex heuristics of whisper-timestamped.\n\n- `temperature`: Union[float, List[float]], default 0.0\n  Temperature for sampling. Can be a single value or a list for fallback temperatures.\n\n- `compression_ratio_threshold`: float, default 2.4\n  If the gzip compression ratio is above this value, treat the decoding as failed.\n\n- `logprob_threshold`: float, default -1.0\n  If the average log probability is below this value, treat the decoding as failed.\n\n- `no_speech_threshold`: float, default 0.6\n  Probability threshold for \u003C|nospeech|> tokens.\n\n- `condition_on_previous_text`: bool, default True\n  Whether to provide the previous output as a prompt for the next window.\n\n- `initial_prompt`: str, optional (default: None)\n  Optional text to provide as a prompt for the first window.\n\n- `suppress_tokens`: str, default \"-1\"\n  Comma-separated list of token ids to suppress during sampling.\n\n- `fp16`: bool, optional (default: None)\n  Whether to perform inference in fp16 precision.\n\n- `verbose`: bool or None, default False\n  Whether to display the text being decoded to the console. If True, displays all details. If False, displays minimal details. If None, does not display anything.\n\n##### Returns:\n\nA dictionary containing:\n- `text`: str - The full transcription text\n- `segments`: List[dict] - List of segment dictionaries, each containing:\n  - `id`: int - Segment ID\n  - `seek`: int - Start position in the audio file (in samples)\n  - `start`: float - Start time of the segment (in seconds)\n  - `end`: float - End time of the segment (in seconds)\n  - `text`: str - Transcribed text for the segment\n  - `tokens`: List[int] - Token IDs for the segment\n  - `temperature`: float - Temperature used for this segment\n  - `avg_logprob`: float - Average log probability of the segment\n  - `compression_ratio`: float - Compression ratio of the segment\n  - `no_speech_prob`: float - Probability of no speech in the segment\n  - `confidence`: float - Confidence score for the segment\n  - `words`: List[dict] - List of word dictionaries, each containing:\n    - `start`: float - Start time of the word (in seconds)\n    - `end`: float - End time of the word (in seconds)\n    - `text`: str - The word text\n    - `confidence`: float - Confidence score for the word (if computed)\n- `language`: str - Detected or specified language\n- `language_probs`: dict - Language detection probabilities (if applicable)\n\n##### Exceptions:\n\n- `RuntimeError`: If the VAD method is not properly installed or configured.\n- `ValueError`: If the `refine_whisper_precision` is not a positive multiple of 0.02.\n- `AssertionError`: If the audio duration is shorter than expected or if there are inconsistencies in the number of segments.\n\n##### Notes:\n\n- The function uses the Whisper model to transcribe the audio and then performs additional processing to generate word-level timestamps and confidence scores.\n- The VAD feature can significantly improve transcription accuracy by removing non-speech segments, but it requires additional dependencies (e.g., torchaudio and onnxruntime for Silero VAD).\n- The `naive_approach` parameter can be useful for debugging or when dealing with specific audio characteristics, but it may be slower than the default approach.\n- When `use_efficient_by_default` is True, some parameters like `best_of`, `beam_size`, and `temperature_increment_on_fallback` are set to None by default for more efficient processing.\n- The function supports both OpenAI Whisper and Transformers backends, which can be specified when loading the model.\n\n### Utility Functions\n\n#### `remove_non_speech(audio, **kwargs)`\n\nRemove non-speech segments from audio using Voice Activity Detection (VAD).\n\n##### Parameters:\n\n- `audio`: torch.Tensor\n  Audio data as a PyTorch tensor.\n\n- `use_sample`: bool, default False\n  If True, return start and end times in samples instead of seconds.\n\n- `min_speech_duration`: float, default 0.1\n  Minimum duration of a speech segment in seconds.\n\n- `min_silence_duration`: float, default 1\n  Minimum duration of a silence segment in seconds.\n\n- `dilatation`: float, default 0.5\n  How much to enlarge each speech segment detected by VAD, in seconds.\n\n- `sample_rate`: int, default 16000\n  Sample rate of the audio.\n\n- `method`: str or List[Tuple[float, float]], default \"silero\"\n  VAD method to use. Can be \"silero\", \"auditok\", or a list of timestamps.\n\n- `avoid_empty_speech`: bool, default False\n  If True, avoid returning an empty speech segment.\n\n- `plot`: Union[bool, str], default False\n  If True, plot the VAD results. If a string, save the plot to the given file.\n\n##### Returns:\n\nA tuple containing:\n1. torch.Tensor: Audio with non-speech segments removed\n2. List[Tuple[float, float]]: List of (start, end) timestamps for speech segments\n3. Callable: Function to convert timestamps from the new audio to the original audio\n\n##### Exceptions:\n\n- `ImportError`: If the required VAD library (e.g., auditok) is not installed.\n- `ValueError`: If an invalid VAD method is specified.\n\n##### Notes:\n\n- This function is particularly useful for improving transcription accuracy by removing silence and non-speech segments from the audio before processing.\n- The choice of VAD method can affect the accuracy and speed of the non-speech removal process.\n\n#### `load_model(name, device=None, backend=\"openai-whisper\", download_root=None, in_memory=False)`\n\nLoad a Whisper model from a given name or path.\n\n##### Parameters:\n\n- `name`: str\n  Name of the model or path to the model. Can be:\n  - OpenAI Whisper identifier: \"large-v3\", \"medium.en\", etc.\n  - HuggingFace identifier: \"openai\u002Fwhisper-large-v3\", \"distil-whisper\u002Fdistil-large-v2\", etc.\n  - File name: \"path\u002Fto\u002Fmodel.pt\", \"path\u002Fto\u002Fmodel.ckpt\", \"path\u002Fto\u002Fmodel.bin\"\n  - Folder name: \"path\u002Fto\u002Ffolder\"\n\n- `device`: Union[str, torch.device], optional (default: None)\n  Device to use. If None, use CUDA if available, otherwise CPU.\n\n- `backend`: str, default \"openai-whisper\"\n  Backend to use. Either \"transformers\" or \"openai-whisper\".\n\n- `download_root`: str, optional (default: None)\n  Root folder to download the model to. If None, use the default download root.\n\n- `in_memory`: bool, default False\n  Whether to preload the model weights into host memory.\n\n##### Returns:\n\nThe loaded Whisper model.\n\n##### Exceptions:\n\n- `ValueError`: If an invalid backend is specified.\n- `ImportError`: If the transformers library is not installed when using the \"transformers\" backend.\n- `RuntimeError`: If the model cannot be found or downloaded from the specified source.\n- `OSError`: If there are issues reading the model file or accessing the specified path.\n\n##### Notes:\n\n- When using a local model file, ensure that the file format is compatible with the selected backend.\n- For HuggingFace models, an internet connection may be required to download the model if it's not already cached locally.\n- The function supports loading both OpenAI Whisper and Transformers models, providing flexibility in model selection.\n\n#### `get_alignment_heads(model, max_top_layer=3)`\n\nGet the alignment heads for the given model.\n\n##### Parameters:\n\n- `model`: Whisper model instance\n  The Whisper model for which to retrieve alignment heads.\n\n- `max_top_layer`: int, default 3\n  Maximum number of top layers to consider for alignment heads.\n\n##### Returns:\n\nA sparse tensor representing the alignment heads.\n\n##### Notes:\n\n- This function is used internally to optimize the word alignment process.\n- The alignment heads are model-specific and are used to improve the accuracy of word-level timestamps.\n\n### File Writing Functions\n\nThe following functions are available for writing transcripts to various file formats:\n\n#### `write_csv(transcript, file, sep=\",\", text_first=True, format_timestamps=None, header=False)`\n\nWrite transcript data to a CSV file.\n\n##### Parameters:\n\n- `transcript`: List[dict]\n  List of transcript segment dictionaries.\n\n- `file`: file-like object\n  File to write the CSV data to.\n\n- `sep`: str, default \",\"\n  Separator to use in the CSV file.\n\n- `text_first`: bool, default True\n  If True, write text column before start\u002Fend times.\n\n- `format_timestamps`: Callable, optional (default: None)\n  Function to format timestamp values.\n\n- `header`: Union[bool, List[str]], default False\n  If True, write default header. If a list, use as custom header.\n\n##### Exceptions:\n\n- `IOError`: If there are issues writing to the specified file.\n- `ValueError`: If the transcript data is not in the expected format.\n\n##### Notes:\n\n- This function is useful for exporting transcription results in a tabular format for further analysis or processing.\n- The `format_timestamps` parameter allows for custom formatting of timestamp values, which can be helpful for specific use cases or data analysis requirements.\n\n#### `write_srt(transcript, file)`\n\nWrite transcript data to an SRT (SubRip Subtitle) file.\n\n##### Parameters:\n\n- `transcript`: List[dict]\n  List of transcript segment dictionaries.\n\n- `file`: file-like object\n  File to write the SRT data to.\n\n##### Exceptions:\n\n- `IOError`: If there are issues writing to the specified file.\n- `ValueError`: If the transcript data is not in the expected format.\n\n##### Notes:\n\n- SRT is a widely supported subtitle format, making this function useful for creating subtitles for videos based on the transcription.\n\n#### `write_vtt(transcript, file)`\n\nWrite transcript data to a VTT (WebVTT) file.\n\n##### Parameters:\n\n- `transcript`: List[dict]\n  List of transcript segment dictionaries.\n\n- `file`: file-like object\n  File to write the VTT data to.\n\n##### Exceptions:\n\n- `IOError`: If there are issues writing to the specified file.\n- `ValueError`: If the transcript data is not in the expected format.\n\n##### Notes:\n\n- WebVTT is a W3C standard for displaying timed text in connection with HTML5, making this function useful for web-based applications.\n\n#### `write_tsv(transcript, file)`\n\nWrite transcript data to a TSV (Tab-Separated Values) file.\n\n##### Parameters:\n\n- `transcript`: List[dict]\n  List of transcript segment dictionaries.\n\n- `file`: file-like object\n  File to write the TSV data to.\n\n##### Exceptions:\n\n- `IOError`: If there are issues writing to the specified file.\n- `ValueError`: If the transcript data is not in the expected format.\n\n##### Notes:\n\n- TSV files are useful for importing transcription data into spreadsheet applications or other data analysis tools.\n\n## Options that may improve results\n\nHere are some options that are not enabled by default but might improve results.\n\n### Accurate Whisper transcription\n\nAs mentioned earlier, some decoding options are disabled by default to offer better efficiency. However, this can impact the quality of the transcription. To run with the options that have the best chance of providing a good transcription, use the following options.\n* In Python:\n```python\nresults = whisper_timestamped.transcribe(model, audio, beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0), ...)\n```\n* On the command line:\n```bash\nwhisper_timestamped --accurate ...\n```\n\n### Running Voice Activity Detection (VAD) before sending to Whisper\n\nWhisper models can \"hallucinate\" text when given a segment without speech. This can be avoided by running VAD and gluing speech segments together before transcribing with the Whisper model. This is possible with `whisper-timestamped`.\n* In Python:\n```python\nresults = whisper_timestamped.transcribe(model, audio, vad=True, ...)\n```\n* On the command line:\n```bash\nwhisper_timestamped --vad True ...\n```\n\nBy default, the VAD method used is [silero](https:\u002F\u002Fgithub.com\u002Fsnakers4\u002Fsilero-vad).\nBut other methods are available, such as earlier versions of silero, or [auditok](https:\u002F\u002Fgithub.com\u002Famsehili\u002Fauditok).\nThose methods were introduced because latest versions of silero VAD can have a lot of false alarms on some audios (speech detected on silence).\n* In Python:\n```python\nresults = whisper_timestamped.transcribe(model, audio, vad=\"silero:v3.1\", ...)\nresults = whisper_timestamped.transcribe(model, audio, vad=\"auditok\", ...)\n```\n* On the command line:\n```bash\nwhisper_timestamped --vad silero:v3.1 ...\nwhisper_timestamped --vad auditok ...\n```\n\nIn order to watch the VAD results, you can use the `--plot` option of the `whisper_timestamped` CLI,\nor the `plot_word_alignment` option of the `whisper_timestamped.transcribe()` Python function.\nIt will show the VAD results on the input audio signal as following (x-axis is time in seconds):\n| **vad=\"silero:v4.0\"** | **vad=\"silero:v3.1\"** | **vad=\"auditok\"** |\n| :---: | :---: | :---: |\n| ![Example VAD](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flinto-ai_whisper-timestamped_readme_58ec1d813e1c.png) | ![Example VAD](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flinto-ai_whisper-timestamped_readme_5263c9b8fd31.png)  | ![Example VAD](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flinto-ai_whisper-timestamped_readme_70b2995c8200.png) |\n\n### Detecting disfluencies\n\nWhisper models tend to remove speech disfluencies (filler words, hesitations, repetitions, etc.). Without precautions, the disfluencies that are not transcribed will affect the timestamp of the following word: the timestamp of the beginning of the word will actually be the timestamp of the beginning of the disfluencies. `whisper-timestamped` can have some heuristics to avoid this.\n* In Python:\n```python\nresults = whisper_timestamped.transcribe(model, audio, detect_disfluencies=True, ...)\n```\n* On the command line:\n```bash\nwhisper_timestamped --detect_disfluencies True ...\n```\n**Important:** Note that when using these options, possible disfluencies will appear in the transcription as a special \"`[*]`\" word.\n\n## Acknowledgments\u002FSupport\n\nwhisper-timestamped was coded mainly by [Jérôme Louradour](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fjeronymous).\nIt is based on the following libraries:\n* [whisper](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fwhisper): Whisper speech recognition (License MIT).\n* [dtw-python](https:\u002F\u002Fpypi.org\u002Fproject\u002Fdtw-python): Dynamic Time Warping (License GPL v3).\n\nPlease consider supporting the development of this library by buying me a coffee:\n\n\u003Ca href=\"https:\u002F\u002Fbuymeacoffee.com\u002Fjeronymous\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Fcdn.buymeacoffee.com\u002Fbuttons\u002Fdefault-orange.png\" alt=\"Buy Me A Coffee\" height=\"41\" width=\"174\">\u003C\u002Fa>\n\n## Citations\nIf you use this in your research, please cite the repo:\n\n```bibtex\n@misc{lintoai2023whispertimestamped,\n  title={whisper-timestamped},\n  author={Louradour, J{\\'e}r{\\^o}me},\n  journal={GitHub repository},\n  year={2023},\n  publisher={GitHub},\n  howpublished = {\\url{https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped}}\n}\n```\n\nas well as the OpenAI Whisper paper:\n\n```bibtex\n@article{radford2022robust,\n  title={Robust speech recognition via large-scale weak supervision},\n  author={Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},\n  journal={arXiv preprint arXiv:2212.04356},\n  year={2022}\n}\n```\n\nand this paper for Dynamic-Time-Warping:\n\n```bibtex\n@article{JSSv031i07,\n  title={Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package},\n  author={Giorgino, Toni},\n  journal={Journal of Statistical Software},\n  year={2009},\n  volume={31},\n  number={7},\n  doi={10.18637\u002Fjss.v031.i07}\n}\n```\n","# whisper-timestamped\n\n多语言自动语音识别，提供词级时间戳和置信度。\n\n* [描述](#description)\n   * [关于其他方法的说明](#notes-on-other-approaches)\n* [安装](#installation)\n   * [首次安装](#first-installation)\n      * [可能需要的额外包](#additional-packages-that-might-be-needed)\n      * [Docker](#docker)\n   * [轻量级 CPU 安装](#light-installation-for-cpu)\n   * [升级到最新版本](#upgrade-to-the-latest-version)\n* [使用](#usage)\n   * [Python](#python)\n   * [命令行](#command-line)\n   * [实用函数](#utility-functions)\n   * [词对齐图](#plot-of-word-alignment)\n   * [示例输出](#example-output)\n* [API 参考](#api-reference)\n   * [主转录函数](#main-transcription-function)\n   * [实用函数](#utility-functions-1)\n   * [文件写入函数](#file-writing-functions)\n* [可能提升结果的选项](#options-that-may-improve-results)\n   * [精确的 Whisper 转录](#accurate-whisper-transcription)\n   * [在发送至 Whisper 之前运行语音活动检测 (VAD)](#running-voice-activity-detection-vad-before-sending-to-whisper)\n   * [检测口吃现象](#detecting-disfluencies)\n* [致谢\u002F支持](#acknowledgments-support)\n* [引用](#citations)\n\n## 描述\n\n[Whisper](https:\u002F\u002Fopenai.com\u002Fblog\u002Fwhisper\u002F) 是由 OpenAI 训练的一系列多语言、鲁棒的语音识别模型，在多种语言中均取得了最先进的效果。Whisper 模型被训练为能够预测语音片段的大致时间戳（大多数情况下精度为 1 秒），但其本身无法预测词级时间戳。本仓库提出了一种实现方案，旨在 **在使用 Whisper 模型进行转录时预测词级时间戳，并更准确地估计语音片段**。此外，每个词和每个片段都会被赋予一个置信度分数。\n\n该方法基于动态时间规整（DTW）应用于交叉注意力权重，正如 [Jong Wook Kim 的这个笔记本](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fwhisper\u002Fblob\u002Ff82bc59f5ea234d4b97fb2860842ed38519f7e65\u002Fnotebooks\u002FMultilingual_ASR.ipynb) 所展示的那样。在此基础上，我们做了一些改进：\n* 开始和结束时间的估计更加准确。\n* 为每个词分配置信度分数。\n* **如果可能的话（无需束搜索...）**，无需额外的推理步骤即可预测词级时间戳（词对齐会在每个语音片段解码后实时完成）。\n* 我们特别关注内存使用：与常规使用 Whisper 模型相比，`whisper-timestamped` 在处理长文件时仅需少量额外内存。\n\n`whisper-timestamped` 是 [`openai-whisper`](https:\u002F\u002Fpypi.org\u002Fproject\u002Fwhisper-openai\u002F) Python 包的扩展，旨在与 `openai-whisper` 的任何版本兼容。它提供了更高效、更准确的词级时间戳，以及以下附加功能：\n* 可以在应用 Whisper 模型之前运行语音活动检测 (VAD)，以避免因训练数据中的错误而导致的“幻觉”现象（例如，在完全静音的情况下预测出“感谢观看！”）。目前提供多种 VAD 方法：silero（默认）、auditok 和 auditok:v3.1。\n* 当未指定语言时，输出中会提供各语言的概率分布。\n\n**免责声明：请注意，此扩展仅用于实验目的，可能会显著影响性能。对于因使用该扩展而产生的任何问题或效率低下，我们概不负责。**\n\n### 关于其他方法的说明\n\n另一种相关的恢复词级时间戳的方法是使用能够预测字符的 wav2vec 模型，如 [whisperX](https:\u002F\u002Fgithub.com\u002Fm-bain\u002FwhisperX) 中成功实现的那样。然而，这些方法存在一些缺点，而基于交叉注意力权重的方法（如 `whisper_timestamped`）则不存在这些问题。这些缺点包括：\n* 需要为每种支持的语言找到一个对应的 wav2vec 模型，这与 Whisper 的多语言能力并不相称。\n* 需要额外处理至少一个神经网络模型（wav2vec 模型），从而占用更多内存。\n* 需要将 Whisper 转录中的字符标准化，以匹配 wav2vec 模型的字符集。这通常涉及繁琐的语言相关转换，例如将数字转换为文字（“2” -> “two”）、符号转换为文字（“%” -> “percent”，“€” -> “euro(s)”）等。\n* 对于语音中的不流畅现象（填充词、犹豫、重复词等）缺乏鲁棒性，而这些内容通常会被 Whisper 去除。\n\n另一种不需要额外模型的方法是查看 Whisper 模型在每个（子）词标记被预测后所估计的时间戳概率。例如，whisper.cpp 和 stable-ts 就采用了这种方法。然而，这种方法缺乏鲁棒性，因为 Whisper 模型并未被训练为在每个词之后都输出有意义的时间戳。Whisper 模型往往只会在预测一定数量的词之后才开始预测时间戳（通常是在句末），而在这种条件之外，时间戳的概率分布可能并不准确。实际上，这些方法在某些时间段内可能会产生完全不同步的结果（我们尤其在有广告音乐时观察到了这种情况）。此外，Whisper 模型的时间戳精度通常四舍五入到 1 秒（类似于许多视频字幕），这对于词级时间戳来说过于粗糙，要达到更高的精度也颇具挑战。\n\n## 安装\n\n### 首次安装\n\n要求：\n* `python3`（版本不低于 3.7，建议使用 3.9 或更高版本）\n* `ffmpeg`（请参阅 [Whisper 仓库](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fwhisper) 中的安装说明）\n\n您可以通过 pip 安装 `whisper-timestamped`：\n```bash\npip3 install whisper-timestamped\n```\n\n或者通过克隆本仓库并运行安装：\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped\ncd whisper-timestamped\u002F\npython3 setup.py install\n```\n\n#### 可能需要的额外包\n\n如果您想绘制音频时间戳与单词之间的对齐图（如 [本节](#plot-of-word-alignment) 所示），还需要安装 matplotlib：\n```bash\npip3 install matplotlib\n```\n\n如果您想使用 VAD 选项（在运行 Whisper 模型之前进行语音活动检测），还需要安装 torchaudio 和 onnxruntime：\n```bash\npip3 install onnxruntime torchaudio\n```\n\n如果您想使用来自 Hugging Face Hub 的微调 Whisper 模型，还需要安装 transformers：\n```bash\npip3 install transformers\n```\n\n#### Docker\n\n可以使用以下命令构建一个约 9GB 的 Docker 镜像：\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped\ncd whisper-timestamped\u002F\ndocker build -t whisper_timestamped:latest .\n```\n\n### 适用于 CPU 的轻量级安装\n\n如果您没有 GPU（或不想使用 GPU），则无需安装 CUDA 相关依赖。此时，您只需在安装 whisper-timestamped 之前，先安装一个轻量级的 PyTorch 版本，例如：\n\n```bash\npip3 install \\\n     torch==1.13.1+cpu \\\n     torchaudio==0.13.1+cpu \\\n     -f https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Ftorch_stable.html\n```\n\n此外，也可以通过以下命令构建一个约 3.5GB 的专用 Docker 镜像：\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped\ncd whisper-timestamped\u002F\ndocker build -t whisper_timestamped_cpu:latest -f Dockerfile.cpu .\n```\n\n### 升级到最新版本\n\n使用 pip 安装时，可以通过以下命令将库更新至最新版本：\n\n```bash\npip3 install --upgrade --no-deps --force-reinstall git+https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped\n```\n\n若需使用特定版本的 `openai-whisper`，可以运行如下命令：\n\n```bash\npip3 install openai-whisper==20230124\n```\n\n## 使用方法\n\n### Python\n\n在 Python 中，您可以使用 `whisper_timestamped.transcribe()` 函数，其用法与 `whisper.transcribe()` 类似：\n\n```python\nimport whisper_timestamped\nhelp(whisper_timestamped.transcribe)\n```\n\n与 `whisper.transcribe()` 的主要区别在于，输出结果中会包含一个 `\"words\"` 键，用于记录每个片段中每个单词的起始和结束时间。请注意，单词中会包含标点符号。示例请参见下文中的 [示例输出]。\n\n此外，默认的解码选项也有所不同，以优先考虑高效解码（采用贪婪解码而非束搜索，并且不启用温度采样回退）。若希望与 `whisper` 的默认设置一致，可使用 `beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0)`。\n\n此外，还提供了一些与词对齐相关的附加选项。\n\n通常情况下，只需在 Python 脚本中将 `whisper` 替换为 `whisper_timestamped`，并将 `model.transcribe(...)` 替换为 `transcribe(model, ...)`，即可正常工作：\n\n```python\nimport whisper_timestamped as whisper\n\naudio = whisper.load_audio(\"AUDIO.wav\")\n\nmodel = whisper.load_model(\"tiny\", device=\"cpu\")\n\nresult = whisper.transcribe(model, audio, language=\"fr\")\n\nimport json\nprint(json.dumps(result, indent = 2, ensure_ascii = False))\n```\n\n需要注意的是，您可以使用 Hugging Face 上的微调 Whisper 模型，或者从本地文件夹加载模型，只需调用 `whisper_timestamped` 的 `load_model` 方法即可。例如，若要使用 [whisper-large-v2-nob](https:\u002F\u002Fhuggingface.co\u002FNbAiLab\u002Fwhisper-large-v2-nob)，可以直接执行以下操作：\n\n```python\nimport whisper_timestamped as whisper\n\nmodel = whisper.load_model(\"NbAiLab\u002Fwhisper-large-v2-nob\", device=\"cpu\")\n\n# ...\n```\n\n### 命令行工具\n\n您也可以在命令行中使用 `whisper_timestamped`，其用法与 `whisper` 类似。可通过以下命令查看帮助信息：\n\n```bash\nwhisper_timestamped --help\n```\n\n与 `whisper` 命令行工具的主要区别包括：\n\n* 输出文件：\n  * 输出的 JSON 文件包含单词的时间戳和置信度分数。示例请参见下文中的 [示例输出]。\n  * 新增了 CSV 格式的输出。\n  * 对于 SRT、VTT 和 TSV 格式，还会额外保存包含单词时间戳的文件。\n* 默认选项有所不同：\n  * 默认情况下未设置输出目录：如需与 `whisper` 默认行为一致，可使用 `--output_dir .`。\n  * 默认情况下不启用详细模式：如需与 `whisper` 默认行为一致，可使用 `--verbose True`。\n  * 默认禁用束搜索解码和温度采样回退，以提高解码效率。若希望与 `whisper` 默认设置相同，可使用 `--accurate` 选项（该选项等价于 `--beam_size 5 --temperature_increment_on_fallback 0.2 --best_of 5`）。\n* 提供了一些额外的特定选项：\n  \u003C!-- * `--efficient` 可启用更快的贪婪解码（不使用束搜索，也不在每一步进行多次采样），从而实现一种特殊路径，在此路径下单词时间戳可实时计算，无需进行两次推理。不过，对于较为复杂的音频，使用该选项可能会导致转录结果显著下降。 -->\n  * `--compute_confidence` 可用于开启或关闭对每个单词置信度分数的计算。\n  * `--punctuations_with_words` 可决定标点符号是否应与前一个单词合并显示。\n\n以下是一个使用 `tiny` 模型处理多个文件，并将结果输出到当前目录的示例命令，其行为与 `whisper` 的默认设置一致：\n\n```bash\nwhisper_timestamped audio1.flac audio2.mp3 audio3.wav --model tiny --output_dir .\n```\n\n需要注意的是，您也可以使用来自 Hugging Face 或本地文件夹的微调 Whisper 模型。例如，若要使用 [whisper-large-v2-nob](https:\u002F\u002Fhuggingface.co\u002FNbAiLab\u002Fwhisper-large-v2-nob) 模型，只需执行以下命令：\n\n```bash\nwhisper_timestamped --model NbAiLab\u002Fwhisper-large-v2-nob \u003C...>\n```\n\n### 工具函数\n\n除了主函数 `transcribe` 外，whisper-timestamped 还提供了一些实用工具函数：\n\n#### `remove_non_speech`\n\n使用语音活动检测（VAD）去除音频中的非语音部分。\n\n```python\nfrom whisper_timestamped import remove_non_speech\n\naudio_speech, segments, convert_timestamps = remove_non_speech(audio, vad=\"silero\")\n```\n\n#### `load_model`\n\n从指定名称或路径加载 Whisper 模型，支持从 Hugging Face 加载微调模型。\n\n```python\nfrom whisper_timestamped import load_model\n\nmodel = load_model(\"NbAiLab\u002Fwhisper-large-v2-nob\", device=\"cpu\")\n```\n\n### 词对齐图示\n\n您还可以使用 `whisper_timestamped.transcribe()` Python 函数中的 `plot_word_alignment` 选项，或 `whisper_timestamped` 命令行工具中的 `--plot` 选项，来查看每个片段的词对齐情况。\n\n![词对齐示例](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flinto-ai_whisper-timestamped_readme_f41f3efe6335.png)\n\n* 上方图表展示了用于对齐的交叉注意力权重经过动态时间规整（DTW）后的变换结果。横轴表示时间，纵轴表示预测的标记，其中开头和结尾是特殊的时间戳标记，中间则是（子）单词和标点符号。\n* 下方图表是输入信号的 MFCC 表示（Whisper 使用的基于梅尔频率倒谱的特征）。\n* 纵向的红色虚线标示出了单词边界的位置（标点符号被“粘”在前一个单词上）。\n\n### 示例输出\n\n`whisper_timestamped.transcribe()` 函数的输出是一个 Python 字典，可以通过命令行工具以 JSON 格式查看。\n\nJSON 模式的定义可在 [tests\u002Fjson_schema.json](tests\u002Fjson_schema.json) 中找到。\n\n以下是示例输出：\n```bash\nwhisper_timestamped AUDIO_FILE.wav --model tiny --language fr\n```\n```json\n{\n  \"text\": \"Bonjour! Est-ce que vous allez bien?\",\n  \"segments\": [\n    {\n      \"id\": 0,\n      \"seek\": 0,\n      \"start\": 0.5,\n      \"end\": 1.2,\n      \"text\": \" Bonjour!\",\n      \"tokens\": [ 25431, 2298 ],\n      \"temperature\": 0.0,\n      \"avg_logprob\": -0.6674491882324218,\n      \"compression_ratio\": 0.8181818181818182,\n      \"no_speech_prob\": 0.10241222381591797,\n      \"confidence\": 0.51,\n      \"words\": [\n        {\n          \"text\": \"Bonjour!\",\n          \"start\": 0.5,\n          \"end\": 1.2,\n          \"confidence\": 0.51\n        }\n      ]\n    },\n    {\n      \"id\": 1,\n      \"seek\": 200,\n      \"start\": 2.02,\n      \"end\": 4.48,\n      \"text\": \" Est-ce que vous allez bien?\",\n      \"tokens\": [ 50364, 4410, 12, 384, 631, 2630, 18146, 3610, 2506, 50464 ],\n      \"temperature\": 0.0,\n      \"avg_logprob\": -0.43492694334550336,\n      \"compression_ratio\": 0.7714285714285715,\n      \"no_speech_prob\": 0.06502953916788101,\n      \"confidence\": 0.595,\n      \"words\": [\n        {\n          \"text\": \"Est-ce\",\n          \"start\": 2.02,\n          \"end\": 3.78,\n          \"confidence\": 0.441\n        },\n        {\n          \"text\": \"que\",\n          \"start\": 3.78,\n          \"end\": 3.84,\n          \"confidence\": 0.948\n        },\n        {\n          \"text\": \"vous\",\n          \"start\": 3.84,\n          \"end\": 4.0,\n          \"confidence\": 0.935\n        },\n        {\n          \"text\": \"allez\",\n          \"start\": 4.0,\n          \"end\": 4.14,\n          \"confidence\": 0.347\n        },\n        {\n          \"text\": \"bien?\",\n          \"start\": 4.14,\n          \"end\": 4.48,\n          \"confidence\": 0.998\n        }\n      ]\n    }\n  ],\n  \"language\": \"fr\"\n}\n```\n\n如果未指定语言（例如在命令行中没有 `--language fr` 选项），则会多出一个包含语言概率的键：\n```json\n{\n  ...\n  \"language\": \"fr\",\n  \"language_probs\": {\n    \"en\": 0.027954353019595146,\n    \"zh\": 0.02743500843644142,\n    ...\n    \"fr\": 0.9196318984031677,\n    ...\n    \"su\": 3.0119704064190955e-08,\n    \"yue\": 2.2565967810805887e-05\n  }\n}\n```\n\n## API 参考\n\n### 主要转录功能\n\n#### `transcribe_timestamped(model, audio, **kwargs)`\n\n使用 Whisper 模型对音频进行转录，并计算词级别的时间戳。\n\n##### 参数：\n\n- `model`: Whisper 模型实例\n  用于转录的 Whisper 模型。\n\n- `audio`: Union[str, np.ndarray, torch.Tensor]\n  要转录的音频文件路径，或作为 NumPy 数组或 PyTorch 张量的音频波形。\n\n- `language`: str, 可选（默认：None）\n  音频的语言。如果为 None，则会进行语言检测。\n\n- `task`: str，默认为 \"transcribe\"\n  要执行的任务：可以是“transcribe”（语音识别）或“translate”（翻译成英语）。\n\n- `vad`: Union[bool, str, List[Tuple[float, float]]], 可选（默认：False）\n  是否使用语音活动检测（VAD）来去除非语音段。可取值如下：\n  - True\u002FFalse：启用\u002F禁用 VAD（默认使用 Silero VAD）\n  - “silero”：使用 Silero VAD\n  - “auditok”：使用 Auditok VAD\n  - (开始, 结束) 时间戳列表：显式指定语音段\n\n- `detect_disfluencies`: bool，默认为 False\n  是否在转录中检测并标记口吃、停顿词等不流畅现象。\n\n- `trust_whisper_timestamps`: bool，默认为 True\n  是否依赖 Whisper 的时间戳来确定初始片段位置。\n\n- `compute_word_confidence`: bool，默认为 True\n  是否计算单词的置信度分数。\n\n- `include_punctuation_in_confidence`: bool，默认为 False\n  在计算单词置信度时，是否包含标点符号的概率。\n\n- `refine_whisper_precision`: float，默认为 0.5\n  以秒为单位，调整 Whisper 片段位置的精度。必须是 0.02 的倍数。\n\n- `min_word_duration`: float，默认为 0.02\n  单词的最小持续时间，以秒为单位。\n\n- `plot_word_alignment`: bool 或 str，默认为 False\n  是否绘制每个片段的词对齐图。如果是字符串，则将图表保存到指定文件。\n\n- `word_alignement_most_top_layers`: int，可选（默认：None）\n  用于词对齐的顶层数量。如果为 None，则使用所有层。\n\n- `remove_empty_words`: bool，默认为 False\n  是否移除出现在片段末尾且持续时间为零的空单词。\n\n- `naive_approach`: bool，默认为 False\n  强制采用朴素方法，即解码两次（一次用于转录，一次用于对齐）。\n\n- `use_backend_timestamps`: bool，默认为 False\n  是否使用后端提供的词时间戳（openai-whisper 或 transformers），而不是由 whisper-timestamped 的复杂启发式算法计算的时间戳。\n\n- `temperature`: Union[float, List[float]], 默认为 0.0\n  采样温度。可以是单个值，也可以是用于回退的温度列表。\n\n- `compression_ratio_threshold`: float，默认为 2.4\n  如果 gzip 压缩比高于此值，则认为解码失败。\n\n- `logprob_threshold`: float，默认为 -1.0\n  如果平均对数概率低于此值，则认为解码失败。\n\n- `no_speech_threshold`: float，默认为 0.6\n  \u003C|nospeech|> 令牌的概率阈值。\n\n- `condition_on_previous_text`: bool，默认为 True\n  是否将前一个输出作为下一个窗口的提示。\n\n- `initial_prompt`: str，可选（默认：None）\n  可选文本，用作第一个窗口的提示。\n\n- `suppress_tokens`: str，默认为 \"-1\"\n  采样过程中要抑制的令牌 ID 列表，以逗号分隔。\n\n- `fp16`: bool，可选（默认：None）\n  是否以 fp16 精度进行推理。\n\n- `verbose`: bool 或 None，默认为 False\n  是否将正在解码的文本显示在控制台上。如果为 True，则显示所有详细信息；如果为 False，则显示最少信息；如果为 None，则不显示任何内容。\n\n##### 返回值：\n\n一个包含以下内容的字典：\n- `text`: str - 完整的转录文本\n- `segments`: List[dict] - 片段字典列表，每个字典包含：\n  - `id`: int - 片段 ID\n  - `seek`: int - 音频文件中的起始位置（以样本为单位）\n  - `start`: float - 片段的开始时间（以秒为单位）\n  - `end`: float - 片段的结束时间（以秒为单位）\n  - `text`: str - 片段的转录文本\n  - `tokens`: List[int] - 片段的令牌 ID 列表\n  - `temperature`: float - 该片段使用的温度\n  - `avg_logprob`: float - 片段的平均对数概率\n  - `compression_ratio`: float - 片段的压缩比\n  - `no_speech_prob`: float - 片段中无语音的概率\n  - `confidence`: float - 片段的置信度评分\n  - `words`: List[dict] - 词字典列表，每个字典包含：\n    - `start`: float - 词的开始时间（以秒为单位）\n    - `end`: float - 词的结束时间（以秒为单位）\n    - `text`: str - 词的内容\n    - `confidence`: float - 词的置信度评分（如果已计算）\n- `language`: str - 检测到的或指定的语言\n- `language_probs`: dict - 语言检测概率（如果适用）\n\n##### 异常：\n\n- `RuntimeError`: 如果 VAD 方法未正确安装或配置。\n- `ValueError`: 如果 `refine_whisper_precision` 不是 0.02 的正整数倍。\n- `AssertionError`: 如果音频时长短于预期，或片段数量存在不一致。\n\n##### 注意事项：\n\n- 该函数使用 Whisper 模型对音频进行转录，然后进行额外处理以生成词级别的时间戳和置信度分数。\n- VAD 功能可以通过去除非语音段来显著提高转录准确性，但需要额外的依赖项（例如，使用 Silero VAD 时需要 torchaudio 和 onnxruntime）。\n- `naive_approach` 参数在调试或处理特定音频特征时可能有用，但其速度可能慢于默认方法。\n- 当 `use_efficient_by_default` 为 True 时，一些参数如 `best_of`、`beam_size` 和 `temperature_increment_on_fallback` 默认设置为 None，以提高处理效率。\n- 该函数支持 OpenAI Whisper 和 Transformers 后端，可在加载模型时指定。\n\n### 工具函数\n\n#### `remove_non_speech(audio, **kwargs)`\n\n使用语音活动检测（VAD）从音频中移除非语音段。\n\n##### 参数：\n\n- `audio`: torch.Tensor\n  音频数据，以 PyTorch 张量形式提供。\n\n- `use_sample`: bool, 默认 False\n  如果为 True，则返回以样本为单位的开始和结束时间，而非秒。\n\n- `min_speech_duration`: float, 默认 0.1\n  语音段的最小持续时间，单位为秒。\n\n- `min_silence_duration`: float, 默认 1\n  静音段的最小持续时间，单位为秒。\n\n- `dilatation`: float, 默认 0.5\n  VAD 检测到的每个语音段将被扩展多少秒。\n\n- `sample_rate`: int, 默认 16000\n  音频的采样率。\n\n- `method`: str 或 List[Tuple[float, float]], 默认 \"silero\"\n  要使用的 VAD 方法。可以是 \"silero\"、\"auditok\"，或一个时间戳列表。\n\n- `avoid_empty_speech`: bool, 默认 False\n  如果为 True，则避免返回空的语音段。\n\n- `plot`: Union[bool, str], 默认 False\n  如果为 True，则绘制 VAD 结果；如果为字符串，则将图表保存到指定文件。\n\n##### 返回值：\n\n一个包含以下内容的元组：\n1. torch.Tensor: 移除非语音段后的音频。\n2. List[Tuple[float, float]]: 语音段的 (开始, 结束) 时间戳列表。\n3. Callable: 将新音频中的时间戳转换为原始音频时间戳的函数。\n\n##### 异常：\n\n- `ImportError`: 如果未安装所需的 VAD 库（例如 auditok）。\n- `ValueError`: 如果指定了无效的 VAD 方法。\n\n##### 注释：\n\n- 此函数特别适用于在处理音频之前移除静音和非语音部分，从而提高转录的准确性。\n- VAD 方法的选择会影响移除非语音部分的准确性和速度。\n\n#### `load_model(name, device=None, backend=\"openai-whisper\", download_root=None, in_memory=False)`\n\n从给定名称或路径加载 Whisper 模型。\n\n##### 参数：\n\n- `name`: str\n  模型名称或路径。可以是：\n  - OpenAI Whisper 标识符：“large-v3”、“medium.en”等。\n  - HuggingFace 标识符：“openai\u002Fwhisper-large-v3”、“distil-whisper\u002Fdistil-large-v2”等。\n  - 文件名：“path\u002Fto\u002Fmodel.pt”、“path\u002Fto\u002Fmodel.ckpt”、“path\u002Fto\u002Fmodel.bin”。\n  - 文件夹名：“path\u002Fto\u002Ffolder”。\n\n- `device`: Union[str, torch.device], 可选（默认：None）\n  使用的设备。如果为 None，则优先使用 CUDA，否则使用 CPU。\n\n- `backend`: str, 默认 “openai-whisper”\n  使用的后端。可以是 “transformers” 或 “openai-whisper”。\n\n- `download_root`: str, 可选（默认：None）\n  下载模型的根目录。如果为 None，则使用默认下载根目录。\n\n- `in_memory`: bool, 默认 False\n  是否将模型权重预加载到主机内存中。\n\n##### 返回值：\n\n加载的 Whisper 模型。\n\n##### 异常：\n\n- `ValueError`: 如果指定了无效的后端。\n- `ImportError`: 如果使用 “transformers” 后端时未安装 transformers 库。\n- `RuntimeError`: 如果无法从指定源找到或下载模型。\n- `OSError`: 如果读取模型文件或访问指定路径时出现问题。\n\n##### 注释：\n\n- 使用本地模型文件时，请确保文件格式与所选后端兼容。\n- 对于 HuggingFace 模型，如果尚未缓存到本地，可能需要互联网连接来下载模型。\n- 该函数支持加载 OpenAI Whisper 和 Transformers 模型，提供了灵活的模型选择方式。\n\n#### `get_alignment_heads(model, max_top_layer=3)`\n\n获取给定模型的对齐头。\n\n##### 参数：\n\n- `model`: Whisper 模型实例\n  需要检索对齐头的 Whisper 模型。\n\n- `max_top_layer`: int, 默认 3\n  考虑对齐头的最大顶层层数。\n\n##### 返回值：\n\n表示对齐头的稀疏张量。\n\n##### 注释：\n\n- 此函数在内部用于优化单词对齐过程。\n- 对齐头是特定于模型的，用于提高单词级时间戳的准确性。\n\n### 文件写入函数\n\n以下函数可用于将转录文本写入各种文件格式：\n\n#### `write_csv(transcript, file, sep=\",\", text_first=True, format_timestamps=None, header=False)`\n\n将转录数据写入 CSV 文件。\n\n##### 参数：\n\n- `transcript`: List[dict]\n  转录片段字典列表。\n\n- `file`: 文件对象\n  用于写入 CSV 数据的文件。\n\n- `sep`: str, 默认 \",\"\n  CSV 文件中使用的分隔符。\n\n- `text_first`: bool, 默认 True\n  如果为 True，则先写文本列，再写开始\u002F结束时间。\n\n- `format_timestamps`: Callable, 可选（默认：None）\n  用于格式化时间戳值的函数。\n\n- `header`: Union[bool, List[str]], 默认 False\n  如果为 True，则写入默认标题；如果是列表，则使用自定义标题。\n\n##### 异常：\n\n- `IOError`: 如果写入指定文件时出现问题。\n- `ValueError`: 如果转录数据格式不符合预期。\n\n##### 注释：\n\n- 该函数适用于将转录结果以表格形式导出，以便进一步分析或处理。\n- `format_timestamps` 参数允许自定义时间戳格式，这在特定用例或数据分析需求中非常有用。\n\n#### `write_srt(transcript, file)`\n\n将转录数据写入 SRT（SubRip 字幕）文件。\n\n##### 参数：\n\n- `transcript`: List[dict]\n  转录片段字典列表。\n\n- `file`: 文件对象\n  用于写入 SRT 数据的文件。\n\n##### 异常：\n\n- `IOError`: 如果写入指定文件时出现问题。\n- `ValueError`: 如果转录数据格式不符合预期。\n\n##### 注释：\n\n- SRT 是一种广泛支持的字幕格式，因此此函数非常适合根据转录内容为视频创建字幕。\n\n#### `write_vtt(transcript, file)`\n\n将转录数据写入 VTT（WebVTT）文件。\n\n##### 参数：\n\n- `transcript`: List[dict]\n  转录片段字典列表。\n\n- `file`: 文件对象\n  用于写入 VTT 数据的文件。\n\n##### 异常：\n\n- `IOError`: 如果写入指定文件时出现问题。\n- `ValueError`: 如果转录数据格式不符合预期。\n\n##### 注释：\n\n- WebVTT 是 W3C 关于在 HTML5 中显示定时文本的标准，因此该函数对于基于 Web 的应用非常有用。\n\n#### `write_tsv(transcript, file)`\n\n将转录数据写入 TSV（制表符分隔值）文件。\n\n##### 参数：\n\n- `transcript`: List[dict]\n  转录片段字典列表。\n\n- `file`: 文件对象\n  用于写入 TSV 数据的文件。\n\n##### 异常：\n\n- `IOError`: 如果写入指定文件时出现问题。\n- `ValueError`: 如果转录数据格式不符合预期。\n\n##### 注释：\n\n- TSV 文件便于将转录数据导入电子表格或其他数据分析工具中。\n\n## 可能提升结果的选项\n\n以下是一些默认未启用但可能提升结果的选项。\n\n### 准确的 Whisper 转录\n\n如前所述，为了提高效率，某些解码选项默认是禁用的。然而，这可能会影响转录的质量。为了使用最有可能提供高质量转录的选项，请使用以下设置。\n* 在 Python 中：\n```python\nresults = whisper_timestamped.transcribe(model, audio, beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0), ...)\n```\n* 在命令行中：\n```bash\nwhisper_timestamped --accurate ...\n```\n\n### 在发送给 Whisper 之前运行语音活动检测 (VAD)\n\n当 Whisper 模型接收到没有语音的部分时，可能会“幻觉”出文本内容。为了避免这种情况，可以在使用 Whisper 模型进行转录之前，先运行 VAD 并将语音片段拼接在一起。这可以通过 `whisper-timestamped` 实现。\n* 在 Python 中：\n```python\nresults = whisper_timestamped.transcribe(model, audio, vad=True, ...)\n```\n* 在命令行中：\n```bash\nwhisper_timestamped --vad True ...\n```\n\n默认情况下，使用的 VAD 方法是 [silero](https:\u002F\u002Fgithub.com\u002Fsnakers4\u002Fsilero-vad)。\n但也可以选择其他方法，例如较早版本的 silero，或者 [auditok](https:\u002F\u002Fgithub.com\u002Famsehili\u002Fauditok)。\n引入这些方法的原因是，最新版本的 silero VAD 在某些音频上会产生大量误报（在静音中检测到语音）。\n* 在 Python 中：\n```python\nresults = whisper_timestamped.transcribe(model, audio, vad=\"silero:v3.1\", ...)\nresults = whisper_timestamped.transcribe(model, audio, vad=\"auditok\", ...)\n```\n* 在命令行中：\n```bash\nwhisper_timestamped --vad silero:v3.1 ...\nwhisper_timestamped --vad auditok ...\n```\n\n为了查看 VAD 的结果，可以使用 `whisper_timestamped` 命令行工具的 `--plot` 选项，\n或者使用 `whisper_timestamped.transcribe()` Python 函数的 `plot_word_alignment` 选项。\n它会以如下方式显示输入音频信号上的 VAD 结果（横轴为时间，单位：秒）：\n| **vad=\"silero:v4.0\"** | **vad=\"silero:v3.1\"** | **vad=\"auditok\"** |\n| :---: | :---: | :---: |\n| ![示例 VAD](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flinto-ai_whisper-timestamped_readme_58ec1d813e1c.png) | ![示例 VAD](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flinto-ai_whisper-timestamped_readme_5263c9b8fd31.png)  | ![示例 VAD](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flinto-ai_whisper-timestamped_readme_70b2995c8200.png) |\n\n### 检测口吃现象\n\nWhisper 模型倾向于去除语音中的口吃现象（填充词、犹豫、重复等）。如果不采取预防措施，未被转录的口吃部分会影响后续单词的时间戳：该单词的起始时间实际上会是口吃部分的起始时间。`whisper-timestamped` 可以采用一些启发式方法来避免这种情况。\n* 在 Python 中：\n```python\nresults = whisper_timestamped.transcribe(model, audio, detect_disfluencies=True, ...)\n```\n* 在命令行中：\n```bash\nwhisper_timestamped --detect_disfluencies True ...\n```\n**重要提示：** 请注意，使用这些选项时，可能出现的口吃现象会在转录中以特殊的 \"`[*]`\" 单词形式出现。\n\n## 致谢\u002F支持\n\nwhisper-timestamped 主要由 [Jérôme Louradour](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fjeronymous) 编写。\n它基于以下库：\n* [whisper](https:\u002F\u002Fgithub.com\u002Fopenai\u002Fwhisper)：Whisper 语音识别（MIT 许可证）。\n* [dtw-python](https:\u002F\u002Fpypi.org\u002Fproject\u002Fdtw-python)：动态时间规整（GPL v3 许可证）。\n\n如果您愿意支持本库的开发，欢迎请我喝杯咖啡：\n\n\u003Ca href=\"https:\u002F\u002Fbuymeacoffee.com\u002Fjeronymous\" target=\"_blank\">\u003Cimg src=\"https:\u002F\u002Fcdn.buymeacoffee.com\u002Fbuttons\u002Fdefault-orange.png\" alt=\"Buy Me A Coffee\" height=\"41\" width=\"174\">\u003C\u002Fa>\n\n## 引用\n如果您在研究中使用了本项目，请引用该仓库：\n\n```bibtex\n@misc{lintoai2023whispertimestamped,\n  title={whisper-timestamped},\n  author={Louradour, J{\\'e}r{\\^o}me},\n  journal={GitHub repository},\n  year={2023},\n  publisher={GitHub},\n  howpublished = {\\url{https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped}}\n}\n```\n\n同时请引用 OpenAI 的 Whisper 论文：\n\n```bibtex\n@article{radford2022robust,\n  title={Robust speech recognition via large-scale weak supervision},\n  author={Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},\n  journal={arXiv preprint arXiv:2212.04356},\n  year={2022}\n}\n```\n\n以及关于动态时间规整的论文：\n\n```bibtex\n@article{JSSv031i07,\n  title={Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package},\n  author={Giorgino, Toni},\n  journal={Journal of Statistical Software},\n  year={2009},\n  volume={31},\n  number={7},\n  doi={10.18637\u002Fjss.v031.i07}\n}\n```","# whisper-timestamped 快速上手指南\n\n`whisper-timestamped` 是 OpenAI Whisper 的扩展工具，专为多语言自动语音识别（ASR）设计。它在保留 Whisper 强大识别能力的同时，增加了**单词级时间戳**和**置信度评分**功能，特别适合需要精准字幕对齐的场景。\n\n## 环境准备\n\n在开始之前，请确保您的系统满足以下要求：\n\n*   **操作系统**: Linux, macOS 或 Windows\n*   **Python 版本**: Python 3.7+ (推荐 3.9 及以上)\n*   **核心依赖**: `ffmpeg` (用于音频处理)\n    *   **Ubuntu\u002FDebian**: `sudo apt update && sudo apt install ffmpeg`\n    *   **macOS**: `brew install ffmpeg`\n    *   **Windows**: 下载构建包并配置环境变量，或通过 `conda install -c conda-forge ffmpeg` 安装\n*   **可选依赖**:\n    *   若需绘制时间轴对齐图：`matplotlib`\n    *   若需使用语音活动检测 (VAD) 以减少幻觉：`onnxruntime`, `torchaudio`\n\n> **国内加速建议**：安装 Python 包时，推荐使用清华或阿里镜像源以提升下载速度。\n> 例如：`pip3 install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple \u003C包名>`\n\n## 安装步骤\n\n### 1. 标准安装（推荐 GPU 用户）\n\n直接使用 pip 安装最新版本：\n\n```bash\npip3 install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple whisper-timestamped\n```\n\n或者从源码安装：\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped\ncd whisper-timestamped\u002F\npython3 setup.py install\n```\n\n### 2. 轻量级安装（仅 CPU 用户）\n\n如果您没有 GPU 或不想使用 CUDA，请先安装 CPU 版本的 PyTorch，再安装本工具，以避免下载巨大的 CUDA 依赖包：\n\n```bash\npip3 install torch==1.13.1+cpu torchaudio==0.13.1+cpu -f https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Ftorch_stable.html\npip3 install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple whisper-timestamped\n```\n\n### 3. 安装可选功能组件\n\n根据需求安装额外功能包：\n\n```bash\n# 绘图支持\npip3 install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple matplotlib\n\n# VAD (语音活动检测) 支持\npip3 install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple onnxruntime torchaudio\n\n# HuggingFace 微调模型支持\npip3 install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple transformers\n```\n\n## 基本使用\n\n### 方式一：Python 脚本调用\n\n这是最灵活的使用方式。只需将导入语句从 `import whisper` 改为 `import whisper_timestamped as whisper`，即可直接获得单词级时间戳。\n\n```python\nimport whisper_timestamped as whisper\nimport json\n\n# 1. 加载音频文件\naudio = whisper.load_audio(\"AUDIO.wav\")\n\n# 2. 加载模型 (支持 'tiny', 'base', 'small', 'medium', 'large' 或 HuggingFace 模型路径)\n# device=\"cuda\" 可启用显卡加速，若无显卡则使用 \"cpu\"\nmodel = whisper.load_model(\"tiny\", device=\"cpu\")\n\n# 3. 执行转录\n# language 指定语言代码 (如 \"zh\", \"en\", \"fr\")，不指定则自动检测\nresult = whisper.transcribe(model, audio, language=\"zh\")\n\n# 4. 输出结果 (包含 words 字段，内含每个单词的 start, end 时间和 confidence 置信度)\nprint(json.dumps(result, indent=2, ensure_ascii=False))\n```\n\n**关键特性说明：**\n*   输出结果中的 `segments` 列表下会新增 `words` 字段，包含每个单词的精确起止时间。\n*   默认采用贪心解码（greedy decoding）以提高效率。若需更高精度（类似原生 Whisper 默认行为），可添加参数 `beam_size=5, best_of=5`。\n*   支持直接加载 HuggingFace 上的微调模型，例如：`whisper.load_model(\"NbAiLab\u002Fwhisper-large-v2-nob\")`。\n\n### 方式二：命令行工具 (CLI)\n\n命令行用法与原生 `whisper` 命令高度兼容，但输出文件会包含更丰富的时间戳信息。\n\n**基础命令示例：**\n\n```bash\nwhisper_timestamped audio_file.mp3 --model tiny --language zh --output_dir .\n```\n\n**常用参数说明：**\n*   `--model`: 选择模型大小 (tiny, base, small, medium, large) 或 HuggingFace 模型 ID。\n*   `--language`: 指定源语言代码（如 `zh` 代表中文）。\n*   `--output_dir`: 输出文件夹路径。\n*   `--accurate`: 启用高精度模式（等价于 `--beam_size 5 --best_of 5` 等参数），牺牲速度换取更好效果。\n*   `--compute_confidence`: 开启\u002F关闭单词置信度计算。\n\n**输出文件格式增强：**\n*   **JSON**: 包含完整的单词级时间戳和置信度。\n*   **SRT\u002FVTT\u002FTSV**: 除了常规字幕文件外，还会生成带有单词级时间戳的额外文件。\n*   **CSV**: 新增 CSV 格式输出支持。","某跨国法律团队正在处理长达数小时的英文庭审录音，需要将其转化为带有精确时间戳的逐字稿，以便律师能快速定位并引用关键证词。\n\n### 没有 whisper-timestamped 时\n- **时间定位模糊**：原生 Whisper 模型仅能提供秒级精度的段落的时间戳，律师无法直接定位到具体的单词，查找“反对”或特定证据描述时需反复拖拽进度条试听。\n- **人工校对耗时**：由于缺乏单词级的置信度评分，团队必须人工逐句核对转录内容，难以快速识别因口音或背景噪音导致的错误识别。\n- **静音段幻觉干扰**：在录音中的长时间沉默或低噪片段，模型容易产生“幻觉”，凭空生成如“谢谢观看”等无关文本，误导案情分析。\n- **多语言混合困难**：庭审中偶尔夹杂西班牙语证词，未指定语言时难以判断识别结果的可靠性，增加了后续验证成本。\n\n### 使用 whisper-timestamped 后\n- **单词级精准定位**：利用动态时间规整（DTW）技术，whisper-timestamped 为每个单词生成精确的起止时间，律师点击文字即可瞬间跳转至对应音频位置。\n- **智能风险预警**：每个单词都附带置信度分数，低分词汇自动高亮，团队可优先复核这些高风险片段，将校对效率提升 50% 以上。\n- **消除静音幻觉**：内置的语音活动检测（VAD）功能在转录前过滤纯静音段，彻底杜绝了在无声背景中生成虚假文本的问题。\n- **语言概率透明化**：在未预设语言的情况下，输出结果包含语言概率分布，帮助团队准确判断混合语言片段的识别可信度。\n\nwhisper-timestamped 通过将粗糙的语音转写升级为带置信度评估的单词级时间轴，让海量音频数据的检索与分析从“大海捞针”变为“精准打击”。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flinto-ai_whisper-timestamped_f41f3efe.png","linto-ai","linto.ai","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Flinto-ai_d20c1cbf.jpg","Your Open Source end-to-end platform for voice-operated solutions",null,"hello@linto.ai","https:\u002F\u002Flinto.ai","https:\u002F\u002Fgithub.com\u002Flinto-ai",[81,85],{"name":82,"color":83,"percentage":84},"Python","#3572A5",99.5,{"name":86,"color":87,"percentage":88},"Dockerfile","#384d54",0.5,2794,209,"2026-04-10T04:52:40","AGPL-3.0","Linux, macOS, Windows","非必需。支持 CPU 运行（需安装 torch CPU 版本）；若使用 GPU，需安装 CUDA 依赖（具体版本未说明，通常匹配 PyTorch 版本）。","未说明（文中提到针对长文件进行了内存优化，相比常规 Whisper 占用额外内存很少）",{"notes":97,"python":98,"dependencies":99},"1. 必须预先安装 ffmpeg。2. 若无 GPU 或不想使用，需先手动安装 CPU 版本的 torch 和 torchaudio，再安装本工具。3. 提供 Docker 镜像（完整版约 9GB，CPU 版约 3.5GB）。4. 默认使用贪心解码以提高效率，若需更高精度可开启 beam search。5. 支持在运行 Whisper 前进行语音活动检测 (VAD) 以减少幻觉。","3.7+ (推荐 3.9+)",[100,101,102,103,104,105,106],"openai-whisper","torch","torchaudio","ffmpeg","onnxruntime (可选，用于 VAD)","matplotlib (可选，用于绘图)","transformers (可选，用于 HuggingFace 模型)",[35,108,14],"音频",[110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129],"deep-learning","speech","speech-recognition","speech-to-text","asr","machine-learning","python","python3","pytorch","attention-is-all-you-need","attention-mechanism","attention-model","attention-network","attention-seq2seq","attention-visualization","multilingual-models","speaker-diarization","speech-processing","transformers","whisper","2026-03-27T02:49:30.150509","2026-04-12T16:52:57.723581",[133,138,143,148,152,156],{"id":134,"question_zh":135,"answer_zh":136,"source_url":137},30900,"遇到 'Inconsistent number of segments'（片段数量不一致）的 AssertionError 报错怎么办？","该错误通常是由于旧版本中的边界情况导致的。请升级 whisper-timestamped 到最新版本（如 1.15.0 或更高），该问题在新版本中已修复。如果使用的是音乐等复杂音频，建议不要使用 'tiny' 模型，并尝试非贪婪解码以获得更好的结果。","https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped\u002Fissues\u002F64",{"id":139,"question_zh":140,"answer_zh":141,"source_url":142},30901,"在几乎完全静音的音频开头出现了幻觉文字（Hallucinated words），如何解决？","这是因为默认的 VAD（语音活动检测）方法可能不够灵敏。从 1.14.1 版本开始，您可以指定不同的 VAD 方法。建议在命令中添加参数 `--vad=\"auditok\"` 或使用旧版 Silero 模型，例如 `--vad=\"silero:3.1\"`，以更好地处理静音片段。","https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped\u002Fissues\u002F74",{"id":144,"question_zh":145,"answer_zh":146,"source_url":147},30902,"遇到 'AssertionError: Got inconsistent logprobs length' 报错该如何解决？","这是一个已知问题，已在版本 1.7.0 中修复。请升级库到 1.7.0 或更高版本。注意：新版本默认选项不同，运行时间可能更长但转录结果更准确。如果您追求速度且对精度要求不高，可以添加 `--efficient` 参数来加速运行。","https:\u002F\u002Fgithub.com\u002Flinto-ai\u002Fwhisper-timestamped\u002Fissues\u002F12",{"id":149,"question_zh":150,"answer_zh":151,"source_url":137},30903,"为什么连续的时间戳片段之间不连贯，中间有间隔？","这是正常现象，因为片段之间可能存在静音。如果您发现对齐质量较低，可能是因为使用了微调过的模型但没有适配相应的对齐头（alignment heads）。如果遇到严重问题，建议提供音频文件和具体的运行命令以便进一步调查。",{"id":153,"question_zh":154,"answer_zh":155,"source_url":147},30904,"如何在保证转录精度的同时提高运行速度？","新版本默认设置倾向于更高的精度，因此运行较慢。如果您可以接受一定的精度损失以换取更快的速度，可以在运行命令中添加 `--efficient` 参数。",{"id":157,"question_zh":158,"answer_zh":159,"source_url":137},30905,"转录音乐音频时效果很差，只识别出少量单词，有什么建议？","转录音乐对模型来说具有挑战性。部分原因是使用了较小的模型（如 'tiny'）以及默认的贪婪解码策略。建议尝试更大的模型（如 'medium' 或 'large'），并避免使用默认的贪婪解码，以提高音乐内容的识别率。",[161,165,169,173,177,181,185,189,193,197,201,205,209,213],{"id":162,"version":163,"summary_zh":76,"released_at":164},222697,"v1.15.9","2025-09-09T07:04:36",{"id":166,"version":167,"summary_zh":76,"released_at":168},222698,"v1.15.8","2024-11-25T10:18:29",{"id":170,"version":171,"summary_zh":76,"released_at":172},222699,"v1.15.7","2024-11-25T03:47:42",{"id":174,"version":175,"summary_zh":76,"released_at":176},222700,"v1.15.6","2024-11-04T06:48:58",{"id":178,"version":179,"summary_zh":76,"released_at":180},222701,"v1.15.5","2024-10-30T08:16:49",{"id":182,"version":183,"summary_zh":76,"released_at":184},222702,"v1.15.4","2024-04-22T18:01:53",{"id":186,"version":187,"summary_zh":76,"released_at":188},222703,"v1.15.3","2024-03-14T15:54:31",{"id":190,"version":191,"summary_zh":76,"released_at":192},222704,"v1.15.2","2024-03-11T16:27:47",{"id":194,"version":195,"summary_zh":76,"released_at":196},222705,"v1.15.1","2024-03-03T20:09:40",{"id":198,"version":199,"summary_zh":76,"released_at":200},222706,"v1.15.0","2024-02-25T15:49:11",{"id":202,"version":203,"summary_zh":76,"released_at":204},222707,"v1.14.4","2024-01-15T17:57:37",{"id":206,"version":207,"summary_zh":76,"released_at":208},222708,"v1.14.3","2024-01-08T08:29:22",{"id":210,"version":211,"summary_zh":76,"released_at":212},222709,"v1.14.2","2023-12-22T04:32:43",{"id":214,"version":215,"summary_zh":76,"released_at":216},222710,"v1.14.1","2023-12-08T18:29:04"]