[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-NLPOptimize--flash-tokenizer":3,"tool-NLPOptimize--flash-tokenizer":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":79,"owner_company":80,"owner_location":80,"owner_email":81,"owner_twitter":80,"owner_website":82,"owner_url":83,"languages":84,"stars":108,"forks":109,"last_commit_at":110,"license":80,"difficulty_score":111,"env_os":112,"env_gpu":113,"env_ram":114,"env_deps":115,"category_tags":119,"github_topics":120,"view_count":23,"oss_zip_url":80,"oss_zip_packed_at":80,"status":16,"created_at":135,"updated_at":136,"faqs":137,"releases":138},2226,"NLPOptimize\u002Fflash-tokenizer","flash-tokenizer","EFFICIENT AND OPTIMIZED TOKENIZER ENGINE FOR LLM INFERENCE SERVING","FlashTokenizer 是一款专为大语言模型（LLM）推理服务打造的高性能分词引擎。它基于 C++ 开发，完美兼容 BERT 分词逻辑，旨在解决现有主流分词工具在速度与资源消耗上的瓶颈。\n\n在实际应用中，许多开发者发现 Hugging Face 的 Python 版分词器速度较慢，而其他加速方案往往依赖庞大的深度学习框架（如 PaddlePaddle）或存在精度损失。FlashTokenizer 的出现正是为了填补这一空白：它在保持与官方实现完全一致的准确率前提下，运行速度比标准的 `BertTokenizerFast` 快达 10 倍，被誉为“全球最快的 CPU 分词库”。\n\n这款工具特别适合需要高吞吐、低延迟的 AI 后端工程师、模型部署专家以及 NLP 研究人员。其独特的技术亮点在于纯 C++ 底层优化，无需安装重型依赖库，仅通过 pip 即可轻松集成到 Python 项目中。此外，它还支持多语言环境，并因其卓越的性能入选了 2025 年顶级 C++ NLP 项目榜单。如果你正在构建对响应时间敏感的大模型应用，FlashTokenizer 能显著提升预处理效率，让推理流程更","FlashTokenizer 是一款专为大语言模型（LLM）推理服务打造的高性能分词引擎。它基于 C++ 开发，完美兼容 BERT 分词逻辑，旨在解决现有主流分词工具在速度与资源消耗上的瓶颈。\n\n在实际应用中，许多开发者发现 Hugging Face 的 Python 版分词器速度较慢，而其他加速方案往往依赖庞大的深度学习框架（如 PaddlePaddle）或存在精度损失。FlashTokenizer 的出现正是为了填补这一空白：它在保持与官方实现完全一致的准确率前提下，运行速度比标准的 `BertTokenizerFast` 快达 10 倍，被誉为“全球最快的 CPU 分词库”。\n\n这款工具特别适合需要高吞吐、低延迟的 AI 后端工程师、模型部署专家以及 NLP 研究人员。其独特的技术亮点在于纯 C++ 底层优化，无需安装重型依赖库，仅通过 pip 即可轻松集成到 Python 项目中。此外，它还支持多语言环境，并因其卓越的性能入选了 2025 年顶级 C++ NLP 项目榜单。如果你正在构建对响应时间敏感的大模型应用，FlashTokenizer 能显著提升预处理效率，让推理流程更加流畅高效。","\u003Cp align=\"center\">\n  \u003Cpicture>\n    \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\"https:\u002F\u002Fgithub.com\u002FNLPOptimize\u002Fflash-tokenizer\u002Fblob\u002Fmain\u002Fassets\u002FFlashTokenizer_main_dark.png?raw=true\">\n    \u003Cimg alt=\"FlashTokenizer\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_3e044c598d04.png\" width=60%>\n  \u003C\u002Fpicture>\n\u003C\u002Fp>\n\u003Ch1 align=\"center\">\nThe world's fastest CPU tokenizer library!\n\u003C\u002Fh1>\n\nFlashTokenizer has been selected as one of Libhunt's [Top 11 C++ NLP Projects](https:\u002F\u002Fwww.libhunt.com\u002Fl\u002Fcpp\u002Ftopic\u002Fnlp).\n\n\u003Cdetails>\n  \u003Csummary>2025 Top 11 C++ NLP Projects\u003C\u002Fsummary>\n\n![Top11C++NLP-Projects](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_42247351e74b.png)\n  \n\u003C\u002Fdetails>\n\n\n[![PyPI Downloads](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_fce6aff22562.png)](https:\u002F\u002Fpepy.tech\u002Fprojects\u002Fflash-tokenizer)\n\n* https:\u002F\u002Fpepy.tech\u002Fprojects\u002Fflash-tokenizer\n\n* * *\n\n### 🇨🇳 [简体中文](.\u002FREADME.zh-CN.md) 🇰🇷[한국어](.\u002FREADME.ko-KR.md) 🇯🇵[日本語](.\u002FREADME.ja-JP.md)\n\n\n## EFFICIENT AND OPTIMIZED TOKENIZER ENGINE FOR LLM INFERENCE SERVING\n\n[FlashTokenizer](https:\u002F\u002Fpypi.org\u002Fproject\u002Fflash-tokenizer\u002F) is a **high-performance tokenizer implementation in C++ of the BertTokenizer used for LLM inference**. It has the highest speed and accuracy of any tokenizer, such as [FlashAttention](https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention) and [FlashInfer](https:\u002F\u002Fgithub.com\u002Fflashinfer-ai\u002Fflashinfer), and is **10 times faster** than `BertTokenizerFast` in transformers.\n\n## 💚 Contributors\n\n\u003Ctable>\n  \u003Ctr>\n    \u003Ctd align=\"center\">\n      \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fjhcnode\">\n        \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_e32c0c729236.png\" width=\"80px;\" alt=\"jhcnode\"\u002F>\u003Cbr \u002F>\n        \u003Csub>\u003Cb>@jhcnode\u003C\u002Fb>\u003C\u002Fsub>\n      \u003C\u002Fa>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n      \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fchristurnbull\">\n        \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_37ac4d05ce8b.png\" width=\"80px;\" alt=\"christurnbull\"\u002F>\u003Cbr \u002F>\n        \u003Csub>\u003Cb>@christurnbull\u003C\u002Fb>\u003C\u002Fsub>\n      \u003C\u002Fa>\n    \u003C\u002Ftd>\n  \u003C\u002Ftr>\n\u003C\u002Ftable>\n\n\n## Performance Benchmark Demo Video\n[![Video Label](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_1eaf7d8a4741.jpg)](https:\u002F\u002Fyoutu.be\u002Fa_sTiAXeSE0)\n\n\n\n\n\n> [!NOTE]  \n>\n> ### Why?\n> - We need a tokenizer that is faster, more accurate, and easier to use than [Huggingface's BertTokenizerFast](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers\u002Fblob\u002Fmain\u002Fsrc\u002Ftransformers\u002Fmodels\u002Fbert\u002Ftokenization_bert_fast.py). ([link1](https:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F75595699\u002Fhuggingfaces-berttokenizerfast-is-between-39000-and-258300-times-slower-than-ex), [link2](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FPaddleNLP\u002Fissues\u002F8565), [link3](https:\u002F\u002Fblog.csdn.net\u002Fxhw205\u002Farticle\u002Fdetails\u002F129578988))\n> - [PaddleNLP's BertTokenizerFast](https:\u002F\u002Fpaddlenlp.readthedocs.io\u002Fen\u002Fstable\u002F_modules\u002Fpaddlenlp\u002Fexperimental\u002Ffaster_tokenizer.html) achieves a 1.2x performance improvement by implementing [Huggingface's Rust version](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftokenizers) in `C++`.  However, using it requires installing both the massive [PaddlePaddle](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FPaddle) and [PaddleNLP](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FPaddleNLP) packages.\n> - [Tensorflow-text's FastBertTokenizer](https:\u002F\u002Fwww.tensorflow.org\u002Ftext\u002Fapi_docs\u002Fpython\u002Ftext\u002FFastBertTokenizer) actually demonstrates slower performance in comparison.\n> - [Microsoft's Blingfire](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FBlingFire) **takes over 8 hours** to train on custom data and shows relatively lower accuracy.\n> - [Rapid's cuDF](https:\u002F\u002Fgithub.com\u002Frapidsai\u002Fcudf) provides a GPU-based BertTokenizer, but it suffers from accuracy issues.\n> - Unfortunately, [FastBertTokenizer](https:\u002F\u002Fgithub.com\u002Fgeorg-jung\u002FFastBertTokenizer) and [BertTokenizers](https:\u002F\u002Fgithub.com\u002FNMZivkovic\u002FBertTokenizers) developed in `C#` and cannot be used in `Python`.\n>\n> - This is why we developed `FlashTokenizer`. It can be easily installed via `pip` and is **developed in C++ for straightforward maintenance**. Plus, it guarantees extremely fast speeds. We've created an implementation that's faster than Blingfire and easier to use. FlashTokenizer is implemented using the **LinMax Tokenizer** proposed in [Fast WordPiece Tokenization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.15524), enabling tokenization in linear time. Finally It supports **parallel processing at the C++ level for batch encoding**, delivering outstanding speed.\n>\n\n\n\n\n\u003Cp align=\"center\">\n  \u003Cpicture>\n    \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\"https:\u002F\u002Fgithub.com\u002FNLPOptimize\u002Fflash-tokenizer\u002Fblob\u002Fmain\u002Fassets\u002FBanner_dark.png?raw=true\">\n    \u003Cimg alt=\"Banner\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_43d3a5413cdb.png\" width=100%>\n  \u003C\u002Fpicture>\n\u003C\u002Fp>\n\n\n\u003Cp>\n\u003Cimg align=\"left\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fsuccess-0B86F1?style=flat&logo=python&logoColor=white&label=MacOS_build\">\n\u003Cimg align=\"left\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fsuccess-0B86F1?style=flat&logo=python&logoColor=white&label=Windows_build\">\n\u003Cimg align=\"left\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fsuccess-0B86F1?style=flat&logo=python&logoColor=white&label=Linux_build\">\n\u003C\u002Fp>\u003Cbr>\n\n* * *\n\n### FlashTokenizer includes the following core features\n\n> [!TIP]\n> \n>  * Implemented in C++17.\n>     * **MacOS**: `clang++`.\n>     * **Windows**: `Visual Studio 2022`.\n>     * **Ubuntu**: `g++`. \n>\n> * Equally fast in Python via pybind11.\n> * Support for parallel processing at the C++ level using OPENMP.\n>     \n\n\n\n## News\n\n> [!IMPORTANT]  \n> **[Apr 02 2025]**\n> - Add performance benchmarking code\n> - Performance benchmarking is conducted using Python, and required packages can be installed via [setup.sh](.\u002Fperftest\u002Fsetup.sh).  \n> - A minor performance improvement has been achieved by adding the `tokenize_early_stop` feature to `BasicTokenizer`.  \n> - [OpenMP](https:\u002F\u002Fwww.openmp.org\u002F) demonstrated better performance than `std::thread` across Windows, Linux, and macOS, so we've switched exclusively to OpenMP.\n> \n> **[Mar 31 2025]**\n> - Modified to provide pre-built whl files for each OS.\n>\n> **[Mar 22 2025]**\n>\n> - Added [DFA](https:\u002F\u002Fblog.cloudflare.com\u002Fpt-br\u002Fmaking-waf-ai-models-go-brr\u002F#:~:text=We%20can%20also%20tune%20Aho,settings%20based%20on%20this%20recommendation) to AC Trie.\n>\n> **[Mar 21 2025]**\n> - Improving Tokenizer Accuracy\n>\n> **[Mar 19 2025]** \n> - Memory reduction and slight performance improvement by applying LinMaxMatching from [Aho–Corasick](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FAho%E2%80%93Corasick_algorithm) algorithm.\n> - Improved branch pipelining of all functions and force-inline applied.\n> - Removed unnecessary operations of `WordpieceTokenizer(Backward)`.\n> - Optimizing all functions to operate except for [Bloom filter](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FBloom_filter) is faster than caching.\n> - `punctuation`,` control`, and `whitespace` are defined as constexprs in advance and used as Bloom filters.\n> - Reduce unnecessary memory allocation with statistical memory profiling.\n> -  In ✨FlashTokenizer✨, `bert-base-uncased` can process **35K** texts per second on a single core, with an approximate processing time of **28ns** per text.\n>\n> **[Mar 18 2025]** \n> - Improvements to the accuracy of the BasicTokenizer have improved the overall accuracy and, in particular, produce more accurate results for Unicode input.\n>\n> **[Mar 14 2025]** \n> - The performance of the `WordPieceTokenizer` and `WordPieceBackwordTokenizer` has been improved using [Trie](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FTrie), which was introduced in [Fast WordPiece Tokenization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.15524).\n> - Using `FastPoolAllocator` in `std::list` improves performance in SingleEncoding, but it is not thread-safe, so `std::list\u003Cstd::string>` is used as is in BatchEncoding. In BatchEncoding, `OPENMP` is completely removed and only `std::thread` is used.\n>\n> **[Mar 10 2025]** \n> - Performance improvements through faster token mapping with robin_hood and memory copy minimization with **std::list**.\n>\n>\n> #### Token Ids Map Table Performance Test.\n>\n> Token and Ids Map used the fastest `robin_hood::unordered_flat_map\u003Cstd::string, int>`.\n>\n> **[Mar 09 2025]** Completed development of flash-tokenizer for BertTokenizer.\n\n\n\n## 1. Installation\n\n### Requirements\n * `Windows(AMD64)`, `MacOS(ARM64)`, `Ubuntu(x86-64)` .\n * `g++` \u002F `clang++` \u002F `MSVC`.\n * `python 3.8 ~ 3.13`.\n\n### Install from [PIP](https:\u002F\u002Fpypi.org\u002Fproject\u002Fflash-tokenizer\u002F)\n\n\n\nOn Windows, you need to install [vc_redist.x64.exe](https:\u002F\u002Fgithub.com\u002FNLPOptimize\u002Fflash-tokenizer\u002Freleases\u002Fdownload\u002FPackages\u002FVC_redist.x64.exe).\n```bash\n# Windows\npip install -U flash-tokenizer\n```\n```bash\n# Linux\npip install -U flash-tokenizer\n```\n```bash\n# MacOS\npip install -U flash-tokenizer\n```\n\n### Install from Source\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FNLPOptimize\u002Fflash-tokenizer\ncd flash-tokenizer\u002Fprj\npip install .\n```\n\n\n## 2. Sample\n\n```python\nfrom flash_tokenizer import BertTokenizerFlash\nfrom transformers import BertTokenizer\n\ntitles = [\n    '绝不能放弃，世界上没有失败，只有放弃。',\n    'is there any doubt about it \"None whatsoever\"',\n    \"세상 어떤 짐승이 이를 드러내고 사냥을 해? 약한 짐승이나 몸을 부풀리지, 진짜 짐승은 누구보다 침착하지.\",\n    'そのように二番目に死を偽装して生き残るようになったイタドリがどうして初めて見る自分をこんなに気遣ってくれるのかと尋ねると「私が大切にする人たちがあなたを大切にするから」と答えては'\n]\n\ntokenizer1 = BertTokenizerFlash.from_pretrained('bert-base-multilingual-cased')\ntokenizer2 = BertTokenizer.from_pretrained('bert-base-multilingual-cased')\n\ncorrect = 0\nfor title in titles:\n    print(title)\n    tokens1 = tokenizer1.tokenize(title)\n    tokens2 = tokenizer2.tokenize(title)\n    ids1 = tokenizer1(title, max_length=512, padding=\"longest\").input_ids[0]\n    ids2 = tokenizer2(title, max_length=512, padding=\"longest\", return_tensors=\"np\").input_ids[0].tolist()\n    if tokens1 == tokens2 and ids1 == ids2:\n        correct += 1\n        print(\"Accept!\")\n    else:\n        print(\"Wrong Answer\")\n    print(ids1)\n    print(ids2)\n    print()\n\nprint(f'Accuracy: {correct * 100.0 \u002F len(titles):.2f}%')\n```\n\n```\n绝不能放弃，世界上没有失败，只有放弃。\nAccept!\n[101, 6346, 2080, 6546, 4284, 3704, 10064, 2087, 5621, 2078, 4917, 4461, 3204, 7480, 10064, 2751, 4461, 4284, 3704, 1882, 102]\n[101, 6346, 2080, 6546, 4284, 3704, 10064, 2087, 5621, 2078, 4917, 4461, 3204, 7480, 10064, 2751, 4461, 4284, 3704, 1882, 102]\n\nis there any doubt about it \"None whatsoever\"\nAccept!\n[101, 10124, 11155, 11178, 86697, 10978, 10271, 107, 86481, 12976, 11669, 23433, 107, 102]\n[101, 10124, 11155, 11178, 86697, 10978, 10271, 107, 86481, 12976, 11669, 23433, 107, 102]\n\n세상 어떤 짐승이 이를 드러내고 사냥을 해? 약한 짐승이나 몸을 부풀리지, 진짜 짐승은 누구보다 침착하지.\nAccept!\n[101, 9435, 14871, 55910, 9710, 48210, 10739, 35756, 9113, 30873, 31605, 11664, 9405, 118729, 10622, 9960, 136, 9539, 11102, 9710, 48210, 43739, 9288, 10622, 9365, 119407, 12692, 12508, 117, 9708, 119235, 9710, 48210, 10892, 9032, 17196, 80001, 9783, 119248, 23665, 119, 102]\n[101, 9435, 14871, 55910, 9710, 48210, 10739, 35756, 9113, 30873, 31605, 11664, 9405, 118729, 10622, 9960, 136, 9539, 11102, 9710, 48210, 43739, 9288, 10622, 9365, 119407, 12692, 12508, 117, 9708, 119235, 9710, 48210, 10892, 9032, 17196, 80001, 9783, 119248, 23665, 119, 102]\n\nそのように二番目に死を偽装して生き残るようになったイタドリがどうして初めて見る自分をこんなに気遣ってくれるのかと尋ねると「私が大切にする人たちがあなたを大切にするから」と答えては\nAccept!\n[101, 11332, 24273, 2150, 5632, 5755, 1943, 4805, 1980, 2371, 7104, 11592, 5600, 1913, 4814, 1975, 27969, 15970, 21462, 15713, 21612, 10898, 56910, 22526, 22267, 2547, 19945, 7143, 1975, 6621, 2534, 1980, 28442, 60907, 11312, 4854, 7770, 14813, 18825, 58174, 75191, 11662, 3456, 1945, 100812, 1890, 5949, 1912, 3197, 2535, 84543, 2179, 78776, 111787, 22946, 20058, 11377, 3197, 2535, 84543, 16867, 1891, 1940, 6076, 27144, 11588, 102]\n[101, 11332, 24273, 2150, 5632, 5755, 1943, 4805, 1980, 2371, 7104, 11592, 5600, 1913, 4814, 1975, 27969, 15970, 21462, 15713, 21612, 10898, 56910, 22526, 22267, 2547, 19945, 7143, 1975, 6621, 2534, 1980, 28442, 60907, 11312, 4854, 7770, 14813, 18825, 58174, 75191, 11662, 3456, 1945, 100812, 1890, 5949, 1912, 3197, 2535, 84543, 2179, 78776, 111787, 22946, 20058, 11377, 3197, 2535, 84543, 16867, 1891, 1940, 6076, 27144, 11588, 102]\n\nAccuracy: 100.00%\n```\n\n## 3. Other Implementations\n\n\n\u003Cp align=\"center\">\n  \u003Cpicture>\n    \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\".\u002Fassets\u002Flogos_dark.png\">\n    \u003Cimg alt=\"Banner\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_bac5076d3438.png\" width=100%>\n  \u003C\u002Fpicture>\n\u003C\u002Fp>\n\n\nMost [BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805)-based models use the [WordPiece Tokenizer](https:\u002F\u002Fstatic.googleusercontent.com\u002Fmedia\u002Fresearch.google.com\u002Fja\u002F\u002Fpubs\u002Farchive\u002F37842.pdf), whose code can be found [here](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert\u002Fblob\u002Fmaster\u002Ftokenization.py).\n(A simple implementation of Huggingface can be found [here](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers\u002Fblob\u002Fmain\u002Fsrc\u002Ftransformers\u002Fmodels\u002Fbert\u002Ftokenization_bert.py)).\n\nSince the BertTokenizer is a CPU intensive algorithm, inference can be a bottleneck, and unoptimized tokenizers can be severely slow. A good example is the [BidirectionalWordpieceTokenizer](https:\u002F\u002Fgithub.com\u002Fsnunlp\u002FKR-BERT\u002Fblob\u002Fmaster\u002Fkrbert_tensorflow\u002Ftokenization_ranked.py) introduced in [KR-BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.03979). Most of the code is the same, but the algorithm traverses the sub token backwards and writes a larger value compared to the forward traversal. The paper claims accuracy improvements, but it's hard to find other quantitative metrics, and the accuracy improvements aren't significant, and the tokenizer is seriously slowed down.\n\n* transformers (Rust Impl, PyO3)\n* paddlenlp (C++ Impl, pybind)\n* tensorflow-text (C++ Impl, pybind)\n* blingfire (C++ Impl, Native binary call)\n\nMost developers will either use `transformers.BertTokenizer` or `transformers.AutoTokenizer`, but using `AutoTokenizer` will return `transformers.BertTokenizerFast`.\n\nNaturally, it's faster than BertTokenizer, but the results aren't exactly the same, which means you're already giving up 100% accuracy starting with the tokenizer.\n\nBertTokenizer is not only provided by transformers. [PaddleNLP](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FPaddleNLP) and [tensorflow-text](https:\u002F\u002Fwww.tensorflow.org\u002Ftext) also provide BertTokenizer.\n\nThen there's [Blingfire](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FBlingFire), which is developed by Microsoft and is being abandoned.\n\nPaddleNLP requires PaddlePaddle and provides tokenizer functionality starting with version 3.0rc. You can install it as follows\n\n```bash\n##### Install PaddlePaddle, PaddleNLP\npython -m pip install paddlepaddle==3.0.0b1 -i https:\u002F\u002Fwww.paddlepaddle.org.cn\u002Fpackages\u002Fstable\u002Fcpu\u002F\npip install --upgrade paddlenlp==3.0.0b3\n##### Install transformers\npip install transformers==4.47.1\n##### Install tf-text\npip install tensorflow-text==2.18.1\n##### Install blingfire\npip install blingfire\n```\n\n\nWith the exception of blingfire, vocab.txt is all you need to run the tokenizer right away. \n(blingfire also requires only vocab.txt and can be used after 8 hours of learning).\n\nThe implementations we'll look at in detail are `PaddleNLP's BertTokenizerFast` and `blingfire`.\n\n* `blingfire`: Uses a [Deterministic Finite State Machine (DFSM)](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FBlingFire\u002Fblob\u002Fmaster\u002Fdoc\u002FBling_Fire_Tokenizer_Algorithms.pdf) to eliminate one linear scan and unnecessary comparisons, resulting in a time of O(n), which is impressive.\n  * **Advantages**: **5-10x faster than other implementations**.\n  * **Disadvantages**: Long training time (8 hours) and lower accuracy than other implementations. (+Difficult to get help due to de facto development hiatus).\n* `PaddleNLP`: As shown in the experiments below, PaddleNLP is always faster than BertTokenizerFast (HF) to the same number of decimal places, and is always faster on any OS, whether X86 or Arm.\n  * **Advantages**:  **Internal implementation is in C++** Compared to `transformers.BertTokenizerFast` implemented in Rust, it is 1.2x faster while outputting exactly the same values.\n    * You can't specify `pt(pytorch tensor)` in `return_tensors`, but this is not a problem.\n  * **Disadvantages**: none, other than the need to install PaddlePaddle and PaddleNLP.\n\n## 4. Performance test\n\n\n### 4.1 Performance test (Single text encoding)\n\nAccuracy is the result of measuring [google's BertTokenizerFast](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert\u002Fblob\u002Fmaster\u002Ftokenization.py) as a baseline. If even one of the `input_ids` is incorrect, the answer is considered incorrect.\n\n\n\u003Cp align=\"center\">\n  \u003Cpicture>\n    \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\".\u002Fassets\u002Fcomp_speed_dark.png\">\n    \u003Cimg alt=\"FlashTokenizer\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_61609d99e453.png\" width=100%>\n  \u003C\u002Fpicture>\n\u003C\u002Fp>\n\n\u003Cp align=\"center\">\n  \u003Cpicture>\n    \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\".\u002Fassets\u002Fcomp_accuracy_dark.png\">\n    \u003Cimg alt=\"FlashTokenizer\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_95e3fea8f02b.png\" width=100%>\n  \u003C\u002Fpicture>\n\u003C\u002Fp>\n\n\n### Tokenizer Performance Comparison \n\n#### [google-bert\u002Fbert-base-cased](https:\u002F\u002Fhuggingface.co\u002Fgoogle-bert\u002Fbert-base-cased)\n\n| Tokenizer                      | Elapsed Time | texts     | Accuracy |\n|--------------------------------|----------------:|-----------:|------------:|\n| BertTokenizerFast(Huggingface) | 84.3700s     | 1,000,000 | 99.9226% |\n| BertTokenizerFast(PaddleNLP)   | 75.6551s     | 1,000,000 | 99.9226% |\n| FastBertTokenizer(Tensorflow)  | 219.1259s    | 1,000,000 | 99.9160% |\n| Blingfire                      | 13.6183s     | 1,000,000 | 99.8991% |\n| **FlashBertTokenizer**             | 8.1968s      | 1,000,000 | 99.8216% |\n\n#### [google-bert\u002Fbert-base-uncased](https:\u002F\u002Fhuggingface.co\u002Fgoogle-bert\u002Fbert-base-uncased)\n\n| Tokenizer                      |   Elapsed Time |     texts |   Accuracy |\n|--------------------------------|----------------:|-----------:|------------:|\n| BertTokenizerFast(Huggingface) |       91.7882s | 1,000,000 |   99.9326% |\n| BertTokenizerFast(PaddleNLP)   |       83.6839s | 1,000,000 |   99.9326% |\n| FastBertTokenizer(Tensorflow)  |      204.2240s | 1,000,000 |   99.1379% |\n| Blingfire                      |       13.2374s | 1,000,000 |   99.8588% |\n| **FlashBertTokenizer**             |        7.6313s | 1,000,000 |   99.6884% |\n\n#### [google-bert\u002Fbert-base-multilingual-cased](https:\u002F\u002Fhuggingface.co\u002Fgoogle-bert\u002Fbert-base-multilingual-cased)\n\n\n\n| Tokenizer                      | Elapsed Time | texts     | Accuracy |\n|--------------------------------|----------------:|-----------:|------------:|\n| BertTokenizerFast(Huggingface) | 212.1570s    | 2,000,000 | 99.7964% |\n| BertTokenizerFast(PaddleNLP)   | 193.9921s    | 2,000,000 | 99.7964% |\n| FastBertTokenizer(Tensorflow)  | 394.1574s    | 2,000,000 | 99.7892% |\n| Blingfire                      | 38.9013s     | 2,000,000 | 99.9780% |\n| **FlashBertTokenizer**             | 20.4570s     | 2,000,000 | 99.8970% |\n\n\n#### [beomi\u002Fkcbert-base](https:\u002F\u002Fgithub.com\u002FBeomi\u002FKcBERT)\n\n| Tokenizer                      |   Elapsed Time |     texts |   Accuracy |\n|--------------------------------|----------------:|-----------:|------------:|\n| BertTokenizerFast(Huggingface) |       52.5744s | 1,000,000 |   99.6754% |\n| BertTokenizerFast(PaddleNLP)   |       44.8943s | 1,000,000 |   99.6754% |\n| FastBertTokenizer(Tensorflow)  |      198.0270s | 1,000,000 |   99.6639% |\n| Blingfire                      |       13.0701s | 1,000,000 |   99.9434% |\n| **FlashBertTokenizer**             |        5.2601s | 1,000,000 |   99.9484% |\n\n\n| Tokenizer                      |   Elapsed Time |     texts |   Accuracy |\n|--------------------------------|----------------|-----------|------------|\n| **FlashBertTokenizer**             |        5.1875s | 1,000,001 |   99.9484% |\n| Blingfire                      |       13.2783s | 1,000,001 |   99.9435% |\n| rust_tokenizers(guillaume-be)  |       16.6308s | 1,000,001 |   99.9829% |\n| BertTokenizerFast(PaddleNLP)   |       44.5476s | 1,000,001 |   99.6754% |\n| BertTokenizerFast(Huggingface) |       53.2525s | 1,000,001 |   99.6754% |\n| FastBertTokenizer(Tensorflow)  |      202.1633s | 1,000,001 |   99.6639% |\n\n#### [microsoft\u002Fllmlingua-2-bert-base-multilingual-cased-meetingbank](https:\u002F\u002Fhuggingface.co\u002Fmicrosoft\u002Fllmlingua-2-bert-base-multilingual-cased-meetingbank)\n\n| Tokenizer                      |   Elapsed Time |     texts |   Accuracy |\n|--------------------------------|----------------:|-----------:|------------:|\n| BertTokenizerFast(Huggingface) |      208.8858s | 2,000,000 |   99.7964% |\n| BertTokenizerFast(PaddleNLP)   |      192.6593s | 2,000,000 |   99.7964% |\n| FastBertTokenizer(Tensorflow)  |      413.2010s | 2,000,000 |   99.7892% |\n| Blingfire                      |       39.3765s | 2,000,000 |   99.9780% |\n| **FlashBertTokenizer**             |       22.8820s | 2,000,000 |   99.8970% |\n\n| Tokenizer                      |   Elapsed Time |     texts |   Accuracy |\n|--------------------------------|----------------|-----------|------------|\n| **FlashBertTokenizer**             |       22.0901s | 2,000,001 |   99.8971% |\n| Blingfire                      |       37.9836s | 2,000,001 |   99.9780% |\n| rust_tokenizers(guillaume-be)  |       98.0366s | 2,000,001 |   99.9976% |\n| BertTokenizerFast(PaddleNLP)   |      208.6889s | 2,000,001 |   99.7964% |\n| BertTokenizerFast(Huggingface) |      219.2644s | 2,000,001 |   99.7964% |\n| FastBertTokenizer(Tensorflow)  |      413.9725s | 2,000,001 |   99.7892% |\n\n#### [KR-BERT](https:\u002F\u002Fgithub.com\u002Fsnunlp\u002FKR-BERT)\n\n\n| Tokenizer                                    |   Elapsed Time |     texts |   Accuracy |\n|--------------------------------|----------------:|-----------:|------------:|\n| BertTokenizerBidirectional(KR-BERT Original) |      128.3320s | 1,000,000 |  100.0000% |\n| **FlashBertTokenizer(Bidirectional)**                           |       10.4492s | 1,000,000 |   99.9631% |\n\n\n\n```mermaid\n%%{ init: { \"er\" : { \"layoutDirection\" : \"LR\" } } }%%\nerDiagram\n    Text ||--o{ Preprocess : tokenize\n    Preprocess o{--|| Inference : memcpy_h2d\n    Inference o{--|| Postprocess : memcpy_d2h\n```\n\n\n\n\n\n## 6. Compatibility\n\nFlashBertTokenizer can be used with any framework.  CUDA version compatibility for each framework is also important for fast inference of LLMs.\n\n * [PyTorch](https:\u002F\u002Fpytorch.org\u002F) no longer supports installation using conda.\n * [ONNXRUNTIME](https:\u002F\u002Fonnxruntime.ai\u002Fdocs\u002Fexecution-providers\u002FCUDA-ExecutionProvider.html#cuda-12x) is separated by CUDA version.\n * PyTorch is also looking to ditch CUDA 12.x in favor of the newer CUDA 12.8. However, the trend is to keep CUDA 11.8 in all frameworks.\n   * CUDA 12.x was made for the newest GPUs, Hopper and Blackwell, and on GPUs like Volta, CUDA 11.8 is faster than CUDA 12.x.\n\n\n\n| DL Framework | Version | OS   | CPU  | CUDA 11.8 | CUDA 12.3 | CUDA 12.4 | CUDA 12.6 | CUDA 12.8 |\n| ------------ | ----|---- | ---- | --------- | ----|----- | --------- | --------- |\n| PyTorch | 2.6| Linux, Windows | ⚪|⚪|❌|⚪| ⚪ |    ❌      |\n| PyTorch | 2.7|Linux, Windows|⚪|⚪|❌|❌|⚪|⚪|\n| ONNXRUNTIME(11) | 1.20.x| Linux, Windows|⚪|⚪|❌|❌|❌|❌|\n| ONNXRUNTIME(12) | 1.20.x| Linux, Windows|⚪|❌|⚪|⚪|⚪|⚪|\n| PaddlePaddle | 3.0-beta | Linux, Windows|⚪|⚪|❌|❌|❌|❌|\n\n\n## 7. GPU Tokenizer\n\nHere is an example of installing and running cuDF in [Run State of the Art NLP Workloads at Scale with RAPIDS, HuggingFace, and Dask](https:\u002F\u002Fdeveloper.nvidia.com\u002Fblog\u002Frun-state-of-the-art-nlp-workloads-at-scale-with-rapids-huggingface-and-dask\u002F#:~:text=,and%20then%20used%20in%20subsequent).\n*(It's incredibly fast)*\n\nYou can run WordPiece Tokenizer on GPUs on [rapids(cudf)](https:\u002F\u002Fdocs.rapids.ai\u002F).\n * [Implemention](https:\u002F\u002Fgithub.com\u002Frapidsai\u002Fcudf\u002Fblob\u002F0e99ec3ec15b8b0ebe68bd884c7d22d600e9259e\u002Fpython\u002Fcudf\u002Fcudf\u002Fcore\u002Fwordpiece_tokenize.py#L10)\n * [Example](https:\u002F\u002Fgithub.com\u002Frapidsai\u002Fcudf\u002Fblob\u002F0e99ec3ec15b8b0ebe68bd884c7d22d600e9259e\u002Fpython\u002Fcudf\u002Fcudf\u002Ftests\u002Ftext\u002Ftest_subword_tokenizer.py#L244)\n\nAs you can see in [how to install rapids](https:\u002F\u002Fdocs.rapids.ai\u002Finstall\u002F), it only supports Linux and the CUDA version is not the same as other frameworks, so [docker](https:\u002F\u002Fhub.docker.com\u002Fr\u002Frapidsai\u002Fbase) is the best choice, which is faster than CPU for batch processing but slower than CPU for streaming processing.\n\nThere are good example codes and explanations in the[ blog](https:\u002F\u002Fdeveloper.nvidia.com\u002Fblog\u002Frun-state-of-the-art-nlp-workloads-at-scale-with-rapids-huggingface-and-dask\u002F#:~:text=,and then used in subsequent). To use cuDF, you must first convert vocab.txt to [hash_vocab](https:\u002F\u002Fgithub.com\u002Frapidsai\u002Fcudf\u002Fblob\u002Fbranch-25.06\u002Fpython\u002Fcudf\u002Fcudf\u002Futils\u002Fhash_vocab_utils.py) as shown below. The problem is that the hash_vocab function cannot convert multilingual. Therefore, the WordpieceTokenizer of cuDF cannot be used if there are any characters other than English\u002FChinese in the vocab.\n\n```python\nimport cudf\nfrom cudf.utils.hash_vocab_utils import hash_vocab\nhash_vocab('bert-base-cased-vocab.txt', 'voc_hash.txt')\n```\n\n\n\n\n\n## TODO\n\n- [x] [BidirectionalWordPieceTokenizer](https:\u002F\u002Fgithub.com\u002Fsnunlp\u002FKR-BERT\u002Fblob\u002Fmaster\u002Fkrbert_tensorflow\u002Ftokenization_ranked.py)\n- [x] BatchEncoder with Multithreading. \n- [x] Replace `std::list` to `boost::intrusive::list`.\n- [x] ~~[MaxMatch-Dropout: Subword Regularization for WordPiece](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.04126) Option.~~\n- [x] Use stack memory for reduce memory allocation. (C-Style, [alloca](https:\u002F\u002Fman7.org\u002Flinux\u002Fman-pages\u002Fman3\u002Falloca.3.html), [_alloca](https:\u002F\u002Flearn.microsoft.com\u002Fko-kr\u002Fcpp\u002Fc-runtime-library\u002Freference\u002Falloca?view=msvc-170))\n- [x] ~~Support for parallel processing option for single encode.~~\n- [ ] `circle.ai`\n  - [ ] Implement distribution of compiled wheel packages for installation.\n- [ ] SIMD\n- [ ] ~~CUDA Version.~~\n\n\n\n## Acknowledgement\n\nFlashTokenizer is inspired by [FlashAttention](https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention), [FlashInfer](https:\u002F\u002Fgithub.com\u002Fflashinfer-ai\u002Fflashinfer), [FastBertTokenizer](https:\u002F\u002Fgithub.com\u002Fgeorg-jung\u002FFastBertTokenizer) and [tokenizers-cpp](https:\u002F\u002Fgithub.com\u002Fmlc-ai\u002Ftokenizers-cpp) projects.\n\n\n\n## Performance comparison\n\n* **WordPiece**\n  * 📒  [huggingface\u002Ftokenizers (Rust)](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftokenizers)\n    * Rust implementation of transformers.BertTokenizerFast, provided as a Python package.\n    * 🔵 **Provided as a Python package.**\n  * 🔥 [FastBertTokenizer (C#)](https:\u002F\u002Ffastberttokenizer.gjung.com)\n    * It demonstrates incredibly fast performance, but accuracy significantly decreases for non-English queries.\n  * ❌ [BertTokenizers (C#)](https:\u002F\u002Fgithub.com\u002FNMZivkovic\u002FBertTokenizers)\n    * It can be confirmed from [FastBertTokenizer (C#) VS BertTokenizers (C#)](https:\u002F\u002Fgithub.com\u002Fgeorg-jung\u002FFastBertTokenizer\u002Ftree\u002Fmaster?tab=readme-ov-file#comparison-to-berttokenizers) that `FastBertTokenizer(C#)` is faster.\n  * 🔥 [rust-tokenizers (Rust)](https:\u002F\u002Fgithub.com\u002Fguillaume-be\u002Frust-tokenizers)\n    * Slower than BertTokenizerFlash and Blingfire but faster and more accurate than other implementations.\n    * 🔵 **Provided as a Python package.**\n  * ❌ [tokenizers-cpp (C++)](https:\u002F\u002Fgithub.com\u002Fmlc-ai\u002Ftokenizers-cpp)\n    * `tokenizer-cpp` is a wrapper around SentencePiece and HuggingFace's Rust implementation, so performance benchmarking is meaningless.\n  * ❌ [bertTokenizer (Java)](https:\u002F\u002Fgithub.com\u002Fankiteciitkgp\u002FbertTokenizer)\n    * Java is not covered.\n  * ✅ [ZhuoruLin\u002Ffast-wordpiece (Rust)](https:\u002F\u002Fgithub.com\u002FZhuoruLin\u002Ffast-wordpiece)\n    * A Rust implementation using LinMaxMatching, runnable only in Rust, and expected to be no faster than the C++ implementation.\n  * ❌ [huggingface_tokenizer_cpp (C++)](https:\u002F\u002Fgithub.com\u002FSorrow321\u002Fhuggingface_tokenizer_cpp)\n    * Very slow due to naive C++ implementation.\n  * ❌ [SeanLee97\u002FBertWordPieceTokenizer.jl (Julia)](https:\u002F\u002Fgithub.com\u002FSeanLee97\u002FBertWordPieceTokenizer.jl)\n    * Julia is not covered.\n* **BPE**\n  * https:\u002F\u002Fgithub.com\u002Fopenai\u002Ftiktoken\n* **SentencePiece**\n  * [google\u002Fsentencepiece (C++)](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fsentencepiece)\n\n\n\n## ⭐ History\n\n\u003Ca href=\"https:\u002F\u002Fwww.star-history.com\u002F#NLPOptimize\u002Fflash-tokenizer&Date\">\n\n \u003Cpicture>\n   \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_b1661b3ccb16.png&theme=dark\" \u002F>\n   \u003Csource media=\"(prefers-color-scheme: light)\" srcset=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_b1661b3ccb16.png\" \u002F>\n   \u003Cimg alt=\"Star History Chart\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_b1661b3ccb16.png\" \u002F>\n \u003C\u002Fpicture>\n\u003C\u002Fa>\n\n\n## References\n\n* https:\u002F\u002Fmedium.com\u002F@techhara\u002Fwhich-bert-tokenizer-is-faster-b832aa978b46\n* https:\u002F\u002Fmedium.com\u002F@atharv6f_47401\u002Fwordpiece-tokenization-a-bpe-variant-73cc48865cbf\n* https:\u002F\u002Fwww.restack.io\u002Fp\u002Ftransformer-models-bert-answer-fast-berttokenizerfast-cat-ai\n* https:\u002F\u002Fmedium.com\u002F@anmolkohli\u002Fmy-notes-on-bert-tokenizer-and-model-98dc22d0b64\n* https:\u002F\u002Fnocomplexity.com\u002Fdocuments\u002Ffossml\u002Fnlpframeworks.html\n* https:\u002F\u002Fgithub.com\u002Fmartinus\u002Frobin-hood-hashing\n* https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.15524\n* https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fhighway\n\n\n* * *\n\nBlingfire2: https:\u002F\u002Fgithub.com\u002Flfoppiano\u002FBlingfire\u002F\nhttps:\u002F\u002Fpypi.org\u002Fproject\u002Fblingfire2\u002F\n\nhttps:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FBlingFire\u002Fblob\u002Fmaster\u002Fnuget\u002Flib\u002Fruntimes\u002Fosx-arm64\u002Fnative\u002Flibblingfiretokdll.dylib","\u003Cp align=\"center\">\n  \u003Cpicture>\n    \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\"https:\u002F\u002Fgithub.com\u002FNLPOptimize\u002Fflash-tokenizer\u002Fblob\u002Fmain\u002Fassets\u002FFlashTokenizer_main_dark.png?raw=true\">\n    \u003Cimg alt=\"FlashTokenizer\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_3e044c598d04.png\" width=60%>\n  \u003C\u002Fpicture>\n\u003C\u002Fp>\n\u003Ch1 align=\"center\">\n全球最快的CPU分词库！\n\u003C\u002Fh1>\n\nFlashTokenizer已被Libhunt评选为【C++自然语言处理十大项目】之一。\n\n\u003Cdetails>\n  \u003Csummary>2025年C++自然语言处理十大项目\u003C\u002Fsummary>\n\n![Top11C++NLP-Projects](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_42247351e74b.png)\n  \n\u003C\u002Fdetails>\n\n\n[![PyPI下载量](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_fce6aff22562.png)](https:\u002F\u002Fpepy.tech\u002Fprojects\u002Fflash-tokenizer)\n\n* https:\u002F\u002Fpepy.tech\u002Fprojects\u002Fflash-tokenizer\n\n* * *\n\n### 🇨🇳 [简体中文](.\u002FREADME.zh-CN.md) 🇰🇷[한국어](.\u002FREADME.ko-KR.md) 🇯🇵[日本語](.\u002FREADME.ja-JP.md)\n\n\n## 面向大模型推理服务的高效优化分词引擎\n\n[FlashTokenizer](https:\u002F\u002Fpypi.org\u002Fproject\u002Fflash-tokenizer\u002F)是**基于BertTokenizer的高性能C++实现，专为大模型推理设计**。它在速度和准确性上均超越了其他类似工具，例如[FlashAttention](https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention)和[FlashInfer](https:\u002F\u002Fgithub.com\u002Fflashinfer-ai\u002Fflashinfer)，并且相比transformers中的`BertTokenizerFast`快**10倍**。\n\n## 💚 贡献者\n\n\u003Ctable>\n  \u003Ctr>\n    \u003Ctd align=\"center\">\n      \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fjhcnode\">\n        \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_e32c0c729236.png\" width=\"80px;\" alt=\"jhcnode\"\u002F>\u003Cbr \u002F>\n        \u003Csub>\u003Cb>@jhcnode\u003C\u002Fb>\u003C\u002Fsub>\n      \u003C\u002Fa>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n      \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fchristurnbull\">\n        \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_37ac4d05ce8b.png\" width=\"80px;\" alt=\"christurnbull\"\u002F>\u003Cbr \u002F>\n        \u003Csub>\u003Cb>@christurnbull\u003C\u002Fb>\u003C\u002Fsub>\n      \u003C\u002Fa>\n    \u003C\u002Ftd>\n  \u003C\u002Ftr>\n\u003C\u002Ftable>\n\n\n## 性能基准测试演示视频\n[![视频封面](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_1eaf7d8a4741.jpg)](https:\u002F\u002Fyoutu.be\u002Fa_sTiAXeSE0)\n\n\n\n\n\n> [!NOTE]  \n>\n> ### 为什么？\n> - 我们需要一种比[Huggingface的BertTokenizerFast](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers\u002Fblob\u002Fmain\u002Fsrc\u002Ftransformers\u002Fmodels\u002Fbert\u002Ftokenization_bert_fast.py)更快、更准确且更易用的分词器。（[链接1](https:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F75595699\u002Fhuggingfaces-berttokenizerfast-is-between-39000-and-258300-times-slower-than-ex), [链接2](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FPaddleNLP\u002Fissues\u002F8565), [链接3](https:\u002F\u002Fblog.csdn.net\u002Fxhw205\u002Farticle\u002Fdetails\u002F129578988)）\n> - [PaddleNLP的BertTokenizerFast](https:\u002F\u002Fpaddlenlp.readthedocs.io\u002Fen\u002Fstable\u002F_modules\u002Fpaddlenlp\u002Fexperimental\u002Ffaster_tokenizer.html)通过将[Huggingface的Rust版本](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftokenizers)移植到`C++`中，实现了1.2倍的性能提升。然而，使用它需要同时安装庞大的[PaddlePaddle](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FPaddle)和[PaddleNLP](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FPaddleNLP)包。\n> - [Tensorflow-text的FastBertTokenizer](https:\u002F\u002Fwww.tensorflow.org\u002Ftext\u002Fapi_docs\u002Fpython\u002Ftext\u002FFastBertTokenizer)的实际表现反而更慢。\n> - [微软的Blingfire](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FBlingFire)在自定义数据上训练**需要超过8小时**，且准确性相对较低。\n> - [Rapid的cuDF](https:\u002F\u002Fgithub.com\u002Frapidsai\u002Fcudf)提供了基于GPU的BertTokenizer，但存在准确性问题。\n> - 不幸的是，[FastBertTokenizer](https:\u002F\u002Fgithub.com\u002Fgeorg-jung\u002FFastBertTokenizer)和[BertTokenizers](https:\u002F\u002Fgithub.com\u002FNMZivkovic\u002FBertTokenizers)是用`C#`开发的，无法在`Python`中使用。\n>\n> - 这就是我们开发`FlashTokenizer`的原因。它可以通过`pip`轻松安装，并且**采用C++编写，便于维护**。此外，它还能保证极高的速度。我们实现了一种比Blingfire更快、更易用的方案。FlashTokenizer基于[快速WordPiece分词法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.15524)中提出的**LinMax分词器**，能够在线性时间内完成分词。最后，它支持**C++级别的并行处理以进行批量编码**，从而提供卓越的速度。\n>\n\n\n\n\n\u003Cp align=\"center\">\n  \u003Cpicture>\n    \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\"https:\u002F\u002Fgithub.com\u002FNLPOptimize\u002Fflash-tokenizer\u002Fblob\u002Fmain\u002Fassets\u002FBanner_dark.png?raw=true\">\n    \u003Cimg alt=\"Banner\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_43d3a5413cdb.png\" width=100%>\n  \u003C\u002Fpicture>\n\u003C\u002Fp>\n\n\n\u003Cp>\n\u003Cimg align=\"left\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fsuccess-0B86F1?style=flat&logo=python&logoColor=white&label=MacOS_build\">\n\u003Cimg align=\"left\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fsuccess-0B86F1?style=flat&logo=python&logoColor=white&label=Windows_build\">\n\u003Cimg align=\"left\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fsuccess-0B86F1?style=flat&logo=python&logoColor=white&label=Linux_build\">\n\u003C\u002Fp>\u003Cbr>\n\n* * *\n\n### FlashTokenizer包含以下核心特性\n\n> [!TIP]\n> \n>  * 基于C++17实现。\n>     * **MacOS**: `clang++`。\n>     * **Windows**: `Visual Studio 2022`。\n>     * **Ubuntu**: `g++`。\n>\n> * 通过pybind11同样可在Python中实现高速运行。\n> * 支持使用OPENMP在C++层面进行并行处理。\n>\n\n## 新闻\n\n> [!IMPORTANT]  \n> **[2025年4月2日]**\n> - 添加性能基准测试代码\n> - 性能基准测试使用 Python 进行，所需包可通过 [setup.sh](.\u002Fperftest\u002Fsetup.sh) 安装。  \n> - 通过在 `BasicTokenizer` 中添加 `tokenize_early_stop` 功能，实现了小幅性能提升。  \n> - 在 Windows、Linux 和 macOS 上，[OpenMP](https:\u002F\u002Fwww.openmp.org\u002F) 的性能均优于 `std::thread`，因此我们已完全切换至 OpenMP。\n> \n> **[2025年3月31日]**\n> - 修改为提供各操作系统的预编译 whl 文件。\n>\n> **[2025年3月22日]**\n>\n> - 向 AC Trie 添加了 [DFA](https:\u002F\u002Fblog.cloudflare.com\u002Fpt-br\u002Fmaking-waf-ai-models-go-brr\u002F#:~:text=We%20can%20also%20tune%20Aho,settings%20based%20on%20this%20recommendation)。\n>\n> **[2025年3月21日]**\n> - 提升分词器准确性\n>\n> **[2025年3月19日]** \n> - 通过应用来自 [Aho–Corasick](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FAho%E2%80%93Corasick_algorithm) 算法的 LinMaxMatching，减少了内存占用并略微提升了性能。\n> - 改进了所有函数的分支流水线，并强制内联。\n> - 移除了 `WordpieceTokenizer(Backward)` 中不必要的操作。\n> - 优化所有函数，使其运行速度比使用缓存更快，仅除 [Bloom filter](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FBloom_filter) 外。\n> - 将 `标点符号`、`控制字符` 和 `空白字符` 预先定义为 constexpr，并用作 Bloom 过滤器。\n> - 通过统计内存分析减少不必要的内存分配。\n> - 在 ✨FlashTokenizer✨ 中，`bert-base-uncased` 在单核上每秒可处理 **35K** 条文本，平均每条文本的处理时间约为 **28ns**。\n>\n> **[2025年3月18日]** \n> - BasicTokenizer 准确性的提升改善了整体准确性，尤其在 Unicode 输入时表现更为精准。\n>\n> **[2025年3月14日]** \n> - 使用 [Trie](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FTrie) 改进了 `WordPieceTokenizer` 和 `WordPieceBackwordTokenizer` 的性能，该技术源自 [Fast WordPiece Tokenization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.15524)。\n> - 在 `SingleEncoding` 中使用 `FastPoolAllocator` 于 `std::list` 可提升性能，但其不具备线程安全性，因此在 `BatchEncoding` 中仍直接使用 `std::list\u003Cstd::string>`。在批量编码中，完全移除了 `OPENMP`，仅使用 `std::thread`。\n>\n> **[2025年3月10日]** \n> - 通过 robin_hood 实现更快的标记映射，以及利用 **std::list** 最小化内存拷贝，从而提升了性能。\n>\n>\n> #### 标记 ID 映射表性能测试。\n>\n> 标记与 ID 映射采用了最快的 `robin_hood::unordered_flat_map\u003Cstd::string, int>`。\n>\n> **[2025年3月9日]** 完成了 BertTokenizer 的 flash-tokenizer 开发。\n\n\n\n## 1. 安装\n\n### 要求\n * `Windows(AMD64)`, `MacOS(ARM64)`, `Ubuntu(x86-64)` 。\n * `g++` \u002F `clang++` \u002F `MSVC`。\n * `python 3.8 ~ 3.13`。\n\n### 从 [PIP](https:\u002F\u002Fpypi.org\u002Fproject\u002Fflash-tokenizer\u002F) 安装\n\n\n\n在 Windows 上，您需要安装 [vc_redist.x64.exe](https:\u002F\u002Fgithub.com\u002FNLPOptimize\u002Fflash-tokenizer\u002Freleases\u002Fdownload\u002FPackages\u002FVC_redist.x64.exe)。\n```bash\n# Windows\npip install -U flash-tokenizer\n```\n```bash\n# Linux\npip install -U flash-tokenizer\n```\n```bash\n# MacOS\npip install -U flash-tokenizer\n```\n\n### 从源码安装\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FNLPOptimize\u002Fflash-tokenizer\ncd flash-tokenizer\u002Fprj\npip install .\n```\n\n\n## 2. 示例\n\n```python\nfrom flash_tokenizer import BertTokenizerFlash\nfrom transformers import BertTokenizer\n\ntitles = [\n    '绝不能放弃，世界上没有失败，只有放弃。',\n    'is there any doubt about it \"None whatsoever\"',\n    \"세상 어떤 짐승이 이를 드러내고 사냥을 해? 약한 짐승이나 몸을 부풀리지, 진짜 짐승은 누구보다 침착하지.\",\n    'そのように二番目に死を偽装して生き残るようになったイタドリがどうして初めて見る自分をこんなに気遣ってくれるのかと尋ねると「私が大切にする人たちがあなたを大切にするから」と答えては'\n]\n\ntokenizer1 = BertTokenizerFlash.from_pretrained('bert-base-multilingual-cased')\ntokenizer2 = BertTokenizer.from_pretrained('bert-base-multilingual-cased')\n\ncorrect = 0\nfor title in titles:\n    print(title)\n    tokens1 = tokenizer1.tokenize(title)\n    tokens2 = tokenizer2.tokenize(title)\n    ids1 = tokenizer1(title, max_length=512, padding=\"longest\").input_ids[0]\n    ids2 = tokenizer2(title, max_length=512, padding=\"longest\", return_tensors=\"np\").input_ids[0].tolist()\n    if tokens1 == tokens2 and ids1 == ids2:\n        correct += 1\n        print(\"Accept!\")\n    else:\n        print(\"Wrong Answer\")\n    print(ids1)\n    print(ids2)\n    print()\n\nprint(f'Accuracy: {correct * 100.0 \u002F len(titles):.2f}%')\n```\n\n```\n绝不能放弃，世界上没有失败，只有放弃。\nAccept!\n[101, 6346, 2080, 6546, 4284, 3704, 10064, 2087, 5621, 2078, 4917, 4461, 3204, 7480, 10064, 2751, 4461, 4284, 3704, 1882, 102]\n[101, 6346, 2080, 6546, 4284, 3704, 10064, 2087, 5621, 2078, 4917, 4461, 3204, 7480, 10064, 2751, 4461, 4284, 3704, 1882, 102]\n\nis there any doubt about it \"None whatsoever\"\nAccept!\n[101, 10124, 11155, 11178, 86697, 10978, 10271, 107, 86481, 12976, 11669, 23433, 107, 102]\n[101, 10124, 11155, 11178, 86697, 10978, 10271, 107, 86481, 12976, 11669, 23433, 107, 102]\n\n세상 어떤 짐승이 이를 드러내고 사냥을 해? 약한 짐승이나 몸을 부풀리지, 진짜 짐승은 누구보다 침착하지.\nAccept!\n[101, 9435, 14871, 55910, 9710, 48210, 10739, 35756, 9113, 30873, 31605, 11664, 9405, 118729, 10622, 9960, 136, 9539, 11102, 9710, 48210, 43739, 9288, 10622, 9365, 119407, 12692, 12508, 117, 9708, 119235, 9710, 48210, 10892, 9032, 17196, 80001, 9783, 119248, 23665, 119, 102]\n[101, 9435, 14871, 55910, 9710, 48210, 10739, 35756, 9113, 30873, 31605, 11664, 9405, 118729, 10622, 9960, 136, 9539, 11102, 9710, 48210, 43739, 9288, 10622, 9365, 119407, 12692, 12508, 117, 9708, 119235, 9710, 48210, 10892, 9032, 17196, 80001, 9783, 119248, 23665, 119, 102]\n\nそのように二番目に死を偽装して生き残るようになったイタドリがどうして初めて見る自分をこんなに気遣ってくれるのかと尋ねると「私が大切にする人たちがあなたを大切にするから」と答えては\nAccept!\n[101, 11332, 24273, 2150, 5632, 5755, 1943, 4805, 1980, 2371, 7104, 11592, 5600, 1913, 4814, 1975, 27969, 15970, 21462, 15713, 21612, 10898, 56910, 22526, 22267, 2547, 19945, 7143, 1975, 6621, 2534, 1980, 28442, 60907, 11312, 4854, 777, 14813, 18825, 58174, 75191, 11662, 3456, 1945, 100812, 1890, 5949, 1912, 3197, 2535, 84543, 2179, 7877, 111787, 22946, 20058, 11377, 3197, 2535, 8454, 1686, 1991, 1, 102]\n[101, 11332, 24273, 2150, 5632, 5755, 1943, 4805, 1980, 2371, 7104, 11592, 5600, 1913, 4814, 1975, 27969, 15970, 21462, 15713, 21612, 10898, 56910, 22526, 22267, 2547, 19945, 7143, 1975, 6621, 2534, 1980, 28442, 60907, 11312, 485, 7, 14813, 18825, 58174, 75191, 11662, 3456, 1945, 1, 102]\n\nAccuracy: 100.00%\n```\n\n## 3. 其他实现\n\n\n\u003Cp align=\"center\">\n  \u003Cpicture>\n    \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\".\u002Fassets\u002Flogos_dark.png\">\n    \u003Cimg alt=\"Banner\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_bac5076d3438.png\" width=100%>\n  \u003C\u002Fpicture>\n\u003C\u002Fp>\n\n\n大多数基于[BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04805)的模型都使用[WordPiece分词器](https:\u002F\u002Fstatic.googleusercontent.com\u002Fmedia\u002Fresearch.google.com\u002Fja\u002F\u002Fpubs\u002Farchive\u002F37842.pdf)，其代码可以在[这里](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert\u002Fblob\u002Fmaster\u002Ftokenization.py)找到。\n（Hugging Face的一个简单实现可以在这里找到[这里](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers\u002Fblob\u002Fmain\u002Fsrc\u002Ftransformers\u002Fmodels\u002Fbert\u002Ftokenization_bert.py)）。\n\n由于BertTokenizer是一种CPU密集型算法，推理可能会成为瓶颈，未优化的分词器速度会非常慢。一个很好的例子是[KR-BERT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.03979)中引入的[双向Wordpiece分词器](https:\u002F\u002Fgithub.com\u002Fsnunlp\u002FKR-BERT\u002Fblob\u002Fmaster\u002Fkrbert_tensorflow\u002Ftokenization_ranked.py)。大部分代码相同，但该算法会反向遍历子标记，并在反向遍历时写入更大的值，而正向遍历时则不然。论文声称准确率有所提升，但很难找到其他量化指标，且准确率的提升并不显著，同时分词器的速度也严重下降。\n\n* transformers（Rust实现，PyO3）\n* paddlenlp（C++实现，pybind）\n* tensorflow-text（C++实现，pybind）\n* blingfire（C++实现，原生二进制调用）\n\n大多数开发者要么使用`transformers.BertTokenizer`，要么使用`transformers.AutoTokenizer`，但使用`AutoTokenizer`时会返回`transformers.BertTokenizerFast`。\n\n当然，它比BertTokenizer快，但结果并不完全相同，这意味着从分词阶段开始，你就已经放弃了100%的准确性。\n\nBertTokenizer并不仅由transformers提供。[PaddleNLP](https:\u002F\u002Fgithub.com\u002FPaddlePaddle\u002FPaddleNLP)和[tensorflow-text](https:\u002F\u002Fwww.tensorflow.org\u002Ftext)也提供了BertTokenizer。\n\n此外还有由微软开发、目前处于废弃状态的[Blingfire](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FBlingFire)。\n\nPaddleNLP需要PaddlePaddle支持，并从3.0rc版本开始提供分词功能。你可以按如下方式安装：\n\n```bash\n##### 安装PaddlePaddle、PaddleNLP\npython -m pip install paddlepaddle==3.0.0b1 -i https:\u002F\u002Fwww.paddlepaddle.org.cn\u002Fpackages\u002Fstable\u002Fcpu\u002F\npip install --upgrade paddlenlp==3.0.0b3\n##### 安装transformers\npip install transformers==4.47.1\n##### 安装tf-text\npip install tensorflow-text==2.18.1\n##### 安装blingfire\npip install blingfire\n```\n\n\n除blingfire外，只需vocab.txt文件即可立即运行分词器。\n（blingfire同样只需要vocab.txt，在学习8小时后即可使用）。\n\n我们将详细探讨的实现是`PaddleNLP的BertTokenizerFast`和`blingfire`。\n\n* `blingfire`：采用[确定性有限状态机（DFSM）](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FBlingFire\u002Fblob\u002Fmaster\u002Fdoc\u002FBling_Fire_Tokenizer_Algorithms.pdf)，消除了单次线性扫描和不必要的比较操作，时间复杂度为O(n)，表现相当出色。\n  * **优点**：比其他实现快**5到10倍**。\n  * **缺点**：训练时间较长（8小时），且准确率低于其他实现。（+由于事实上已停止开发，难以获得支持）。\n* `PaddleNLP`：如下面的实验所示，PaddleNLP在小数点后相同位数的情况下，始终比HF的BertTokenizerFast更快，并且无论是在X86还是Arm架构上，它在任何操作系统上都表现得更快。\n  * **优点**：**内部实现为C++**，相比由Rust实现的`transformers.BertTokenizerFast`，在输出完全相同结果的同时，速度提升了1.2倍。\n    * 虽然不能在`return_tensors`中指定`pt(pytorch tensor)`，但这并不是问题。\n  * **缺点**：除了需要安装PaddlePaddle和PaddleNLP之外，没有其他缺点。\n\n## 4. 性能测试\n\n\n### 4.1 性能测试（单文本编码）\n\n准确率是以[谷歌的BertTokenizerFast](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert\u002Fblob\u002Fmaster\u002Ftokenization.py)作为基准进行测量的结果。只要有一个`input_ids`不正确，就被视为错误答案。\n\n\n\u003Cp align=\"center\">\n  \u003Cpicture>\n    \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\".\u002Fassets\u002Fcomp_speed_dark.png\">\n    \u003Cimg alt=\"FlashTokenizer\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_61609d99e453.png\" width=100%>\n  \u003C\u002Fpicture>\n\u003C\u002Fp>\n\n\u003Cp align=\"center\">\n  \u003Cpicture>\n    \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\".\u002Fassets\u002Fcomp_accuracy_dark.png\">\n    \u003Cimg alt=\"FlashTokenizer\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_95e3fea8f02b.png\" width=100%>\n  \u003C\u002Fpicture>\n\u003C\u002Fp>\n\n### 分词器性能对比\n\n#### [google-bert\u002Fbert-base-cased](https:\u002F\u002Fhuggingface.co\u002Fgoogle-bert\u002Fbert-base-cased)\n\n| 分词器                      | 耗时       | 文本数量   | 准确率     |\n|--------------------------------|--------------|------------|------------|\n| BertTokenizerFast(Huggingface) | 84.3700秒  | 1,000,000  | 99.9226%   |\n| BertTokenizerFast(PaddleNLP)   | 75.6551秒  | 1,000,000  | 99.9226%   |\n| FastBertTokenizer(Tensorflow)  | 219.1259秒 | 1,000,000  | 99.9160%   |\n| Blingfire                      | 13.6183秒  | 1,000,000  | 99.8991%   |\n| **FlashBertTokenizer**             | 8.1968秒   | 1,000,000  | 99.8216%   |\n\n#### [google-bert\u002Fbert-base-uncased](https:\u002F\u002Fhuggingface.co\u002Fgoogle-bert\u002Fbert-base-uncased)\n\n| 分词器                      | 耗时       | 文本数量   | 准确率     |\n|--------------------------------|--------------|------------|------------|\n| BertTokenizerFast(Huggingface) | 91.7882秒  | 1,000,000  | 99.9326%   |\n| BertTokenizerFast(PaddleNLP)   | 83.6839秒  | 1,000,000  | 99.9326%   |\n| FastBertTokenizer(Tensorflow)  | 204.2240秒 | 1,000,000  | 99.1379%   |\n| Blingfire                      | 13.2374秒  | 1,000,000  | 99.8588%   |\n| **FlashBertTokenizer**             | 7.6313秒   | 1,000,000  | 99.6884%   |\n\n#### [google-bert\u002Fbert-base-multilingual-cased](https:\u002F\u002Fhuggingface.co\u002Fgoogle-bert\u002Fbert-base-multilingual-cased)\n\n\n\n| 分词器                      | 耗时       | 文本数量   | 准确率     |\n|--------------------------------|--------------|------------|------------|\n| BertTokenizerFast(Huggingface) | 212.1570秒 | 2,000,000  | 99.7964%   |\n| BertTokenizerFast(PaddleNLP)   | 193.9921秒 | 2,000,000  | 99.7964%   |\n| FastBertTokenizer(Tensorflow)  | 394.1574秒 | 2,000,000  | 99.7892%   |\n| Blingfire                      | 38.9013秒   | 2,000,000  | 99.9780%   |\n| **FlashBertTokenizer**             | 20.4570秒  | 2,000,000  | 99.8970%   |\n\n\n#### [beomi\u002Fkcbert-base](https:\u002F\u002Fgithub.com\u002FBeomi\u002FKcBERT)\n\n| 分词器                      | 耗时       | 文本数量   | 准确率     |\n|--------------------------------|--------------|------------|------------|\n| BertTokenizerFast(Huggingface) | 52.5744秒  | 1,000,000  | 99.6754%   |\n| BertTokenizerFast(PaddleNLP)   | 44.8943秒  | 1,000,000  | 99.6754%   |\n| FastBertTokenizer(Tensorflow)  | 198.0270秒 | 1,000,000  | 99.6639%   |\n| Blingfire                      | 13.0701秒  | 1,000,000  | 99.9434%   |\n| **FlashBertTokenizer**             | 5.2601秒   | 1,000,000  | 99.9484%   |\n\n\n| 分词器                      | 耗时       | 文本数量   | 准确率     |\n|--------------------------------|--------------|------------|------------|\n| **FlashBertTokenizer**             | 5.1875秒   | 1,000,001  | 99.9484%   |\n| Blingfire                      | 13.2783秒  | 1,000,001  | 99.9435%   |\n| rust_tokenizers(guillaume-be)  | 16.6308秒  | 1,000,001  | 99.9829%   |\n| BertTokenizerFast(PaddleNLP)   | 44.5476秒  | 1,000,001  | 99.6754%   |\n| BertTokenizerFast(Huggingface) | 53.2525秒  | 1,000,001  | 99.6754%   |\n| FastBertTokenizer(Tensorflow)  | 202.1633秒 | 1,000,001  | 99.6639%   |\n\n#### [microsoft\u002Fllmlingua-2-bert-base-multilingual-cased-meetingbank](https:\u002F\u002Fhuggingface.co\u002Fmicrosoft\u002Fllmlingua-2-bert-base-multilingual-cased-meetingbank)\n\n| 分词器                      | 耗时       | 文本数量   | 准确率     |\n|--------------------------------|--------------|------------|------------|\n| BertTokenizerFast(Huggingface) | 208.8858秒 | 2,000,000  | 99.7964%   |\n| BertTokenizerFast(PaddleNLP)   | 192.6593秒 | 2,000,000  | 99.7964%   |\n| FastBertTokenizer(Tensorflow)  | 413.2010秒 | 2,000,000  | 99.7892%   |\n| Blingfire                      | 39.3765秒  | 2,000,000  | 99.9780%   |\n| **FlashBertTokenizer**             | 22.8820秒  | 2,000,000  | 99.8970%   |\n\n| 分词器                      | 耗时       | 文本数量   | 准确率     |\n|--------------------------------|--------------|------------|------------|\n| **FlashBertTokenizer**             | 22.0901秒  | 2,000,001  | 99.8971%   |\n| Blingfire                      | 37.9836秒  | 2,000,001  | 99.9780%   |\n| rust_tokenizers(guillaume-be)  | 98.0366秒  | 2,000,001  | 99.9976%   |\n| BertTokenizerFast(PaddleNLP)   | 208.6889秒 | 2,000,001  | 99.7964%   |\n| BertTokenizerFast(Huggingface) | 219.2644秒 | 2,000,001  | 99.7964%   |\n| FastBertTokenizer(Tensorflow)  | 413.9725秒 | 2,000,001  | 99.7892%   |\n\n#### [KR-BERT](https:\u002F\u002Fgithub.com\u002Fsnunlp\u002FKR-BERT)\n\n\n| 分词器                                    | 耗时       | 文本数量   | 准确率     |\n|--------------------------------|--------------|------------|------------|\n| BertTokenizerBidirectional(KR-BERT Original) | 128.3320秒 | 1,000,000  | 100.0000%  |\n| **FlashBertTokenizer(Bidirectional)**                           | 10.4492秒  | 1,000,000  | 99.9631%   |\n\n\n\n```mermaid\n%%{ init: { \"er\" : { \"layoutDirection\" : \"LR\" } } }%%\nerDiagram\n    Text ||--o{ Preprocess : tokenize\n    Preprocess o{--|| Inference : memcpy_h2d\n    Inference o{--|| Postprocess : memcpy_d2h\n```\n\n\n\n\n\n## 6. 兼容性\n\nFlashBertTokenizer 可以与任何框架一起使用。各框架的 CUDA 版本兼容性对于 LLM 的快速推理也非常重要。\n\n * [PyTorch](https:\u002F\u002Fpytorch.org\u002F) 已不再支持通过 conda 安装。\n * [ONNXRUNTIME](https:\u002F\u002Fonnxruntime.ai\u002Fdocs\u002Fexecution-providers\u002FCUDA-ExecutionProvider.html#cuda-12x) 按照 CUDA 版本进行了分离。\n * PyTorch 也在考虑放弃 CUDA 12.x，转而采用更新的 CUDA 12.8。然而，目前的趋势是在所有框架中继续使用 CUDA 11.8。\n   * CUDA 12.x 是为最新的 GPU（Hopper 和 Blackwell）设计的，而在 Volta 等旧款 GPU 上，CUDA 11.8 的速度反而比 CUDA 12.x 更快。\n\n\n\n| 深度学习框架 | 版本 | 操作系统   | CPU  | CUDA 11.8 | CUDA 12.3 | CUDA 12.4 | CUDA 12.6 | CUDA 12.8 |\n| ------------ | ----|---- | ---- | --------- | ----|----- | --------- | --------- |\n| PyTorch | 2.6| Linux、Windows | ⚪|⚪|❌|⚪| ⚪ |    ❌      |\n| PyTorch | 2.7|Linux、Windows|⚪|⚪|❌|❌|⚪|⚪|\n| ONNXRUNTIME(11) | 1.20.x| Linux、Windows|⚪|⚪|❌|❌|❌|❌|\n| ONNXRUNTIME(12) | 1.20.x| Linux、Windows|⚪|❌|⚪|⚪|⚪|⚪|\n| PaddlePaddle | 3.0-beta | Linux、Windows|⚪|⚪|❌|❌|❌|❌|\n\n## 7. GPU 分词器\n\n以下是在 [使用 RAPIDS、HuggingFace 和 Dask 大规模运行最先进 NLP 工作负载](https:\u002F\u002Fdeveloper.nvidia.com\u002Fblog\u002Frun-state-of-the-art-nlp-workloads-at-scale-with-rapids-huggingface-and-dask\u002F#:~:text=,and%20then%20used%20in%20subsequent) 中安装并运行 cuDF 的示例。\n*（速度非常快）*\n\n你可以在 [rapids(cudf)](https:\u002F\u002Fdocs.rapids.ai\u002F) 上在 GPU 上运行 WordPiece 分词器。\n * [实现](https:\u002F\u002Fgithub.com\u002Frapidsai\u002Fcudf\u002Fblob\u002F0e99ec3ec15b8b0ebe68bd884c7d22d600e9259e\u002Fpython\u002Fcudf\u002Fcudf\u002Fcore\u002Fwordpiece_tokenize.py#L10)\n * [示例](https:\u002F\u002Fgithub.com\u002Frapidsai\u002Fcudf\u002Fblob\u002F0e99ec3ec15b8b0ebe68bd884c7d22d600e9259e\u002Fpython\u002Fcudf\u002Fcudf\u002Ftests\u002Ftext\u002Ftest_subword_tokenizer.py#L244)\n\n如 [如何安装 rapids](https:\u002F\u002Fdocs.rapids.ai\u002Finstall\u002F) 所示，它仅支持 Linux，且 CUDA 版本与其他框架不同，因此 [docker](https:\u002F\u002Fhub.docker.com\u002Fr\u002Frapidsai\u002Fbase) 是最佳选择。对于批处理任务，GPU 比 CPU 更快；但对于流式处理任务，GPU 则较慢。\n\n在该[博客](https:\u002F\u002Fdeveloper.nvidia.com\u002Fblog\u002Frun-state-of-the-art-nlp-workloads-at-scale-with-rapids-huggingface-and-dask\u002F#:~:text=,and then used in subsequent)中提供了很好的示例代码和说明。要使用 cuDF，你必须首先将 vocab.txt 转换为 [hash_vocab](https:\u002F\u002Fgithub.com\u002Frapidsai\u002Fcudf\u002Fblob\u002Fbranch-25.06\u002Fpython\u002Fcudf\u002Fcudf\u002Futils\u002Fhash_vocab_utils.py)，如下所示。问题在于 hash_vocab 函数无法处理多语言词汇表。因此，如果词汇表中包含除英语或中文之外的其他字符，就无法使用 cuDF 的 WordpieceTokenizer。\n\n```python\nimport cudf\nfrom cudf.utils.hash_vocab_utils import hash_vocab\nhash_vocab('bert-base-cased-vocab.txt', 'voc_hash.txt')\n```\n\n\n\n\n\n## 待办事项\n\n- [x] [双向WordPieceTokenizer](https:\u002F\u002Fgithub.com\u002Fsnunlp\u002FKR-BERT\u002Fblob\u002Fmaster\u002Fkrbert_tensorflow\u002Ftokenization_ranked.py)\n- [x] 带有线程池的批量编码器。\n- [x] 将 `std::list` 替换为 `boost::intrusive::list`。\n- [x] ~~[MaxMatch-Dropout：用于 WordPiece 的子词正则化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.04126) 选项。~~\n- [x] 使用栈内存以减少内存分配。（C 风格，[alloca](https:\u002F\u002Fman7.org\u002Flinux\u002Fman-pages\u002Fman3\u002Falloca.3.html)，[_alloca](https:\u002F\u002Flearn.microsoft.com\u002Fko-kr\u002Fcpp\u002Fc-runtime-library\u002Freference\u002Falloca?view=msvc-170)）\n- [x] ~~单次编码的并行处理选项支持。~~\n- [ ] `circle.ai`\n  - [ ] 实现编译好的 wheel 包分发以便安装。\n- [ ] SIMD\n- [ ] ~~CUDA 版本。~~\n\n\n\n## 致谢\n\nFlashTokenizer 灵感来源于 [FlashAttention](https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention)、[FlashInfer](https:\u002F\u002Fgithub.com\u002Fflashinfer-ai\u002Fflashinfer)、[FastBertTokenizer](https:\u002F\u002Fgithub.com\u002Fgeorg-jung\u002FFastBertTokenizer) 和 [tokenizers-cpp](https:\u002F\u002Fgithub.com\u002Fmlc-ai\u002Ftokenizers-cpp) 等项目。\n\n\n\n## 性能对比\n\n* **WordPiece**\n  * 📒  [huggingface\u002Ftokenizers (Rust)](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftokenizers)\n    * transformers.BertTokenizerFast 的 Rust 实现，以 Python 包形式提供。\n    * 🔵 **以 Python 包形式提供。**\n  * 🔥 [FastBertTokenizer (C#)](https:\u002F\u002Ffastberttokenizer.gjung.com)\n    * 展示了极快的性能，但对非英语查询的准确性显著下降。\n  * ❌ [BertTokenizers (C#)](https:\u002F\u002Fgithub.com\u002FNMZivkovic\u002FBertTokenizers)\n    * 从 [FastBertTokenizer (C#) VS BertTokenizers (C#)](https:\u002F\u002Fgithub.com\u002Fgeorg-jung\u002FFastBertTokenizer\u002Ftree\u002Fmaster?tab=readme-ov-file#comparison-to-berttokenizers) 可以确认 `FastBertTokenizer(C#)` 更快。\n  * 🔥 [rust-tokenizers (Rust)](https:\u002F\u002Fgithub.com\u002Fguillaume-be\u002Frust-tokenizers)\n    * 比 BertTokenizerFlash 和 Blingfire 慢，但比其他实现更快且更准确。\n    * 🔵 **以 Python 包形式提供。**\n  * ❌ [tokenizers-cpp (C++)](https:\u002F\u002Fgithub.com\u002Fmlc-ai\u002Ftokenizers-cpp)\n    * `tokenizer-cpp` 是 SentencePiece 和 HuggingFace 的 Rust 实现的封装，因此性能基准测试没有意义。\n  * ❌ [bertTokenizer (Java)](https:\u002F\u002Fgithub.com\u002Fankiteciitkgp\u002FbertTokenizer)\n    * Java 不在此列。\n  * ✅ [ZhuoruLin\u002Ffast-wordpiece (Rust)](https:\u002F\u002Fgithub.com\u002FZhuoruLin\u002Ffast-wordpiece)\n    * 使用 LinMaxMatching 的 Rust 实现，只能在 Rust 中运行，预计速度不会超过 C++ 实现。\n  * ❌ [huggingface_tokenizer_cpp (C++)](https:\u002F\u002Fgithub.com\u002FSorrow321\u002Fhuggingface_tokenizer_cpp)\n    * 由于采用朴素的 C++ 实现，速度非常慢。\n  * ❌ [SeanLee97\u002FBertWordPieceTokenizer.jl (Julia)](https:\u002F\u002Fgithub.com\u002FSeanLee97\u002FBertWordPieceTokenizer.jl)\n    * Julia 不在此列。\n* **BPE**\n  * https:\u002F\u002Fgithub.com\u002Fopenai\u002Ftiktoken\n* **SentencePiece**\n  * [google\u002Fsentencepiece (C++)](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fsentencepiece)\n\n\n\n## ⭐ 历史\n\n\u003Ca href=\"https:\u002F\u002Fwww.star-history.com\u002F#NLPOptimize\u002Fflash-tokenizer&Date\">\n\n \u003Cpicture>\n   \u003Csource media=\"(prefers-color-scheme: dark)\" srcset=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_b1661b3ccb16.png&theme=dark\" \u002F>\n   \u003Csource media=\"(prefers-color-scheme: light)\" srcset=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_b1661b3ccb16.png\" \u002F>\n   \u003Cimg alt=\"Star History Chart\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_readme_b1661b3ccb16.png\" \u002F>\n \u003C\u002Fpicture>\n\u003C\u002Fa>\n\n\n## 参考文献\n\n* https:\u002F\u002Fmedium.com\u002F@techhara\u002Fwhich-bert-tokenizer-is-faster-b832aa978b46\n* https:\u002F\u002Fmedium.com\u002F@atharv6f_47401\u002Fwordpiece-tokenization-a-bpe-variant-73cc48865cbf\n* https:\u002F\u002Fwww.restack.io\u002Fp\u002Ftransformer-models-bert-answer-fast-berttokenizerfast-cat-ai\n* https:\u002F\u002Fmedium.com\u002F@anmolkohli\u002Fmy-notes-on-bert-tokenizer-and-model-98dc22d0b64\n* https:\u002F\u002Fnocomplexity.com\u002Fdocuments\u002Ffossml\u002Fnlpframeworks.html\n* https:\u002F\u002Fgithub.com\u002Fmartinus\u002Frobin-hood-hashing\n* https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.15524\n* https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fhighway\n\n\n* * *\n\nBlingfire2：https:\u002F\u002Fgithub.com\u002Flfoppiano\u002FBlingfire\u002F\nhttps:\u002F\u002Fpypi.org\u002Fproject\u002Fblingfire2\u002F\n\nhttps:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FBlingFire\u002Fblob\u002Fmaster\u002Fnuget\u002Flib\u002Fruntimes\u002Fosx-arm64\u002Fnative\u002Flibblingfiretokdll.dylib","# FlashTokenizer 快速上手指南\n\nFlashTokenizer 是一款专为大语言模型（LLM）推理服务打造的高性能 C++ Tokenizer 引擎。它是 `BertTokenizer` 的极速实现，速度比 Hugging Face 的 `BertTokenizerFast` **快 10 倍**，同时保持完全一致的准确率。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**：\n    *   Windows (AMD64)\n    *   MacOS (ARM64 \u002F x86_64)\n    *   Linux \u002F Ubuntu (x86-64)\n*   **Python 版本**：3.8 ~ 3.13\n*   **编译器依赖**（仅源码安装时需要）：\n    *   Windows: Visual Studio 2022 (需安装 [VC++ 运行库](https:\u002F\u002Fgithub.com\u002FNLPOptimize\u002Fflash-tokenizer\u002Freleases\u002Fdownload\u002FPackages\u002FVC_redist.x64.exe))\n    *   MacOS: clang++\n    *   Linux: g++\n\n## 安装步骤\n\n### 方式一：通过 Pip 安装（推荐）\n\n直接使用 pip 安装预编译包，适用于大多数用户。\n\n```bash\npip install -U flash-tokenizer\n```\n\n> **注意**：国内用户若下载缓慢，可指定清华或阿里镜像源加速：\n> ```bash\n> pip install -U flash-tokenizer -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n> ```\n\n### 方式二：源码安装\n\n如果您需要最新特性或进行二次开发，可以从 GitHub 克隆源码安装。\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FNLPOptimize\u002Fflash-tokenizer\ncd flash-tokenizer\u002Fprj\npip install .\n```\n\n## 基本使用\n\nFlashTokenizer 的 API 设计与 Hugging Face `transformers` 高度兼容，只需替换导入类即可无缝切换。\n\n### 代码示例\n\n以下示例展示了如何加载预训练模型并进行分词，同时验证其与原生 `BertTokenizer` 的结果一致性。\n\n```python\nfrom flash_tokenizer import BertTokenizerFlash\nfrom transformers import BertTokenizer\n\n# 测试文本列表（包含中文、英文、韩文、日文）\ntitles = [\n    '绝不能放弃，世界上没有失败，只有放弃。',\n    'is there any doubt about it \"None whatsoever\"',\n    \"세상 어떤 짐승이 이를 드러내고 사냥을 해? 약한 짐승이나 몸을 부풀리지, 진짜 짐승은 누구보다 침착하지.\",\n    'そのように二番目に死を偽装して生き残るようになったイタドリがどうして初めて見る自分をこんなに気遣ってくれるのかと尋ねると「私が大切にする人たちがあなたを大切にするから」と答えては'\n]\n\n# 初始化 FlashTokenizer (极速版)\ntokenizer1 = BertTokenizerFlash.from_pretrained('bert-base-multilingual-cased')\n\n# 初始化原生 BertTokenizer (用于对比)\ntokenizer2 = BertTokenizer.from_pretrained('bert-base-multilingual-cased')\n\ncorrect = 0\nfor title in titles:\n    print(f\"Text: {title}\")\n    \n    # 获取分词结果\n    tokens1 = tokenizer1.tokenize(title)\n    tokens2 = tokenizer2.tokenize(title)\n    \n    # 获取 ID 序列\n    ids1 = tokenizer1(title, max_length=512, padding=\"longest\").input_ids[0]\n    ids2 = tokenizer2(title, max_length=512, padding=\"longest\", return_tensors=\"np\").input_ids[0].tolist()\n    \n    # 验证结果一致性\n    if tokens1 == tokens2 and ids1 == ids2:\n        correct += 1\n        print(\"✅ Accept! (结果一致)\")\n    else:\n        print(\"❌ Wrong Answer\")\n        \n    print(f\"Flash IDs: {ids1}\")\n    print()\n\nprint(f'最终准确率：{correct * 100.0 \u002F len(titles):.2f}%')\n```\n\n### 核心优势\n*   **极速推理**：单核每秒可处理约 35,000 条文本（基于 `bert-base-uncased`）。\n*   **完全兼容**：输出结果与 Hugging Face 官方 Tokenizer 完全一致。\n*   **并行加速**：底层支持 C++ 级别的 OpenMP 并行处理，批量编码效率极高。","某大型电商客服团队正在部署基于大语言模型（LLM）的实时智能问答系统，需在高峰期每秒处理数千条用户咨询。\n\n### 没有 flash-tokenizer 时\n- **响应延迟高**：使用默认的 `BertTokenizerFast` 进行文本预处理时，CPU 占用率飙升，导致首字生成延迟超过 800 毫秒，用户感觉明显卡顿。\n- **并发瓶颈严重**：由于分词速度慢，推理服务的吞吐量受限，在促销活动期间无法支撑高并发请求，大量请求排队超时。\n- **依赖包臃肿**：尝试引入其他高性能方案（如 PaddleNLP）需安装数 GB 的深度学习框架，增加了容器镜像体积和部署复杂度。\n- **资源成本浪费**：为了弥补分词效率低下的短板，不得不额外扩容 CPU 节点，显著推高了云端算力成本。\n\n### 使用 flash-tokenizer 后\n- **极致低延迟**：flash-tokenizer 凭借 C++ 底层优化，将分词速度提升 10 倍以上，首字延迟降至 80 毫秒以内，实现“秒回”体验。\n- **吞吐量倍增**：单节点每秒可处理的请求数（QPS）大幅提升，轻松应对大促流量洪峰，无需复杂的负载均衡策略。\n- **轻量级部署**：仅需通过 `pip` 安装轻量级库，无需绑定庞大的训练框架，镜像体积减小 90%，运维迭代更敏捷。\n- **降本增效显著**：在同等流量下，所需 CPU 核心数减少 60%，直接降低了基础设施运营成本。\n\nflash-tokenizer 通过消除 LLM 推理链路中的分词性能瓶颈，让企业以更低的算力成本实现了工业级的实时响应能力。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FNLPOptimize_flash-tokenizer_42247351.jpg","NLPOptimize","NLP Optimize","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FNLPOptimize_badb387c.png","The world's fastest CPU tokenizer library!",null,"springnode@gmail.com","https:\u002F\u002Fmedium.com\u002F@springnode\u002Fflashtokenizer-the-worlds-fastest-cpu-tokenizer-70fd4ed8a6f0","https:\u002F\u002Fgithub.com\u002FNLPOptimize",[85,89,93,97,101,104],{"name":86,"color":87,"percentage":88},"C++","#f34b7d",95.4,{"name":90,"color":91,"percentage":92},"Python","#3572A5",4.1,{"name":94,"color":95,"percentage":96},"CMake","#DA3434",0.3,{"name":98,"color":99,"percentage":100},"Shell","#89e051",0.1,{"name":102,"color":103,"percentage":100},"Batchfile","#C1F12E",{"name":105,"color":106,"percentage":107},"C","#555555",0,513,9,"2026-04-03T06:34:33",1,"Linux, macOS, Windows","不需要 GPU，纯 CPU 运行","未说明",{"notes":116,"python":117,"dependencies":118},"Windows 用户需额外安装 VC++ 运行库 (vc_redist.x64.exe)。该工具是基于 C++17 开发的高性能 BertTokenizer 实现，通过 pybind11 供 Python 调用，利用 OpenMP 进行并行处理。无需安装 PyTorch、TensorFlow 或 PaddlePaddle 等重型深度学习框架。","3.8 ~ 3.13",[],[26,13,14],[121,122,123,124,125,126,127,128,129,130,131,132,133,134],"deep-learning","nlp","python","berttokenizer","flash","wordpiece","wordpiece-tokenization","cpp","cpp17","tokenizer","bert","huggingface","pybind11","trie","2026-03-27T02:49:30.150509","2026-04-06T05:17:27.774628",[],[139,143,148,153],{"id":140,"version":141,"summary_zh":80,"released_at":142},107490,"pretrained_bpe","2025-05-25T14:27:44",{"id":144,"version":145,"summary_zh":146,"released_at":147},107491,"pretrained","⚠️⚠️⚠️⚠️⚠️\r\n\r\nThe configuration file is downloaded from here by the `from_pretrained` method.","2025-04-02T10:54:27",{"id":149,"version":150,"summary_zh":151,"released_at":152},107492,"Packages","* `VC_redist.x64.exe` : Latest Microsoft Visual C++ Redistributable Version.","2025-03-31T06:21:53",{"id":154,"version":155,"summary_zh":156,"released_at":157},107493,"Dataset","⚠️⚠️⚠️⚠️⚠️\r\n## dataset\r\n\r\n* `texts_en.all.parquet`\r\n* `texts_ko.all.parquet`\r\n* `texts_cn.all.parquet`\r\n* `texts_multilingual.all.parquet`\r\n\r\n","2025-03-10T08:33:50"]