[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-plasticityai--magnitude":3,"tool-plasticityai--magnitude":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",160015,2,"2026-04-18T11:30:52",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",109154,"2026-04-18T11:18:24",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":77,"owner_twitter":76,"owner_website":78,"owner_url":79,"languages":80,"stars":89,"forks":90,"last_commit_at":91,"license":92,"difficulty_score":93,"env_os":94,"env_gpu":95,"env_ram":96,"env_deps":97,"category_tags":105,"github_topics":107,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":122,"updated_at":123,"faqs":124,"releases":155},9033,"plasticityai\u002Fmagnitude","magnitude","A fast, efficient universal vector embedding utility package.","Magnitude 是一款专为机器学习设计的高性能向量嵌入工具库，由 Plasticity 团队开发。它提供了一套高效的 Python 接口和专用的向量存储文件格式，旨在帮助开发者快速加载、查询和使用词向量等嵌入数据。\n\n在自然语言处理等任务中，传统工具（如 Gensim）在处理大规模预训练模型时，往往面临加载速度慢、内存占用高或查询效率低的问题。Magnitude 通过优化的底层存储结构和算法，显著提升了向量检索速度，并支持流式加载超大模型，有效解决了这些性能瓶颈。此外，它还具备独特的“未登录词”处理能力，即使遇到拼写错误或训练集中不存在的词汇，也能智能生成近似向量，增强了模型的鲁棒性。\n\n这款工具非常适合人工智能研究员、数据科学家以及后端工程师使用。无论是需要快速验证想法的学术研究者，还是致力于构建高并发生产系统的开发人员，都能从中受益。Magnitude 不仅兼容 Keras、PyTorch 等主流深度学习框架，还支持通过 HTTP 远程流式传输模型，让资源受限的环境也能轻松调用巨型嵌入模型。如果你正在寻找一个比传统方案更轻量、更快速的向量管理方案，Magnitude 值得尝试。","\u003Cdiv align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fplasticityai_magnitude_readme_45e13ec22f36.png\" alt=\"magnitude\" height=\"50\">\u003C\u002Fdiv>\n\n## \u003Cdiv align=\"center\">Magnitude: a fast, simple vector embedding utility library\u003Cbr \u002F>\u003Cbr \u002F>[![pipeline status](https:\u002F\u002Fgitlab.com\u002FPlasticity\u002Fmagnitude\u002Fbadges\u002Fmaster\u002Fpipeline.svg)](https:\u002F\u002Fgitlab.com\u002FPlasticity\u002Fmagnitude\u002Fcommits\u002Fmaster)&nbsp;&nbsp;&nbsp;[![Build Status](https:\u002F\u002Ftravis-ci.org\u002Fplasticityai\u002Fmagnitude.svg?branch=master)](https:\u002F\u002Ftravis-ci.org\u002Fplasticityai\u002Fmagnitude)&nbsp;&nbsp;&nbsp;[![Build status](https:\u002F\u002Fci.appveyor.com\u002Fapi\u002Fprojects\u002Fstatus\u002F72lwh2g7a9ddbnt2\u002Fbranch\u002Fmaster?svg=true)](https:\u002F\u002Fci.appveyor.com\u002Fproject\u002Fplasticity-admin\u002Fmagnitude\u002Fbranch\u002Fmaster)\u003Cbr\u002F>[![PyPI version](https:\u002F\u002Fbadge.fury.io\u002Fpy\u002Fpymagnitude.svg)](https:\u002F\u002Fpypi.python.org\u002Fpypi\u002Fpymagnitude\u002F)&nbsp;&nbsp;&nbsp;[![license](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Flicense\u002Fmashape\u002Fapistatus.svg?maxAge=2592000)](https:\u002F\u002Fgitlab.com\u002FPlasticity\u002Fmagnitude\u002Fblob\u002Fmaster\u002FLICENSE.txt)&nbsp;&nbsp;&nbsp;[![Python version](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fpyversions\u002Fpymagnitude.svg)](https:\u002F\u002Fpypi.python.org\u002Fpypi\u002Fpymagnitude\u002F)&nbsp;&nbsp;&nbsp;&nbsp;[![DOI](https:\u002F\u002Fzenodo.org\u002Fbadge\u002F122715432.svg)](https:\u002F\u002Fzenodo.org\u002Fbadge\u002Flatestdoi\u002F122715432)&nbsp;&nbsp;&nbsp;&nbsp;[![arXiv](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FarXiv-1810.11190-%23B41A1A.svg)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.11190)\u003C\u002Fdiv>\nA feature-packed Python package and vector storage file format for utilizing vector embeddings in machine learning models in a fast, efficient, and simple manner developed by [Plasticity](https:\u002F\u002Fwww.plasticity.ai\u002F). It is primarily intended to be a simpler \u002F faster alternative to [Gensim](https:\u002F\u002Fradimrehurek.com\u002Fgensim\u002F), but can be used as a generic key-vector store for domains outside NLP. It offers unique features like [out-of-vocabulary lookups](#advanced-out-of-vocabulary-keys) and [streaming of large models over HTTP](#remote-streaming-over-http). Published in our paper at [EMNLP 2018](http:\u002F\u002Faclweb.org\u002Fanthology\u002FD18-2021) and available on [arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.11190).\n\n## Table of Contents\n- [Installation](#installation)\n- [Motivation](#motivation)\n- [Benchmarks and Features](#benchmarks-and-features)\n- [Pre-converted Magnitude Formats of Popular Embeddings Models](#pre-converted-magnitude-formats-of-popular-embeddings-models)\n- [Using the Library](#using-the-library)\n    * [Constructing a Magnitude Object](#constructing-a-magnitude-object)\n    * [Querying](#querying)\n    * [Basic Out-of-Vocabulary Keys](#basic-out-of-vocabulary-keys)\n    * [Advanced Out-of-Vocabulary Keys](#advanced-out-of-vocabulary-keys)\n        + [Handling Misspellings and Typos](#handling-misspellings-and-typos)\n    * [Concatenation of Multiple Models](#concatenation-of-multiple-models)\n    * [Additional Featurization (Parts of Speech, etc.)](#additional-featurization-parts-of-speech-etc)\n    * [Using Magnitude with a ML library](#using-magnitude-with-a-ml-library)\n        + [Keras](#keras)\n        + [PyTorch](#pytorch)\n        + [TFLearn](#tflearn)\n    * [Utils](#utils)\n- [Concurrency and Parallelism](#concurrency-and-parallelism)\n- [File Format and Converter](#file-format-and-converter)\n- [Remote Loading](#remote-loading)\n- [Remote Streaming over HTTP](#remote-streaming-over-http)\n- [Other Documentation](#other-documentation)\n- [Other Languages](#other-languages)\n- [Other Programming Languages](#other-programming-languages)\n- [Other Domains](#other-domains)\n- [Contributing](#contributing)\n- [Roadmap](#roadmap)\n- [Other Notable Projects](#other-notable-projects)\n- [Citing this Repository](#citing-this-repository)\n- [LICENSE and Attribution](#license-and-attribution)\n\n## Installation\nYou can install this package with `pip`:\n```python\npip install pymagnitude # Python 2.7\npip3 install pymagnitude # Python 3\n```\n\nGoogle Colaboratory has some dependency issues with installing Magnitude due to conflicting dependencies. You can use the following snippet to install Magnitude on Google Colaboratory:\n```bash\n# Install Magnitude on Google Colab\n! echo \"Installing Magnitude.... (please wait, can take a while)\"\n! (curl https:\u002F\u002Fraw.githubusercontent.com\u002Fplasticityai\u002Fmagnitude\u002Fmaster\u002Finstall-colab.sh | \u002Fbin\u002Fbash 1>\u002Fdev\u002Fnull 2>\u002Fdev\u002Fnull)\n! echo \"Done installing Magnitude.\"\n```\n\n## Motivation\nVector space embedding models have become increasingly common in machine learning and traditionally have been popular for natural language processing applications. A fast, lightweight tool to consume these large vector space embedding models efficiently is lacking.\n\nThe Magnitude file format (`.magnitude`) for vector embeddings is intended to be a more efficient universal vector embedding format that allows for lazy-loading for faster cold starts in development, LRU memory caching for performance in production, multiple key queries, direct featurization to the inputs for a neural network, performant similiarity calculations, and other nice to have features for edge cases like handling out-of-vocabulary keys or misspelled keys and concatenating multiple vector models together. It also is intended to work with large vector models that may not fit in memory.\n\nIt uses [SQLite](http:\u002F\u002Fwww.sqlite.org), a fast, popular embedded database, as its underlying data store. It uses indexes for fast key lookups as well as uses memory mapping, SIMD instructions, and spatial indexing for fast similarity search in the vector space off-disk with good memory performance even between multiple processes. Moreover, memory maps are cached between runs so even after closing a process, speed improvements are reaped.\n\n## Benchmarks and Features\n\n| **Metric**                                                                                                                                            | **Magnitude Light**   | **Magnitude Medium** | **Magnitude Heavy** | **Magnitude [Stream](#remote-streaming-over-http)**    |\n| ----------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------: | :------------------: | :-----------------: | :----------------------------------------------------: |\n| Initial load time                                                                                                                                     | **0.7210s**           | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | 7.7550s                                                |\n| Cold single key query                                                                                                                                 | **0.0001s**           | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | 1.6437s                                                |\n| Warm single key query \u003Cbr \u002F>\u003Csup>*(same key as cold query)*\u003C\u002Fsup>                                                                                     | **0.00004s**          | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | **0.0004s**                                            |\n| Cold multiple key query \u003Cbr \u002F>\u003Csup>*(n=25)*\u003C\u002Fsup>                                                                                                     | **0.0442s**           | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | 1.7753s                                                |\n| Warm multiple key query \u003Cbr \u002F>\u003Csup>*(n=25) (same keys as cold query)*\u003C\u002Fsup>                                                                           | **0.00004s**          | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | **0.0001s**                                            |\n| First `most_similar` search query \u003Cbr \u002F>\u003Csup>*(n=10) (worst case)*\u003C\u002Fsup>                                                                              | 247.05s               | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | -                                                      |\n| First `most_similar` search query \u003Cbr \u002F>\u003Csup>*(n=10) (average case) (w\u002F disk persistent cache)*\u003C\u002Fsup>                                                 | **1.8217s**           | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | -                                                      |\n| Subsequent `most_similar` search \u003Cbr \u002F>\u003Csup>*(n=10) (different key than first query)*\u003C\u002Fsup>                                                           | **0.2434s**           | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | -                                                      |\n| Warm subsequent `most_similar` search \u003Cbr \u002F>\u003Csup>*(n=10) (same key as first query)*\u003C\u002Fsup>                                                             | **0.00004s**          | **0.00004s**         | **0.00004s**        | -                                                      |\n| First `most_similar_approx` search query \u003Cbr \u002F>\u003Csup>*(n=10, effort=1.0) (worst case)*\u003C\u002Fsup>                                                           | N\u002FA                   | N\u002FA                  | **29.610s**         | -                                                      |\n| First `most_similar_approx` search query \u003Cbr \u002F>\u003Csup>*(n=10, effort=1.0) (average case) (w\u002F disk persistent cache)*\u003C\u002Fsup>                              | N\u002FA                   | N\u002FA                  | **0.9155s**         | -                                                      |\n| Subsequent `most_similar_approx` search \u003Cbr \u002F>\u003Csup>*(n=10, effort=1.0) (different key than first query)*\u003C\u002Fsup>                                        | N\u002FA                   | N\u002FA                  | **0.1873s**         | -                                                      |\n| Subsequent `most_similar_approx` search \u003Cbr \u002F>\u003Csup>*(n=10, effort=0.1) (different key than first query)*\u003C\u002Fsup>                                        | N\u002FA                   | N\u002FA                  | **0.0199s**         | -                                                      |\n| Warm subsequent `most_similar_approx` search \u003Cbr \u002F>\u003Csup>*(n=10, effort=1.0) (same key as first query)*\u003C\u002Fsup>                                          | N\u002FA                   | N\u002FA                  | **0.00004s**        | -                                                      |\n| File size                                                                                                                                             | 4.21GB                | 5.29GB               | 10.74GB             | **0.00GB**                                             |\n| Process memory (RAM) utilization                                                                                                                      | **18KB**              | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | 1.71MB                                                 |\n| Process memory (RAM) utilization after 100 key queries                                                                                                | **168KB**             | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | 1.91MB                                                 |\n| Process memory (RAM) utilization after 100 key queries + similarity search                                                                            | **342KB**\u003Csup>2\u003C\u002Fsup> | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> |                                                        |\n| Integrity checks and tests                                                                                                                            | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| Universal format between word2vec (`.txt`, `.bin`), GloVe (`.txt`), fastText (`.vec`), and ELMo (`.hdf5`) with converter utility                      | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| Simple, Pythonic interface                                                                                                                            | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| Few dependencies                                                                                                                                      | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| Support for larger than memory models                                                                                                                 | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| Lazy loading whenever possible for speed and performance                                                                                              | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| Optimized for `threading` and `multiprocessing`                                                                                                       | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| Bulk and multiple key lookup with padding, truncation, placeholder, and featurization support                                                         | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| Concatenting multiple vector models together                                                                                                          | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| Basic out-of-vocabulary key lookup \u003Cbr \u002F>\u003Csup>(character n-gram feature hashing)\u003C\u002Fsup>                                                                | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| Advanced out-of-vocabulary key lookup with support for misspellings \u003Cbr \u002F>\u003Csup>(character n-gram feature hashing to similar in-vocabulary keys)\u003C\u002Fsup> | ❌                     | ✅                    | ✅                   | ✅                                                      |\n| Approximate most similar search with an [annoy](#other-notable-projects) index                                                                        | ❌                     | ❌                    | ✅                   | ✅                                                      |\n| Built-in training for new models                                                                                                                      | ❌                     | ❌                    | ❌                   | ❌                                                      |\n\n\n\n\u003Csup>1: *same value as previous column*\u003C\u002Fsup>\u003Cbr \u002F>\n\u003Csup>2: *uses `mmap` to read from disk, so the OS will still allocate pages of memory when memory is available, but it can be shared between processes and isn't managed within each process for extremely large files which is a performance win*\u003C\u002Fsup>\u003Cbr\u002F>\n\u003Csup>\\*: All [benchmarks](https:\u002F\u002Fgitlab.com\u002FPlasticity\u002Fmagnitude\u002Fblob\u002Fmaster\u002Ftests\u002Fbenchmark.py) were performed on the Google News pre-trained word vectors (`GoogleNews-vectors-negative300.bin`) with a MacBook Pro (Retina, 15-inch, Mid 2014) 2.2GHz quad-core Intel Core i7 @ 16GB RAM on SSD over an average of trials where feasible.\u003C\u002Fsup>\n\n## Pre-converted Magnitude Formats of Popular Embeddings Models\n\nPopular embedding models have been pre-converted to the `.magnitude` format for immmediate download and usage:\n\n| **Contributor**                                                         | **Data**                                                        | **Light**\u003Cbr\u002F>\u003Cbr\u002F>\u003Csup>(basic support for out-of-vocabulary keys)\u003C\u002Fsup>                                                                                                                                                                                                                                                                                                | **Medium**\u003Cbr\u002F>\u003Ci>(recommended)\u003C\u002Fi>\u003Cbr\u002F>\u003Cbr\u002F>\u003Csup>(advanced support for out-of-vocabulary keys)\u003C\u002Fsup>                                                                                                                                                                                                                                                                           | **Heavy**\u003Cbr\u002F>\u003Cbr\u002F>\u003Csup>(advanced support for out-of-vocabulary keys and faster `most_similar_approx`)\u003C\u002Fsup>                                                                                                                                                                                                                                                                |\n| :---------------------------------------------------------------------: | :-------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:                         | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |\n| Google - [word2vec](https:\u002F\u002Fcode.google.com\u002Farchive\u002Fp\u002Fword2vec\u002F)        | Google News 100B                                                | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fword2vec\u002Flight\u002FGoogleNews-vectors-negative300.magnitude)                                                                                                                                                                                                                                                                          | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fword2vec\u002Fmedium\u002FGoogleNews-vectors-negative300.magnitude)                                                                                                                                                                                                                                                                                 | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fword2vec\u002Fheavy\u002FGoogleNews-vectors-negative300.magnitude)                                                                                                                                                                                                                                                                              |\n| Stanford - [GloVe](https:\u002F\u002Fnlp.stanford.edu\u002Fprojects\u002Fglove\u002F)            | Wikipedia 2014 + Gigaword 5 6B                                  | [50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.6B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.6B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.6B.200d.magnitude),&nbsp;[300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.6B.300d.magnitude)                                             | [50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.6B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.6B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.6B.200d.magnitude),&nbsp;[300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.6B.300d.magnitude)                                                 | [50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.6B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.6B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.6B.200d.magnitude),&nbsp;[300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.6B.300d.magnitude)                                                 |\n| Stanford - [GloVe](https:\u002F\u002Fnlp.stanford.edu\u002Fprojects\u002Fglove\u002F)            | Wikipedia 2014 + Gigaword 5 6B \u003Cbr \u002F>(lemmatized by Plasticity) | [50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove-lemmatized.6B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove-lemmatized.6B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove-lemmatized.6B.200d.magnitude),&nbsp;[300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove-lemmatized.6B.300d.magnitude) | [50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove-lemmatized.6B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove-lemmatized.6B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove-lemmatized.6B.200d.magnitude),&nbsp;[300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove-lemmatized.6B.300d.magnitude)     | [50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove-lemmatized.6B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove-lemmatized.6B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove-lemmatized.6B.200d.magnitude),&nbsp;[300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove-lemmatized.6B.300d.magnitude)     |\n| Stanford - [GloVe](https:\u002F\u002Fnlp.stanford.edu\u002Fprojects\u002Fglove\u002F)            | Common Crawl 840B                                               | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.840B.300d.magnitude)                                                                                                                                                                                                                                                                                            | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.840B.300d.magnitude)                                                                                                                                                                                                                                                                                                   | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.840B.300d.magnitude)                                                                                                                                                                                                                                                                                                |\n| Stanford - [GloVe](https:\u002F\u002Fnlp.stanford.edu\u002Fprojects\u002Fglove\u002F)            | Twitter 27B                                                     | [25D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.twitter.27B.25d.magnitude),&nbsp;[50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.twitter.27B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.twitter.27B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.twitter.27B.200d.magnitude)           | [25D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.twitter.27B.25d.magnitude),&nbsp;[50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.twitter.27B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.twitter.27B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.twitter.27B.200d.magnitude)               | [25D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.twitter.27B.25d.magnitude),&nbsp;[50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.twitter.27B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.twitter.27B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.twitter.27B.200d.magnitude)               |\n| Facebook - [fastText](https:\u002F\u002Ffasttext.cc\u002Fdocs\u002Fen\u002Fenglish-vectors.html) | English Wikipedia 2017 16B                                      | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Ffasttext\u002Flight\u002Fwiki-news-300d-1M.magnitude)                                                                                                                                                                                                                                                                                       | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Ffasttext\u002Fmedium\u002Fwiki-news-300d-1M.magnitude)                                                                                                                                                                                                                                                                                              | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Ffasttext\u002Fheavy\u002Fwiki-news-300d-1M.magnitude)                                                                                                                                                                                                                                                                                           |\n| Facebook - [fastText](https:\u002F\u002Ffasttext.cc\u002Fdocs\u002Fen\u002Fenglish-vectors.html) | English Wikipedia 2017 + subword 16B                            | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Ffasttext\u002Flight\u002Fwiki-news-300d-1M-subword.magnitude)                                                                                                                                                                                                                                                                               | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Ffasttext\u002Fmedium\u002Fwiki-news-300d-1M-subword.magnitude)                                                                                                                                                                                                                                                                                      | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Ffasttext\u002Fheavy\u002Fwiki-news-300d-1M-subword.magnitude)                                                                                                                                                                                                                                                                                   |\n| Facebook - [fastText](https:\u002F\u002Ffasttext.cc\u002Fdocs\u002Fen\u002Fenglish-vectors.html) | Common Crawl 600B                                               | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Ffasttext\u002Flight\u002Fcrawl-300d-2M.magnitude)                                                                                                                                                                                                                                                                                           | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Ffasttext\u002Fmedium\u002Fcrawl-300d-2M.magnitude)                                                                                                                                                                                                                                                                                                  | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Ffasttext\u002Fheavy\u002Fcrawl-300d-2M.magnitude)                                                                                                                                                                                                                                                                                               |\n| AI2 - [AllenNLP ELMo](https:\u002F\u002Fallennlp.org\u002Felmo)                        | [ELMo Models](ELMo.md)                                          | [ELMo Models](ELMo.md)                                                                                                                                                                                                                                                                                                                                                  | [ELMo Models](ELMo.md)                                                                                                                                                                                                                                                                                                                                                          | [ELMo Models](ELMo.md)                                                                                                                                                                                                                                                                                                                                                      |\n| Google - [BERT](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert)                | [Coming Soon...](#roadmap)                                      | [Coming Soon...](#roadmap)                                                                                                                                                                                                                                                                                                                                              | [Coming Soon...](#roadmap)                                                                                                                                                                                                                                                                                                                                                      | [Coming Soon...](#roadmap)                                                                                                                                                                                                                                                                                                                                                  |\n\n\nThere are instructions [below](#file-format-and-converter) for converting any `.bin`, `.txt`, `.vec`, `.hdf5` file to a `.magnitude` file.\n\n## Using the Library\n\n### Constructing a Magnitude Object\n\nYou can create a Magnitude object like so:\n```python\nfrom pymagnitude import *\nvectors = Magnitude(\"\u002Fpath\u002Fto\u002Fvectors.magnitude\")\n```\n\nIf needed, and included for convenience, you can also open a `.bin`, `.txt`, `.vec`, `.hdf5` file directly with Magnitude. This is, however, less efficient and very slow for large models as it will convert the file to a `.magnitude` file on the first run into a temporary directory. The temporary directory is not guaranteed to persist and does not persist when your computer reboots. You should [pre-convert `.bin`, `.txt`, `.vec`, `.hdf5` files with `python -m pymagnitude.converter`](#file-format-and-converter) typically for faster speeds, but this feature is useful for one-off use-cases. A warning will be generated when instantiating a Magnitude object directly with a `.bin`, `.txt`, `.vec`, `.hdf5`. You can supress warnings by setting the  `supress_warnings` argument in the constructor to `True`.\n\n---------------\n\n* \u003Csup>By default, lazy loading is enabled. You can pass in an optional `lazy_loading` argument to the constructor with the value `-1` to disable lazy-loading and pre-load all vectors into memory (a la Gensim), `0` (default) to enable lazy-loading with an unbounded in-memory LRU cache, or an integer greater than zero `X` to enable lazy-loading with an LRU cache that holds the `X` most recently used vectors in memory.\u003C\u002Fsup> \n* \u003Csup>If you want the data for the `most_similar` functions to be pre-loaded eagerly on initialization, set `eager` to `True`.\u003C\u002Fsup>\n* \u003Csup>Note, even when `lazy_loading` is set to `-1` or `eager` is set to `True` data will be pre-loaded into memory in a background thread to prevent the constructor from blocking for a few minutes for large models. If you really want blocking behavior, you can pass `True` to the `blocking` argument.\u003C\u002Fsup>\n* \u003Csup>By default, [unit-length normalized](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FUnit_vector) vectors are returned unless you are loading an ELMo model. Set the optional argument `normalized` to `False` if you wish to recieve the raw non-normalized vectors instead.\u003C\u002Fsup>\n* \u003Csup>By default, NumPy arrays are returned for queries. Set the optional argument `use_numpy` to `False` if you wish to recieve Python lists instead.\u003C\u002Fsup>\n* \u003Csup>By default, querying for keys is case-sensitive. Set the optional argument `case_insensitive` to `True` if you wish to perform case-insensitive searches.\u003C\u002Fsup>\n* \u003Csup>Optionally, you can include the `pad_to_length` argument which will specify the length all examples should be padded to if passing in multple examples. Any examples that are longer than the pad length will be truncated.\u003C\u002Fsup>\n* \u003Csup>Optionally, you can set the `truncate_left` argument to `True` if you want the beginning of the the list of keys in each example to be truncated instead of the end in case it is longer than `pad_to_length` when specified.\u003C\u002Fsup>\n* \u003Csup>Optionally, you can set the `pad_left` argument to `True` if you want the padding to appear at the beginning versus the end (which is the default).\u003C\u002Fsup>\n* \u003Csup>Optionally, you can pass in the `placeholders` argument, which will increase the dimensions of each vector by a `placeholders` amount, zero-padding those extra dimensions. This is useful, if you plan to add other values and information to the vectors and want the space for that pre-allocated in the vectors for efficiency.\u003C\u002Fsup>\n* \u003Csup>Optionally, you can pass in the `language` argument with an [ISO 639-1 Language Code](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FList_of_ISO_639-1_codes), which, if you are using Magnitude for word vectors, will ensure the library respects stemming and other language-specific features for that language. The default is `en` for English. You can also pass in `None` if you are not using Magnitude for word vectors. \u003C\u002Fsup>\n* \u003Csup>Optionally, you can pass in the `dtype` argument which will let you control the data type of the NumPy arrays returned by Magnitude.\u003C\u002Fsup>\n* \u003Csup>Optionally, you can pass in the `devices` argument which will let you control the usage of GPUs when the underlying models supports GPU usage. This argument should be a list of integers, where each integer represents the GPU device number (`0`, `1`, etc.).\u003C\u002Fsup>\n* \u003Csup>Optionally, you can pass in the `temp_dir` argument which will let you control the location of the temporary directory Magnitude will use.\u003C\u002Fsup>\n* \u003Csup>Optionally, you can pass in the `log` argument which will have Magnitude log progress to standard error when slow operations are taking place.\u003C\u002Fsup>\n\n### Querying\n\nYou can query the total number of vectors in the file like so:\n```python\nlen(vectors)\n```\n\n---------------\n\nYou can query the dimensions of the vectors like so: \n```python\nvectors.dim\n```\n\n---------------\n\nYou can check if a key is in the vocabulary like so: \n```python\n\"cat\" in vectors\n```\n\n---------------\n\nYou can iterate through all keys and vectors like so:\n```python\nfor key, vector in vectors:\n  ...\n```\n\n---------------\n\nYou can query for the vector of a key like so: \n```python\nvectors.query(\"cat\")\n```\n\n---------------\n\nYou can index for the n-th key and vector like so:\n```python\nvectors[42]\n```\n\n---------------\n\nYou can query for the vector of multiple keys like so: \n```python\nvectors.query([\"I\", \"read\", \"a\", \"book\"])\n```\nA 2D array (keys by vectors) will be returned.\n\n---------------\n\nYou can query for the vector of multiple examples like so: \n```python\nvectors.query([[\"I\", \"read\", \"a\", \"book\"], [\"I\", \"read\", \"a\", \"magazine\"]])\n```\nA 3D array (examples by keys by vectors) will be returned. If `pad_to_length` is not specified, and the size of each example is uneven, they will be padded to the length of the longest example.\n\n---------------\n\nYou can index for the keys and vectors of multiple indices like so:\n```python\nvectors[:42] # slice notation\nvectors[42, 1337, 2001] # tuple notation\n```\n\n---------------\n\nYou can query the distance of two or multiple keys like so:\n```python\nvectors.distance(\"cat\", \"dog\")\nvectors.distance(\"cat\", [\"dog\", \"tiger\"])\n```\n\n---------------\n\nYou can query the similarity of two or multiple keys like so:\n```python\nvectors.similarity(\"cat\", \"dog\")\nvectors.similarity(\"cat\", [\"dog\", \"tiger\"])\n```\n\n---------------\n\nYou can query for the most similar key out of a list of keys to a given key like so:\n```python\nvectors.most_similar_to_given(\"cat\", [\"dog\", \"television\", \"laptop\"]) # dog\n```\n\n---------------\n\nYou can query for which key doesn't match a list of keys to a given key like so:\n```python\nvectors.doesnt_match([\"breakfast\", \"cereal\", \"dinner\", \"lunch\"]) # cereal\n```\n\n---------------\n\nYou can query for the most similar (nearest neighbors) keys like so: \n```python\nvectors.most_similar(\"cat\", topn = 100) # Most similar by key\nvectors.most_similar(vectors.query(\"cat\"), topn = 100) # Most similar by vector\n```\nOptionally, you can pass a `min_similarity` argument to `most_similar`. Values from [-1.0-1.0] are valid.\n\n---------------\n\nYou can also query for the most similar keys giving positive and negative examples (which, incidentally, solves analogies) like so: \n```python\nvectors.most_similar(positive = [\"woman\", \"king\"], negative = [\"man\"]) # queen\n```\n\n---------------\n\nSimilar to `vectors.most_similar`, a `vectors.most_similar_cosmul` function exists that uses the 3CosMul function from [Levy and Goldberg](http:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FW14-1618):\n```python\nvectors.most_similar_cosmul(positive = [\"woman\", \"king\"], negative = [\"man\"]) # queen\n```\n\n---------------\n\nYou can also query for the most similar keys using an approximate nearest neighbors index which is much faster, but doesn't guarantee the exact answer: \n```python\nvectors.most_similar_approx(\"cat\")\nvectors.most_similar_approx(positive = [\"woman\", \"king\"], negative = [\"man\"])\n```\nOptionally, you can pass an `effort` argument with values between [0.0-1.0] to the `most_similar_approx` function which will give you runtime trade-off. The default value for `effort` is 1.0 which will take the longest, but will give the most accurate result.\n\n---------------\n\nYou can query for all keys closer to a key than another key is like so:\n```python\nvectors.closer_than(\"cat\", \"rabbit\") # [\"dog\", ...]\n```\n\n---------------\n\nYou can access all of the underlying vectors in the model in a large `numpy.memmap` array of size (`len(vectors) x vectors.emb_dim`) like so:\n\n```python\nvectors.get_vectors_mmap()\n```\n\n---------------\n\nYou can clean up all associated resources, open files, and database connections like so:\n```python\nvectors.close()\n```\n\n### Basic Out-of-Vocabulary Keys\n\nFor word vector representations, handling out-of-vocabulary keys is important to handling new words not in the trained model, handling mispellings and typos, and making models trained on the word vector representations more robust in general.\n\nOut-of-vocabulary keys are handled by assigning them a random vector value. However, the randomness is deterministic. So if the *same* out-of-vocabulary key is encountered twice, it will be assigned the same random vector value for the sake of being able to train on those out-of-vocabulary keys. Moreover, if two out-of-vocabulary keys share similar character n-grams (\"uberx\", \"uberxl\") they will placed close to each other even if they are both not in the vocabulary:\n\n```python\nvectors = Magnitude(\"\u002Fpath\u002Fto\u002FGoogleNews-vectors-negative300.magnitude\")\n\"uberx\" in vectors # False\n\"uberxl\" in vectors # False\nvectors.query(\"uberx\") # array([ 5.07109939e-02, -7.08248823e-02, -2.74812328e-02, ... ])\nvectors.query(\"uberxl\") # array([ 0.04734962, -0.08237578, -0.0333479, -0.00229564, ... ])\nvectors.similarity(\"uberx\", \"uberxl\") # 0.955000000200815\n```\n\n### Advanced Out-of-Vocabulary Keys\n\nIf using a Magnitude file with advanced out-of-vocabulary support (Medium or Heavy), out-of-vocabulary keys will also be embedded close to similar keys (determined by string similarity) that *are in* the vocabulary:\n```python\nvectors = Magnitude(\"\u002Fpath\u002Fto\u002FGoogleNews-vectors-negative300.magnitude\")\n\"uberx\" in vectors # False\n\"uberification\" in vectors # False\n\"uber\" in vectors # True\nvectors.similarity(\"uberx\", \"uber\") # 0.7383483267618451\nvectors.similarity(\"uberification\", \"uber\") # 0.745452837882727\n```\n\n#### Handling Misspellings and Typos\nThis also makes Magnitude robust to a lot of spelling errors:\n```python\nvectors = Magnitude(\"\u002Fpath\u002Fto\u002FGoogleNews-vectors-negative300.magnitude\")\n\"missispi\" in vectors # False\nvectors.similarity(\"missispi\", \"mississippi\") # 0.35961736624824003\n\"discrimnatory\" in vectors # False\nvectors.similarity(\"discrimnatory\", \"discriminatory\") # 0.8309152561753461\n\"hiiiiiiiiii\" in vectors # False\nvectors.similarity(\"hiiiiiiiiii\", \"hi\") # 0.7069775034853861\n```\n\nCharacter n-grams are used to create this effect for out-of-vocabulary keys. The inspiration for this feature was taken from Facebook AI Research's [Enriching Word Vectors with Subword Information](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1607.04606.pdf), but instead of utilizing character n-grams at train time, character n-grams are used at inference so the effect can be somewhat replicated (but not perfectly replicated) in older models that were not trained with character n-grams like word2vec and GloVe.\n\n### Concatenation of Multiple Models\nOptionally, you can combine vectors from multiple models to feed stronger information into a machine learning model like so:\n```python\nfrom pymagnitude import *\nword2vec = Magnitude(\"\u002Fpath\u002Fto\u002FGoogleNews-vectors-negative300.magnitude\")\nglove = Magnitude(\"\u002Fpath\u002Fto\u002Fglove.6B.50d.magnitude\")\nvectors = Magnitude(word2vec, glove) # concatenate word2vec with glove\nvectors.query(\"cat\") # returns 350-dimensional NumPy array ('cat' from word2vec concatenated with 'cat' from glove)\nvectors.query((\"cat\", \"cats\")) # returns 350-dimensional NumPy array ('cat' from word2vec concatenated with 'cats' from glove)\n```\n\nYou can concatenate more than two vector models, simply by passing more arguments to constructor.\n\n### Additional Featurization (Parts of Speech, etc.)\nYou can automatically create vectors from additional features you may have such as parts of speech, syntax dependency information, or any other information using the `FeaturizerMagnitude` class:\n\n```python\nfrom pymagnitude import *\npos_vectors = FeaturizerMagnitude(100, namespace = \"PartsOfSpeech\")\npos_vectors.dim # 4 - number of dims automatically determined by Magnitude from 100\npos_vectors.query(\"NN\") # - array([ 0.08040417, -0.71705252,  0.61228951,  0.32322192]) \npos_vectors.query(\"JJ\") # - array([-0.11681135,  0.10259253,  0.8841201 , -0.44063763])\npos_vectors.query(\"NN\") # - array([ 0.08040417, -0.71705252,  0.61228951,  0.32322192]) (deterministic hashing so the same value is returned every time for the same key)\ndependency_vectors = FeaturizerMagnitude(100, namespace = \"SyntaxDependencies\")\ndependency_vectors.dim # 4 - number of dims automatically determined by Magnitude from 100\ndependency_vectors.query(\"nsubj\") # - array([-0.81043793,  0.55401352, -0.10838071,  0.15656626])\ndependency_vectors.query(\"prep\") # - array([-0.30862918, -0.44487267, -0.0054573 , -0.84071788])\n```\n\nMagnitude will use the [feature hashing trick](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FFeature_hashing) internally to directly use the hash of the feature value to create a unique vector for that feature value.\n\nThe first argument to `FeaturizerMagnitude` should be an approximate upper-bound on the number of values for the feature. Since there are \u003C 100 [parts of speech tags](https:\u002F\u002Fwww.ling.upenn.edu\u002Fcourses\u002FFall_2003\u002Fling001\u002Fpenn_treebank_pos.html) and \u003C 100 [syntax dependencies](http:\u002F\u002Funiversaldependencies.org\u002Fu\u002Fdep\u002Fall.html), we choose 100 for both in the example above. The value chosen will determine how many dimensions Magnitude will automatically assign to the particular the `FeaturizerMagnitude` object to reduce the chance of a hash collision. The `namespace` argument can be any string that describes your additional feature. It is optional, but highly recommended.\n\nYou can then concatenate these features for use with a standard Magnitude object:\n```python\nfrom pymagnitude import *\nword2vec = Magnitude(\"\u002Fpath\u002Fto\u002FGoogleNews-vectors-negative300.magnitude\")\npos_vectors = FeaturizerMagnitude(100, namespace = \"PartsOfSpeech\")\ndependency_vectors = FeaturizerMagnitude(100, namespace = \"SyntaxDependencies\")\nvectors = Magnitude(word2vec, pos_vectors, dependency_vectors) # concatenate word2vec with pos and dependencies\nvectors.query([\n    (\"I\", \"PRP\", \"nsubj\"), \n    (\"saw\", \"VBD\", \"ROOT\"), \n    (\"a\", \"DT\", \"det\"), \n    (\"cat\", \"NN\", \"dobj\"), \n    (\".\",  \".\", \"punct\")\n  ]) # array of size 5 x (300 + 4 + 4) or 5 x 308\n\n# Or get a unique vector for every 'buffalo' in:\n# \"Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo\"\n# (https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FBuffalo_buffalo_Buffalo_buffalo_buffalo_buffalo_Buffalo_buffalo)\nvectors.query([\n    (\"Buffalo\", \"JJ\", \"amod\"), \n    (\"buffalo\", \"NNS\", \"nsubj\"), \n    (\"Buffalo\", \"JJ\", \"amod\"), \n    (\"buffalo\", \"NNS\", \"nsubj\"), \n    (\"buffalo\",  \"VBP\", \"rcmod\"),\n    (\"buffalo\",  \"VB\", \"ROOT\"),\n    (\"Buffalo\",  \"JJ\", \"amod\"),\n    (\"buffalo\",  \"NNS\", \"dobj\")\n  ]) # array of size 8 x (300 + 4 + 4) or 8 x 308\n\n```\n\nA machine learning model, given this output, now has access to parts of speech information and syntax dependency information instead of just word vector information. In this case, this additional information can give neural networks stronger signal for semantic information and reduce the need for training data.\n\n### Using Magnitude with a ML library\nMagnitude makes it very easy to quickly build and iterate on models that need to use vector representations by taking care of a lot of pre-processing code to convert a dataset of text (or keys) into vectors. Moreover, it can make these models more robust to [out-of-vocabulary words](#advanced-out-of-vocabulary-keys) and [misspellings](#handling-misspellings-and-typos).\n\nThere is example code available using Magnitude to build an intent classification model for the [ATIS (Airline Travel Information Systems) dataset](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002Fdocs\u002FLDC93S4B\u002Fcorpus.html) ([Train](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fdata\u002Fatis\u002Fatis-intent-train.txt)\u002F[Test](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fdata\u002Fatis\u002Fatis-intent-test.txt)), used for chatbots or conversational interfaces, in a few popular machine learning libraries below.\n\n#### Keras\nYou can access a guide for using Magnitude with Keras (which supports TensorFlow, Theano, CNTK) at this [Google Colaboratory Python notebook](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1lOcAhIffLW8XC6QsKzt5T_ZqPP4Y9eS4).\n\n#### PyTorch\n*The PyTorch guide is coming soon.*\n\n#### TFLearn\n*The TFLearn guide is coming soon.*\n\n### Utils\n\nYou can use the `MagnitudeUtils` class for convenient access to functions that may be useful when creating machine learning models.\n\nYou can import MagnitudeUtils like so:\n```python\n  from pymagnitude import MagnitudeUtils\n```\n\nYou can download a Magnitude model from a remote source like so:\n```python\n  vecs = Magnitude(MagnitudeUtils.download_model('word2vec\u002Fheavy\u002FGoogleNews-vectors-negative300'))\n```\n\nBy default, `download_model` will download files from `http:\u002F\u002Fmagnitude.plasticity.ai` to a `~\u002F.magnitude` folder created automatically. If the file has already been downloaded, it will not be downloaded again. You can change the directory of the local download folder using the optional `download_dir` argument. You can change the domain from which models will be downloaded with the optional `remote_path` argument.\n\nYou can create a batch generator for `X` and `y` data with `batchify`, like so:\n```python\n  X = [.3, .2, .7, .8, .1]\n  y = [0, 0, 1, 1, 0]\n  batch_gen = MagnitudeUtils.batchify(X, y, 2)\n  for X_batch, y_batch in batch_gen:\n    print(X_batch, y_batch)\n  # Returns:\n  # 1st loop: X_batch = [.3, .2], y_batch = [0, 0]\n  # 2nd loop: X_batch = [.7, .8], y_batch = [1, 1]\n  # 3rd loop: X_batch = [.1], y_batch = [0]\n  # next loop: repeats infinitely...\n```\n\nYou can encode class labels to integers and back with `class_encoding`, like so:\n```python\n  add_class, class_to_int, int_to_class = MagnitudeUtils.class_encoding()\n  add_class(\"cat\") # Returns: 0\n  add_class(\"dog\") # Returns: 1\n  add_class(\"cat\") # Returns: 0\n  class_to_int(\"dog\") # Returns: 1\n  class_to_int(\"cat\") # Returns: 0\n  int_to_class(1) # Returns: \"dog\"\n  int_to_class(0) # Returns: \"cat\"\n```\n\nYou can convert categorical data with class integers to one-hot NumPy arrays with `to_categorical`, like so:\n```python\n  y = [1, 5, 2]\n  MagnitudeUtils.to_categorical(y, num_classes = 6) # num_classes is optional\n  # Returns: \n  # array([[0., 1., 0., 0., 0., 0.] \n  #       [0., 0., 0., 0., 0., 1.] \n  #       [0., 0., 1., 0., 0., 0.]])\n```\n\nYou can convert from one-hot NumPy arrays back to a 1D NumPy array of class integers with `from_categorical`, like so:\n```python\n  y_c = [[0., 1., 0., 0., 0., 0.],\n         [0., 0., 0., 0., 0., 1.]]\n  MagnitudeUtils.from_categorical(y_c)\n  # Returns: \n  # array([1., 5.])\n```\n\n## Concurrency and Parallelism\nThe library is thread safe (it uses a different connection to the underlying store per thread), is read-only, and it never writes to the file. Because of the light-memory usage, you can also run it in multiple processes (or use `multiprocessing`) with different address spaces without having to duplicate the data in-memory like with other libraries and without having to create a multi-process shared variable since data is read off-disk and each process keeps its own LRU memory cache. For heavier functions, like `most_similar` a shared memory mapped file is created to share memory between processes.\n\n## File Format and Converter\nThe Magnitude package uses the `.magnitude` file format instead of `.bin`, `.txt`, `.vec`, or `.hdf5` as with other vector models like word2vec, GloVe, fastText, and ELMo. There is an included command-line utility for converting word2vec, GloVe, fastText, and ELMo files to Magnitude files.\n\nYou can convert them like so:\n```bash\npython -m pymagnitude.converter -i \u003CPATH TO FILE TO BE CONVERTED> -o \u003COUTPUT PATH FOR MAGNITUDE FILE>\n```\n\nThe input format will automatically be determined by the extension \u002F the contents of the input file. You should only need to perform this conversion once for a model. After converting, the Magnitude file format is static and it will not be modified or written to make concurrent read access safe.\n\nThe flags for  `pymagnitude.converter` are specified below:\n* You can pass in the `-h` flag for help and to list all flags.\n* You can use the `-p \u003CPRECISION>` flag to specify the decimal precision to retain (selecting a lower number will create smaller files). The actual underlying values are stored as integers instead of floats so this is essentially [quantization](https:\u002F\u002Fwww.tensorflow.org\u002Fperformance\u002Fquantization) for smaller model footprints.\n* You can add an approximate nearest neighbors index to the file (increases size) with the `-a` flag which will enable the use of the `most_similar_approx` function. The `-t \u003CTREES>` flag controls the number of trees in the approximate neigherest neighbors index (higher is more accurate) when used in conjunction with the `-a` flag (if not supplied, the number of trees is automatically determined).\n* You can pass the `-s` flag to disable adding subword information to the file (which will make the file smaller), but disable advanced out-of-vocabulary key support.\n* If converting a model that has no vocabulary like ELMo, you can pass the `-v` flag along with the path to another Magnitude file you would like to take the vocabulary from.\n\nOptionally, you can bulk convert many files by passing an input folder and output folder instead of an input file and output file. All `.txt`, `.bin`, `.vec`, `.hdf5` files in the input folder will be converted to `.magnitude` files in the the output folder. The output folder must exist before a bulk conversion operation.\n\n## Remote Loading\nYou can instruct Magnitude download and open a model from Magnitude's remote repository instead of a local file path. The file will automatically be downloaded locally on the first run to `~\u002F.magnitude\u002F` and subsequently skip the download if the file already exists locally.\n\n```python\n  vecs = Magnitude('http:\u002F\u002Fmagnitude.plasticity.ai\u002Fword2vec\u002Fheavy\u002FGoogleNews-vectors-negative300.magnitude') # full url\n  vecs = Magnitude('word2vec\u002Fheavy\u002FGoogleNews-vectors-negative300') # or, use the shorthand for the url\n```\n\nFor more control over the remote download domain and local download directory, see how to use [`MagnitudeUtils.download_model`](#utils).\n\n## Remote Streaming over HTTP\n\nMagnitude models are generally large files (multiple GB) that take up a lot of disk space, even though the `.magnitude` format makes it fast to utilize the vectors. Magnitude has an option to stream these large files over HTTP. \nThis is explicitly different from the [remote loading feature](#remote-loading), in that the model doesn't even need to be downloaded at all. You can begin querying models immediately with no disk space used at all. \n\n\n```python\n  vecs = Magnitude('http:\u002F\u002Fmagnitude.plasticity.ai\u002Fword2vec\u002Fheavy\u002FGoogleNews-vectors-negative300.magnitude', stream=True) # full url\n  vecs = Magnitude('word2vec\u002Fheavy\u002FGoogleNews-vectors-negative300', stream=True) # or, use the shorthand for the url\n\n  vecs.query(\"king\") # Returns: the vector for \"king\" quickly, even with no local model file downloaded\n```\n\nYou can play around with a demo of this in a [Google Colaboratory Python Notebook](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1zkPhoNM1NvbTmEk9gr0Jnt8hONrca1Fv).\n\nThis feature is extremely useful if your computing environment is resource constrainted (low RAM and low disk space), you want to experiment quickly with vectors without downloading and setting up large model files, or you are training a small model.\nWhile there is some added network latency since the data is being streamed, Magnitude will still use an in-memory cache as specified by the [`lazy_loading`](#constructing-a-magnitude-object) constructor parameter. Since languages generally have a [Zipf-ian distribution](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FZipf%27s_law), the network latency should largely not be an issue after the cache is warmed after being queried a small number of times.\n\nThey will be queried directly off a static HTTP web server using [HTTP Range Request](https:\u002F\u002Fdeveloper.mozilla.org\u002Fen-US\u002Fdocs\u002FWeb\u002FHTTP\u002FRange_requests) headers. All Magnitude methods support streaming, however, `most_similar` and `most_similar_approx`\nmay be slow as they are not optimized for streaming [yet](#roadmap). You can see how this streaming mode [performs currently in the benchmarks](#benchmarks-and-features), however, it will get faster as we [optimize it in the future](#roadmap)!\n\n## Other Documentation\nOther documentation is not available at this time. See the source file directly (it is well commented) if you need more information about a method's arguments or want to see all supported features.\n\n## Other Languages\nCurrently, we only provide English word vector models on this page pre-converted to the `.magnitude` format. You can, however, still use Magnitude with word vectors of other languages. Facebook has trained their [fastText vectors for many different languages](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FfastText\u002Fblob\u002Fmaster\u002Fpretrained-vectors.md). You can down the `.vec` file for any language you want and then convert it to `.magnitude` with the [converter](#file-format-and-converter).\n\n## Other Programming Languages\nCurrently, reading Magnitude files is only supported in Python, since it has become the de-facto language for machine learning. This is sufficient for most use cases. Extending the file format to other languages shouldn't be difficult as SQLite has a native C implementation and has bindings in most languages. The file format itself and the protocol for reading and searching is also fairly straightforward upon reading the source code of this repository.\n\n## Other Domains\nCurrently, natural language processing is the most popular domain that uses pre-trained vector embedding models for word vector representations. There are, however, other domains like computer vision that have started using pre-trained vector embedding models like [Deep1B](https:\u002F\u002Fgithub.com\u002Farbabenko\u002FGNOIMI) for image representation. This library intends to stay agnostic to various domains and instead provides a generic key-vector store and interface that is useful for all domains.\n\n## Contributing\nThe main repository for this project can be found on [GitLab](https:\u002F\u002Fgitlab.com\u002FPlasticity\u002Fmagnitude). The [GitHub repository](https:\u002F\u002Fgithub.com\u002Fplasticityai\u002Fmagnitude) is only a mirror. Pull requests for more tests, better error-checking, bug fixes, performance improvements, or documentation or adding additional utilties \u002F functionalities are welcome on [GitLab](https:\u002F\u002Fgitlab.com\u002FPlasticity\u002Fmagnitude).\n\nYou can contact us at [opensource@plasticity.ai](mailto:opensource@plasticity.ai).\n\n## Roadmap\n\n* Speed optimizations on remote streaming and exposing stream cache configuration options\n* Make `most_similar_approx` optimized for streaming\n* In addition to the \"Light\", \"Medium\", and \"Heavy\" flavors, add a \"Ludicrous\" flavor that will be of an even larger file size but removes the constraint of the initially slow `most_similar` lookups.\n* Add Google BERT support\n* Support fastText `.bin` format\n\n## Other Notable Projects\n* [spotify\u002Fannoy](https:\u002F\u002Fgithub.com\u002Fspotify\u002Fannoy) - Powers the approximate nearest neighbors algorithm behind `most_similar_approx` in Magnitude using random-projection trees and hierarchical 2-means. Thanks to author [Erik Bernhardsson](https:\u002F\u002Fgithub.com\u002Ferikbern) for helping out with some of the integration details between Magnitude and Annoy.\n\n## Citing this Repository\n\nIf you'd like to [cite our paper at EMNLP 2018](http:\u002F\u002Faclweb.org\u002Fanthology\u002FD18-2021), you can use the following BibTeX citation:\n```latex\n@inproceedings{patel2018magnitude,\n  title={Magnitude: A Fast, Efficient Universal Vector Embedding Utility Package},\n  author={Patel, Ajay and Sands, Alexander and Callison-Burch, Chris and Apidianaki, Marianna},\n  booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations},\n  pages={120--126},\n  year={2018}\n}\n```\nor follow the [Google Scholar link](https:\u002F\u002Fscholar.google.com\u002Fscholar?cluster=5916903042122216495&hl=en&as_sdt=0,5) for other ways to cite the paper.\n\nIf you'd like to cite this repository you can use the following DOI badge: &nbsp;[![DOI](https:\u002F\u002Fzenodo.org\u002Fbadge\u002F122715432.svg)](https:\u002F\u002Fzenodo.org\u002Fbadge\u002Flatestdoi\u002F122715432)\n\nClicking on the badge will lead to a page that will help you generate proper BibTeX citations, JSON-LD citations, and other citations.\n\n## LICENSE and Attribution\n\nThis repository is licensed under the license found [here](LICENSE.txt).\n\n“[Seismic](https:\u002F\u002Fthenounproject.com\u002Fziman.jan\u002Fcollection\u002Fweather\u002F?i=1518266)” icon by JohnnyZi from the [Noun Project](https:\u002F\u002Fthenounproject.com).\n","\u003Cdiv align=\"center\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fplasticityai_magnitude_readme_45e13ec22f36.png\" alt=\"magnitude\" height=\"50\">\u003C\u002Fdiv>\n\n## \u003Cdiv align=\"center\">Magnitude：一款快速、简单的向量嵌入工具库\u003Cbr \u002F>\u003Cbr \u002F>[![流水线状态](https:\u002F\u002Fgitlab.com\u002FPlasticity\u002Fmagnitude\u002Fbadges\u002Fmaster\u002Fpipeline.svg)](https:\u002F\u002Fgitlab.com\u002FPlasticity\u002Fmagnitude\u002Fcommits\u002Fmaster)&nbsp;&nbsp;&nbsp;[![构建状态](https:\u002F\u002Ftravis-ci.org\u002Fplasticityai\u002Fmagnitude.svg?branch=master)](https:\u002F\u002Ftravis-ci.org\u002Fplasticityai\u002Fmagnitude)&nbsp;&nbsp;&nbsp;[![构建状态](https:\u002F\u002Fci.appveyor.com\u002Fapi\u002Fprojects\u002Fstatus\u002F72lwh2g7a9ddbnt2\u002Fbranch\u002Fmaster?svg=true)](https:\u002F\u002Fci.appveyor.com\u002Fproject\u002Fplasticity-admin\u002Fmagnitude\u002Fbranch\u002Fmaster)\u003Cbr\u002F>[![PyPI版本](https:\u002F\u002Fbadge.fury.io\u002Fpy\u002Fpymagnitude.svg)](https:\u002F\u002Fpypi.python.org\u002Fpypi\u002Fpymagnitude\u002F)&nbsp;&nbsp;&nbsp;[![许可证](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Flicense\u002Fmashape\u002Fapistatus.svg?maxAge=2592000)](https:\u002F\u002Fgitlab.com\u002FPlasticity\u002Fmagnitude\u002Fblob\u002Fmaster\u002FLICENSE.txt)&nbsp;&nbsp;&nbsp;[![Python版本](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fpyversions\u002Fpymagnitude.svg)](https:\u002F\u002Fpypi.python.org\u002Fpypi\u002Fpymagnitude\u002F)&nbsp;&nbsp;&nbsp;&nbsp;[![DOI](https:\u002F\u002Fzenodo.org\u002Fbadge\u002F122715432.svg)](https:\u002F\u002Fzenodo.org\u002Fbadge\u002Flatestdoi\u002F122715432)&nbsp;&nbsp;&nbsp;&nbsp;[![arXiv](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FarXiv-1810.11190-%23B41A1A.svg)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.11190)\u003C\u002Fdiv>\n由 [Plasticity](https:\u002F\u002Fwww.plasticity.ai\u002F) 开发的多功能 Python 软件包及向量存储文件格式，旨在以快速、高效且简便的方式在机器学习模型中使用向量嵌入。它主要作为 [Gensim](https:\u002F\u002Fradimrehurek.com\u002Fgensim\u002F) 的更简单、更快替代方案而设计，但也可用作自然语言处理以外领域的通用键-向量存储。该库提供诸如 [未登录词查询](#advanced-out-of-vocabulary-keys) 和 [通过 HTTP 流式传输大型模型](#remote-streaming-over-http) 等独特功能。相关研究成果已发表于 [EMNLP 2018](http:\u002F\u002Faclweb.org\u002Fanthology\u002FD18-2021)，并可在 [arXiv](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.11190) 上查阅。\n\n## 目录\n- [安装](#installation)\n- [动机](#motivation)\n- [基准测试与特性](#benchmarks-and-features)\n- [热门嵌入模型的预转换 Magnitude 格式](#pre-converted-magnitude-formats-of-popular-embeddings-models)\n- [使用该库](#using-the-library)\n    * [构造 Magnitude 对象](#constructing-a-magnitude-object)\n    * [查询](#querying)\n    * [基础未登录词处理](#basic-out-of-vocabulary-keys)\n    * [高级未登录词处理](#advanced-out-of-vocabulary-keys)\n        + [拼写错误与打字错误的处理](#handling-misspellings-and-typos)\n    * [多个模型的拼接](#concatenation-of-multiple-models)\n    * [附加特征提取（词性等）](#additional-featurization-parts-of-speech-etc)\n    * [将 Magnitude 与机器学习库结合使用](#using-magnitude-with-a-ml-library)\n        + [Keras](#keras)\n        + [PyTorch](#pytorch)\n        + [TFLearn](#tflearn)\n    * [实用工具](#utils)\n- [并发与并行处理](#concurrency-and-parallelism)\n- [文件格式与转换器](#file-format-and-converter)\n- [远程加载](#remote-loading)\n- [通过 HTTP 远程流式传输](#remote-streaming-over-http)\n- [其他文档](#other-documentation)\n- [其他语言](#other-languages)\n- [其他编程语言](#other-programming-languages)\n- [其他领域](#other-domains)\n- [贡献](#contributing)\n- [路线图](#roadmap)\n- [其他值得关注的项目](#other-notable-projects)\n- [引用本仓库](#citing-this-repository)\n- [许可与署名](#license-and-attribution)\n\n## 安装\n您可以通过 `pip` 安装本软件包：\n```python\npip install pymagnitude # Python 2.7\npip3 install pymagnitude # Python 3\n```\n\n由于依赖冲突，Google Colaboratory 在安装 Magnitude 时可能会遇到一些问题。您可以在 Google Colab 中使用以下代码片段来安装 Magnitude：\n```bash\n# 在 Google Colab 上安装 Magnitude\n! echo \"正在安装 Magnitude.... (请稍候，可能需要一段时间)\"\n! (curl https:\u002F\u002Fraw.githubusercontent.com\u002Fplasticityai\u002Fmagnitude\u002Fmaster\u002Finstall-colab.sh | \u002Fbin\u002Fbash 1>\u002Fdev\u002Fnull 2>\u002Fdev\u002Fnull)\n! echo \"Magnitude 安装完成。\"\n```\n\n## 动机\n向量空间嵌入模型在机器学习中日益普及，传统上主要用于自然语言处理任务。然而，目前仍缺乏一种快速轻量级的工具来高效地使用这些庞大的向量空间嵌入模型。\n\nMagnitude 向量嵌入文件格式（`.magnitude`）旨在成为一种更高效的通用向量嵌入格式，支持延迟加载以加快开发环境中的冷启动速度，采用 LRU 内存缓存机制以提升生产环境下的性能，支持多键查询、直接为神经网络输入添加特征、高效进行相似度计算，并具备处理未登录词或拼写错误等边缘情况以及合并多个向量模型等功能。此外，该格式还适用于无法完全加载到内存中的大型向量模型。\n\nMagnitude 使用 [SQLite](http:\u002F\u002Fwww.sqlite.org)，一种快速且流行的嵌入式数据库，作为其底层数据存储。它利用索引实现快速键值查找，并通过内存映射、SIMD 指令和空间索引技术，在磁盘外的向量空间中实现高效的相似度搜索，同时保持良好的内存性能，即使在多个进程之间也能维持高效运行。更重要的是，内存映射会在不同运行之间被缓存，因此即使关闭进程后，性能优势仍然得以保留。\n\n## 基准测试与特性\n\n| **指标**                                                                                                                                            | **轻量级**   | **中量级** | **重量级** | **[流式传输](#remote-streaming-over-http)**    |\n| ----------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------: | :------------------: | :-----------------: | :----------------------------------------------------: |\n| 首次加载时间                                                                                                                                     | **0.7210秒**           | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | 7.7550秒                                                |\n| 冷启动单键查询                                                                                                                                 | **0.0001秒**           | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | 1.6437秒                                                |\n| 温启动单键查询 \u003Cbr \u002F>\u003Csup>*(与冷启动查询相同的键)*\u003C\u002Fsup>                                                                                     | **0.00004秒**          | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | **0.0004秒**                                            |\n| 冷启动多键查询 \u003Cbr \u002F>\u003Csup>*(n=25)*\u003C\u002Fsup>                                                                                                     | **0.0442秒**           | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | 1.7753秒                                                |\n| 温启动多键查询 \u003Cbr \u002F>\u003Csup>*(n=25, 键与冷启动查询相同)*\u003C\u002Fsup>                                                                           | **0.00004秒**          | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | **0.0001秒**                                            |\n| 第一次 `most_similar` 搜索查询 \u003Cbr \u002F>\u003Csup>*(n=10, 最坏情况)*\u003C\u002Fsup>                                                                              | 247.05秒               | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | -                                                      |\n| 第一次 `most_similar` 搜索查询 \u003Cbr \u002F>\u003Csup>*(n=10, 平均情况, 使用磁盘持久化缓存)*\u003C\u002Fsup>                                                 | **1.8217秒**           | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | -                                                      |\n| 后续 `most_similar` 搜索 \u003Cbr \u002F>\u003Csup>*(n=10, 键与第一次查询不同)*\u003C\u002Fsup>                                                           | **0.2434秒**           | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | -                                                      |\n| 温启动后续 `most_similar` 搜索 \u003Cbr \u002F>\u003Csup>*(n=10, 键与第一次查询相同)*\u003C\u002Fsup>                                                             | **0.00004秒**          | **0.00004秒**         | **0.00004秒**        | -                                                      |\n| 第一次 `most_similar_approx` 搜索查询 \u003Cbr \u002F>\u003Csup>*(n=10, effort=1.0, 最坏情况)*\u003C\u002Fsup>                                                           | 不适用                   | 不适用                  | **29.610秒**         | -                                                      |\n| 第一次 `most_similar_approx` 搜索查询 \u003Cbr \u002F>\u003Csup>*(n=10, effort=1.0, 平均情况, 使用磁盘持久化缓存)*\u003C\u002Fsup>                              | 不适用                   | 不适用                  | **0.9155秒**         | -                                                      |\n| 后续 `most_similar_approx` 搜索 \u003Cbr \u002F>\u003Csup>*(n=10, effort=1.0, 键与第一次查询不同)*\u003C\u002Fsup>                                        | 不适用                   | 不适用                  | **0.1873秒**         | -                                                      |\n| 后续 `most_similar_approx` 搜索 \u003Cbr \u002F>\u003Csup>*(n=10, effort=0.1, 键与第一次查询不同)*\u003C\u002Fsup>                                        | 不适用                   | 不适用                  | **0.0199秒**         | -                                                      |\n| 温启动后续 `most_similar_approx` 搜索 \u003Cbr \u002F>\u003Csup>*(n=10, effort=1.0, 键与第一次查询相同)*\u003C\u002Fsup>                                          | 不适用                   | 不适用                  | **0.00004秒**        | -                                                      |\n| 文件大小                                                                                                                                             | 4.21GB                | 5.29GB               | 10.74GB             | **0.00GB**                                             |\n| 进程内存（RAM）占用                                                                                                                                  | **18KB**              | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | 1.71MB                                                 |\n| 执行100次键查询后的进程内存（RAM）占用                                                                                                              | **168KB**             | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> | 1.91MB                                                 |\n| 执行100次键查询及相似度搜索后的进程内存（RAM）占用                                                                                                  | **342KB**\u003Csup>2\u003C\u002Fsup> | ━&nbsp;\u003Csup>1\u003C\u002Fsup>  | ━&nbsp;\u003Csup>1\u003C\u002Fsup> |                                                        |\n| 完整性检查和测试                                                                                                                                    | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| 支持 word2vec（`.txt`、`.bin`）、GloVe（`.txt`）、fastText（`.vec`）和 ELMo（`.hdf5`）之间的通用格式，并配备转换工具                      | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| 简洁的 Python 式接口                                                                                                                                | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| 依赖项较少                                                                                                                                          | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| 支持超出内存容量的模型                                                                                                                              | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| 尽可能采用懒加载以提升速度和性能                                                                                                                  | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| 针对 `threading` 和 `multiprocessing` 进行优化                                                                                                       | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| 支持批量和多键查找，包括填充、截断、占位符和特征化功能                                                         | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| 可将多个向量模型拼接在一起                                                                                                          | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| 基本的未登录词键查找 \u003Cbr \u002F>\u003Csup>(字符 n-gram 特征哈希)\u003C\u002Fsup>                                                                | ✅                     | ✅                    | ✅                   | ✅                                                      |\n| 高级的未登录词键查找，支持拼写错误 \u003Cbr \u002F>\u003Csup>(字符 n-gram 特征哈希映射到相似的已登录词键)\u003C\u002Fsup> | ❌                     | ✅                    | ✅                   | ✅                                                      |\n| 使用 [annoy](#other-notable-projects) 索引进行近似最相似搜索                                                                        | ❌                     | ❌                    | ✅                   | ✅                                                      |\n| 内置新模型训练功能                                                                                                                                  | ❌                     | ❌                    | ❌                   | ❌                                                      |\n\n\u003Csup>1: *与前一列相同*\u003C\u002Fsup>\u003Cbr \u002F>\n\u003Csup>2: *使用 `mmap` 从磁盘读取，因此在内存可用时操作系统仍会分配内存页，但这些内存页可以在进程间共享，并且对于超大文件来说无需在每个进程中单独管理，从而带来性能优势*\u003C\u002Fsup>\u003Cbr\u002F>\n\u003Csup>\\*: 所有[基准测试](https:\u002F\u002Fgitlab.com\u002FPlasticity\u002Fmagnitude\u002Fblob\u002Fmaster\u002Ftests\u002Fbenchmark.py)均在 Google News 预训练词向量 (`GoogleNews-vectors-negative300.bin`) 上进行，使用的设备为 MacBook Pro (Retina, 15 英寸，2014 年中旬款)，配备 2.2GHz 四核 Intel Core i7 处理器和 16GB RAM，存储介质为 SSD，在可行的情况下取多次试验的平均值。\u003C\u002Fsup>\n\n\n\n## 流行嵌入模型的预转换 Magnitude 格式\n\n流行的嵌入模型已被预先转换为 `.magnitude` 格式，可供立即下载和使用：\n\n| **贡献者**                                                         | **数据**                                                        | **轻量级**\u003Cbr\u002F>\u003Cbr\u002F>\u003Csup>(对未登录词的基本支持)\u003C\u002Fsup>                                                                                                                                                                                                                                                                                                | **中量级**\u003Cbr\u002F>\u003Ci>(推荐)\u003C\u002Fi>\u003Cbr\u002F>\u003Cbr\u002F>\u003Csup>(对未登录词的高级支持)\u003C\u002Fsup>                                                                                                                                                                                                                                                                           | **重量级**\u003Cbr\u002F>\u003Cbr\u002F>\u003Csup>(对未登录词的高级支持以及更快的 `most_similar_approx`)\u003C\u002Fsup>                                                                                                                                                                                                                                                                |\n| :---------------------------------------------------------------------: | :-------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:                         | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |\n| Google - [word2vec](https:\u002F\u002Fcode.google.com\u002Farchive\u002Fp\u002Fword2vec\u002F)        | Google News 100B                                                | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fword2vec\u002Flight\u002FGoogleNews-vectors-negative300.magnitude)                                                                                                                                                                                                                                                                          | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fword2vec\u002Fmedium\u002FGoogleNews-vectors-negative300.magnitude)                                                                                                                                                                                                                                                                                 | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fword2vec\u002Fheavy\u002FGoogleNews-vectors-negative300.magnitude)                                                                                                                                                                                                                                                                              |\n| Stanford - [GloVe](https:\u002F\u002Fnlp.stanford.edu\u002Fprojects\u002Fglove\u002F)            | Wikipedia 2014 + Gigaword 5 6B                                  | [50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.6B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.6B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.6B.200d.magnitude),&nbsp;[300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.6B.300d.magnitude)                                             | [50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.6B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.6B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.6B.200d.magnitude),&nbsp;[300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.6B.300d.magnitude)                                                 | [50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.6B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.6B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.6B.200d.magnitude),&nbsp;[300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.6B.300d.magnitude)                                                 |\n| Stanford - [GloVe](https:\u002F\u002Fnlp.stanford.edu\u002Fprojects\u002Fglove\u002F)            | Wikipedia 2014 + Gigaword 5 6B \u003Cbr \u002F>(经Plasticity词形还原) | [50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove-lemmatized.6B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove-lemmatized.6B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove-lemmatized.6B.200d.magnitude),&nbsp;[300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove-lemmatized.6B.300d.magnitude) | [50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove-lemmatized.6B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove-lemmatized.6B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove-lemmatized.6B.200d.magnitude),&nbsp;[300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove-lemmatized.6B.300d.magnitude)     | [50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove-lemmatized.6B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove-lemmatized.6B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove-lemmatized.6B.200d.magnitude),&nbsp;[300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove-lemmatized.6B.300d.magnitude)     |\n| Stanford - [GloVe](https:\u002F\u002Fnlp.stanford.edu\u002Fprojects\u002Fglove\u002F)            | Common Crawl 840B                                               | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.840B.300d.magnitude)                                                                                                                                                                                                                                                                                            | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.840B.300d.magnitude)                                                                                                                                                                                                                                                                                                   | [300D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.840B.300d.magnitude)                                                                                                                                                                                                                                                                                                |\n| Stanford - [GloVe](https:\u002F\u002Fnlp.stanford.edu\u002Fprojects\u002Fglove\u002F)            | Twitter 27B                                                     | [25D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.twitter.27B.25d.magnitude),&nbsp;[50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.twitter.27B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.twitter.27B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Flight\u002Fglove.twitter.27B.200d.magnitude)           | [25D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.twitter.27B.25d.magnitude),&nbsp;[50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.twitter.27B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.twitter.27B.100d.magnitude),&nbsp;[200D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fmedium\u002Fglove.twitter.27B.200d.magnitude)               | [25D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.twitter.27B.25d.magnitude),&nbsp;[50D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.twitter.27B.50d.magnitude),&nbsp;[100D](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fglove\u002Fheavy\u002Fglove.twitter.27B.100d.dema…\n\n以下是将任何 `.bin`、`.txt`、`.vec` 或 `.hdf5` 文件转换为 `.magnitude` 文件的说明 [如下](#file-format-and-converter)。\n\n\n\n## 使用该库\n\n### 构建 Magnitude 对象\n\n您可以按以下方式创建一个 Magnitude 对象：\n```python\nfrom pymagnitude import *\nvectors = Magnitude(\"\u002Fpath\u002Fto\u002Fvectors.magnitude\")\n```\n\n如果需要，为了方便起见，您也可以直接使用 Magnitude 打开 `.bin`、`.txt`、`.vec` 或 `.hdf5` 文件。不过，这种方法效率较低，对于大型模型来说速度非常慢，因为它会在首次运行时将文件转换为临时目录中的 `.magnitude` 文件。临时目录不一定会保留，并且在计算机重启后也不会保留。通常，为了提高速度，您应该先使用 `python -m pymagnitude.converter` 将 `.bin`、`.txt`、`.vec` 或 `.hdf5` 文件预先转换为 `.magnitude` 格式，但此功能对于一次性使用场景仍然很有用。当直接使用 `.bin`、`.txt`、`.vec` 或 `.hdf5` 文件实例化 Magnitude 对象时，会生成警告信息。您可以通过将构造函数中的 `supress_warnings` 参数设置为 `True` 来抑制这些警告。\n\n---------------\n\n* \u003Csup>默认情况下，惰性加载已启用。您可以向构造函数传递一个可选的 `lazy_loading` 参数，其值可以是 `-1`（禁用惰性加载并预加载所有向量到内存中，类似于 Gensim）、`0`（默认值，启用无限制的内存 LRU 缓存的惰性加载），或大于零的整数 `X`（启用惰性加载，并使用一个仅保留最近使用过的 `X` 个向量的 LRU 缓存）。\u003C\u002Fsup>\n* \u003Csup>如果您希望在初始化时就提前加载 `most_similar` 函数所需的数据，请将 `eager` 参数设置为 `True`。\u003C\u002Fsup>\n* \u003Csup>请注意，即使将 `lazy_loading` 设置为 `-1` 或将 `eager` 设置为 `True`，数据仍会在后台线程中预加载到内存中，以避免构造函数因大型模型而阻塞几分钟。如果您确实希望阻塞式行为，可以将 `blocking` 参数设置为 `True`。\u003C\u002Fsup>\n* \u003Csup>默认情况下，返回的是[单位长度归一化](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FUnit_vector)向量，除非您正在加载 ELMo 模型。如果您希望接收原始的未归一化向量，请将可选参数 `normalized` 设置为 `False`。\u003C\u002Fsup>\n* \u003Csup>默认情况下，查询结果以 NumPy 数组形式返回。如果您希望接收 Python 列表而不是 NumPy 数组，可以将可选参数 `use_numpy` 设置为 `False`。\u003C\u002Fsup>\n* \u003Csup>默认情况下，键的查询区分大小写。如果您希望进行不区分大小写的搜索，请将可选参数 `case_insensitive` 设置为 `True`。\u003C\u002Fsup>\n* \u003Csup>您可以选择包含 `pad_to_length` 参数，该参数用于指定在传入多个示例时，所有示例应被填充到的长度。超过填充长度的示例将会被截断。\u003C\u002Fsup>\n* \u003Csup>如果指定了 `pad_to_length`，并且某个示例的长度超过了该值，您可以将 `truncate_left` 参数设置为 `True`，以截断每个示例中键列表的开头部分，而不是结尾部分。\u003C\u002Fsup>\n* \u003Csup>您可以选择将 `pad_left` 参数设置为 `True`，使填充出现在列表的开头而不是默认的结尾。\u003C\u002Fsup>\n* \u003Csup>您可以选择传递 `placeholders` 参数，该参数会将每个向量的维度增加 `placeholders` 的数量，并用零填充这些额外的维度。如果您计划向向量中添加其他值和信息，并希望预先为这些内容分配空间以提高效率，此选项将非常有用。\u003C\u002Fsup>\n* \u003Csup>您可以选择传递 `language` 参数，并提供一个 [ISO 639-1 语言代码](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FList_of_ISO_639-1_codes)。如果您将 Magnitude 用于词向量，则此参数可确保库尊重该语言的词干提取和其他特定于语言的功能。默认值为 `en`，表示英语。如果您不将 Magnitude 用于词向量，也可以将此参数设置为 `None`。\u003C\u002Fsup>\n* \u003Csup>您可以选择传递 `dtype` 参数，以控制 Magnitude 返回的 NumPy 数组的数据类型。\u003C\u002Fsup>\n* \u003Csup>您可以选择传递 `devices` 参数，以控制底层模型支持 GPU 使用时的 GPU 设备使用情况。该参数应为一个整数列表，其中每个整数代表 GPU 设备编号（`0`、`1` 等）。\u003C\u002Fsup>\n* \u003Csup>您可以选择传递 `temp_dir` 参数，以控制 Magnitude 将使用的临时目录的位置。\u003C\u002Fsup>\n* \u003Csup>您可以选择传递 `log` 参数，使 Magnitude 在执行耗时操作时将进度记录到标准错误流中。\u003C\u002Fsup>\n\n### 查询\n\n你可以这样查询文件中向量的总数：\n```python\nlen(vectors)\n```\n\n---------------\n\n你可以这样查询向量的维度：\n```python\nvectors.dim\n```\n\n---------------\n\n你可以这样检查某个键是否在词汇表中：\n```python\n\"cat\" in vectors\n```\n\n---------------\n\n你可以这样遍历所有的键和向量：\n```python\nfor key, vector in vectors:\n  ...\n```\n\n---------------\n\n你可以这样查询某个键对应的向量：\n```python\nvectors.query(\"cat\")\n```\n\n---------------\n\n你可以通过索引获取第 n 个键和向量：\n```python\nvectors[42]\n```\n\n---------------\n\n你可以这样查询多个键的向量：\n```python\nvectors.query([\"I\", \"read\", \"a\", \"book\"])\n```\n将返回一个二维数组（键对应向量）。\n\n---------------\n\n你可以这样查询多个例句的向量：\n```python\nvectors.query([[\"I\", \"read\", \"a\", \"book\"], [\"I\", \"read\", \"a\", \"magazine\"]])\n```\n将返回一个三维数组（例句 × 键 × 向量）。如果未指定 `pad_to_length`，且每个例句的长度不一致，则会按最长例句的长度进行填充。\n\n---------------\n\n你可以通过索引获取多个位置的键和向量：\n```python\nvectors[:42] # 切片表示法\nvectors[42, 1337, 2001] # 元组表示法\n```\n\n---------------\n\n你可以这样查询两个或多个键之间的距离：\n```python\nvectors.distance(\"cat\", \"dog\")\nvectors.distance(\"cat\", [\"dog\", \"tiger\"])\n```\n\n---------------\n\n你可以这样查询两个或多个键之间的相似度：\n```python\nvectors.similarity(\"cat\", \"dog\")\nvectors.similarity(\"cat\", [\"dog\", \"tiger\"])\n```\n\n---------------\n\n你可以这样查询给定键与一组键中最相似的键：\n```python\nvectors.most_similar_to_given(\"cat\", [\"dog\", \"television\", \"laptop\"]) # dog\n```\n\n---------------\n\n你可以这样查询给定键与一组键中不匹配的键：\n```python\nvectors.doesnt_match([\"breakfast\", \"cereal\", \"dinner\", \"lunch\"]) # cereal\n```\n\n---------------\n\n你可以这样查询最相似的键（近邻）：\n```python\nvectors.most_similar(\"cat\", topn = 100) # 按键最相似\nvectors.most_similar(vectors.query(\"cat\"), topn = 100) # 按向量最相似\n```\n你还可以选择性地为 `most_similar` 传递一个 `min_similarity` 参数，其值范围为 [-1.0, 1.0]。\n\n---------------\n\n你也可以通过提供正例和负例来查询最相似的键（这实际上可以解决类比问题）：\n```python\nvectors.most_similar(positive = [\"woman\", \"king\"], negative = [\"man\"]) # queen\n```\n\n---------------\n\n类似于 `vectors.most_similar`，还有一个 `vectors.most_similar_cosmul` 函数，它使用 [Levy and Goldberg](http:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FW14-1618) 提出的 3CosMul 方法：\n```python\nvectors.most_similar_cosmul(positive = [\"woman\", \"king\"], negative = [\"man\"]) # queen\n```\n\n---------------\n\n你还可以使用近似最近邻索引快速查询最相似的键，虽然速度更快，但不能保证完全准确的结果：\n```python\nvectors.most_similar_approx(\"cat\")\nvectors.most_similar_approx(positive = [\"woman\", \"king\"], negative = [\"man\"])\n```\n你还可以选择性地为 `most_similar_approx` 函数传递一个 `effort` 参数，其值范围为 [0.0, 1.0]，以调整运行时间和准确性。默认值为 1.0，此时耗时最长，但结果最准确。\n\n---------------\n\n你可以这样查询所有与某个键的距离比另一个键更近的键：\n```python\nvectors.closer_than(\"cat\", \"rabbit\") # [\"dog\", ...]\n```\n\n---------------\n\n你可以访问模型中所有的底层向量，它们存储在一个大小为 (`len(vectors) x vectors.emb_dim`) 的大型 `numpy.memmap` 数组中：\n```python\nvectors.get_vectors_mmap()\n```\n\n---------------\n\n你可以清理所有相关资源、打开的文件和数据库连接：\n```python\nvectors.close()\n```\n\n### 基本的未登录词处理\n\n对于词向量表示而言，处理未登录词非常重要，这有助于应对训练数据中未出现的新词、拼写错误和打字错误，并总体上提高基于词向量表示模型的鲁棒性。\n\n未登录词的处理方式是为其分配一个随机向量值。然而，这种随机性是确定性的：如果两次遇到 *相同* 的未登录词，它会被赋予相同的随机向量值，以便能够对这些未登录词进行训练。此外，如果两个未登录词具有相似的字符 n-gram（例如“uberx”和“uberxl”），即使它们都不在词汇表中，也会被放置在彼此附近：\n\n```python\nvectors = Magnitude(\"\u002Fpath\u002Fto\u002FGoogleNews-vectors-negative300.magnitude\")\n\"uberx\" in vectors # False\n\"uberxl\" in vectors # False\nvectors.query(\"uberx\") # array([ 5.07109939e-02, -7.08248823e-02, -2.74812328e-02, ... ])\nvectors.query(\"uberxl\") # array([ 0.04734962, -0.08237578, -0.0333479, -0.00229564, ... ])\nvectors.similarity(\"uberx\", \"uberxl\") # 0.955000000200815\n```\n\n### 高级的未登录词处理\n\n如果使用支持高级未登录词处理的 Magnitude 文件（Medium 或 Heavy 版本），未登录词还会被嵌入到与其相似的已登录词附近，这些相似性是根据字符串相似度判断的：\n```python\nvectors = Magnitude(\"\u002Fpath\u002Fto\u002FGoogleNews-vectors-negative300.magnitude\")\n\"uberx\" in vectors # False\n\"uberification\" in vectors # False\n\"uber\" in vectors # True\nvectors.similarity(\"uberx\", \"uber\") # 0.7383483267618451\nvectors.similarity(\"uberification\", \"uber\") # 0.745452837882727\n```\n\n#### 处理拼写错误和打字错误\n这也使得 Magnitude 对许多拼写错误具有较强的鲁棒性：\n```python\nvectors = Magnitude(\"\u002Fpath\u002Fto\u002FGoogleNews-vectors-negative300.magnitude\")\n\"missispi\" in vectors # False\nvectors.similarity(\"missispi\", \"mississippi\") # 0.35961736624824003\n\"discrimnatory\" in vectors # False\nvectors.similarity(\"discrimnatory\", \"discriminatory\") # 0.8309152561753461\n\"hiiiiiiiiii\" in vectors # False\nvectors.similarity(\"hiiiiiiiiii\", \"hi\") # 0.7069775034853861\n```\n\n为了实现未登录词的这一效果，使用了字符 n-gram 技术。该功能的灵感来源于 Facebook AI Research 的论文 [Enriching Word Vectors with Subword Information](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1607.04606.pdf)，但与原文不同的是，这里并非在训练时利用字符 n-gram，而是在推理时使用字符 n-gram，从而在一些较旧的模型（如 word2vec 和 GloVe）中部分地复现了这一效果（尽管无法完全复制）。\n\n### 多模型的拼接\n可选地，你可以将多个模型的向量组合起来，以便向机器学习模型提供更丰富的信息，如下所示：\n```python\nfrom pymagnitude import *\nword2vec = Magnitude(\"\u002Fpath\u002Fto\u002FGoogleNews-vectors-negative300.magnitude\")\nglove = Magnitude(\"\u002Fpath\u002Fto\u002Fglove.6B.50d.magnitude\")\nvectors = Magnitude(word2vec, glove) # 将 word2vec 与 glove 的向量拼接在一起\nvectors.query(\"cat\") # 返回一个 350 维的 NumPy 数组（来自 word2vec 的“cat”与来自 glove 的“cat”拼接而成）\nvectors.query((\"cat\", \"cats\")) # 返回一个 350 维的 NumPy 数组（来自 word2vec 的“cat”与来自 glove 的“cats”拼接而成）\n```\n\n你还可以拼接两个以上的向量模型，只需在构造函数中传入更多参数即可。\n\n### 额外的特征化（词性等）\n你可以使用 `FeaturizerMagnitude` 类，自动从你可能拥有的额外特征中创建向量，例如词性、句法依存关系或其他任何信息：\n\n```python\nfrom pymagnitude import *\npos_vectors = FeaturizerMagnitude(100, namespace = \"PartsOfSpeech\")\npos_vectors.dim # 4 - 由 Magnitude 根据 100 这个值自动确定的维度数\npos_vectors.query(\"NN\") # - array([ 0.08040417, -0.71705252,  0.61228951,  0.32322192]) \npos_vectors.query(\"JJ\") # - array([-0.11681135,  0.10259253,  0.8841201 , -0.44063763])\npos_vectors.query(\"NN\") # - array([ 0.08040417, -0.71705252,  0.61228951,  0.32322192])（由于采用确定性哈希，对于相同的键每次都会返回相同的值）\ndependency_vectors = FeaturizerMagnitude(100, namespace = \"SyntaxDependencies\")\ndependency_vectors.dim # 4 - 由 Magnitude 根据 100 这个值自动确定的维度数\ndependency_vectors.query(\"nsubj\") # - array([-0.81043793,  0.55401352, -0.10838071,  0.15656626])\ndependency_vectors.query(\"prep\") # - array([-0.30862918, -0.44487267, -0.0054573 , -0.84071788])\n```\n\nMagnitude 内部会使用 [特征哈希技巧](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FFeature_hashing)，直接利用特征值的哈希来为该特征值创建唯一的向量。\n\n`FeaturizerMagnitude` 的第一个参数应为该特征可能取值数量的近似上限。由于 [词性标签](https:\u002F\u002Fwww.ling.upenn.edu\u002Fcourses\u002FFall_2003\u002Fling001\u002Fpenn_treebank_pos.html)和 [句法依存关系](http:\u002F\u002Funiversaldependencies.org\u002Fu\u002Fdep\u002Fall.html)的数量均少于 100 种，因此我们在上述示例中都选择了 100。所选值将决定 Magnitude 自动为特定的 `FeaturizerMagnitude` 对象分配多少维度，以降低哈希冲突的概率。`namespace` 参数可以是任何描述你所添加特征的字符串，它是可选的，但强烈建议使用。\n\n随后，你可以将这些特征与标准的 Magnitude 对象拼接在一起，以供使用：\n```python\nfrom pymagnitude import *\nword2vec = Magnitude(\"\u002Fpath\u002Fto\u002FGoogleNews-vectors-negative300.magnitude\")\npos_vectors = FeaturizerMagnitude(100, namespace = \"PartsOfSpeech\")\ndependency_vectors = FeaturizerMagnitude(100, namespace = \"SyntaxDependencies\")\nvectors = Magnitude(word2vec, pos_vectors, dependency_vectors) # 将 word2vec 与词性和依存关系的向量拼接在一起\nvectors.query([\n    (\"I\", \"PRP\", \"nsubj\"), \n    (\"saw\", \"VBD\", \"ROOT\"), \n    (\"a\", \"DT\", \"det\"), \n    (\"cat\", \"NN\", \"dobj\"), \n    (\".\",  \".\", \"punct\")\n  ]) # 一个大小为 5 x (300 + 4 + 4)，即 5 x 308 的数组\n\n# 或者为以下句子中的每一个“buffalo”获取唯一的向量：\n# “Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo”\n# （https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FBuffalo_buffalo_Buffalo_buffalo_buffalo_buffalo_Buffalo_buffalo）\nvectors.query([\n    (\"Buffalo\", \"JJ\", \"amod\"), \n    (\"buffalo\", \"NNS\", \"nsubj\"), \n    (\"Buffalo\", \"JJ\", \"amod\"), \n    (\"buffalo\", \"NNS\", \"nsubj\"), \n    (\"buffalo\",  \"VBP\", \"rcmod\"),\n    (\"buffalo\",  \"VB\", \"ROOT\"),\n    (\"Buffalo\",  \"JJ\", \"amod\"),\n    (\"buffalo\",  \"NNS\", \"dobj\")\n  ]) # 一个大小为 8 x (300 + 4 + 4)，即 8 x 308 的数组\n\n```\n\n有了这样的输出，机器学习模型现在不仅可以访问词向量信息，还可以获得词性和句法依存关系等额外信息。在这种情况下，这些额外的信息可以为神经网络提供更强的语义信号，并减少对训练数据的需求。\n\n### 在机器学习库中使用 Magnitude\nMagnitude 通过处理大量预处理代码，将文本数据集（或键）转换为向量，从而使得构建和迭代需要使用向量表示的模型变得非常容易。此外，它还能使这些模型对 [未登录词](#advanced-out-of-vocabulary-keys) 和 [拼写错误](#handling-misspellings-and-typos) 具有更强的鲁棒性。\n\n下面是一些流行的机器学习库中，使用 Magnitude 构建针对 [ATIS（航空公司旅行信息系统）数据集](https:\u002F\u002Fcatalog.ldc.upenn.edu\u002Fdocs\u002FLDC93S4B\u002Fcorpus.html)（[训练集](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fdata\u002Fatis\u002Fatis-intent-train.txt)\u002F[测试集](http:\u002F\u002Fmagnitude.plasticity.ai\u002Fdata\u002Fatis\u002Fatis-intent-test.txt)）的意图分类模型的示例代码。该数据集常用于聊天机器人或对话式界面。\n\n#### Keras\n你可以在这个 [Google Colaboratory Python 笔记本](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1lOcAhIffLW8XC6QsKzt5T_ZqPP4Y9eS4)中找到使用 Magnitude 与 Keras（支持 TensorFlow、Theano、CNTK）的指南。\n\n#### PyTorch\n*PyTorch 指南即将推出。*\n\n#### TFLearn\n*TFlearn 指南即将推出。*\n\n### 工具类\n\n您可以使用 `MagnitudeUtils` 类，以便便捷地访问在构建机器学习模型时可能用到的函数。\n\n您可以按如下方式导入 `MagnitudeUtils`：\n```python\n  from pymagnitude import MagnitudeUtils\n```\n\n您也可以从远程源下载 Magnitude 模型，如下所示：\n```python\n  vecs = Magnitude(MagnitudeUtils.download_model('word2vec\u002Fheavy\u002FGoogleNews-vectors-negative300'))\n```\n\n默认情况下，`download_model` 会从 `http:\u002F\u002Fmagnitude.plasticity.ai` 下载文件，并将其保存到自动创建的 `~\u002F.magnitude` 文件夹中。如果文件已下载，则不会再次下载。您可以通过可选的 `download_dir` 参数更改本地下载文件夹的路径。此外，还可以通过可选的 `remote_path` 参数更改模型的下载域名。\n\n您还可以使用 `batchify` 创建 `X` 和 `y` 数据的批次生成器，如下所示：\n```python\n  X = [.3, .2, .7, .8, .1]\n  y = [0, 0, 1, 1, 0]\n  batch_gen = MagnitudeUtils.batchify(X, y, 2)\n  for X_batch, y_batch in batch_gen:\n    print(X_batch, y_batch)\n  # 输出：\n  # 第1轮：X_batch = [.3, .2], y_batch = [0, 0]\n  # 第2轮：X_batch = [.7, .8], y_batch = [1, 1]\n  # 第3轮：X_batch = [.1], y_batch = [0]\n  # 后续轮次：无限循环...\n```\n\n您还可以使用 `class_encoding` 将类别标签编码为整数并解码回标签，如下所示：\n```python\n  add_class, class_to_int, int_to_class = MagnitudeUtils.class_encoding()\n  add_class(\"cat\") # 返回：0\n  add_class(\"dog\") # 返回：1\n  add_class(\"cat\") # 返回：0\n  class_to_int(\"dog\") # 返回：1\n  class_to_int(\"cat\") # 返回：0\n  int_to_class(1) # 返回：“dog”\n  int_to_class(0) # 返回：“cat”\n```\n\n您还可以使用 `to_categorical` 将带有类别整数的分类数据转换为独热编码的 NumPy 数组，如下所示：\n```python\n  y = [1, 5, 2]\n  MagnitudeUtils.to_categorical(y, num_classes=6) # num_classes 为可选参数\n  # 输出：\n  # array([[0., 1., 0., 0., 0., 0.],\n  #        [0., 0., 0., 0., 0., 1.],\n  #        [0., 0., 1., 0., 0., 0.]])\n```\n\n您还可以使用 `from_categorical` 将独热编码的 NumPy 数组转换回包含类别整数的 1D NumPy 数组，如下所示：\n```python\n  y_c = [[0., 1., 0., 0., 0., 0.],\n         [0., 0., 0., 0., 0., 1.]]\n  MagnitudeUtils.from_categorical(y_c)\n  # 输出：\n  # array([1., 5.])\n```\n\n## 并发与并行性\n该库是线程安全的（每个线程使用不同的底层存储连接），并且是只读的，永远不会向文件写入数据。由于内存占用较低，您也可以在多个进程（或使用 `multiprocessing`）中运行它，每个进程拥有独立的地址空间，而无需像其他库那样在内存中复制数据，也无需创建多进程共享变量，因为数据是从磁盘读取的，且每个进程维护自己的 LRU 内存缓存。对于较重的操作，例如 `most_similar`，会创建一个共享内存映射文件，以实现进程间内存共享。\n\n## 文件格式与转换工具\nMagnitude 包采用 `.magnitude` 文件格式，而不是其他向量模型（如 word2vec、GloVe、fastText 和 ELMo）所使用的 `.bin`、`.txt`、`.vec` 或 `.hdf5` 格式。该软件包附带了一个命令行工具，用于将 word2vec、GloVe、fastText 和 ELMo 文件转换为 Magnitude 文件。\n\n您可以按如下方式进行转换：\n```bash\npython -m pymagnitude.converter -i \u003C待转换文件路径> -o \u003CMagnitude 文件输出路径>\n```\n\n输入文件的格式将根据其扩展名或内容自动确定。通常只需对某个模型执行一次转换即可。转换完成后，Magnitude 文件格式保持静态，不会被修改或写入，从而确保并发读取的安全性。\n\n`pymagnitude.converter` 的可用标志如下：\n* 使用 `-h` 标志可获取帮助并列出所有标志。\n* 使用 `-p \u003C精度>` 标志可以指定保留的小数位数（选择较小的数值会生成更小的文件）。实际底层值以整数形式存储，而非浮点数，因此这本质上是一种[量化](https:\u002F\u002Fwww.tensorflow.org\u002Fperformance\u002Fquantization)技术，用于减小模型的体积。\n* 使用 `-a` 标志可以为文件添加近似最近邻索引（会增加文件大小），从而启用 `most_similar_approx` 函数。配合 `-a` 标志使用时，还可以通过 `-t \u003C树的数量>` 控制近似最近邻索引中的树数量（树越多，精度越高）。若未指定树的数量，则会自动确定。\n* 使用 `-s` 标志可以禁用向文件中添加子词信息（从而减小文件大小），但同时也会禁用对词汇表外关键词的高级支持。\n* 如果要转换的模型没有词汇表（如 ELMo），可以在提供路径的同时使用 `-v` 标志，并指定一个您希望从中继承词汇表的 Magnitude 文件。\n\n此外，您还可以批量转换文件，只需分别指定输入文件夹和输出文件夹，而无需逐个指定输入和输出文件。输入文件夹中的所有 `.txt`、`.bin`、`.vec` 和 `.hdf5` 文件都将被转换为输出文件夹中的 `.magnitude` 文件。在执行批量转换之前，输出文件夹必须已经存在。\n\n## 远程加载\n您可以指示 Magnitude 从其远程仓库下载并打开模型，而无需指定本地文件路径。首次运行时，文件会自动下载到 `~\u002F.magnitude\u002F` 目录下；后续运行时，如果本地已存在该文件，则会跳过下载步骤。\n\n```python\n  vecs = Magnitude('http:\u002F\u002Fmagnitude.plasticity.ai\u002Fword2vec\u002Fheavy\u002FGoogleNews-vectors-negative300.magnitude') # 完整 URL\n  vecs = Magnitude('word2vec\u002Fheavy\u002FGoogleNews-vectors-negative300') # 或者使用 URL 的简写形式\n```\n\n如需更精细地控制远程下载域名和本地下载目录，请参阅如何使用 [`MagnitudeUtils.download_model`](#utils)。\n\n## 通过 HTTP 远程流式传输\n\nMagnitude 模型通常是大型文件（多个 GB），会占用大量磁盘空间，尽管 `.magnitude` 格式使得向量的利用非常快速。Magnitude 提供了一个选项，可以通过 HTTP 流式传输这些大文件。\n这与[远程加载功能](#远程加载)有明显区别，因为模型甚至完全不需要下载。您可以立即开始查询模型，而无需占用任何磁盘空间。\n\n\n```python\n  vecs = Magnitude('http:\u002F\u002Fmagnitude.plasticity.ai\u002Fword2vec\u002Fheavy\u002FGoogleNews-vectors-negative300.magnitude', stream=True) # 完整 URL\n  vecs = Magnitude('word2vec\u002Fheavy\u002FGoogleNews-vectors-negative300', stream=True) # 或者使用 URL 的简写形式\n\n  vecs.query(\"king\") # 返回结果：即使没有下载本地模型文件，也能快速返回“king”的向量\n```\n\n您可以在 [Google Colaboratory Python 笔记本](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1zkPhoNM1NvbTmEk9gr0Jnt8hONrca1Fv) 中体验此演示。\n\n如果您的计算环境资源有限（内存和磁盘空间不足），或者您希望在不下载和设置大型模型文件的情况下快速尝试向量，又或者您正在训练一个小模型，那么此功能将非常有用。\n虽然由于数据是流式传输的，会增加一些网络延迟，但 Magnitude 仍会使用由 [`lazy_loading`](#构造一个Magnitude对象) 构造函数参数指定的内存缓存。由于语言通常遵循 [齐夫定律](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FZipf%27s_law)，在缓存被少量查询预热后，网络延迟通常不会成为问题。\n\n它们将直接从静态 HTTP Web 服务器上通过 [HTTP Range Request](https:\u002F\u002Fdeveloper.mozilla.org\u002Fen-US\u002Fdocs\u002FWeb\u002FHTTP\u002FRange_requests) 头进行查询。所有 Magnitude 方法都支持流式传输，然而，`most_similar` 和 `most_similar_approx`\n可能会较慢，因为它们目前尚未针对流式传输进行优化 [#路线图]。您可以在【基准测试与功能】中查看当前流式模式的性能表现，不过随着我们未来对其进行优化[#路线图]，其速度将会更快！\n\n## 其他文档\n目前暂无其他文档。如果您需要了解某个方法的参数详情或查看所有支持的功能，可以直接查阅源代码文件（注释较为详尽）。\n\n## 其他语言\n目前，本页面仅提供预先转换为 `.magnitude` 格式的英语词向量模型。不过，您仍然可以使用 Magnitude 来处理其他语言的词向量。Facebook 已经训练了多种语言的 [fastText 向量](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FfastText\u002Fblob\u002Fmaster\u002Fpretrained-vectors.md)。您可以下载所需语言的 `.vec` 文件，然后使用【文件格式与转换器】中的工具将其转换为 `.magnitude` 格式。\n\n## 其他编程语言\n目前，读取 Magnitude 文件仅支持 Python，因为 Python 已成为机器学习领域的事实标准语言。对于大多数应用场景来说，这已经足够。将文件格式扩展到其他语言并不困难，因为 SQLite 有原生的 C 实现，并且在大多数语言中都有绑定。此外，该文件格式以及读取和搜索协议本身也非常简单，只需阅读本仓库的源代码即可理解。\n\n## 其他领域\n目前，自然语言处理是使用预训练向量嵌入模型进行词向量表示的最热门领域。不过，也有一些其他领域，例如计算机视觉，开始使用类似 [Deep1B](https:\u002F\u002Fgithub.com\u002Farbabenko\u002FGNOIMI) 的预训练向量嵌入模型来表示图像。本库旨在保持对各个领域的中立性，提供一个适用于所有领域的通用键值存储和接口。\n\n## 贡献\n该项目的主要仓库位于 [GitLab](https:\u002F\u002Fgitlab.com\u002FPlasticity\u002Fmagnitude) 上。[GitHub 仓库](https:\u002F\u002Fgithub.com\u002Fplasticityai\u002Fmagnitude)仅为镜像。欢迎在 [GitLab](https:\u002F\u002Fgitlab.com\u002FPlasticity\u002Fmagnitude) 上提交更多测试、改进错误检查、修复 bug、提升性能、完善文档，或添加额外工具\u002F功能的拉取请求。\n\n您可以通过 [opensource@plasticity.ai](mailto:opensource@plasticity.ai) 与我们联系。\n\n## 路线图\n\n* 对远程流式传输进行速度优化，并公开流式缓存配置选项\n* 使 `most_similar_approx` 也针对流式传输进行优化\n* 除了“轻量”、“中量”和“重量”三种规格外，新增“超重量”规格，文件体积更大，但可消除最初 `most_similar` 查询较慢的限制。\n* 添加对 Google BERT 的支持\n* 支持 fastText `.bin` 格式\n\n## 其他知名项目\n* [spotify\u002Fannoy](https:\u002F\u002Fgithub.com\u002Fspotify\u002Fannoy) - 使用随机投影树和分层二均值算法，为 Magnitude 中的 `most_similar_approx` 提供近似最近邻算法的支持。感谢作者 [Erik Bernhardsson](https:\u002F\u002Fgithub.com\u002Ferikbern) 在 Magnitude 与 Annoy 集成细节方面提供的帮助。\n\n## 引用本仓库\n\n如果您想引用我们在 EMNLP 2018 上发表的论文，请使用以下 BibTeX 引用：\n```latex\n@inproceedings{patel2018magnitude,\n  title={Magnitude: 一种快速高效的通用向量嵌入工具包},\n  author={Patel, Ajay and Sands, Alexander and Callison-Burch, Chris and Apidianaki, Marianna},\n  booktitle={2018 年自然语言处理经验方法会议系统演示文集},\n  pages={120--126},\n  year={2018}\n}\n```\n或者访问 [Google Scholar 链接](https:\u002F\u002Fscholar.google.com\u002Fscholar?cluster=5916903042122216495&hl=en&as_sdt=0,5) 查看其他引用方式。\n\n如果您想引用本仓库，可以使用以下 DOI 徽章： &nbsp;[![DOI](https:\u002F\u002Fzenodo.org\u002Fbadge\u002F122715432.svg)](https:\u002F\u002Fzenodo.org\u002Fbadge\u002Flatestdoi\u002F122715432)\n\n点击徽章将跳转至页面，帮助您生成正确的 BibTeX 引用、JSON-LD 引用以及其他引用格式。\n\n## 许可证与署名\n本仓库采用 [此处](LICENSE.txt) 所列许可证授权。\n\n图标“[Seismic](https:\u002F\u002Fthenounproject.com\u002Fziman.jan\u002Fcollection\u002Fweather\u002F?i=1518266)”由 JohnnyZi 来自 [The Noun Project](https:\u002F\u002Fthenounproject.com) 创作。","# Magnitude 快速上手指南\n\nMagnitude 是一个高性能的向量嵌入（Vector Embedding）实用工具库和文件格式，由 Plasticity 开发。它旨在作为 Gensim 的更轻量、更快速的替代方案，支持懒加载、内存映射和高效的相似度搜索，特别适用于处理大型预训练词向量模型（如 Word2Vec, GloVe, fastText 等）。\n\n## 环境准备\n\n*   **操作系统**：Linux, macOS, Windows (支持多进程和内存映射)\n*   **Python 版本**：Python 2.7 或 Python 3.x\n*   **核心依赖**：\n    *   `sqlite3` (内置于 Python 标准库，用于底层数据存储)\n    *   `numpy` (用于数值计算)\n    *   `scipy` (用于空间索引和相似度计算)\n    *   `pymagnitude` 包会自动处理大部分依赖安装。\n\n> **注意**：如果在 Google Colaboratory 环境中使用，由于依赖冲突问题，建议使用官方提供的专用安装脚本（见下文安装步骤）。\n\n## 安装步骤\n\n### 1. 常规安装 (本地环境)\n\n使用 `pip` 直接安装：\n\n```bash\n# Python 2.7\npip install pymagnitude\n\n# Python 3\npip3 install pymagnitude\n```\n\n*(国内用户若下载缓慢，可添加清华源加速：`pip3 install pymagnitude -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple`)*\n\n### 2. Google Colab 专用安装\n\n如果在 Google Colab 中遇到依赖问题，请运行以下命令：\n\n```bash\n# Install Magnitude on Google Colab\n! echo \"Installing Magnitude.... (please wait, can take a while)\"\n! (curl https:\u002F\u002Fraw.githubusercontent.com\u002Fplasticityai\u002Fmagnitude\u002Fmaster\u002Finstall-colab.sh | \u002Fbin\u002Fbash 1>\u002Fdev\u002Fnull 2>\u002Fdev\u002Fnull)\n! echo \"Done installing Magnitude.\"\n```\n\n### 3. 转换模型格式 (可选)\n\nMagnitude 需要将常见的词向量文件（`.txt`, `.bin`, `.vec`, `.hdf5`）转换为高效的 `.magnitude` 格式。安装完成后，可使用命令行工具进行转换：\n\n```bash\n# 示例：将 GloVe txt 文件转换为 magnitude 格式\nmagnitude-converter -c glove.txt -o glove.magnitude\n```\n\n## 基本使用\n\n以下是使用 Magnitude 加载模型并查询向量的最简示例。\n\n### 1. 加载模型与查询向量\n\n假设你已经拥有一个转换好的 `.magnitude` 文件（例如 `glove.magnitude`）：\n\n```python\nfrom pymagnitude import Magnitude\n\n# 初始化对象 (支持懒加载，启动速度极快)\nvectors = Magnitude(\"glove.magnitude\")\n\n# 查询单个词的向量\nvector = vectors.query(\"king\")\nprint(vector.shape)  # 输出向量维度，例如 (300,)\n\n# 批量查询多个词\nbatch_vectors = vectors.query([\"king\", \"queen\", \"man\"])\nprint(batch_vectors.shape)  # 输出 (3, 300)\n```\n\n### 2. 相似度搜索\n\n查找与给定词最相似的词：\n\n```python\n# 查找与 'king' 最相似的 5 个词\nsimilar_words = vectors.most_similar(\"king\", n=5)\n\nfor word, score in similar_words:\n    print(f\"{word}: {score}\")\n```\n\n### 3. 处理未登录词 (Out-of-Vocabulary)\n\nMagnitude 内置了简单的子词处理机制，可以处理拼写错误或未收录的词：\n\n```python\n# 即使 'kng' 不在词典中，也能尝试返回一个基于字符特征的向量\nvector = vectors.query(\"kng\") \n```\n\n### 4. 与深度学习框架结合 (简要示例)\n\nMagnitude 可以直接作为 Keras 或 PyTorch 的 Embedding 层数据源。\n\n**Keras 示例片段：**\n```python\nfrom pymagnitude.keras import MagnitudeEmbedding\n\n# 在 Keras 模型中直接使用\nembedding_layer = MagnitudeEmbedding(vectors, input_length=10, trainable=False)\n```\n\n**PyTorch 示例片段：**\n```python\nfrom pymagnitude.pytorch import MagnitudeDataset, MagnitudeLoader\n\n# 创建 DataLoader\ndataset = MagnitudeDataset(vectors, batch_size=32)\nloader = MagnitudeLoader(dataset, num_workers=4)\n```","某电商公司的 NLP 团队正在构建一个实时商品评论情感分析系统，需要快速加载包含数百万词汇的预训练词向量模型以支持高并发查询。\n\n### 没有 magnitude 时\n- **启动缓慢**：加载巨大的 Word2Vec 或 GloVe 文本文件需耗时数分钟，导致服务重启或扩容时等待时间过长。\n- **内存爆炸**：传统库（如 Gensim）往往需要将整个向量矩阵载入内存，极易触发 OOM（内存溢出）错误，迫使团队升级昂贵的高配服务器。\n- **查词僵化**：遇到用户评论中的拼写错误或未登录词（OOV）时，系统直接返回空值，导致情感判断准确率大幅下降。\n- **并发瓶颈**：在高并发请求下，简单的键值查询响应延迟高，难以满足实时性要求。\n\n### 使用 magnitude 后\n- **秒级启动**：利用 magnitude 专有的二进制文件格式，百万级向量模型可实现毫秒级加载，服务部署效率提升数十倍。\n- **内存友好**：支持内存映射（Memory Mapping）技术，仅按需读取数据，大幅降低内存占用，使模型能在普通配置服务器上流畅运行。\n- **智能容错**：内置模糊匹配算法，能自动处理拼写错误和未登录词，通过近似向量推算出合理结果，显著提升了鲁棒性。\n- **高速查询**：针对查询操作进行了底层优化，即使在多线程高并发场景下，也能保持极低的延迟，确保实时分析流畅无阻。\n\nmagnitude 通过高效的存储格式和智能查询机制，将沉重的向量加载与检索任务变得轻量、快速且健壮，是生产级 NLP 应用的理想加速器。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fplasticityai_magnitude_45e13ec2.png","plasticityai","Plasticity","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fplasticityai_8968bf38.png","The official GitHub account of Plasticity",null,"opensource@plasticity.ai","https:\u002F\u002Fwww.plasticity.ai","https:\u002F\u002Fgithub.com\u002Fplasticityai",[81,85],{"name":82,"color":83,"percentage":84},"Python","#3572A5",100,{"name":86,"color":87,"percentage":88},"Shell","#89e051",0,1658,122,"2026-04-06T10:05:34","MIT",1,"Linux, macOS, Windows","未说明","最低需求极低（测试显示仅 18KB - 342KB），支持内存映射处理超出物理内存的大型模型",{"notes":98,"python":99,"dependencies":100},"该工具基于 SQLite 构建，利用内存映射和 SIMD 指令实现高效运行。主要优势是内存占用极低且支持懒加载，无需将整个向量模型载入 RAM。在 Google Colab 上安装需使用专用脚本以解决依赖冲突。支持通过 HTTP 流式传输大型模型。","2.7, 3.x",[101,102,103,104],"sqlite3","numpy","scipy","h5py",[35,106,14],"音频",[108,109,110,111,112,113,114,115,116,117,118,119,120,121],"python","natural-language-processing","nlp","machine-learning","vectors","embeddings","word2vec","fasttext","glove","gensim","fast","memory-efficient","machine-learning-library","word-embeddings","2026-03-27T02:49:30.150509","2026-04-18T22:34:11.741613",[125,130,135,140,145,150],{"id":126,"question_zh":127,"answer_zh":128,"source_url":129},40583,"在 Windows 上使用 vectors.most_similar() 时遇到 PermissionError (WinError 32) 错误怎么办？","这是由于 Windows 上的垃圾回收机制或 NumPy 文件引用冲突导致的。该问题已在版本 0.1.14 中修复。请升级库到最新版本：`pip install pymagnitude --upgrade`。如果问题仍然存在，可能是旧版本缓存导致，请尝试清理缓存后重新安装。","https:\u002F\u002Fgithub.com\u002Fplasticityai\u002Fmagnitude\u002Fissues\u002F4",{"id":131,"question_zh":132,"answer_zh":133,"source_url":134},40584,"运行 most_similar 查询大量结果时报错 \"too many SQL variables\" 如何解决？","这是 SQLite 变量数量限制导致的已知问题。该问题已在版本 0.1.7 中修复。请将 pymagnitude 升级到 0.1.7 或更高版本：`pip install pymagnitude --upgrade`。升级后再次运行相同的代码即可正常工作。","https:\u002F\u002Fgithub.com\u002Fplasticityai\u002Fmagnitude\u002Fissues\u002F1",{"id":136,"question_zh":137,"answer_zh":138,"source_url":139},40585,"在中国地区安装 pymagnitude 速度极慢或超时怎么办？","由于默认下载源位于美国，中国用户下载二进制 wheel 包可能很慢。解决方案包括：\n1. 尝试手动下载特定版本的 wheel 文件（例如：https:\u002F\u002Fs3.amazonaws.com\u002Fmagnitude.plasticity.ai\u002Fwheelhouse\u002Fpymagnitude-0.1.119-cp36-cp36m-manylinux1_x86_64.whl）。\n2. 使用 `pip install pymagnitude -vvv` 查看详细信息，确认是下载慢还是编译慢（SQLite 编译较慢）。\n3. 维护者已建议后续会通过 CDN 加速全球下载，或者支持加载离线的 .whl 文件进行安装。","https:\u002F\u002Fgithub.com\u002Fplasticityai\u002Fmagnitude\u002Fissues\u002F40",{"id":141,"question_zh":142,"answer_zh":143,"source_url":144},40586,"在虚拟环境中安装 pymagnitude 后导入报错 \"ModuleNotFoundError: No module named 'lz4'\" 怎么办？","这通常是因为依赖项未正确安装。请尝试以下步骤：\n1. 使用 verbose 模式重新安装以查看具体失败原因：`pip3 install pymagnitude -vvvv`。\n2. 手动安装缺失的依赖：`pip install lz4`。\n3. 确保使用的是最新版本的 pip (`pip install --upgrade pip`) 后再安装 pymagnitude。\n如果是在 CI\u002FCD (如 GitHub Actions) 中遇到此问题，请检查缓存策略是否导致依赖缺失。","https:\u002F\u002Fgithub.com\u002Fplasticityai\u002Fmagnitude\u002Fissues\u002F21",{"id":146,"question_zh":147,"answer_zh":148,"source_url":149},40587,"使用 pymagnitude.converter 添加 -a 标志构建近似最近邻索引时出现 \"Segmentation fault\" (段错误) 怎么办？","这通常是由于内存不足 (Out-of-Memory) 或数据类型溢出（如 signed int32 限制）引起的。建议：\n1. 在内存更大的服务器上运行转换任务。\n2. 检查输入向量数据的维度是否过大，尝试减少数据量或使用更小的索引参数。\n3. 这是一个已知的复杂问题，如果常规方法无效，可能需要等待官方针对大模型转换的优化更新或提交详细的复现步骤给维护者排查。","https:\u002F\u002Fgithub.com\u002Fplasticityai\u002Fmagnitude\u002Fissues\u002F34",{"id":151,"question_zh":152,"answer_zh":153,"source_url":154},40588,"在企业内部网络或有严格安全扫描的环境中安装 pymagnitude 挂起或失败怎么办？","这通常是因为安装脚本试图从外部 S3 服务器下载额外的二进制文件或依赖，而被防火墙拦截。解决方案：\n1. 不要直接使用 `pip install pymagnitude`，而是先手动下载对应的 `.whl` 文件和任何需要的额外二进制资源。\n2. 在离线环境下使用 `pip install \u003C本地路径>\u002Fpymagnitude-x.x.x-cpXX-cpXXm-xxx.whl` 进行安装。\n3. 确保企业内部代理允许访问所需的资源地址，或者联系维护者获取完整的离线安装包。","https:\u002F\u002Fgithub.com\u002Fplasticityai\u002Fmagnitude\u002Fissues\u002F62",[156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232],{"id":157,"version":158,"summary_zh":76,"released_at":159},324072,"0.1.143","2020-05-25T11:26:09",{"id":161,"version":162,"summary_zh":76,"released_at":163},324073,"0.1.142","2020-05-25T09:38:10",{"id":165,"version":166,"summary_zh":76,"released_at":167},324074,"0.1.140","2020-05-25T07:46:28",{"id":169,"version":170,"summary_zh":76,"released_at":171},324075,"0.1.139","2020-05-25T07:29:11",{"id":173,"version":174,"summary_zh":76,"released_at":175},324076,"0.1.138","2020-05-25T07:17:52",{"id":177,"version":178,"summary_zh":76,"released_at":179},324077,"0.1.137","2020-05-25T07:12:12",{"id":181,"version":182,"summary_zh":76,"released_at":183},324078,"0.1.136","2020-05-25T06:55:10",{"id":185,"version":186,"summary_zh":76,"released_at":187},324079,"0.1.135","2020-05-25T06:27:25",{"id":189,"version":190,"summary_zh":76,"released_at":191},324080,"0.1.132","2020-05-25T05:22:51",{"id":193,"version":194,"summary_zh":76,"released_at":195},324081,"0.1.131","2020-05-25T05:14:51",{"id":197,"version":198,"summary_zh":76,"released_at":199},324082,"0.1.130","2020-05-25T05:03:24",{"id":201,"version":202,"summary_zh":76,"released_at":203},324083,"0.1.129","2020-05-25T01:29:45",{"id":205,"version":206,"summary_zh":76,"released_at":207},324084,"0.1.128","2020-05-25T00:47:23",{"id":209,"version":210,"summary_zh":76,"released_at":211},324085,"0.1.127","2020-05-24T23:26:33",{"id":213,"version":214,"summary_zh":76,"released_at":215},324086,"0.1.126","2020-05-24T23:19:24",{"id":217,"version":218,"summary_zh":76,"released_at":219},324087,"0.1.125","2020-05-24T23:13:46",{"id":221,"version":222,"summary_zh":76,"released_at":223},324088,"0.1.124","2020-05-24T23:07:15",{"id":225,"version":226,"summary_zh":76,"released_at":227},324089,"0.1.123","2020-05-24T23:01:23",{"id":229,"version":230,"summary_zh":76,"released_at":231},324090,"0.1.122","2020-05-24T22:52:02",{"id":233,"version":234,"summary_zh":76,"released_at":235},324091,"0.1.121","2020-05-24T22:39:43"]