[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-QwenLM--Qwen3-Embedding":3,"tool-QwenLM--Qwen3-Embedding":64},[4,17,25,39,48,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,14,15],"开发框架","Agent","语言模型","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":10,"last_commit_at":23,"category_tags":24,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,15],{"id":26,"name":27,"github_repo":28,"description_zh":29,"stars":30,"difficulty_score":10,"last_commit_at":31,"category_tags":32,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[33,34,35,36,14,37,15,13,38],"图像","数据工具","视频","插件","其他","音频",{"id":40,"name":41,"github_repo":42,"description_zh":43,"stars":44,"difficulty_score":45,"last_commit_at":46,"category_tags":47,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,3,"2026-04-04T04:44:48",[14,33,13,15,37],{"id":49,"name":50,"github_repo":51,"description_zh":52,"stars":53,"difficulty_score":45,"last_commit_at":54,"category_tags":55,"status":16},519,"PaddleOCR","PaddlePaddle\u002FPaddleOCR","PaddleOCR 是一款基于百度飞桨框架开发的高性能开源光学字符识别工具包。它的核心能力是将图片、PDF 等文档中的文字提取出来，转换成计算机可读取的结构化数据，让机器真正“看懂”图文内容。\n\n面对海量纸质或电子文档，PaddleOCR 解决了人工录入效率低、数字化成本高的问题。尤其在人工智能领域，它扮演着连接图像与大型语言模型（LLM）的桥梁角色，能将视觉信息直接转化为文本输入，助力智能问答、文档分析等应用场景落地。\n\nPaddleOCR 适合开发者、算法研究人员以及有文档自动化需求的普通用户。其技术优势十分明显：不仅支持全球 100 多种语言的识别，还能在 Windows、Linux、macOS 等多个系统上运行，并灵活适配 CPU、GPU、NPU 等各类硬件。作为一个轻量级且社区活跃的开源项目，PaddleOCR 既能满足快速集成的需求，也能支撑前沿的视觉语言研究，是处理文字识别任务的理想选择。",74913,"2026-04-05T10:44:17",[15,33,13,37],{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":45,"last_commit_at":62,"category_tags":63,"status":16},2181,"OpenHands","OpenHands\u002FOpenHands","OpenHands 是一个专注于 AI 驱动开发的开源平台，旨在让智能体（Agent）像人类开发者一样理解、编写和调试代码。它解决了传统编程中重复性劳动多、环境配置复杂以及人机协作效率低等痛点，通过自动化流程显著提升开发速度。\n\n无论是希望提升编码效率的软件工程师、探索智能体技术的研究人员，还是需要快速原型验证的技术团队，都能从中受益。OpenHands 提供了灵活多样的使用方式：既可以通过命令行（CLI）或本地图形界面在个人电脑上轻松上手，体验类似 Devin 的流畅交互；也能利用其强大的 Python SDK 自定义智能体逻辑，甚至在云端大规模部署上千个智能体并行工作。\n\n其核心技术亮点在于模块化的软件智能体 SDK，这不仅构成了平台的引擎，还支持高度可组合的开发模式。此外，OpenHands 在 SWE-bench 基准测试中取得了 77.6% 的优异成绩，证明了其解决真实世界软件工程问题的能力。平台还具备完善的企业级功能，支持与 Slack、Jira 等工具集成，并提供细粒度的权限管理，适合从个人开发者到大型企业的各类用户场景。",70612,"2026-04-05T11:12:22",[15,14,13,36],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":68,"owner_location":68,"owner_email":79,"owner_twitter":80,"owner_website":81,"owner_url":82,"languages":83,"stars":92,"forks":93,"last_commit_at":94,"license":68,"difficulty_score":45,"env_os":95,"env_gpu":95,"env_ram":95,"env_deps":96,"category_tags":103,"github_topics":68,"view_count":45,"oss_zip_url":68,"oss_zip_packed_at":68,"status":16,"created_at":104,"updated_at":105,"faqs":106,"releases":135},736,"QwenLM\u002FQwen3-Embedding","Qwen3-Embedding",null,"Qwen3 Embedding 是通义千问家族最新推出的文本嵌入与重排序模型系列，旨在将自然语言精准转化为高维向量表示。它有效解决了传统检索系统中语义匹配偏差大、多语言支持不足以及长文本处理受限等挑战，显著提升了信息检索、文本分类及聚类的效果。\n\n对于希望构建智能搜索、知识库问答或进行跨语言数据分析的开发者与研究人员而言，Qwen3 Embedding 是理想选择。其亮点在于提供 0.6B 至 8B 多种参数规模，兼顾推理效率与模型性能。作为多语言榜单（MTEB）的领先者，它原生支持超过 100 种人类语言及编程代码，并具备 32K 超长上下文处理能力。此外，模型支持自定义指令以适配特定场景，灵活的向量维度定义让文本向量化更加高效，助力用户轻松实现高质量的语义理解与应用。","\n\n\u003Cp align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Fqianwen-res.oss-accelerate.aliyuncs.com\u002Flogo_qwen_embedding.png\" width=\"400\"\u002F>\n\u003Cp>\n\n\u003Cp align=\"center\">\n   &nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fcollections\u002FQwen\u002Fqwen3-embedding-6841b2055b99c44d9a4c371f\">Huggingface\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fmodelscope.cn\u002Fcollections\u002FQwen3-Embedding-3edc3762d50f48\">ModelScope\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fqwenlm.github.io\u002Fblog\u002Fqwen3-embedding\u002F\">Blog\u003C\u002Fa> &nbsp&nbsp | &nbsp&nbsp \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.05176\">Arxiv\u003C\u002Fa> &nbsp&nbsp | &nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fbailian.console.aliyun.com\u002F?tab=model#\u002Fmodel-market\u002Fdetail\u002Ftext-embedding-v4\">API\u003C\u002Fa> ｜ &nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002FyPEP2vHTu4\">Discord\u003C\u002Fa> \n\u003C\u002Fp>\n\n# Qwen3 Embedding\n\n## Highlights\n\nThe Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B). This series inherits the exceptional multilingual capabilities, long-text understanding, and reasoning skills of its foundational model. The Qwen3 Embedding series represents significant advancements in multiple text embedding and ranking tasks, including text retrieval, code retrieval, text classification, text clustering, and bitext mining.\n\n**Exceptional Versatility**: The embedding model has achieved state-of-the-art performance across a wide range of downstream application evaluations. The 8B size embedding model ranks **No.1** in the MTEB multilingual leaderboard (as of June 5, 2025, score **70.58**), while the reranking model excels in various text retrieval scenarios.\n\n**Comprehensive Flexibility**: The Qwen3 Embedding series offers a full spectrum of sizes (from 0.6B to 8B) for both embedding and reranking models, catering to diverse use cases that prioritize efficiency and effectiveness. Developers can seamlessly combine these two modules. Additionally, the embedding model allows for flexible vector definitions across all dimensions, and both embedding and reranking models support user-defined instructions to enhance performance for specific tasks, languages, or scenarios.\n\n**Multilingual Capability**: The Qwen3 Embedding series offer support for over 100 languages, thanks to the multilingual capabilities of Qwen3 models. This includes various programming languages, and provides robust multilingual, cross-lingual, and code retrieval capabilities.\n\n\n## Qwen3 Embedding Series Model list\n\n| Model Type       | Models               | Size | Layers | Sequence Length | Embedding Dimension | MRL Support | Instruction Aware |\n|------------------|----------------------|------|--------|-----------------|---------------------|-------------|----------------|\n| Text Embedding   | [Qwen3-Embedding-0.6B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Embedding-0.6B) | 0.6B | 28     | 32K             | 1024                | Yes         | Yes            |\n| Text Embedding   | [Qwen3-Embedding-4B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Embedding-4B)   | 4B   | 36     | 32K             | 2560                | Yes         | Yes            |\n| Text Embedding   | [Qwen3-Embedding-8B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Embedding-8B)   | 8B   | 36     | 32K             | 4096                | Yes         | Yes            |\n| Text Reranking   | [Qwen3-Reranker-0.6B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Reranker-0.6B) | 0.6B | 28     | 32K             | -                   | -           | Yes            |\n| Text Reranking   | [Qwen3-Reranker-4B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Reranker-4B)   | 4B   | 36     | 32K             | -                   | -           | Yes            |\n| Text Reranking   | [Qwen3-Reranker-8B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Reranker-8B)   | 8B   | 36     | 32K             | -                   | -           | Yes            |\n\n> **Note**:\n> - `MRL (Matryoshka Representation Learning) Support` indicates whether the embedding model supports custom dimensions for the final embedding. \n> - `Instruction Aware` notes whether the embedding or reranking model supports customizing the input instruction according to different tasks.\n> - Our evaluation indicates that, for most downstream tasks, using instructions (instruct) typically yields an improvement of 1% to 5% compared to not using them. Therefore, we recommend that developers create tailored instructions specific to their tasks and scenarios. In multilingual contexts, we also advise users to write their instructions in English, as most instructions utilized during the model training process were originally written in English.\n\n### Multilingual Support\n\nQwen3-Embedding model series shares the multilingual support capabilities of the Qwen3 base model. \n\n\u003Cdetails>\n\u003Csummary>Click to expand the list of supported languages\u003C\u002Fsummary>\n\n| Language Family | Languages & Dialects |\n|---|---|\n| Indo-European | English, French, Portuguese, German, Romanian, Swedish, Danish, Bulgarian, Russian, Czech, Greek, Ukrainian, Spanish, Dutch, Slovak, Croatian, Polish, Lithuanian, Norwegian Bokmål, Norwegian Nynorsk, Persian, Slovenian, Gujarati, Latvian, Italian, Occitan, Nepali, Marathi, Belarusian, Serbian, Luxembourgish, Venetian, Assamese, Welsh, Silesian, Asturian, Chhattisgarhi, Awadhi, Maithili, Bhojpuri, Sindhi, Irish, Faroese, Hindi, Punjabi, Bengali, Oriya, Tajik, Eastern Yiddish, Lombard, Ligurian, Sicilian, Friulian, Sardinian, Galician, Catalan, Icelandic, Tosk Albanian, Limburgish, Dari, Afrikaans, Macedonian, Sinhala, Urdu, Magahi, Bosnian, Armenian |\n| Sino-Tibetan | Chinese (Simplified Chinese, Traditional Chinese, Cantonese), Burmese |\n| Afro-Asiatic | Arabic (Standard, Najdi, Levantine, Egyptian, Moroccan, Mesopotamian, Ta'izzi-Adeni, Tunisian), Hebrew, Maltese |\n| Austronesian | Indonesian, Malay, Tagalog, Cebuano, Javanese, Sundanese, Minangkabau, Balinese, Banjar, Pangasinan, Iloko, Waray (Philippines)  |\n| Dravidian | Tamil, Telugu, Kannada, Malayalam |\n| Turkic | Turkish, North Azerbaijani, Northern Uzbek, Kazakh, Bashkir, Tatar |\n| Tai-Kadai | Thai, Lao |\n| Uralic | Finnish, Estonian, Hungarian |\n| Austroasiatic | Vietnamese, Khmer |\n| Other | Japanese, Korean, Georgian, Basque, Haitian, Papiamento, Kabuverdianu, Tok Pisin, Swahili | \n\n\u003C\u002Fdetails>\n\n## Usage\n\nWith Transformers versions earlier than 4.51.0, you may encounter the following error:\n```\nKeyError: 'qwen3'\n```\n### Embedding Model\n\n#### Transformers Usage\n\n```python\n# Requires transformers>=4.51.0\n\nimport torch\nimport torch.nn.functional as F\n\nfrom torch import Tensor\nfrom transformers import AutoTokenizer, AutoModel\n\n\ndef last_token_pool(last_hidden_states: Tensor,\n                 attention_mask: Tensor) -> Tensor:\n    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])\n    if left_padding:\n        return last_hidden_states[:, -1]\n    else:\n        sequence_lengths = attention_mask.sum(dim=1) - 1\n        batch_size = last_hidden_states.shape[0]\n        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]\n\n\ndef get_detailed_instruct(task_description: str, query: str) -> str:\n    return f'Instruct: {task_description}\\nQuery:{query}'\n\n# Each query must come with a one-sentence instruction that describes the task\ntask = 'Given a web search query, retrieve relevant passages that answer the query'\n\nqueries = [\n    get_detailed_instruct(task, 'What is the capital of China?'),\n    get_detailed_instruct(task, 'Explain gravity')\n]\n# No need to add instruction for retrieval documents\ndocuments = [\n    \"The capital of China is Beijing.\",\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\"\n]\ninput_texts = queries + documents\n\ntokenizer = AutoTokenizer.from_pretrained('Qwen\u002FQwen3-Embedding-0.6B', padding_side='left')\nmodel = AutoModel.from_pretrained('Qwen\u002FQwen3-Embedding-0.6B')\n\n# We recommend enabling flash_attention_2 for better acceleration and memory saving.\n# model = AutoModel.from_pretrained('Qwen\u002FQwen3-Embedding-0.6B', attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16).cuda()\n\nmax_length = 8192\n\n# Tokenize the input texts\nbatch_dict = tokenizer(\n    input_texts,\n    padding=True,\n    truncation=True,\n    max_length=max_length,\n    return_tensors=\"pt\",\n)\nbatch_dict.to(model.device)\nwith torch.no_grad():\n    outputs = model(**batch_dict)\n    embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])\n\n    # normalize embeddings\n    embeddings = F.normalize(embeddings, p=2, dim=1)\n    scores = (embeddings[:2] @ embeddings[2:].T)\n\nprint(scores.tolist())\n# [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]\n```\n\n#### vLLM Usage \n```python\n# Requires vllm>=0.8.5\nimport torch\nimport vllm\nfrom vllm import LLM\n\ndef get_detailed_instruct(task_description: str, query: str) -> str:\n    return f'Instruct: {task_description}\\nQuery:{query}'\n\n# Each query must come with a one-sentence instruction that describes the task\ntask = 'Given a web search query, retrieve relevant passages that answer the query'\n\nqueries = [\n    get_detailed_instruct(task, 'What is the capital of China?'),\n    get_detailed_instruct(task, 'Explain gravity')\n]\n# No need to add instruction for retrieval documents\ndocuments = [\n    \"The capital of China is Beijing.\",\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\"\n]\ninput_texts = queries + documents\n\nmodel = LLM(model=\"Qwen\u002FQwen3-Embedding-0.6B\", task=\"embed\")\n\noutputs = model.embed(input_texts)\nembeddings = torch.tensor([o.outputs.embedding for o in outputs])\nscores = (embeddings[:2] @ embeddings[2:].T)\nprint(scores.tolist())\n# [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]\n```\n\n#### Sentence Transformers Usage\n```python\n# Requires transformers>=4.51.0\n# Requires sentence-transformers>=2.7.0\n\nfrom sentence_transformers import SentenceTransformer\n\n# Load the model\nmodel = SentenceTransformer(\"Qwen\u002FQwen3-Embedding-0.6B\")\n\n# We recommend enabling flash_attention_2 for better acceleration and memory saving,\n# together with setting `padding_side` to \"left\":\n# model = SentenceTransformer(\n#     \"Qwen\u002FQwen3-Embedding-0.6B\",\n#     model_kwargs={\"attn_implementation\": \"flash_attention_2\", \"device_map\": \"auto\"},\n#     tokenizer_kwargs={\"padding_side\": \"left\"},\n# )\n\n# The queries and documents to embed\nqueries = [\n    \"What is the capital of China?\",\n    \"Explain gravity\",\n]\ndocuments = [\n    \"The capital of China is Beijing.\",\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\",\n]\n\nwith torch.no_grad():\n    # Encode the queries and documents. Note that queries benefit from using a prompt\n    # Here we use the prompt called \"query\" stored under `model.prompts`, but you can\n    # also pass your own prompt via the `prompt` argument\n    query_embeddings = model.encode(queries, prompt_name=\"query\")\n    document_embeddings = model.encode(documents)\n\n    # Compute the (cosine) similarity between the query and document embeddings\n    similarity = model.similarity(query_embeddings, document_embeddings)\n\nprint(similarity)\n# tensor([[0.7646, 0.1414], [0.1355, 0.6000]])\n```\n### Reranker Model\n\n#### Transformers Usage\n\n```python\n# Requires transformers>=4.51.0\nimport torch\nfrom transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM\n\ndef format_instruction(instruction, query, doc):\n    if instruction is None:\n        instruction = 'Given a web search query, retrieve relevant passages that answer the query'\n    output = \"\u003CInstruct>: {instruction}\\n\u003CQuery>: {query}\\n\u003CDocument>: {doc}\".format(instruction=instruction,query=query, doc=doc)\n    return output\n\ndef process_inputs(pairs):\n    inputs = tokenizer(\n        pairs, padding=False, truncation='longest_first',\n        return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens)\n    )\n    for i, ele in enumerate(inputs['input_ids']):\n        inputs['input_ids'][i] = prefix_tokens + ele + suffix_tokens\n    inputs = tokenizer.pad(inputs, padding=True, return_tensors=\"pt\", max_length=max_length)\n    for key in inputs:\n        inputs[key] = inputs[key].to(model.device)\n    return inputs\n\n@torch.no_grad()\ndef compute_logits(inputs, **kwargs):\n    batch_scores = model(**inputs).logits[:, -1, :]\n    true_vector = batch_scores[:, token_true_id]\n    false_vector = batch_scores[:, token_false_id]\n    batch_scores = torch.stack([false_vector, true_vector], dim=1)\n    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)\n    scores = batch_scores[:, 1].exp().tolist()\n    return scores\n\ntokenizer = AutoTokenizer.from_pretrained(\"Qwen\u002FQwen3-Reranker-0.6B\", padding_side='left')\nmodel = AutoModelForCausalLM.from_pretrained(\"Qwen\u002FQwen3-Reranker-0.6B\").eval()\n\n# We recommend enabling flash_attention_2 for better acceleration and memory saving.\n# model = AutoModelForCausalLM.from_pretrained(\"Qwen\u002FQwen3-Reranker-0.6B\", torch_dtype=torch.float16, attn_implementation=\"flash_attention_2\").cuda().eval()\n\ntoken_false_id = tokenizer.convert_tokens_to_ids(\"no\")\ntoken_true_id = tokenizer.convert_tokens_to_ids(\"yes\")\nmax_length = 8192\n\nprefix = \"\u003C|im_start|>system\\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \\\"yes\\\" or \\\"no\\\".\u003C|im_end|>\\n\u003C|im_start|>user\\n\"\nsuffix = \"\u003C|im_end|>\\n\u003C|im_start|>assistant\\n\u003Cthink>\\n\\n\u003C\u002Fthink>\\n\\n\"\nprefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)\nsuffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)\n        \ntask = 'Given a web search query, retrieve relevant passages that answer the query'\n\nqueries = [\"What is the capital of China?\",\n    \"Explain gravity\",\n]\n\ndocuments = [\n    \"The capital of China is Beijing.\",\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\",\n]\n\npairs = [format_instruction(task, query, doc) for query, doc in zip(queries, documents)]\n\n# Tokenize the input texts\ninputs = process_inputs(pairs)\nscores = compute_logits(inputs)\n\nprint(\"scores: \", scores)\n```\n\n#### vLLM Usage \n\n```python\n# Requires vllm>=0.8.5\nimport logging\nfrom typing import Dict, Optional, List\n\nimport json\nimport logging\n\nimport torch\n\nfrom transformers import AutoTokenizer, is_torch_npu_available\nfrom vllm import LLM, SamplingParams\nfrom vllm.distributed.parallel_state import destroy_model_parallel\nimport gc\nimport math\nfrom vllm.inputs.data import TokensPrompt\n\ndef format_instruction(instruction, query, doc):\n    text = [\n        {\"role\": \"system\", \"content\": \"Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \\\"yes\\\" or \\\"no\\\".\"},\n        {\"role\": \"user\", \"content\": f\"\u003CInstruct>: {instruction}\\n\\n\u003CQuery>: {query}\\n\\n\u003CDocument>: {doc}\"}\n    ]\n    return text\n\ndef process_inputs(pairs, instruction, max_length, suffix_tokens):\n    messages = [format_instruction(instruction, query, doc) for query, doc in pairs]\n    messages =  tokenizer.apply_chat_template(\n        messages, tokenize=True, add_generation_prompt=False, enable_thinking=False\n    )\n    messages = [ele[:max_length] + suffix_tokens for ele in messages]\n    messages = [TokensPrompt(prompt_token_ids=ele) for ele in messages]\n    return messages\n\ndef compute_logits(model, messages, sampling_params, true_token, false_token):\n    outputs = model.generate(messages, sampling_params, use_tqdm=False)\n    scores = []\n    for i in range(len(outputs)):\n        final_logits = outputs[i].outputs[0].logprobs[-1]\n        token_count = len(outputs[i].outputs[0].token_ids)\n        if true_token not in final_logits:\n            true_logit = -10\n        else:\n            true_logit = final_logits[true_token].logprob\n        if false_token not in final_logits:\n            false_logit = -10\n        else:\n            false_logit = final_logits[false_token].logprob\n        true_score = math.exp(true_logit)\n        false_score = math.exp(false_logit)\n        score = true_score \u002F (true_score + false_score)\n        scores.append(score)\n    return scores\n\nnumber_of_gpu = torch.cuda.device_count()\ntokenizer = AutoTokenizer.from_pretrained('Qwen\u002FQwen3-Reranking-4B')\nmodel = LLM(model='Qwen\u002FQwen3-Reranking-0.6B', tensor_parallel_size=number_of_gpu, max_model_len=10000, enable_prefix_caching=True, gpu_memory_utilization=0.8)\ntokenizer.padding_side = \"left\"\ntokenizer.pad_token = tokenizer.eos_token\nsuffix = \"\u003C|im_end|>\\n\u003C|im_start|>assistant\\n\u003Cthink>\\n\\n\u003C\u002Fthink>\\n\\n\"\nmax_length=8192\nsuffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)\ntrue_token = tokenizer(\"yes\", add_special_tokens=False).input_ids[0]\nfalse_token = tokenizer(\"no\", add_special_tokens=False).input_ids[0]\nsampling_params = SamplingParams(temperature=0, \n    max_tokens=1,\n    logprobs=20, \n    allowed_token_ids=[true_token, false_token],\n)\n\n        \ntask = 'Given a web search query, retrieve relevant passages that answer the query'\nqueries = [\"What is the capital of China?\",\n    \"Explain gravity\",\n]\ndocuments = [\n    \"The capital of China is Beijing.\",\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\",\n]\n\npairs = list(zip(queries, documents))\ninputs = process_inputs(pairs, task, max_length-len(suffix_tokens), suffix_tokens)\nscores = compute_logits(model, inputs, sampling_params, true_token, false_token)\nprint('scores', scores)\n\ndestroy_model_parallel()\n```\n\n\n📌 **Tip**: We recommend that developers customize the `instruct` according to their specific scenarios, tasks, and languages. Our tests have shown that in most retrieval scenarios, not using an `instruct` on the query side can lead to a drop in retrieval performance by approximately 1% to 5%.\n\nFor more usage examples, see the code in the [examples]() sections.\n\n## Training\n\nThe code and instructions for training Qwen3-Embedding models can be found in the [training docs](docs\u002Ftraining).\n\n## Evaluation\n\nThe code for reproducing the following results is available in the [evaluation]() section.\n\n### MTEB (Multilingual)\n\n| Model                            |  Size   |  Mean (Task)  | Mean (Type) | Bitxt Mining | Class. | Clust. | Inst. Retri. | Multi. Class. | Pair. Class. | Rerank | Retri. | STS  |\n|----------------------------------|:-------:|:-------------:|:-------------:|:--------------:|:--------:|:--------:|:--------------:|:---------------:|:--------------:|:--------:|:--------:|:------:|\n| NV-Embed-v2                      |   7B    |     56.29     | 49.58       | 57.84        | 57.29  | 40.80  | 1.04         | 18.63         | 78.94        | 63.82  | 56.72  | 71.10|\n| GritLM-7B                        |   7B    |     60.92     | 53.74       | 70.53        | 61.83  | 49.75  | 3.45         | 22.77         | 79.94        | 63.78  | 58.31  | 73.33|\n| BGE-M3                           |  0.6B   |     59.56     | 52.18       | 79.11        | 60.35  | 40.88  | -3.11        | 20.1          | 80.76        | 62.79  | 54.60  | 74.12|\n| multilingual-e5-large-instruct   |  0.6B   |     63.22     | 55.08       | 80.13        | 64.94  | 50.75  | -0.40        | 22.91         | 80.86        | 62.61  | 57.12  | 76.81|\n| gte-Qwen2-1.5B-instruct          |  1.5B   |     59.45     | 52.69       | 62.51        | 58.32  | 52.05  | 0.74         | 24.02         | 81.58        | 62.58  | 60.78  | 71.61|\n| gte-Qwen2-7b-Instruct            |   7B    |     62.51     | 55.93       | 73.92        | 61.55  | 52.77  | 4.94         | 25.48         | 85.13        | 65.55  | 60.08  | 73.98|\n| text-embedding-3-large           |    -    |     58.93     | 51.41       | 62.17        | 60.27  | 46.89  | -2.68        | 22.03         | 79.17        | 63.89  | 59.27  | 71.68|\n| Cohere-embed-multilingual-v3.0   |    -    |     61.12     | 53.23       | 70.50        | 62.95  | 46.89  | -1.89        | 22.74         | 79.88        | 64.07  | 59.16  | 74.80|\n| gemini-embedding-exp-03-07       |    -    |     68.37     | 59.59       | 79.28        | 71.82  | 54.59  | 5.18         | **29.16**     | 83.63        | 65.58  | 67.71  | 79.40|\n| **Qwen3-Embedding-0.6B**         |  0.6B   |     64.33     | 56.00       | 72.22        | 66.83  | 52.33  | 5.09         | 24.59         | 80.83        | 61.41  | 64.64  | 76.17|\n| **Qwen3-Embedding-4B**           |   4B    |     69.45     | 60.86       | 79.36        | 72.33  | 57.15  | **11.56**    | 26.77         | 85.05        | 65.08  | 69.60  | 80.86|\n| **Qwen3-Embedding-8B**           |   8B    |   **70.58**   | **61.69**   | **80.89**    | **74.00** | **57.65** | 10.06      | 28.66         | **86.40**    | **65.63** | **70.88** | **81.08** |\n\n> **Note**: For compared models, the scores are retrieved from MTEB online [leaderboard](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fmteb\u002Fleaderboard) on June 6th, 2025.\n\n### MTEB (Eng v2)\n\n| MTEB English \u002F Models          |  Param.  | Mean(Task) | Mean(Type) | Class. | Clust. | Pair Class. | Rerank. | Retri. | STS   | Summ. |\n|--------------------------------|:--------:|:------------:|:------------:|:--------:|:--------:|:-------------:|:---------:|:--------:|:-------:|:-------:|\n| multilingual-e5-large-instruct |   0.6B   | 65.53      | 61.21      | 75.54  | 49.89  | 86.24       | 48.74   | 53.47  | 84.72 | 29.89 |\n| NV-Embed-v2                    |   7.8B   | 69.81      | 65.00      | 87.19  | 47.66  | 88.69       | 49.61   | 62.84  | 83.82 | 35.21 |\n| GritLM-7B                      |   7.2B   | 67.07      | 63.22      | 81.25  | 50.82  | 87.29       | 49.59   | 54.95  | 83.03 | 35.65 |\n| gte-Qwen2-1.5B-instruct        |   1.5B   | 67.20      | 63.26      | 85.84  | 53.54  | 87.52       | 49.25   | 50.25  | 82.51 | 33.94 |\n| stella_en_1.5B_v5              |   1.5B   | 69.43      | 65.32      | 89.38  | 57.06  | 88.02       | 50.19   | 52.42  | 83.27 | 36.91 |\n| gte-Qwen2-7B-instruct          |   7.6B   | 70.72      | 65.77      | 88.52  | 58.97  | 85.9        | 50.47   | 58.09  | 82.69 | 35.74 |\n| gemini-embedding-exp-03-07     |    -     | 73.3       | 67.67      | 90.05  | **59.39**  | **87.7**   | 48.59   | 64.35  | 85.29 | **38.28** |\n| **Qwen3-Embedding-0.6B**       |   0.6B   | 70.70      | 64.88      | 85.76  | 54.05  | 84.37       | 48.18   | 61.83  | 86.57 | 33.43 |\n| **Qwen3-Embedding-4B**         |    4B    | 74.60      | 68.10      | 89.84  | 57.51  | 87.01       | 50.76   | 68.46  | **88.72** | 34.39 |\n| **Qwen3-Embedding-8B**         |    8B    | **75.22**  | **68.71**  | **90.43** | 58.57  | 87.52       | **51.56**   | **69.44**  | 88.58 | 34.83 |\n\n### C-MTEB (MTEB Chinese)\n\n| C-MTEB           | Param. | Mean(Task) | Mean(Type) | Class. | Clust. | Pair Class. | Rerank. | Retr. | STS   |\n|------------------|--------|------------|------------|--------|--------|-------------|---------|-------|-------|\n| multilingual-e5-large-instruct | 0.6B   | 58.08      | 58.24      | 69.80  | 48.23  | 64.52       | 57.45   | 63.65 | 45.81 |\n| bge-multilingual-gemma2 | 9B     | 67.64      |68.52   | 75.31      | 59.30  | 86.67  | 68.28       | 73.73   | 55.19 | \n| gte-Qwen2-1.5B-instruct  | 1.5B   | 67.12      | 67.79      | 72.53  | 54.61  | 79.5        | 68.21   | 71.86 | 60.05 |\n| gte-Qwen2-7B-instruct    | 7.6B   | 71.62      | 72.19      | 75.77  | 66.06  | 81.16       | 69.24   | 75.70 | 65.20 |\n| ritrieve_zh_v1          | 0.3B   | 72.71      | 73.85      | 76.88  | 66.5   | **85.98**       | **72.86**   | 76.97 | **63.92** |\n| **Qwen3-Embedding-0.6B** | 0.6B   | 66.33      | 67.45      | 71.40  | 68.74  | 76.42       | 62.58   | 71.03 | 54.52 |\n| **Qwen3-Embedding-4B**   | 4B     | 72.27      | 73.51      | 75.46  | 77.89  | 83.34       | 66.05   | 77.03 | 61.26 |\n| **Qwen3-Embedding-8B**   | 8B     | **73.84**  | **75.00**  | **76.97**  | **80.08**  | 84.23       | 66.99   | **78.21** | 63.53 |\n\n### Reranker\n| Model                              | Param  | MTEB-R  | CMTEB-R | MMTEB-R | MLDR   | MTEB-Code | FollowIR |\n|------------------------------------|--------|---------|---------|---------|--------|-----------|----------|\n| **Qwen3-Embedding-0.6B**               | 0.6B   | 61.82   | 71.02   | 64.64   | 50.26  | 75.41     | 5.09     |\n| Jina-multilingual-reranker-v2-base | 0.3B   | 58.22   | 63.37   | 63.73   | 39.66  | 58.98     | -0.68    |\n| gte-multilingual-reranker-base                      | 0.3B   | 59.51   | 74.08   | 59.44   | 66.33  | 54.18     | -1.64    |\n| BGE-reranker-v2-m3                 | 0.6B   | 57.03   | 72.16   | 58.36   | 59.51  | 41.38     | -0.01    |\n| **Qwen3-Reranker-0.6B**                | 0.6B   | 65.80   | 71.31   | 66.36   | 67.28  | 73.42     | 5.41     |\n| **Qwen3-Reranker-4B**                  | 4B   | **69.76** | 75.94   | 72.74   | 69.97  | 81.20     | **14.84** |\n| **Qwen3-Reranker-8B**                  | 8B     | 69.02   | **77.45** | **72.94** | **70.19** | **81.22** | 8.05     |\n\n> **Note**:  \n> - Evaluation results for reranking models. We use the retrieval subsets of MTEB(eng, v2), MTEB(cmn, v1), MMTEB and MTEB (Code), which are MTEB-R, CMTEB-R, MMTEB-R and MTEB-Code.\n> - All scores are our runs based on the top-100 candidates retrieved by dense embedding model [Qwen3-Embedding-0.6B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Embedding-0.6B).\n\n\n## Citation\nIf you find our work helpful, feel free to give us a cite.\n\n```\n@article{qwen3embedding,\n  title={Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models},\n  author={Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and Zhang, Xin and Lin, Huan and Yang, Baosong and Xie, Pengjun and Yang, An and Liu, Dayiheng and Lin, Junyang and Huang, Fei and Zhou, Jingren},\n  journal={arXiv preprint arXiv:2506.05176},\n  year={2025}\n}\n```\n","\u003Cp align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Fqianwen-res.oss-accelerate.aliyuncs.com\u002Flogo_qwen_embedding.png\" width=\"400\"\u002F>\n\u003Cp>\n\n\u003Cp align=\"center\">\n   &nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fcollections\u002FQwen\u002Fqwen3-embedding-6841b2055b99c44d9a4c371f\">Huggingface\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fmodelscope.cn\u002Fcollections\u002FQwen3-Embedding-3edc3762d50f48\">ModelScope\u003C\u002Fa>&nbsp&nbsp | &nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fqwenlm.github.io\u002Fblog\u002Fqwen3-embedding\u002F\">博客\u003C\u002Fa> &nbsp&nbsp | &nbsp&nbsp \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.05176\">Arxiv\u003C\u002Fa> &nbsp&nbsp | &nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fbailian.console.aliyun.com\u002F?tab=model#\u002Fmodel-market\u002Fdetail\u002Ftext-embedding-v4\">API\u003C\u002Fa> ｜ &nbsp&nbsp \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002FyPEP2vHTu4\">Discord\u003C\u002Fa> \n\u003C\u002Fp>\n\n# Qwen3 Embedding\n\n## 亮点\n\nQwen3 嵌入模型系列是 Qwen 家族最新的专有模型，专门设计用于文本嵌入（Text Embedding）和排序（Ranking）任务。基于 Qwen3 系列的稠密基础模型构建，它提供了一系列全面的文本嵌入和重排序（Reranking）模型，包含多种尺寸（0.6B、4B 和 8B）。该系列继承了其基础模型卓越的多语言能力、长文本理解和推理技能。Qwen3 嵌入系列代表了在多个文本嵌入和排序任务中的重大进步，包括文本检索、代码检索、文本分类、文本聚类和双语文本挖掘（Bitext Mining）。\n\n**卓越的通用性**：该嵌入模型在广泛的下游应用评估中实现了最先进的性能。8B 尺寸的嵌入模型在 MTEB 多语言排行榜上排名**第 1**（截至 2025 年 6 月 5 日，得分**70.58**），而重排序模型在各种文本检索场景中表现出色。\n\n**全面的灵活性**：Qwen3 嵌入系列提供了从 0.6B 到 8B 的全尺寸范围的嵌入和重排序模型，以满足优先考虑效率和效果的多样化用例。开发者可以无缝地组合这两个模块。此外，嵌入模型允许在所有维度上进行灵活的向量定义，且嵌入和重排序模型均支持用户自定义指令，以增强特定任务、语言或场景的性能。\n\n**多语言能力**：得益于 Qwen3 模型的多语言能力，Qwen3 嵌入系列支持超过 100 种语言。这包括各种编程语言，并提供强大的多语言、跨语言和代码检索能力。\n\n\n## Qwen3 Embedding 系列模型列表\n\n| 模型类型       | 模型               | 大小 | 层数 | 序列长度 | 嵌入维度 | MRL 支持 | 指令感知 |\n|------------------|----------------------|------|--------|-----------------|---------------------|-------------|----------------|\n| 文本嵌入   | [Qwen3-Embedding-0.6B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Embedding-0.6B) | 0.6B | 28     | 32K             | 1024                | 是         | 是            |\n| 文本嵌入   | [Qwen3-Embedding-4B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Embedding-4B)   | 4B   | 36     | 32K             | 2560                | 是         | 是            |\n| 文本嵌入   | [Qwen3-Embedding-8B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Embedding-8B)   | 8B   | 36     | 32K             | 4096                | 是         | 是            |\n| 文本重排序   | [Qwen3-Reranker-0.6B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Reranker-0.6B) | 0.6B | 28     | 32K             | -                   | -           | 是            |\n| 文本重排序   | [Qwen3-Reranker-4B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Reranker-4B)   | 4B   | 36     | 32K             | -                   | -           | 是            |\n| 文本重排序   | [Qwen3-Reranker-8B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Reranker-8B)   | 8B   | 36     | 32K             | -                   | -           | 是            |\n\n> **注意**：\n> - `MRL (Matryoshka Representation Learning) 支持` 表示嵌入模型是否支持最终嵌入的自定义维度。 \n> - `指令感知` 注明嵌入或重排序模型是否支持根据不同任务自定义输入指令。\n> - 我们的评估表明，对于大多数下游任务，使用指令（instruct）通常比不使用能带来 1% 到 5% 的提升。因此，我们建议开发者创建针对其任务和场景的定制指令。在多语言环境中，我们也建议用户使用英文编写指令，因为在模型训练过程中使用的多数指令最初是用英文编写的。\n\n### 多语言支持\n\nQwen3-Embedding 模型系列共享 Qwen3 基础模型的多语言支持能力。 \n\n\u003Cdetails>\n\u003Csummary>点击展开支持的语言列表\u003C\u002Fsummary>\n\n| 语言家族 | 语言及方言 |\n|---|---|\n| 印欧语系 | 英语，法语，葡萄牙语，德语，罗马尼亚语，瑞典语，丹麦语，保加利亚语，俄语，捷克语，希腊语，乌克兰语，西班牙语，荷兰语，斯洛伐克语，克罗地亚语，波兰语，立陶宛语，挪威书面语，挪威新挪威语，波斯语，斯洛文尼亚语，古吉拉特语，拉脱维亚语，意大利语，奥克语，尼泊尔语，马拉地语，白俄罗斯语，塞尔维亚语，卢森堡语，威尼斯语，阿萨姆语，威尔士语，西里西亚语，阿斯图里亚斯语，恰蒂斯加尔语，阿瓦德语，迈蒂利语，博杰普尔语，信德语，爱尔兰语，法罗语，印地语，旁遮普语，孟加拉语，奥里亚语，塔吉克语，东意第绪语，伦巴第语，利古里亚语，西西里语，弗留利语，撒丁语，加利西亚语，加泰罗尼亚语，冰岛语，托斯克阿尔巴尼亚语，林堡语，达里语，南非荷兰语，马其顿语，僧伽罗语，乌尔都语，摩揭陀语，波斯尼亚语，亚美尼亚语 |\n| 汉藏语系 | 中文（简体中文，繁体中文，粤语），缅甸语 |\n| 闪含语系 | 阿拉伯语（标准，纳季迪，黎凡特，埃及，摩洛哥，美索不达米亚，塔伊兹-阿登，突尼斯），希伯来语，马耳他语 |\n| 南岛语系 | 印度尼西亚语，马来语，菲律宾语，宿务语，爪哇语，巽他语，米南加保语，巴厘语，班贾尔语，帕纳辛语，伊洛卡诺语，瓦雷语（菲律宾） |\n| 达罗毗荼语系 | 泰米尔语，泰卢固语，卡纳达语，马拉雅拉姆语 |\n| 突厥语系 | 土耳其语，北阿塞拜疆语，北乌兹别克语，哈萨克语，巴什基尔语，鞑靼语 |\n| 台-卡岱语系 | 泰语，老挝语 |\n| 乌拉尔语系 | 芬兰语，爱沙尼亚语，匈牙利语 |\n| 南亚语系 | 越南语，高棉语 |\n| 其他 | 日语，韩语，格鲁吉亚语，巴斯克语，海地语，帕皮阿门托语，佛得角克里奥尔语，巴布亚皮钦语，斯瓦希里语 | \n\n\u003C\u002Fdetails>\n\n## 使用方法\n\n如果使用早于 4.51.0 版本的 Transformers，您可能会遇到以下错误：\n```\nKeyError: 'qwen3'\n```\n### 嵌入模型\n\n#### Transformers 用法\n\n```python\n\n# 需要 transformers>=4.51.0\n\nimport torch\nimport torch.nn.functional as F\n\nfrom torch import Tensor\nfrom transformers import AutoTokenizer, AutoModel\n\n\ndef last_token_pool(last_hidden_states: Tensor,\n                 attention_mask: Tensor) -> Tensor:\n    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])\n    if left_padding:\n        return last_hidden_states[:, -1]\n    else:\n        sequence_lengths = attention_mask.sum(dim=1) - 1\n        batch_size = last_hidden_states.shape[0]\n        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]\n\n\ndef get_detailed_instruct(task_description: str, query: str) -> str:\n    return f'Instruct: {task_description}\\nQuery:{query}'\n\n# Each query must come with a one-sentence instruction that describes the task\ntask = 'Given a web search query, retrieve relevant passages that answer the query'\n\nqueries = [\n    get_detailed_instruct(task, 'What is the capital of China?'),\n    get_detailed_instruct(task, 'Explain gravity')\n]\n# No need to add instruction for retrieval documents\ndocuments = [\n    \"The capital of China is Beijing.\",\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\"\n]\ninput_texts = queries + documents\n\ntokenizer = AutoTokenizer.from_pretrained('Qwen\u002FQwen3-Embedding-0.6B', padding_side='left')\nmodel = AutoModel.from_pretrained('Qwen\u002FQwen3-Embedding-0.6B')\n\n# We recommend enabling flash_attention_2 for better acceleration and memory saving.\n# model = AutoModel.from_pretrained('Qwen\u002FQwen3-Embedding-0.6B', attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16).cuda()\n\nmax_length = 8192\n\n# Tokenize the input texts\nbatch_dict = tokenizer(\n    input_texts,\n    padding=True,\n    truncation=True,\n    max_length=max_length,\n    return_tensors=\"pt\",\n)\nbatch_dict.to(model.device)\nwith torch.no_grad():\n    outputs = model(**batch_dict)\n    embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])\n\n    # normalize embeddings\n    embeddings = F.normalize(embeddings, p=2, dim=1)\n    scores = (embeddings[:2] @ embeddings[2:].T)\n\nprint(scores.tolist())\n# [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]\n```\n\n#### vLLM 使用 \n```python\n# Requires vllm>=0.8.5\nimport torch\nimport vllm\nfrom vllm import LLM\n\ndef get_detailed_instruct(task_description: str, query: str) -> str:\n    return f'Instruct: {task_description}\\nQuery:{query}'\n\n# Each query must come with a one-sentence instruction that describes the task\ntask = 'Given a web search query, retrieve relevant passages that answer the query'\n\nqueries = [\n    get_detailed_instruct(task, 'What is the capital of China?'),\n    get_detailed_instruct(task, 'Explain gravity')\n]\n# No need to add instruction for retrieval documents\ndocuments = [\n    \"The capital of China is Beijing.\",\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\"\n]\ninput_texts = queries + documents\n\nmodel = LLM(model=\"Qwen\u002FQwen3-Embedding-0.6B\", task=\"embed\")\n\noutputs = model.embed(input_texts)\nembeddings = torch.tensor([o.outputs.embedding for o in outputs])\nscores = (embeddings[:2] @ embeddings[2:].T)\nprint(scores.tolist())\n# [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]\n```\n\n#### Sentence Transformers 使用\n```python\n# Requires transformers>=4.51.0\n# Requires sentence-transformers>=2.7.0\n\nfrom sentence_transformers import SentenceTransformer\n\n# Load the model\nmodel = SentenceTransformer(\"Qwen\u002FQwen3-Embedding-0.6B\")\n\n# We recommend enabling flash_attention_2 for better acceleration and memory saving,\n# together with setting `padding_side` to \"left\":\n# model = SentenceTransformer(\n#     \"Qwen\u002FQwen3-Embedding-0.6B\",\n#     model_kwargs={\"attn_implementation\": \"flash_attention_2\", \"device_map\": \"auto\"},\n#     tokenizer_kwargs={\"padding_side\": \"left\"},\n# )\n\n# The queries and documents to embed\nqueries = [\n    \"What is the capital of China?\",\n    \"Explain gravity\",\n]\ndocuments = [\n    \"The capital of China is Beijing.\",\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\",\n]\n\nwith torch.no_grad():\n    # Encode the queries and documents. Note that queries benefit from using a prompt\n    # Here we use the prompt called \"query\" stored under `model.prompts`, but you can\n    # also pass your own prompt via the `prompt` argument\n    query_embeddings = model.encode(queries, prompt_name=\"query\")\n    document_embeddings = model.encode(documents)\n\n    # Compute the (cosine) similarity between the query and document embeddings\n    similarity = model.similarity(query_embeddings, document_embeddings)\n\nprint(similarity)\n# tensor([[0.7646, 0.1414], [0.1355, 0.6000]])\n```\n### 重排序模型 (Reranker)\n\n#### Transformers 使用\n\n```python\n# Requires transformers>=4.51.0\nimport torch\nfrom transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM\n\ndef format_instruction(instruction, query, doc):\n    if instruction is None:\n        instruction = 'Given a web search query, retrieve relevant passages that answer the query'\n    output = \"\u003CInstruct>: {instruction}\\n\u003CQuery>: {query}\\n\u003CDocument>: {doc}\".format(instruction=instruction,query=query, doc=doc)\n    return output\n\ndef process_inputs(pairs):\n    inputs = tokenizer(\n        pairs, padding=False, truncation='longest_first',\n        return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens)\n    )\n    for i, ele in enumerate(inputs['input_ids']):\n        inputs['input_ids'][i] = prefix_tokens + ele + suffix_tokens\n    inputs = tokenizer.pad(inputs, padding=True, return_tensors=\"pt\", max_length=max_length)\n    for key in inputs:\n        inputs[key] = inputs[key].to(model.device)\n    return inputs\n\n@torch.no_grad()\ndef compute_logits(inputs, **kwargs):\n    batch_scores = model(**inputs).logits[:, -1, :]\n    true_vector = batch_scores[:, token_true_id]\n    false_vector = batch_scores[:, token_false_id]\n    batch_scores = torch.stack([false_vector, true_vector], dim=1)\n    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)\n    scores = batch_scores[:, 1].exp().tolist()\n    return scores\n\ntokenizer = AutoTokenizer.from_pretrained(\"Qwen\u002FQwen3-Reranker-0.6B\", padding_side='left')\nmodel = AutoModelForCausalLM.from_pretrained(\"Qwen\u002FQwen3-Reranker-0.6B\").eval()\n\n# We recommend enabling flash_attention_2 for better acceleration and memory saving.\n\n# model = AutoModelForCausalLM.from_pretrained(\"Qwen\u002FQwen3-Reranker-0.6B\", torch_dtype=torch.float16, attn_implementation=\"flash_attention_2\").cuda().eval()\n\ntoken_false_id = tokenizer.convert_tokens_to_ids(\"no\")\ntoken_true_id = tokenizer.convert_tokens_to_ids(\"yes\")\nmax_length = 8192\n\nprefix = \"\u003C| ( performance|>system\\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \\\"yes\\\" or \\\"no\\\".\u003C| ( eyes|>\\n\u003C| ( performance|>user\\n\"\nsuffix = \"\u003C| ( eyes|>\\n\u003C| ( performance|>assistant\\nynchroneg>\\n\\nost switching>\\n\\n\"\nprefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)\nsuffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)\n        \ntask = 'Given a web search query, retrieve relevant passages that answer the query'\n\nqueries = [\"What is the capital of China?\",\n    \"Explain gravity\",\n]\n\ndocuments = [\n    \"The capital of China is Beijing.\",\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\",\n]\n\npairs = [format_instruction(task, query, doc) for query, doc in zip(queries, documents)]\n\n# Tokenize the input texts\ninputs = process_inputs(pairs)\nscores = compute_logits(inputs)\n\nprint(\"scores: \", scores)\n```\n\n#### vLLM 使用 \n\n```python\n# Requires vllm>=0.8.5\nimport logging\nfrom typing import Dict, Optional, List\n\nimport json\nimport logging\n\nimport torch\n\nfrom transformers import AutoTokenizer, is_torch_npu_available\nfrom vllm import LLM, SamplingParams\nfrom vllm.distributed.parallel_state import destroy_model_parallel\nimport gc\nimport math\nfrom vllm.inputs.data import TokensPrompt\n\ndef format_instruction(instruction, query, doc):\n    text = [\n        {\"role\": \"system\", \"content\": \"Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \\\"yes\\\" or \\\"no\\\".\"},\n        {\"role\": \"user\", \"content\": f\"\u003CInstruct>: {instruction}\\n\\n\u003CQuery>: {query}\\n\\n\u003CDocument>: {doc}\"}\n    ]\n    return text\n\ndef process_inputs(pairs, instruction, max_length, suffix_tokens):\n    messages = [format_instruction(instruction, query, doc) for query, doc in pairs]\n    messages =  tokenizer.apply_chat_template(\n        messages, tokenize=True, add_generation_prompt=False, enable_thinking=False\n    )\n    messages = [ele[:max_length] + suffix_tokens for ele in messages]\n    messages = [TokensPrompt(prompt_token_ids=ele) for ele in messages]\n    return messages\n\ndef compute_logits(model, messages, sampling_params, true_token, false_token):\n    outputs = model.generate(messages, sampling_params, use_tqdm=False)\n    scores = []\n    for i in range(len(outputs)):\n        final_logits = outputs[i].outputs[0].logprobs[-1]\n        token_count = len(outputs[i].outputs[0].token_ids)\n        if true_token not in final_logits:\n            true_logit = -10\n        else:\n            true_logit = final_logits[true_token].logprob\n        if false_token not in final_logits:\n            false_logit = -10\n        else:\n            false_logit = final_logits[false_token].logprob\n        true_score = math.exp(true_logit)\n        false_score = math.exp(false_logit)\n        score = true_score \u002F (true_score + false_score)\n        scores.append(score)\n    return scores\n\nnumber_of_gpu = torch.cuda.device_count()\ntokenizer = AutoTokenizer.from_pretrained('Qwen\u002FQwen3-Reranking-4B')\nmodel = LLM(model='Qwen\u002FQwen3-Reranking-0.6B', tensor_parallel_size=number_of_gpu, max_model_len=10000, enable_prefix_caching=True, gpu_memory_utilization=0.8)\ntokenizer.padding_side = \"left\"\ntokenizer.pad_token = tokenizer.eos_token\nsuffix = \"\u003C| ( eyes|>\\n\u003C| ( performance|>assistant\\nynchroneg>\\n\\nost switching>\\n\\n\"\nmax_length=8192\nsuffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)\ntrue_token = tokenizer(\"yes\", add_special_tokens=False).input_ids[0]\nfalse_token = tokenizer(\"no\", add_special_tokens=False).input_ids[0]\nsampling_params = SamplingParams(temperature=0, \n    max_tokens=1,\n    logprobs=20, \n    allowed_token_ids=[true_token, false_token],\n)\n\n        \ntask = 'Given a web search query, retrieve relevant passages that answer the query'\nqueries = [\"What is the capital of China?\",\n    \"Explain gravity\",\n]\ndocuments = [\n    \"The capital of China is Beijing.\",\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\",\n]\n\npairs = list(zip(queries, documents))\ninputs = process_inputs(pairs, task, max_length-len(suffix_tokens), suffix_tokens)\nscores = compute_logits(model, inputs, sampling_params, true_token, false_token)\nprint('scores', scores)\n\ndestroy_model_parallel()\n```\n\n\n📌 **提示**：我们建议开发者根据具体的场景、任务和语言自定义 `instruct`。我们的测试表明，在大多数检索（retrieval）场景中，如果在查询侧不使用 `instruct`，检索性能可能会下降约 1% 到 5%。\n\n更多使用示例，请参见 [examples]() 部分中的代码。\n\n## 训练\n\nQwen3-Embedding 模型的训练代码和说明可在 [训练文档](docs\u002Ftraining) 中找到。\n\n## 评估\n\n复现以下结果的代码可在 [评估]() 部分找到。\n\n### MTEB（大规模文本嵌入基准，多语言）\n\n| 模型                            |  大小   |  平均（任务）  | 平均（类型） | Bitext Mining（双语文本挖掘） | 分类 | 聚类 | 实例检索 | 多分类 | 配对分类 | 重排序 | 检索 | STS（语义文本相似度） |\n|----------------------------------|:-------:|:-------------:|:-------------:|:--------------:|:--------:|:--------:|:--------------:|:---------------:|:--------------:|:--------:|:--------:|:------:|\n| NV-Embed-v2                      |   7B    |     56.29     | 49.58       | 57.84        | 57.29  | 40.80  | 1.04         | 18.63         | 78.94        | 63.82  | 56.72  | 71.10|\n| GritLM-7B                        |   7B    |     60.92     | 53.74       | 70.53        | 61.83  | 49.75  | 3.45         | 22.77         | 79.94        | 63.78  | 58.31  | 73.33|\n| BGE-M3                           |  0.6B   |     59.56     | 52.18       | 79.11        | 60.35  | 40.88  | -3.11        | 20.1          | 80.76        | 62.79  | 54.60  | 74.12|\n| multilingual-e5-large-instruct   |  0.6B   |     63.22     | 55.08       | 80.13        | 64.94  | 50.75  | -0.40        | 22.91         | 80.86        | 62.61  | 57.12  | 76.81|\n| gte-Qwen2-1.5B-instruct          |  1.5B   |     59.45     | 52.69       | 62.51        | 58.32  | 52.05  | 0.74         | 24.02         | 81.58        | 62.58  | 60.78  | 71.61|\n| gte-Qwen2-7b-Instruct            |   7B    |     62.51     | 55.93       | 73.92        | 61.55  | 52.77  | 4.94         | 25.48         | 85.13        | 65.55  | 60.08  | 73.98|\n| text-embedding-3-large           |    -    |     58.93     | 51.41       | 62.17        | 60.27  | 46.89  | -2.68        | 22.03         | 79.17        | 63.89  | 59.27  | 71.68|\n| Cohere-embed-multilingual-v3.0   |    -    |     61.12     | 53.23       | 70.50        | 62.95  | 46.89  | -1.89        | 22.74         | 79.88        | 64.07  | 59.16  | 74.80|\n| gemini-embedding-exp-03-07       |    -    |     68.37     | 59.59       | 79.28        | 71.82  | 54.59  | 5.18         | **29.16**     | 83.63        | 65.58  | 67.71  | 79.40|\n| **Qwen3-Embedding-0.6B**         |  0.6B   |     64.33     | 56.00       | 72.22        | 66.83  | 52.33  | 5.09         | 24.59         | 80.83        | 61.41  | 64.64  | 76.17|\n| **Qwen3-Embedding-4B**           |   4B    |     69.45     | 60.86       | 79.36        | 72.33  | 57.15  | **11.56**    | 26.77         | 85.05        | 65.08  | 69.60  | 80.86|\n| **Qwen3-Embedding-8B**           |   8B    |   **70.58**   | **61.69**   | **80.89**    | **74.00** | **57.65** | 10.06      | 28.66         | **86.40**    | **65.63** | **70.88** | **81.08** |\n\n> **注意**：对比模型的分数来源于 MTEB 在线 [排行榜](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fmteb\u002Fleaderboard)，截至 2025 年 6 月 6 日。\n\n### MTEB（英文 v2）\n\n| MTEB 英文 \u002F 模型          |  参数量  | 平均（任务） | 平均（类型） | 分类 | 聚类 | 配对分类 | 重排序 | 检索 | 语义文本相似度 | 摘要 |\n|--------------------------------|:--------:|:------------:|:------------:|:--------:|:--------:|:-------------:|:---------:|:--------:|:-------:|:-------:|\n| multilingual-e5-large-instruct |   0.6B   | 65.53      | 61.21      | 75.54  | 49.89  | 86.24       | 48.74   | 53.47  | 84.72 | 29.89 |\n| NV-Embed-v2                    |   7.8B   | 69.81      | 65.00      | 87.19  | 47.66  | 88.69       | 49.61   | 62.84  | 83.82 | 35.21 |\n| GritLM-7B                      |   7.2B   | 67.07      | 63.22      | 81.25  | 50.82  | 87.29       | 49.59   | 54.95  | 83.03 | 35.65 |\n| gte-Qwen2-1.5B-instruct        |   1.5B   | 67.20      | 63.26      | 85.84  | 53.54  | 87.52       | 49.25   | 50.25  | 82.51 | 33.94 |\n| stella_en_1.5B_v5              |   1.5B   | 69.43      | 65.32      | 89.38  | 57.06  | 88.02       | 50.19   | 52.42  | 83.27 | 36.91 |\n| gte-Qwen2-7B-instruct          |   7.6B   | 70.72      | 65.77      | 88.52  | 58.97  | 85.9        | 50.47   | 58.09  | 82.69 | 35.74 |\n| gemini-embedding-exp-03-07     |    -     | 73.3       | 67.67      | 90.05  | **59.39**  | **87.7**   | 48.59   | 64.35  | 85.29 | **38.28** |\n| **Qwen3-Embedding-0.6B**       |   0.6B   | 70.70      | 64.88      | 85.76  | 54.05  | 84.37       | 48.18   | 61.83  | 86.57 | 33.43 |\n| **Qwen3-Embedding-4B**         |    4B    | 74.60      | 68.10      | 89.84  | 57.51  | 87.01       | 50.76   | 68.46  | **88.72** | 34.39 |\n| **Qwen3-Embedding-8B**         |    8B    | **75.22**  | **68.71**  | **90.43** | 58.57  | 87.52       | **51.56**   | **69.44**  | 88.58 | 34.83 |\n\n### C-MTEB（MTEB 中文）\n\n| C-MTEB           | 参数量 | 平均（任务） | 平均（类型） | 分类 | 聚类 | 配对分类 | 重排序 | 检索 | 语义文本相似度 |\n|------------------|--------|------------|------------|--------|--------|-------------|---------|-------|-------|\n| multilingual-e5-large-instruct | 0.6B   | 58.08      | 58.24      | 69.80  | 48.23  | 64.52       | 57.45   | 63.65 | 45.81 |\n| bge-multilingual-gemma2 | 9B     | 67.64      |68.52   | 75.31      | 59.30  | 86.67  | 68.28       | 73.73   | 55.19 | \n| gte-Qwen2-1.5B-instruct  | 1.5B   | 67.12      | 67.79      | 72.53  | 54.61  | 79.5        | 68.21   | 71.86 | 60.05 |\n| gte-Qwen2-7B-instruct    | 7.6B   | 71.62      | 72.19      | 75.77  | 66.06  | 81.16       | 69.24   | 75.70 | 65.20 |\n| ritrieve_zh_v1          | 0.3B   | 72.71      | 73.85      | 76.88  | 66.5   | **85.98**       | **72.86**   | 76.97 | **63.92** |\n| **Qwen3-Embedding-0.6B** | 0.6B   | 66.33      | 67.45      | 71.40  | 68.74  | 76.42       | 62.58   | 71.03 | 54.52 |\n| **Qwen3-Embedding-4B**   | 4B     | 72.27      | 73.51      | 75.46  | 77.89  | 83.34       | 66.05   | 77.03 | 61.26 |\n| **Qwen3-Embedding-8B**   | 8B     | **73.84**  | **75.00**  | **76.97**  | **80.08**  | 84.23       | 66.99   | **78.21** | 63.53 |\n\n### 重排序模型\n| 模型                              | 参数量 | MTEB-R  | CMTEB-R | MMTEB-R | MLDR   | MTEB-Code | FollowIR |\n|------------------------------------|--------|---------|---------|---------|--------|-----------|----------|\n| **Qwen3-Embedding-0.6B**               | 0.6B   | 61.82   | 71.02   | 64.64   | 50.26  | 75.41     | 5.09     |\n| Jina-multilingual-reranker-v2-base | 0.3B   | 58.22   | 63.37   | 63.73   | 39.66  | 58.98     | -0.68    |\n| gte-multilingual-reranker-base                      | 0.3B   | 59.51   | 74.08   | 59.44   | 66.33  | 54.18     | -1.64    |\n| BGE-reranker-v2-m3                 | 0.6B   | 57.03   | 72.16   | 58.36   | 59.51  | 41.38     | -0.01    |\n| **Qwen3-Reranker-0.6B**                | 0.6B   | 65.80   | 71.31   | 66.36   | 67.28  | 73.42     | 5.41     |\n| **Qwen3-Reranker-4B**                  | 4B   | **69.76** | 75.94   | 72.74   | 69.97  | 81.20     | **14.84** |\n| **Qwen3-Reranker-8B**                  | 8B     | 69.02   | **77.45** | **72.94** | **70.19** | **81.22** | 8.05     |\n\n> **注意**：  \n> - 重排序模型的评估结果。我们使用了 MTEB(eng, v2)、MTEB(cmn, v1)、MMTEB 和 MTEB (Code) 的检索子集，分别为 MTEB-R、CMTEB-R、MMTEB-R 和 MTEB-Code。\n> - 所有分数均为我们基于密集嵌入模型 [Qwen3-Embedding-0.6B](https:\u002F\u002Fhuggingface.co\u002FQwen\u002FQwen3-Embedding-0.6B) 检索的前 100 个候选项的运行结果。\n\n\n## 引用\n如果您觉得我们的工作有帮助，欢迎引用。\n\n```\n@article{qwen3embedding,\n  title={Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models},\n  author={Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and Zhang, Xin and Lin, Huan and Yang, Baosong and Xie, Pengjun and Yang, An and Liu, Dayiheng and Lin, Junyang and Huang, Fei and Zhou, Jingren},\n  journal={arXiv preprint arXiv:2506.05176},\n  year={2025}\n}\n```","# Qwen3-Embedding 快速上手指南\n\nQwen3-Embedding 是通义千问（Qwen）系列最新的专用文本嵌入与重排序模型，支持多语言、长上下文及指令感知能力。本指南将帮助您快速部署和运行该模型。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n- **Python 版本**：建议 Python 3.8 及以上。\n- **深度学习框架**：PyTorch (`torch`)。\n- **核心依赖**：`transformers` 版本必须 **>= 4.51.0**。\n  > ⚠️ **注意**：如果使用低于 4.51.0 版本的 transformers，可能会遇到 `KeyError: 'qwen3'` 错误。\n- **硬件建议**：推荐使用 GPU 以加速推理，尤其是运行 4B 或 8B 大尺寸模型时。\n\n## 安装步骤\n\n通过 pip 安装必要的依赖库。为了在中国大陆地区获得更快的下载速度，建议配置国内镜像源或使用 ModelScope。\n\n```bash\npip install torch transformers sentence-transformers accelerate\n```\n\n如需优化显存和计算速度，可额外安装 flash attention：\n```bash\npip install flash-attn --no-build-isolation\n```\n\n## 基本使用\n\n以下示例展示了如何使用 `Sentence Transformers` 库加载并生成嵌入向量。代码中已包含指令处理逻辑。\n\n### 1. 选择模型\n根据需求选择合适的模型尺寸（0.6B \u002F 4B \u002F 8B）。例如：\n- **轻量级**：`Qwen\u002FQwen3-Embedding-0.6B`\n- **高性能**：`Qwen\u002FQwen3-Embedding-8B`\n\n### 2. 代码示例\n\n```python\n# Requires transformers>=4.51.0\n# Requires sentence-transformers>=2.7.0\n\nfrom sentence_transformers import SentenceTransformer\n\n# Load the model\nmodel = SentenceTransformer(\"Qwen\u002FQwen3-Embedding-0.6B\")\n\n# We recommend enabling flash_attention_2 for better acceleration and memory saving,\n# together with setting `padding_side` to \"left\":\n# model = SentenceTransformer(\n#     \"Qwen\u002FQwen3-Embedding-0.6B\",\n#     model_kwargs={\"attn_implementation\": \"flash_attention_2\", \"device_map\": \"auto\"},\n#     tokenizer_kwargs={\"padding_side\": \"left\"},\n# )\n\n# The queries and documents to embed\nqueries = [\n    \"What is the capital of China?\",\n    \"Explain gravity\",\n]\ndocuments = [\n    \"The capital of China is Beijing.\",\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\",\n]\n\nwith torch.no_grad():\n    # Encode the queries and documents. Note that queries benefit from using a prompt\n    # Here we use the prompt called \"query\" stored under `model.prompts`, but you can\n    # also pass your own prompt via the `prompt` argument\n    query_embeddings = model.encode(queries, prompt_name=\"query\")\n    document_embeddings = model.encode(documents)\n\n    # Compute the (cosine) similarity between the query and document embeddings\n    similarity = model.similarity(query_embeddings, document_embeddings)\n\nprint(similarity)\n# tensor([[0.7646, 0.1414], [0.1355, 0.6000]])\n```\n\n### 提示\n- **指令优化**：虽然 `Sentence Transformers` 封装了部分指令逻辑，但在自定义任务时，建议在查询前添加描述性指令（建议使用英文），通常能提升 1%~5% 的效果。\n- **国内加速**：如果从 Hugging Face 下载模型较慢，可尝试设置环境变量使用 ModelScope 镜像，或直接访问 [ModelScope](https:\u002F\u002Fmodelscope.cn\u002Fcollections\u002FQwen3-Embedding-3edc3762d50f48) 获取模型文件。","某跨境电商企业正在搭建全球售后知识库系统，需处理来自不同国家用户的复杂咨询并关联内部长文档。\n\n### 没有 Qwen3-Embedding 时\n- 通用嵌入模型对非英语小语种理解能力弱，导致日语或西班牙语咨询匹配错误率高。\n- 产品说明书长达数万字，传统短窗口切分导致关键信息断裂，无法精准定位故障原因。\n- 初步检索返回大量低相关性文档，客服团队需花费大量时间人工二次筛选有效信息。\n- 现有方案依赖昂贵的大参数模型，推理延迟高且服务器成本超出初创团队预算。\n\n### 使用 Qwen3-Embedding 后\n- Qwen3-Embedding 原生支持超百种语言，显著提升多语种查询的语义对齐精度。\n- 凭借 32K 长文本处理能力，完整编码整份技术手册，确保长文档中的细微条款被准确识别。\n- 结合 Qwen3-Reranker 重排序模块，自动过滤噪声，将最相关的解决方案置顶展示。\n- 可选用 0.6B 轻量版或 8B 高精度版，根据业务流量灵活调整，实现性价比最优解。\n\n通过引入 Qwen3-Embedding 系列，系统在降低算力成本的同时，实现了跨语言长文档的高精度智能检索。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FQwenLM_Qwen3-Embedding_80a34fa5.png","QwenLM","Qwen","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FQwenLM_4756c6c9.png","Alibaba Cloud's general-purpose AI models","qianwen_opensource@alibabacloud.com","Alibaba_Qwen","https:\u002F\u002Fqwen.ai\u002F","https:\u002F\u002Fgithub.com\u002FQwenLM",[84,88],{"name":85,"color":86,"percentage":87},"Python","#3572A5",98.3,{"name":89,"color":90,"percentage":91},"Shell","#89e051",1.7,1881,118,"2026-04-04T22:03:38","未说明",{"notes":97,"python":95,"dependencies":98},"需确保 transformers 版本不低于 4.51.0 以避免 KeyError；建议使用 flash_attention_2 实现加速并节省显存；多语言场景下指令建议使用英文以优化效果；支持自定义嵌入维度（MRL）；提供 0.6B 至 8B 多种模型规格可选。",[99,100,101,102],"transformers>=4.51.0","sentence-transformers>=2.7.0","vllm>=0.8.5","torch",[15],"2026-03-27T02:49:30.150509","2026-04-06T05:38:01.687353",[107,112,116,121,126,131],{"id":108,"question_zh":109,"answer_zh":110,"source_url":111},3120,"LoRA 微调合并后模型效果无变化，如何解决？","如果是 LoRA 微调，在 merge 权重时，import 类不应使用 AutoModel，而应使用 AutoModelForCausalModel。全量微调通常不会出现此问题。参考：https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen3-Embedding\u002Fissues\u002F70","https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen3-Embedding\u002Fissues\u002F70",{"id":113,"question_zh":114,"answer_zh":115,"source_url":111},3121,"更换机器后微调结果差异巨大，原因是什么？","确保两台机器的 swift 和 transformers 版本相同。若仍出现差异，建议从 ModelScope 下载模型，因为从 HuggingFace 下载的模型在使用该框架运行时可能会出现问题。参考：https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen3-Embedding\u002Fissues\u002F70",{"id":117,"question_zh":118,"answer_zh":119,"source_url":120},3122,"新版数据集格式（messages\u002Fpositive_messages）无法识别怎么办？","旧版通过 pip 安装的 ms-swift 可能无法识别最新的消息格式。需要重新从源码安装最新版的 ms-swift 才能支持包含 messages、positive_messages 和 negative_messages 的新格式。参考：https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen3-Embedding\u002Fissues\u002F155","https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen3-Embedding\u002Fissues\u002F155",{"id":122,"question_zh":123,"answer_zh":124,"source_url":125},3123,"Embedding 输出维度是否必须为 32 的倍数？如何自定义？","支持 32 到 1024 之间的任意整数。可以通过代码手动提取未归一化的 embedding 并调整维度，例如直接截断向量长度（embeddings = embeddings[:, :128]）或使用投影层。参考：https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen3-Embedding\u002Fissues\u002F2","https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen3-Embedding\u002Fissues\u002F2",{"id":127,"question_zh":128,"answer_zh":129,"source_url":130},3124,"使用 vLLM 服务 Embedding 模型时启动参数是什么？","需在 `vllm serve` 命令中添加 `--task embedding` 参数。如果该模式报错或暂不支持，可以尝试使用 `--task score` 参数启动，但需注意部分功能可能受限。参考：https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen3-Embedding\u002Fissues\u002F12","https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen3-Embedding\u002Fissues\u002F12",{"id":132,"question_zh":133,"answer_zh":134,"source_url":130},3125,"vLLM 调用 Embedding 时报错 `AttributeError: 'Qwen3ForCausalLM' object has no attribute 'pooler'` 如何处理？","这表明当前版本对 vLLM 作为 embedding 后端的支持尚不完善，模型对象缺少 pooler 属性。建议暂时避免使用 vLLM 进行 embedding 推理，或检查任务类型配置是否正确，考虑使用标准 transformers 推理方式。参考：https:\u002F\u002Fgithub.com\u002FQwenLM\u002FQwen3-Embedding\u002Fissues\u002F12",[]]