[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-xlang-ai--instructor-embedding":3,"tool-xlang-ai--instructor-embedding":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",140436,2,"2026-04-05T23:32:43",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":79,"owner_location":79,"owner_email":79,"owner_twitter":80,"owner_website":81,"owner_url":82,"languages":83,"stars":88,"forks":89,"last_commit_at":90,"license":91,"difficulty_score":23,"env_os":92,"env_gpu":93,"env_ram":92,"env_deps":94,"category_tags":102,"github_topics":103,"view_count":23,"oss_zip_url":79,"oss_zip_packed_at":79,"status":16,"created_at":114,"updated_at":115,"faqs":116,"releases":117},3422,"xlang-ai\u002Finstructor-embedding","instructor-embedding","[ACL 2023] One Embedder, Any Task: Instruction-Finetuned Text Embeddings","Instructor-embedding 是一款基于指令微调的文本嵌入模型，旨在用同一个模型灵活应对各类自然语言处理任务。传统嵌入模型通常针对特定任务（如分类或检索）单独训练，通用性较差；而 Instructor 通过引入“任务指令”，让用户只需在输入文本时附带简单的自然语言描述（例如“代表科学标题”或“用于检索重复句子”），即可动态生成适配该场景的高质量向量表示，无需重新训练或微调模型。\n\n这一特性有效解决了多任务场景下模型部署成本高、适应性差的痛点。它在 70 多项 diverse 的嵌入任务中取得了业界领先的性能，涵盖分类、聚类、信息检索及文本评估等多个领域，尤其擅长处理跨域数据（如科研、金融等专业语境）。\n\nInstructor-embedding 非常适合 AI 开发者、数据科学家及研究人员使用。对于需要构建搜索引擎、推荐系统或进行大规模文本聚类的团队，它能显著简化工作流程，降低维护多个专用模型的复杂度。其核心技术亮点在于“指令驱动”的泛化能力，将任务定义从代码逻辑转化为自然语言提示，使得模型能够像理解人类意图一样，按需定制输出结果，极大地提升了文本表示的灵活性与准确性。","# One Embedder, Any Task: Instruction-Finetuned Text Embeddings\n\nThis repository contains the code and pre-trained models for our paper [One Embedder, Any Task: Instruction-Finetuned Text Embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09741). Please refer to our [project page](https:\u002F\u002Finstructor-embedding.github.io\u002F) for a quick project overview.\n\nWe introduce **Instructor**👨‍🏫, an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) and domains (e.g., science, finance, etc.) ***by simply providing the task instruction, without any finetuning***. Instructor👨‍ achieves sota on 70 diverse embedding tasks!\n\n**************************** **Updates** ****************************\n\n* 01\u002F21: We updated the code structure, which supports easy package installation.\n* 12\u002F28: We updated the [checkpoint](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-large) with hard negatives.\n* 12\u002F20: We released [our paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09741), [code](https:\u002F\u002Fgithub.com\u002FHKUNLP\u002Finstructor-embedding), [project page](https:\u002F\u002Finstructor-embedding.github.io\u002F) and [checkpoint](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-large). Check them out!\n\n## Quick Links\n\n- [One Embedder, Any Task: Instruction-Finetuned Text Embeddings](#one-embedder-any-task-instruction-finetuned-text-embeddings)\n  - [Quick Links](#quick-links)\n  - [Installation](#installation)\n    - [Environment setup](#environment-setup)\n  - [Getting Started](#getting-started)\n    - [The `encode` function](#the-encode-function)\n  - [Model List](#model-list)\n  - [Use Cases](#use-cases)\n    - [Calculate embeddings for your customized texts](#calculate-embeddings-for-your-customized-texts)\n    - [Compute similarities between texts](#compute-similarities-between-texts)\n    - [Use customized embeddings for information retrieval](#use-customized-embeddings-for-information-retrieval)\n    - [Use customized embeddings for clustering](#use-customized-embeddings-for-clustering)\n  - [Training](#training)\n    - [Data](#data)\n    - [Train INSTRUCTOR](#train-instructor)\n  - [Evaluation](#evaluation)\n    - [MTEB](#mteb)\n    - [Billboard](#billboard)\n    - [Prompt Retrieval](#prompt-retrieval)\n  - [Quantization](#quantization)\n  - [Bugs or questions?](#bugs-or-questions)\n  - [Citation](#citation)\n  - [INSTRUCTOR Elsewhere](#instructor-elsewhere)\n\n## Installation\nIt is very easy to use INSTRUCTOR for any text embeddings. You can easily try it out in [Colab notebook](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1P7ivNLMosHyG7XOHmoh7CoqpXryKy3Qt?usp=sharing). In your local machine, we recommend to first create a virtual environment:\n```bash\nconda env create -n instructor python=3.7\ngit clone https:\u002F\u002Fgithub.com\u002FHKUNLP\u002Finstructor-embedding\npip install -r requirements.txt\n```\nThat will create the environment `instructor` we used. To use the embedding tool, first install the `InstructorEmbedding` package from PyPI\n```bash\npip install InstructorEmbedding\n```\nor directly install it from our code\n```bash\npip install -e .\n```\n\n### Environment setup\n\nActivate the environment by running\n```bash\nconda activate instructor\n```\n\n## Getting Started\n\nFirst download a pretrained model (See [model list](#model-list) for a full list of available models)\n\n```python\nfrom InstructorEmbedding import INSTRUCTOR\nmodel = INSTRUCTOR('hkunlp\u002Finstructor-large')\n```\n\nThen provide the sentence and customized instruction to the model.\n```python\n# prepare texts with instructions\ntext_instruction_pairs = [\n    {\"instruction\": \"Represent the Science title:\", \"text\": \"3D ActionSLAM: wearable person tracking in multi-floor environments\"},\n    {\"instruction\": \"Represent the Medicine sentence for retrieving a duplicate sentence:\", \"text\": \"Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear.\"}\n]\n\n# postprocess\ntexts_with_instructions = []\nfor pair in text_instruction_pairs:\n    texts_with_instructions.append([pair[\"instruction\"], pair[\"text\"]])\n\n# calculate embeddings\ncustomized_embeddings = model.encode(texts_with_instructions)\n```\n\nAnd that's it already. We now have a list of numpy arrays with the embeddings.\n\n```python\nfor pair, embedding in zip(text_instruction_pairs, customized_embeddings):\n    print(\"Instruction: \", pair[\"instruction\"])\n    print(\"text: \", pair[\"text\"])\n    print(\"Embedding: \", embedding)\n    print(\"\")\n```\n\n### The `encode` function\n\nThe users of the model need to use only the `encode` function:\n\n```python\nmodel.encode( sentences,\n              batch_size: int = 32,\n              show_progress_bar: bool = None,\n              output_value: str = 'sentence_embedding',\n              convert_to_numpy: bool = True,\n              convert_to_tensor: bool = False,\n              device: str = None,\n              normalize_embeddings: bool = False)\n```\n\n* `sentences`: The sentences to be embedded. It should be in the format of `[[\"instruction prompt 0\", \"text to be embedded 0], [\"instruction prompt 1\", \"text to be embedded 1], ...]`.\n* `batch_size` (default: 32): The batch size used for the computation. It determines the number of sentences processed together in each batch.\n* `show_progress_bar` (default: None): If set to `True`, it displays a progress bar while encoding sentences, providing a visual indication of the encoding progress.\n* `output_value` (default: 'sentence\\_embedding'): Specifies the desired output type. The default value 'sentence\\_embedding' returns sentence embeddings. Setting it to 'token\\_embeddings' returns wordpiece token embeddings. Setting it to None returns all output values.\n* `convert_to_numpy` (default: `True`): If set to `True`, the output is a list of numpy vectors. If set to `False`, the output is a list of PyTorch tensors.\n* `convert_to_tensor` (default: `False`): If set to `True`, the function returns a stacked tensor as a single output. This parameter overrides any setting specified by `convert_to_numpy`.\n* `device` (default: None): Specifies the torch.device to use for the computation. If not specified, the function uses the default device.\n* `normalize_embeddings` (default: `False`): If set to `True`, the returned vectors will have a length of 1, indicating that they are normalized. In this case, similarity search would use the faster dot-product (`util.dot_score`), instead of cosine similarity.\n\n## Model List\n\nWe released a series of INSTRUCTOR checkpoints with different sizes. You can easily load these models with `InstructorEmbedding` package. \n|              Model              | Avg. Score |\n|:-------------------------------|:--------:|\n|  [hkunlp\u002Finstructor-base](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-base) |   55.9 |\n| [hkunlp\u002Finstructor-large](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-large) |   58.4  |\n|    [hkunlp\u002Finstructor-xl](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-xl)    |   58.8  |\n\n## Use Cases\nWe provide a few specific use cases in the following. For more examples and applications, refer to [our paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09741)\n### Calculate embeddings for your customized texts\nIf you want to calculate customized embeddings for specific sentences, you may follow the unified template to write instructions: \n\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Represent the `domain` `text_type` for `task_objective`:\n* `domain` is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc.\n* `text_type` is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc.\n* `task_objective` is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc.\n\n### Compute similarities between texts\nYou can use **INSTRUCTOR** to compute similarities between two groups of sentences, with **customized embeddings**.\n```python\nfrom sklearn.metrics.pairwise import cosine_similarity\nsentences_a = [['Represent the Science sentence: ','Parton energy loss in QCD matter'], \n               ['Represent the Financial statement: ','The Federal Reserve on Wednesday raised its benchmark interest rate.']]\nsentences_b = [['Represent the Science sentence: ','The Chiral Phase Transition in Dissipative Dynamics'],\n               ['Represent the Financial statement: ','The funds rose less than 0.5 per cent on Friday']]\nembeddings_a = model.encode(sentences_a)\nembeddings_b = model.encode(sentences_b)\nsimilarities = cosine_similarity(embeddings_a,embeddings_b)\n```\n\n### Use customized embeddings for information retrieval\n```python\nimport numpy as np\nfrom sklearn.metrics.pairwise import cosine_similarity\nquery  = [['Represent the Wikipedia question for retrieving supporting documents: ','where is the food stored in a yam plant']]\ncorpus = [['Represent the Wikipedia document for retrieval: ','Capitalism has been dominant in the Western world since the end of feudalism, but most feel[who?] that the term \"mixed economies\" more precisely describes most contemporary economies, due to their containing both private-owned and state-owned enterprises. In capitalism, prices determine the demand-supply scale. For example, higher demand for certain goods and services lead to higher prices and lower demand for certain goods lead to lower prices.'],\n          ['Represent the Wikipedia document for retrieval: ',\"The disparate impact theory is especially controversial under the Fair Housing Act because the Act regulates many activities relating to housing, insurance, and mortgage loansâ€”and some scholars have argued that the theory's use under the Fair Housing Act, combined with extensions of the Community Reinvestment Act, contributed to rise of sub-prime lending and the crash of the U.S. housing market and ensuing global economic recession\"],\n          ['Represent the Wikipedia document for retrieval: ','Disparate impact in United States labor law refers to practices in employment, housing, and other areas that adversely affect one group of people of a protected characteristic more than another, even though rules applied by employers or landlords are formally neutral. Although the protected classes vary by statute, most federal civil rights laws protect based on race, color, religion, national origin, and sex as protected traits, and some laws include disability status and other traits as well.']]\nquery_embeddings = model.encode(query)\ncorpus_embeddings = model.encode(corpus)\nsimilarities = cosine_similarity(query_embeddings,corpus_embeddings)\nretrieved_doc_id = np.argmax(similarities)\nprint(retrieved_doc_id)\n```\n\n### Use customized embeddings for clustering\n```python\nimport sklearn.cluster\nsentences = [['Represent the Medicine sentence for clustering: ','Dynamical Scalar Degree of Freedom in Horava-Lifshitz Gravity'],\n             ['Represent the Medicine sentence for clustering: ','Comparison of Atmospheric Neutrino Flux Calculations at Low Energies'],\n             ['Represent the Medicine sentence for clustering: ','Fermion Bags in the Massive Gross-Neveu Model'],\n             ['Represent the Medicine sentence for clustering: ',\"QCD corrections to Associated t-tbar-H production at the Tevatron\"],\n             ['Represent the Medicine sentence for clustering: ','A New Analysis of the R Measurements: Resonance Parameters of the Higher,  Vector States of Charmonium']]\nembeddings = model.encode(sentences)\nclustering_model = sklearn.cluster.MiniBatchKMeans(n_clusters=2)\nclustering_model.fit(embeddings)\ncluster_assignment = clustering_model.labels_\nprint(cluster_assignment)\n```\n## Training\n### Data\nWe construct Multitask Embeddings Data\nwith Instructions (MEDI), consisting of a collection of 330 datasets from [Super-NI](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07705)(Super-NaturalInstructions), [sentence-transformer embedding training data](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fsentence-transformers\u002Fembedding-training-data), [KILT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.02252) and [MedMCQA](https:\u002F\u002Fproceedings.mlr.press\u002Fv174\u002Fpal22a\u002Fpal22a.pdf), spanning a wide range of domains and tasks. We construct positive and negative pairs if they are not provided, and store them in a unified format:\n```\n[\n    {'query': ['Represent the Wikipedia question for retrieving relevant documents;', 'big little lies season 2 how many episodes'], 'pos': ['Represent the Wikipedia document for retrieval;', 'Big Little Lies (TV series) series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley'], 'neg': ['Represent the Wikipedia document for retrieval;', 'Little People, Big World final minutes of the season two-A finale, \"Farm Overload\". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of \"Little People, Big World\" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend'], 'task_id': 1}\n    {'query': ['Represent the Wikipedia question for retrieving relevant documents;', 'who sang waiting for a girl like you'], 'pos': ['Represent the Wikipedia document for retrieval;', 'Waiting for a Girl Like You Waiting for a Girl Like You \"Waiting for a Girl Like You\" is a 1981 power ballad by the British-American rock band Foreigner. The distinctive synthesizer theme was performed by the then-little-known Thomas Dolby, and this song also marked a major departure from their earlier singles because their previous singles were mid to upper tempo rock songs while this song was a softer love song with the energy of a power ballad. It was the second single released from the album \"4\" (1981) and was co-written by Lou Gramm and Mick Jones. It has become one of the band\\'s most'], 'neg': ['Represent the Wikipedia document for retrieval;', 'Waiting for a Girl Like You held off the number 1 spot by Olivia Newton-John\\'s single \"Physical\" for nine consecutive weeks, and then by Hall & Oates\\' \"I Can\\'t Go for That (No Can Do)\" for a tenth week on January 30, 1982. Because of its chart longevity, it ended up being the number 19 song on the Top 100 singles of 1982. The song was the band\\'s biggest hit until \"I Want to Know What Love Is\" hit number 1 in 1985. The song lists at number 100 on \"\"Billboard\"\\'s Greatest Songs of All Time\". Waiting for a Girl Like You \"Waiting for a Girl'], 'task_id': 1}\n    ...\n    {'query': ['Represent the Wikipedia sentence for retrieving relevant documents;', 'i LOVE sweet martini drinks!'], 'pos': ['Represent the Wikipedia document for retrieval;', \"Appletini Appletini\\nAn Apple martini (Appletini for short) is a cocktail containing vodka and one or more of apple juice, apple cider, apple liqueur, or apple brandy.\\nThis drink, originally called an Adam's Apple Martini because the bartender who created it was named Adam, was created in 1996 at Lola's West Hollywood restaurant.\\nThe drink, Adam's Apple was advertised by Smirnoff in the July 1972 issue of Playboy Magazine to the inside front cover. The recipe called for an ounce or so of Smirnoff\"], 'neg': ['Represent the Wikipedia document for retrieval;', \"Aromatised wine similar beverages described in this legislation are 'aromatised wine-based drinks' (non-fortified) and 'aromatised wine-product cocktail' (blended, lower alcohol drink under 7% ABV).\\nVarieties of aromatised wine.\\nVarieties of aromatised wine Vermouth.\\nVermouth is the most widely used aromatised wine due to its use in cocktails and famous commercial brands such as Martini and Cinzano which are commonplace around the world. Vermouth can be sweet or dry and red, white, pink or orange. It is traditionally\"], 'task_id': 300}\n]\n```\nEach instance consists of a query, a positive pair, a negative pair and the id of the task, which is used to ensure data in the same training batch are from the same task.\nThe MEDI data is available to be downloaded at [this link](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1vZ5c2oJNonGOvXzppNg5mHz24O6jcc52\u002Fview?usp=sharing).\n\n### Train INSTRUCTOR\nWe provide the example script for training INSTRUCTOR. You may need to first download the [MEDI data](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1vZ5c2oJNonGOvXzppNg5mHz24O6jcc52\u002Fview?usp=sharing), unzip the folder and put `medi-data.json` under `--cache_dir`.\n```python\npython train.py --model_name_or_path sentence-transformers\u002Fgtr-t5-large --output_dir {output_directory} --cache_dir {cache_directory} --max_source_length 512 --num_train_epochs 10 --save_steps 500 --cl_temperature 0.1 --warmup_ratio 0.1 --learning_rate 2e-5 --overwrite_output_dir\n```\nWe explain the arguments in the following:\n* `--model_name_or_path`: Pretrained checkpoints to start with. We support both model id (e.g., `sentence-transformers\u002Fgtr-t5-large`, `sentence-transformers\u002Fsentence-t5-large`) or checkpoint path (e.g., checkpoint saved by transformers trainer).\n* `--cl_temperature`: Temperature for contrastive loss\n* `--cache_dir`: The directory to cache downloaded models and data. The downloaded MEDI data(`medi-data.json`) should be put under the directory `--cache_dir`.\n* `--output_dir`: The directory to store the trained models(checkpoints) for evaluation. \n\nAll the other arguments are standard `Huggingface's transformers` training arguments, such as `--overwrite_output_dir`, `--num_train_epochs`, `--learning_rate`. For details, refer to [Huggingface transformers](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers) \n\n## Evaluation\nWe evaluate INSTRUCTOR massively on 70 diverse tasks, spanning a wide range of tasks and domains. Specifically, we build our evaluation on three benchmarks, [MTEB](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fmteb\u002Fleaderboard), [Billboard](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.04139), and [Prompt Retrieval](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.01975). We explain the details about running evaluation scripts in the following.\n\u003C!-- * MTEB is a comprehensive embedding evaluation benchmark that aims to provide a holistic view of embedding models.  It combines several conventional benchmarks (e.g., BEIR and STS) and spans a wide range of domain-specific datasets, including science, biology, and medicine. \n* Prompt Retrieval tasks aim to retrieve a few in-context learning (i.e., demonstration) examples from annotated examples given a test instance. The embedding model is used to encode all annotated examples and to find the few most similar examples to the test instance based on the cosine similarity. We evaluate embeddings by measuring the average performance on the downstream tasks. \n* Billboard applies INSTRUCTOR to automatic evaluations for text generation tasks. Following [Kasai et al. (2022a)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.04139), we measure the cosine similarity between the generated text and each reference text and take the maximum similarity score over all references available. We evaluate all embedding models by the Pearson correlation with the human judgments. -->\n\n### MTEB\nTo evaluate the model performance on MTEB benchmark dataset, first install the MTEB library\n\n```python\ncd evaluation\u002FMTEB\npip install -e .\n```\nThen run the following command:\n```python\npython examples\u002Fevaluate_model.py --model_name hkunlp\u002Finstructor-large --output_dir outputs --task_name ArguAna --result_file results\n```\nYou can evaluate your trained model checkpoints by specifying `--model_name` and run all MTEB datasets by changing `--task_name`. Check [our paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09741) or [MTEB benchmark](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fmteb\u002Fleaderboard) for evaluation metrics of all tasks.\n\n### Billboard\nTo evaluate the model performance on Billboard, run the following command:\n```python\ncd evaluation\u002Ftext_evaluation\npython main.py --model_name hkunlp\u002Finstructor-large --task mscoco --add_prompt\n```\nYou can evaluate your trained model checkpoints by specifying `--model_name` and run all Billboard datasets by changing `--task`. In all of the three datasets in Billboard, we report the Pearson correlation.\n\n### Prompt Retrieval\nTo evaluate the model performance on Prompt Retrieval, run the following command:\n```python\ncd evaluation\u002Fprompt_retrieval\npython main.py --embedding_model hkunlp\u002Finstructor-large --task rte --model_cache_dir {cache_dir} --output_dir {output_dir} --add_prompt\n```\nYou can evaluate your trained model checkpoints by specifying `--model_name` and run prompt retrieval datasets by changing `--task`. In order to have a consistent metric, we cast all tasks in Prompt Retrieval into a \"text-to-text\" format, and report the Rouge-L score.\n\n\n## Quantization \nTo [**Quantize**](https:\u002F\u002Fpytorch.org\u002Fdocs\u002Fstable\u002Fquantization.html) the Instructor embedding model, run the following code: \n\n```python \n# imports \nimport torch\nfrom InstructorEmbedding import INSTRUCTOR\n\n# load the model \nmodel = INSTRUCTOR('hkunlp\u002Finstructor-large', device='cpu')  # you can use GPU\n\n# quantize the model \nqmodel = torch.quantization.quantize_dynamic(\nmodel, {torch.nn.Linear}, dtype=torch.qint8)\n\n# Inference \nsentence = \"3D ActionSLAM: wearable person tracking in multi-floor environments\"\ninstruction = \"Represent the Science title:\"\n\nembeddings = qmodel.encode([[instruction,sentence]])  \n# you can also normalize the embeddings:  normalize_embeddings=True \n\nprint(f\"Quantized Embeddings:\\n {embeddings}\")\n````\n\nIt reduces the model size by 10x and inference time will be lesser than normal model :) \n\n\n## Bugs or questions?\nIf you have any question related to the code or the paper, feel free to email Hongjin (`hjsu@cs.hku.hk`) and Weijia (`swj0419@cs.washington.edu`). Please try to specify the problem with details so we can help you better and quicker.\n\n## Citation\nIf you find our work helpful, please cite us:\n\n```bibtex\n@inproceedings{INSTRUCTOR,\n  title={One Embedder, Any Task: Instruction-Finetuned Text Embeddings},\n  author={Su, Hongjin and Shi, Weijia and Kasai, Jungo and Wang, Yizhong and Hu, Yushi and  Ostendorf, Mari and Yih, Wen-tau and Smith, Noah A. and  Zettlemoyer, Luke and Yu, Tao},\n  url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09741},\n  year={2022},\n}\n```\n\n## INSTRUCTOR Elsewhere\nWe thank the community's efforts for extending INSTRUCTOR!\n* [LangChain](https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fintegrations\u002Ftext_embedding\u002Finstruct_embeddings) supports InstructEmbeddings, which use the INSTRUCTOR model.\n* [MosaicML](https:\u002F\u002Fwww.mosaicml.com\u002Finference) has included [Instructor-Large](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-large) and [Instructor-XL](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-xl)\n* [embaas](https:\u002F\u002Fembaas.io\u002Fdocs\u002Fmodels\u002Finstructor) integrated [Instructor-Large](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-large)\n* [Haystack](https:\u002F\u002Fhaystack.deepset.ai\u002Fintegrations\u002Finstructor-embedder) includes `InstructorTextEmbedder` and `InstructorDocumentEmbedder` components.\n","# 一个嵌入器，适用于任何任务：指令微调文本嵌入\n\n本仓库包含我们论文《一个嵌入器，适用于任何任务：指令微调文本嵌入》（[arXiv:2212.09741](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09741)）的代码及预训练模型。请访问我们的[项目主页](https:\u002F\u002Finstructor-embedding.github.io\u002F)以获取项目的快速概览。\n\n我们推出了**Instructor**👨‍🏫，这是一种经过指令微调的文本嵌入模型，能够根据任务指令生成针对任意任务（如分类、检索、聚类、文本评估等）和领域的文本嵌入（如科学、金融等领域），***只需提供任务指令即可，无需额外微调***。Instructor👨‍在70个多样化的嵌入任务上均取得了当前最优性能！\n\n**************************** **更新** ****************************\n\n* 01\u002F21：我们更新了代码结构，支持便捷的包安装。\n* 12\u002F28：我们使用硬负样本更新了[检查点](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-large)。\n* 12\u002F20：我们发布了[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09741)、[代码](https:\u002F\u002Fgithub.com\u002FHKUNLP\u002Finstructor-embedding)、[项目主页](https:\u002F\u002Finstructor-embedding.github.io\u002F)以及[检查点](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-large)。快来查看吧！\n\n## 快速链接\n\n- [一个嵌入器，适用于任何任务：指令微调文本嵌入](#one-embedder-any-task-instruction-finetuned-text-embeddings)\n  - [快速链接](#quick-links)\n  - [安装](#installation)\n    - [环境设置](#environment-setup)\n  - [入门指南](#getting-started)\n    - [`encode` 函数](#the-encode-function)\n  - [模型列表](#model-list)\n  - [使用场景](#use-cases)\n    - [为自定义文本计算嵌入](#calculate-embeddings-for-your-customized-texts)\n    - [计算文本之间的相似度](#compute-similarities-between-texts)\n    - [将自定义嵌入用于信息检索](#use-customized-embeddings-for-information-retrieval)\n    - [将自定义嵌入用于聚类](#use-customized-embeddings-for-clustering)\n  - [训练](#training)\n    - [数据](#data)\n    - [训练 INSTRUCTOR](#train-instructor)\n  - [评估](#evaluation)\n    - [MTEB](#mteb)\n    - [Billboard](#billboard)\n    - [提示检索](#prompt-retrieval)\n  - [量化](#quantization)\n  - [遇到问题或有疑问？](#bugs-or-questions)\n  - [引用](#citation)\n  - [INSTRUCTOR 其他资源](#instructor-elsewhere)\n\n## 安装\n使用 INSTRUCTOR 进行任何文本嵌入都非常简单。您可以在[Colab 笔记本](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1P7ivNLMosHyG7XOHmoh7CoqpXryKy3Qt?usp=sharing)中轻松试用。在本地机器上，我们建议首先创建一个虚拟环境：\n```bash\nconda env create -n instructor python=3.7\ngit clone https:\u002F\u002Fgithub.com\u002FHKUNLP\u002Finstructor-embedding\npip install -r requirements.txt\n```\n这将创建我们使用的 `instructor` 环境。要使用嵌入工具，首先从 PyPI 安装 `InstructorEmbedding` 包：\n```bash\npip install InstructorEmbedding\n```\n或者直接从我们的代码中安装：\n```bash\npip install -e .\n```\n\n### 环境设置\n\n通过运行以下命令激活环境：\n```bash\nconda activate instructor\n```\n\n## 入门指南\n\n首先下载一个预训练模型（完整可用模型列表见[模型列表](#model-list)）：\n\n```python\nfrom InstructorEmbedding import INSTRUCTOR\nmodel = INSTRUCTOR('hkunlp\u002Finstructor-large')\n```\n\n然后将句子和自定义指令提供给模型：\n```python\n# 准备带有指令的文本对\ntext_instruction_pairs = [\n    {\"instruction\": \"表示科学标题:\", \"text\": \"3D ActionSLAM：多层环境中可穿戴式人物跟踪\"},\n    {\"instruction\": \"表示医学句子以便检索重复句子:\", \"text\": \"近期研究表明，他汀类药物——一种在心血管疾病死亡率预防中广泛应用的药物——可能延缓或预防乳腺癌复发，但其对疾病特异性死亡率的影响尚不明确。\"}\n]\n\n# 后处理\ntexts_with_instructions = []\nfor pair in text_instruction_pairs:\n    texts_with_instructions.append([pair[\"instruction\"], pair[\"text\"]])\n\n# 计算嵌入\ncustomized_embeddings = model.encode(texts_with_instructions)\n```\n\n就这样完成了。我们现在得到了一组包含嵌入的 NumPy 数组。\n\n```python\nfor pair, embedding in zip(text_instruction_pairs, customized_embeddings):\n    print(\"指令: \", pair[\"instruction\"])\n    print(\"文本: \", pair[\"text\"])\n    print(\"嵌入: \", embedding)\n    print(\"\")\n```\n\n### `encode` 函数\n\n用户只需使用 `encode` 函数即可：\n\n```python\nmodel.encode( sentences,\n              batch_size: int = 32,\n              show_progress_bar: bool = None,\n              output_value: str = 'sentence_embedding',\n              convert_to_numpy: bool = True,\n              convert_to_tensor: bool = False,\n              device: str = None,\n              normalize_embeddings: bool = False)\n```\n\n* `sentences`：待嵌入的句子。格式应为 `[[\"指令提示 0\", \"待嵌入文本 0], [\"指令提示 1\", \"待嵌入文本 1], ...]`。\n* `batch_size`（默认：32）：用于计算的批次大小，决定每批同时处理的句子数量。\n* `show_progress_bar`（默认：None）：若设置为 `True`，编码过程中会显示进度条，直观地展示编码进度。\n* `output_value`（默认：'sentence_embedding'）：指定所需的输出类型。默认值 `'sentence_embedding'` 返回句子嵌入；设置为 `'token_embeddings'` 则返回词元嵌入；设置为 `None` 则返回所有输出值。\n* `convert_to_numpy`（默认：`True`）：若设置为 `True`，输出为 NumPy 向量列表；若设置为 `False`，则输出为 PyTorch 张量列表。\n* `convert_to_tensor`（默认：`False`）：若设置为 `True`，函数将以堆叠张量的形式作为单一输出返回。此参数会覆盖 `convert_to_numpy` 的设置。\n* `device`（默认：`None`）：指定用于计算的 `torch.device`。若未指定，则使用默认设备。\n* `normalize_embeddings`（默认：`False`）：若设置为 `True`，返回的向量长度将归一化为 1，此时相似度搜索将使用更快的点积方法（`util.dot_score`），而非余弦相似度。\n\n## 模型列表\n\n我们发布了一系列不同规模的 INSTRUCTOR 检查点。您可以使用 `InstructorEmbedding` 包轻松加载这些模型。\n|              模型              | 平均得分 |\n|:-------------------------------|:--------:|\n|  [hkunlp\u002Finstructor-base](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-base) |   55.9 |\n| [hkunlp\u002Finstructor-large](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-large) |   58.4  |\n|    [hkunlp\u002Finstructor-xl](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-xl)    |   58.8  |\n\n## 使用场景\n以下我们提供了一些具体的使用场景。更多示例和应用，请参阅[我们的论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09741)。\n### 为自定义文本计算嵌入\n如果您希望为特定句子计算自定义嵌入，可以按照统一模板编写指令：\n\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;表示用于 `任务目标` 的 `领域` `文本类型`：\n* `领域` 是可选的，用于指定文本所属的领域，例如科学、金融、医学等。\n* `文本类型` 是必需的，用于指定编码单位，例如句子、文档、段落等。\n* `任务目标` 是可选的，用于指定嵌入的目的，例如检索文档、分类句子等。\n\n### 计算文本之间的相似度\n您可以使用 **INSTRUCTOR** 来计算两组句子之间的相似度，并生成**自定义嵌入**。\n```python\nfrom sklearn.metrics.pairwise import cosine_similarity\nsentences_a = [['表示科学领域的句子：','QCD物质中的部分子能量损失'], \n               ['表示金融领域的句子：','美联储周三上调了基准利率。']]\nsentences_b = [['表示科学领域的句子：','耗散动力学中的手征相变'],\n               ['表示金融领域的句子：','周五该基金上涨不足0.5%']]\nembeddings_a = model.encode(sentences_a)\nembeddings_b = model.encode(sentences_b)\nsimilarities = cosine_similarity(embeddings_a,embeddings_b)\n```\n\n### 将自定义嵌入用于信息检索\n```python\nimport numpy as np\nfrom sklearn.metrics.pairwise import cosine_similarity\nquery  = [['表示维基百科问题，用于检索支持性文档：','山药植物中食物储存在哪里']],\ncorpus = [['表示维基百科文档，用于检索：','资本主义自封建制度结束以来一直在西方世界占据主导地位，但大多数人认为[谁？]“混合经济”这一术语更准确地描述了大多数当代经济，因为它们同时包含私营企业和国有企业。在资本主义制度中，价格决定了供需关系。例如，对某些商品和服务的需求增加会导致价格上涨，而需求减少则会导致价格下降。'],\n          ['表示维基百科文档，用于检索：',\"公平住房法案下的差异影响理论尤其具有争议性，因为该法案规范了许多与住房、保险和抵押贷款相关的活动——一些学者认为，该理论在公平住房法案下的应用，加上社区再投资法案的扩展，导致了次级抵押贷款的兴起以及美国房地产市场的崩溃和随之而来的全球经济衰退\"],\n          ['表示维基百科文档，用于检索：','在美国劳动法中，差异影响是指在就业、住房等领域存在的做法，这些做法会以某种受保护特征为依据，对某一特定群体造成比其他群体更大的不利影响，即使雇主或房东所采用的规则表面上是中立的。尽管受保护的类别因法律而异，但大多数联邦民权法律都以种族、肤色、宗教、国籍和性别作为受保护的特征，有些法律还涵盖了残疾状况和其他特征。']]\nquery_embeddings = model.encode(query)\ncorpus_embeddings = model.encode(corpus)\nsimilarities = cosine_similarity(query_embeddings,corpus_embeddings)\nretrieved_doc_id = np.argmax(similarities)\nprint(retrieved_doc_id)\n```\n\n### 将自定义嵌入用于聚类\n```python\nimport sklearn.cluster\nsentences = [['表示医学领域的句子，用于聚类：','霍瓦瓦-利夫希茨引力中的动力学标量自由度'],\n             ['表示医学领域的句子，用于聚类：','低能区大气中微子通量计算的比较'],\n             ['表示医学领域的句子，用于聚类：','大质量格罗斯-内韦模型中的费米子袋'],\n             ['表示医学领域的句子，用于聚类：',\"泰瓦特隆加速器上伴随t-tbar-H产生的QCD修正\"],\n             ['表示医学领域的句子，用于聚类：','R测量的新分析：粲夸克偶素更高能态矢量粒子的共振参数']]\nembeddings = model.encode(sentences)\nclustering_model = sklearn.cluster.MiniBatchKMeans(n_clusters=2)\nclustering_model.fit(embeddings)\ncluster_assignment = clustering_model.labels_\nprint(cluster_assignment)\n```\n## 训练\n\n### 数据\n我们构建了包含指令的多任务嵌入数据集（MEDI），该数据集由来自 [Super-NI](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07705)（Super-NaturalInstructions）、[sentence-transformer 嵌入训练数据](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fsentence-transformers\u002Fembedding-training-data)、[KILT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.02252) 和 [MedMCQA](https:\u002F\u002Fproceedings.mlr.press\u002Fv174\u002Fpal22a\u002Fpal22a.pdf) 的 330 个数据集组成，涵盖了广泛的领域和任务。如果未提供正样本和负样本对，我们会自行构造，并以统一格式存储：\n```\n[\n    {'query': ['表示用于检索相关文档的维基百科问题;', '大小谎言第二季有多少集'], 'pos': ['表示用于检索的维基百科文档;', '《大小谎言》（电视剧）获得了多项荣誉。它获得了16项艾美奖提名并赢得了8项，包括最佳限定剧集以及基德曼、斯卡斯加德和德恩的表演奖项。这三位演员还获得了金球奖，此外该剧还荣获了最佳迷你剧或电视电影金球奖。基德曼和斯卡斯加德也因其表演获得了美国影视演员协会奖。尽管最初被宣传为迷你剧，HBO还是续订了该剧的第二季。第二季的制作于2018年3月开始，预计将于2019年首播。全部七集均由凯利编剧'], 'neg': ['表示用于检索的维基百科文档;', '《小人物，大世界》第二季B部分的最后一集——“农场超载”。一群人围在躺在投石机旁地上的雅各布身边。第二季B的前两集聚焦于这起事故以及当地媒体对此事的反应。第一季《小人物，大世界》为TLC频道带来了稳定的收视率（尤其是在18至49岁这一重要人群中），从而促使该节目续订第二季。对该节目的评论总体上较为正面，称赞其对身材矮小人群的积极刻画。另一方面，也有评论认为该节目带有窥探性质'], 'task_id': 1}\n    {'query': ['表示用于检索相关文档的维基百科问题;', '谁演唱了《等待像你这样的女孩》'], 'pos': ['表示用于检索的维基百科文档;', '《等待像你这样的女孩》\\n《等待像你这样的女孩》是英国-美国摇滚乐队Foreigner于1981年发行的一首力量抒情歌曲。这首歌曲中标志性的合成器旋律由当时尚不知名的托马斯·多尔比演奏，这首歌也标志着乐队风格的重大转变：此前他们的单曲多为中快节奏的摇滚乐，而此曲则是一首更为柔和的情歌，同时兼具力量抒情曲的能量。它是专辑《4》（1981）中的第二支单曲，由卢·格拉姆和米克·琼斯共同创作。它已成为乐队最'], 'neg': ['表示用于检索的维基百科文档;', '《等待像你这样的女孩》曾连续九周力压奥莉维亚·牛顿-约翰的单曲《Physical》占据榜首，随后又在1982年1月30日被霍尔与欧茨的《I Can\\'t Go for That (No Can Do)》挤下，保持了第十周的冠军位置。由于其在榜单上的持久表现，最终成为1982年百强单曲榜第19位的歌曲。这首歌是乐队当时最大的热门单曲，直到1985年《I Want to Know What Love Is》登顶为止。该歌曲在《公告牌》“史上百大歌曲”中排名第100位。《等待像你这样的女孩》\\n《等待像一个女孩']], 'task_id': 1}\n    ...\n    {'query': ['表示用于检索相关文档的维基百科句子;', '我超爱甜味马提尼酒！'], 'pos': ['表示用于检索的维基百科文档;', '苹果马提尼\\n苹果马提尼（简称Appletini）是一种含有伏特加以及一种或多种苹果汁、苹果酒、苹果利口酒或苹果白兰地的鸡尾酒。\\n这种饮品最初被称为“亚当的苹果马提尼”，因为创造它的调酒师名叫亚当。它诞生于1996年位于西好莱坞的Lola's餐厅。\\n这款名为“亚当的苹果”的饮品曾在1972年7月号的《花花公子》杂志内封面上由Smirnoff品牌进行广告宣传。配方要求使用大约一盎司的Smirnoff'], 'neg': ['表示用于检索的维基百科文档;', '本法案所描述的调味葡萄酒类饮料包括“调味葡萄酒基饮料”（非加强型）和“调味葡萄酒产品鸡尾酒”（混合型、酒精含量低于7%的低度饮品）。\\n调味葡萄酒的种类。\\n调味葡萄酒的种类——苦艾酒。\\n苦艾酒是最常用的调味葡萄酒，因为它广泛用于调制鸡尾酒，并且拥有Martini和Cinzano等在全球范围内广为人知的知名品牌。苦艾酒有甜味和干味之分，颜色可以是红色、白色、粉色或橙色。传统上'], 'task_id': 300}\n]\n```\n每个实例包含一个问题、一个正样本对、一个负样本对以及任务ID，任务ID用于确保同一训练批次中的数据来自同一任务。\nMEDI 数据可通过[此链接](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1vZ5c2oJNonGOvXzppNg5mHz24O6jcc52\u002Fview?usp=sharing)下载。\n\n### 训练 INSTRUCTOR\n我们提供了 INSTRUCTOR 的训练示例脚本。您可能需要先下载 [MEDI 数据](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1vZ5c2oJNonGOvXzppNg5mHz24O6jcc52\u002Fview?usp=sharing)，解压缩文件夹并将 `medi-data.json` 放置在 `--cache_dir` 目录下。\n```python\npython train.py --model_name_or_path sentence-transformers\u002Fgtr-t5-large --output_dir {输出目录 } --cache_dir { 缓存目录 } --max_source_length 512 --num_train_epochs 10 --save_steps 500 --cl_temperature 0.1 --warmup_ratio 0.1 --learning_rate 2e-5 --overwrite_output_dir\n```\n我们对各个参数说明如下：\n* `--model_name_or_path`：用于开始训练的预训练检查点。我们支持模型标识符（例如 `sentence-transformers\u002Fgtr-t5-large`、`sentence-transformers\u002Fsentence-t5-large`）或检查点路径（例如由 transformers 训练器保存的检查点）。\n* `--cl_temperature`：对比损失的温度系数。\n* `--cache_dir`：用于缓存已下载模型和数据的目录。下载的 MEDI 数据（`medi-data.json`）应放置在 `--cache_dir` 目录下。\n* `--output_dir`：用于存储训练好的模型（检查点）以便评估的目录。\n\n其余参数均为标准的 `Huggingface's transformers` 训练参数，例如 `--overwrite_output_dir`、`--num_train_epochs`、`--learning_rate`。更多详细信息请参阅 [Huggingface transformers](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers)。\n\n## 评估\n我们在70个多样化的任务上大规模评估INSTRUCTOR，这些任务涵盖了广泛的任务和领域。具体来说，我们的评估基于三个基准：[MTEB](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fmteb\u002Fleaderboard)、[Billboard](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.04139)和[Prompt Retrieval](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.01975)。以下我们将详细说明如何运行评估脚本。\n\u003C!-- * MTEB是一个全面的嵌入模型评估基准，旨在提供对嵌入模型的全局视角。它结合了多个传统基准（如BEIR和STS），并覆盖了广泛的领域特定数据集，包括科学、生物学和医学。 \n* Prompt Retrieval任务的目标是在给定测试实例的情况下，从标注好的示例中检索出少量上下文学习（即示范）示例。嵌入模型用于编码所有标注示例，并根据余弦相似度找到与测试实例最相似的几个示例。我们通过衡量下游任务上的平均性能来评估嵌入效果。 \n* Billboard将INSTRUCTOR应用于文本生成任务的自动评估。遵循[Kasai等人（2022a）](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.04139)，我们计算生成文本与每篇参考文本之间的余弦相似度，并取所有可用参考文本中的最大相似度分数。我们通过嵌入模型与人工评分之间的皮尔逊相关系数来评估所有嵌入模型。 -->\n\n### MTEB\n为了在MTEB基准数据集上评估模型性能，首先安装MTEB库：\n\n```python\ncd evaluation\u002FMTEB\npip install -e .\n```\n然后运行以下命令：\n```python\npython examples\u002Fevaluate_model.py --model_name hkunlp\u002Finstructor-large --output_dir outputs --task_name ArguAna --result_file results\n```\n您可以通过指定`--model_name`来评估自己训练的模型检查点，并通过更改`--task_name`来运行所有的MTEB数据集。有关所有任务的评估指标，请参阅[我们的论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09741)或[MTEB基准](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fmteb\u002Fleaderboard)。\n\n### Billboard\n要评估模型在Billboard上的性能，运行以下命令：\n```python\ncd evaluation\u002Ftext_evaluation\npython main.py --model_name hkunlp\u002Finstructor-large --task mscoco --add_prompt\n```\n您可以通过指定`--model_name`来评估自己训练的模型检查点，并通过更改`--task`来运行所有的Billboard数据集。在Billboard的三个数据集中，我们都报告了皮尔逊相关系数。\n\n### Prompt Retrieval\n要评估模型在Prompt Retrieval上的性能，运行以下命令：\n```python\ncd evaluation\u002Fprompt_retrieval\npython main.py --embedding_model hkunlp\u002Finstructor-large --task rte --model_cache_dir {cache_dir} --output_dir {output_dir} --add_prompt\n```\n您可以通过指定`--model_name`来评估自己训练的模型检查点，并通过更改`--task`来运行Prompt Retrieval数据集。为了保持一致的评价标准，我们将Prompt Retrieval中的所有任务统一转换为“文本到文本”的格式，并报告Rouge-L分数。\n\n## 量化\n要对Instructor嵌入模型进行[**量化**](https:\u002F\u002Fpytorch.org\u002Fdocs\u002Fstable\u002Fquantization.html)，请运行以下代码：\n\n```python\n# 导入模块\nimport torch\nfrom InstructorEmbedding import INSTRUCTOR\n\n# 加载模型\nmodel = INSTRUCTOR('hkunlp\u002Finstructor-large', device='cpu')  # 您也可以使用GPU\n\n# 对模型进行量化\nqmodel = torch.quantization.quantize_dynamic(\n    model, {torch.nn.Linear}, dtype=torch.qint8)\n\n# 推理\nsentence = \"3D ActionSLAM: 可穿戴式多层环境人体跟踪\"\ninstruction = \"表示科学标题：\"\n\nembeddings = qmodel.encode([[instruction,sentence]])  \n# 您也可以对嵌入进行归一化：normalize_embeddings=True \n\nprint(f\"量化后的嵌入：\\n {embeddings}\")\n```\n\n这会将模型大小缩小10倍，并且推理时间也会比普通模型更短 :) \n\n## 遇到问题或有疑问？\n如果您对代码或论文有任何疑问，请随时发送邮件至Hongjin（`hjsu@cs.hku.hk`）和Weijia（`swj0419@cs.washington.edu`）。请尽量详细描述问题，以便我们能够更快更好地帮助您。\n\n## 引用\n如果您觉得我们的工作有所帮助，请引用我们：\n\n```bibtex\n@inproceedings{INSTRUCTOR,\n  title={一个嵌入器，适用于任何任务：指令微调的文本嵌入},\n  author={Su, Hongjin and Shi, Weijia and Kasai, Jungo and Wang, Yizhong and Hu, Yushi and  Ostendorf, Mari and Yih, Wen-tau and Smith, Noah A. and  Zettlemoyer, Luke and Yu, Tao},\n  url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09741},\n  year={2022},\n}\n```\n\n## INSTRUCTOR的其他应用\n我们感谢社区为扩展INSTRUCTOR所做的努力！\n* [LangChain](https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fintegrations\u002Ftext_embedding\u002Finstruct_embeddings) 支持使用INSTRUCTOR模型的InstructEmbeddings。\n* [MosaicML](https:\u002F\u002Fwww.mosaicml.com\u002Finference) 已将[Instructor-Large](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-large)和[Instructor-XL](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-xl)纳入其支持列表。\n* [embaas](https:\u002F\u002Fembaas.io\u002Fdocs\u002Fmodels\u002Finstructor) 集成了[Instructor-Large](https:\u002F\u002Fhuggingface.co\u002Fhkunlp\u002Finstructor-large)。\n* [Haystack](https:\u002F\u002Fhaystack.deepset.ai\u002Fintegrations\u002Finstructor-embedder) 包含了`InstructorTextEmbedder`和`InstructorDocumentEmbedder`组件。","# Instructor-Embedding 快速上手指南\n\nInstructor 是一个指令微调的文本嵌入模型。它无需针对特定任务进行微调，只需在输入时提供简单的**任务指令**（Instruction），即可生成适用于分类、检索、聚类等任意任务和领域的文本向量。\n\n## 环境准备\n\n*   **操作系统**: Linux, macOS, Windows\n*   **Python 版本**: 推荐 Python 3.7 及以上\n*   **依赖管理**: 推荐使用 `conda` 创建虚拟环境以避免依赖冲突\n*   **硬件**: 支持 CPU 运行，建议使用 NVIDIA GPU 以加速推理\n\n## 安装步骤\n\n### 1. 创建并激活虚拟环境\n```bash\nconda create -n instructor python=3.7\nconda activate instructor\n```\n\n### 2. 安装依赖包\n你可以直接从 PyPI 安装官方发布的包（推荐）：\n```bash\npip install InstructorEmbedding\n```\n\n或者克隆源码进行开发模式安装：\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FHKUNLP\u002Finstructor-embedding\ncd instructor-embedding\npip install -e .\n```\n\n> **国内加速提示**：如果下载速度较慢，建议使用清华或阿里镜像源：\n> `pip install InstructorEmbedding -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple`\n\n## 基本使用\n\n### 1. 加载模型\n首先导入库并加载预训练模型（如 `instructor-large`）。首次运行时会自动从 Hugging Face 下载模型。\n\n```python\nfrom InstructorEmbedding import INSTRUCTOR\n\n# 加载模型，可选模型包括：hkunlp\u002Finstructor-base, hkunlp\u002Finstructor-large, hkunlp\u002Finstructor-xl\nmodel = INSTRUCTOR('hkunlp\u002Finstructor-large')\n```\n\n### 2. 准备数据与指令\nInstructor 的核心在于**指令**。输入格式必须为列表嵌套列表：`[[指令，文本], [指令，文本], ...]`。\n\n通用指令模板：\n> \"Represent the `领域` `文本类型` for `任务目标`:\"\n> *   `领域` (可选): 如 science, finance, medicine\n> *   `文本类型` (必填): 如 sentence, document, paragraph\n> *   `任务目标` (可选): 如 retrieving a duplicate sentence, clustering, classification\n\n```python\n# 定义包含指令和文本的数据对\ntext_instruction_pairs = [\n    [\"Represent the Science title:\", \"3D ActionSLAM: wearable person tracking in multi-floor environments\"],\n    [\"Represent the Medicine sentence for retrieving a duplicate sentence:\", \"Recent studies have suggested that statins could delay breast cancer recurrence.\"]\n]\n```\n\n### 3. 生成嵌入向量\n调用 `encode` 方法即可获取向量表示。\n\n```python\n# 计算嵌入向量\ncustomized_embeddings = model.encode(text_instruction_pairs)\n\n# 查看结果\nfor pair, embedding in zip(text_instruction_pairs, customized_embeddings):\n    print(f\"指令：{pair[0]}\")\n    print(f\"文本：{pair[1]}\")\n    print(f\"向量维度：{embedding.shape}\")\n    print(\"-\" * 30)\n```\n\n### 进阶参数说明\n`model.encode()` 支持以下常用参数：\n*   `batch_size`: 批处理大小，默认 32。\n*   `show_progress_bar`: 是否显示进度条，默认 `None`。\n*   `convert_to_numpy`: 是否转换为 numpy 数组，默认 `True`。\n*   `normalize_embeddings`: 是否归一化向量（长度为 1），默认 `False`。若设为 `True`，后续相似度计算可使用更快的点积。\n\n```python\n# 示例：开启进度条并归一化输出\nembeddings = model.encode(\n    text_instruction_pairs, \n    show_progress_bar=True, \n    normalize_embeddings=True\n)\n```","某医疗科技公司的数据团队正在构建一个跨科室的临床文献智能检索系统，需要让医生能通过自然语言快速找到高度相关的病例报告和研究论文。\n\n### 没有 instructor-embedding 时\n- **任务适配成本高**：面对“查找相似病例”和“筛选最新疗法”等不同需求，团队必须为每个特定任务单独微调模型，开发周期长达数周。\n- **领域泛化能力弱**：在通用语料上训练的嵌入模型，难以理解医学专有名词的深层语义，导致心血管领域的查询常误匹配到骨科文献。\n- **指令感知缺失**：模型无法区分用户是想“总结文章”还是“寻找反例”，只能机械地计算字面相似度，检索结果往往答非所问。\n- **维护负担重**：随着业务扩展至金融合规等新领域，需重复训练多套模型，导致服务器资源紧张且版本管理混乱。\n\n### 使用 instructor-embedding 后\n- **零样本任务切换**：只需在输入时添加如“代表医学句子以检索重复项”的指令文本，instructor-embedding 即可立即适应新任务，无需任何额外训练。\n- **跨域语义精准**：凭借指令微调机制，模型能准确捕捉“他汀类药物”在预防复发场景下的特定含义，显著提升了跨科室检索的准确率。\n- **意图动态对齐**：通过改变指令前缀（如“用于聚类的科学标题”），instructor-embedding 能动态调整向量空间分布，完美契合分类、聚类或评估等不同目标。\n- **统一架构部署**：仅需维护一个大模型即可覆盖公司所有业务线，大幅降低了算力成本和运维复杂度。\n\ninstructor-embedding 通过“一条指令适配任意任务”的特性，将原本繁琐的多模型训练流程简化为即插即用的文本提示，极大提升了垂直领域 AI 应用的落地效率。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fxlang-ai_instructor-embedding_7c53ab22.png","xlang-ai","XLANG Lab","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fxlang-ai_02033ec0.png","Developing embodied AI agents that empower users to use language to interact with digital and physical environments to carry out real-world tasks.",null,"XLangNLP","https:\u002F\u002Fxlang.ai","https:\u002F\u002Fgithub.com\u002Fxlang-ai",[84],{"name":85,"color":86,"percentage":87},"Python","#3572A5",100,2022,157,"2026-03-31T07:09:47","Apache-2.0","未说明","未说明 (代码支持通过 device 参数指定计算设备，默认使用系统默认设备，通常隐含支持 CPU 和 CUDA GPU)",{"notes":95,"python":96,"dependencies":97},"建议使用 conda 创建虚拟环境（示例命令为 python=3.7）。可通过 PyPI 安装 'InstructorEmbedding' 包或从源码安装。模型需从 Hugging Face 下载（提供 base\u002Flarge\u002Fxl 不同尺寸）。编码时需注意输入格式为 [['指令', '文本'], ...]。","3.7+",[98,99,100,101],"InstructorEmbedding","torch","transformers","sentence-transformers",[13,54,26],[104,105,106,107,108,109,110,111,112,113],"embeddings","information-retrieval","language-model","text-classification","text-clustering","text-embedding","text-evaluation","text-semantic-similarity","prompt-retrieval","text-reranking","2026-03-27T02:49:30.150509","2026-04-06T11:30:58.361466",[],[]]