[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-code-kern-ai--refinery":3,"tool-code-kern-ai--refinery":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",160784,2,"2026-04-19T11:32:54",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",109154,"2026-04-18T11:18:24",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":77,"owner_twitter":78,"owner_website":79,"owner_url":80,"languages":81,"stars":94,"forks":95,"last_commit_at":96,"license":97,"difficulty_score":98,"env_os":99,"env_gpu":99,"env_ram":99,"env_deps":100,"category_tags":106,"github_topics":108,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":129,"updated_at":130,"faqs":131,"releases":157},9806,"code-kern-ai\u002Frefinery","refinery","The data scientist's open-source choice to scale, assess and maintain natural language data. Treat training data like a software artifact.","Refinery 是一款专为数据科学家打造的开源工具，旨在帮助用户高效地扩展、评估和维护自然语言处理（NLP）所需的训练数据。它的核心理念是将训练数据视为可版本化、可管理的“软件工件”，而不仅仅是静态文件。\n\n在实际工作中，许多团队面临标注数据不足、数据散落在表格或文本文件中难以评估质量，或在资源有限的情况下不知如何优化标注效率等痛点。Refinery 正是为解决这些问题而生。它支持半自动化标注流程，帮助快速识别训练数据中的低质量子集，并提供统一的数据监控面板，让用户能以“数据为中心”的方式构建更优质的 NLP 模型。虽然 Refinery 不取代人工标注，但它能确保宝贵的人力时间花在刀刃上。此外，项目正积极开发与其他标注工具的集成，方便用户灵活切换工作流。\n\n无论是独立开发者开展个人 NLP 项目，还是协作团队希望在有限预算和时间内最大化数据价值，Refinery 都是理想选择。其开源特性、对数据质量的深度洞察以及模块化设计，使其成为现代自然语言数据处理流程中不可或缺的一环。","![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcode-kern-ai_refinery_readme_f115b18f2e89.png)\n\n\u003Cdiv align=\"center\">\n    \u003Cp>\u003Cb>The data scientist's open-source choice to scale, assess and maintain natural language data.\u003C\u002Fb>\u003C\u002Fp>\n    \u003Cp>\u003Cb>Treat training data like a software artifact.\u003C\u002Fb>\u003C\u002Fp>\n\u003C\u002Fdiv>\n\n\u003Cp align=center>\n    \u003Ca href=\"https:\u002F\u002Fpypi.org\u002Fproject\u002Fkern-refinery\u002F1.3.0\u002F\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fpypi-yellow.svg\" alt=\"pypi\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fblob\u002Fmaster\u002FLICENSE\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-Apache%202.0-success\" alt=\"Apache 2.0 License\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fdiscussions\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FDiscussions-gray.svg?logo=github\" alt=\"GitHub Discussions\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002Fqf4rGCEphW\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FDiscord-gray.svg?logo=discord\" alt=\"Discord\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Ftwitter.com\u002FMeetKern\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FTwitter-white.svg?logo=twitter\" alt=\"Twitter\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fcompany\u002Fkern-ai\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLinkedIn-0A66C2.svg?logo=linkedin\" alt=\"LinkedIn\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fwww.youtube.com\u002F@kern_ai\u002Fvideos\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FYouTube-FF0000.svg?logo=youtube\" alt=\"YouTube\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fapp.kern.ai\u002F\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCloud-black.svg?logo=data:image\u002Fpng;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAAeCAYAAABNChwpAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAALDSURBVHgBvVfBjRUxDLWzf+GCxJeQkLhNJ2wLFIC02wFIcNgG4IBAdMCBBrYD6IS5oYXLcoAL\u002FBg7iRM7Mx+ElMHSTDxOxn52nMRBKPR4vvwAQA8BELOE+EGQD8QAIYkRgjzMiwyl5WGh8NKXZFD6y1jpP0Jz15ONYzGNRSQtUZYrNBKBPPqn4RDRaSVYEuV\u002FpwYgeEVUYyAtHVW0ZqgoT+5Y4JYUZAMQdSBWr1GVkgGnfegVQZWRjxomScWOHZyw9IKqB8WdPK\u002FFG3lZUOkLdUqsHKrraIKg0RSd8s+uByAIyQSbzV8R4k1KwJIhWKOQEw+InxDq\u002FwlpKJoEOdKeRWcs2WefGoid9x2hm+n5\u002FfTiEQyg59ev37H6817upgBN+7eE+1ciaMmi4e8A+NC7zBlAbJS679S6HOgzFGF7Citcs45jIeCKvt2Tzy8nYb79\u002FM4Zm5dcjJGzPt7wPM0wkGTe7RJPq+D05PYn+bgXbhmEePXq\u002FtMLGEhsOKm3URAwwaLZksISUNlH\u002FhOl4HfhXwBzW\u002FBoKkd2b89vRGuoRmLoABydggjbkEbYroZdP2B0YkilxX5P1z++7k8C6hH98e7pnYsKgOqRm0M0MgJinJuJWOuv2OqEtw8uZ+lPzubDATajXBXh6iFXp2DLbaBURQZOo7BYerTNKqh7LAbnarBns44cvQpaeaaVdMOwy6j6+NP5sy9vUuLIT3w4Mcdtqgdj0iGyxAPkN+XRkQ5JdogHMbNnftIyrSt1GwBnuqwIZs4gKWyzljMYa6EqvC3ZiWIto0OJbL1okJb5fspdDuQ1Wopzbf2rITGHmivbiy0yhfiftvjQ1\u002FVaFVHRa0vwasR\u002FY7vM1TtFKompFPE26D2U1XtBVmwuIOhimMfRcmHZ61muQn122aC1CADOsEK0YBC05FeACzLXjxbHPu\u002FFapiV\u002FQ23LD\u002F8UGO+6AAAAABJRU5ErkJggg==\" alt=\"Kern AI\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fdemo.kern.ai\u002F\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPlayground-white.svg\" alt=\"Playground\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fdocs.kern.ai\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FDocs-blue.svg\" alt=\"Docs\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fwww.kern.ai\u002F\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FWeb-white.svg\" alt=\"Website\">\u003C\u002Fa>\n\u003C\u002Fp>\n\nDoes one of these scenarios sounds familiar to you?\n\n- You are working on your own side-project in NLP, but you don't have enough labeled data to train a good model.\n- You are working in a team and already have some labeled data, but your training data is just stored in a spreadsheet or some TXT-file, and you have no idea how _good_ it actually is.\n- You are working in a team about to start a new project with limited resources (annotators, budget, time), and now want to understand how you can best make use of them\n\nIf so, you are one of the people we've built refinery for. refinery helps you to build better NLP models in a data-centric approach. Semi-automate your labeling, find low-quality subsets in your training data, and monitor your data in one place.\n\n_refinery_ doesn't get rid of manual labeling, but it makes sure that your valuable time is spent well. Also, the makers of _refinery_ currently work on integrations to other labeling tools, such that you can easily switch between different choices.\n\n![Showcase GIF of _refinery_](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcode-kern-ai_refinery_readme_bae4fd401088.gif)\n\n> **_DEMO:_** You can interact with the application in a (mostly read-only) online playground. Check it out [here](https:\u002F\u002Fdemo.kern.ai)\n\n_refinery_ is a multi-repository project, you can find all integrated services in the architecture below. The app builds on top of [🤗 Hugging Face](https:\u002F\u002Fwww.huggingface.co) and [spaCy](https:\u002F\u002Fspacy.io\u002F) to leverage pre-built language models for your NLP tasks, as well as [qdrant](https:\u002F\u002Fgithub.com\u002Fqdrant\u002Fqdrant) for neural search.\n\n## Table of contents\n\n- [🧑‍💻 Why _refinery_?](#-why-refinery)\n  - [Enabling ideas of one-person-armies](#enabling-ideas-of-one-person-armies)\n  - [Extending your existing labeling approach](#extending-your-existing-labeling-approach)\n  - [Put structure into unstructured data](#put-structure-into-unstructured-data)\n  - [Pushing collaboration](#pushing-collaboration)\n  - [Open-source, and treating training data as a software artifact](#open-source-and-treating-training-data-as-a-software-artifact)\n  - [Integrations](#integrations)\n- [Your benefits](#your-benefits)\n- [How does Kern AI make money, if refinery is open-source?](#how-does-kern-ai-make-money-if-refinery-is-open-source)\n- [🤓 Features](#-features)\n  - [(Semi-)automated labeling workflow for NLP tasks](#semi-automated-labeling-workflow-for-nlp-tasks)\n  - [Extensive data management and monitoring](#extensive-data-management-and-monitoring)\n  - [Team workspaces in the managed version](#team-workspaces-in-the-managed-version)\n- [☕ Installation](#-installation)\n  - [From pip](#from-pip)\n  - [From repository](#from-repository)\n  - [Persisting data](#persisting-data)\n- [📘 Documentation and tutorials](#-documentation-and-tutorials)\n- [😵‍💫 Need help?](#-need-help)\n- [🪢 Community and contact](#-community-and-contact)\n- [🙌 Contributing](#-contributing)\n- [❓ FAQ](#-faq)\n  - [Concept questions](#concept-questions)\n  - [Technical questions](#technical-questions)\n  - [Service and hosting questions](#service-and-hosting-questions)\n- [🐍 Python SDK](#-python-sdk)\n- [🏠 Architecture](#-architecture)\n- [🏫 Glossary](#-glossary)\n- [👩‍💻👨‍💻 Team and contributors](#-team-and-contributors)\n- [🌟 Star History](#-star-history)\n- [📃 License](#-license)\n\n## 🧑‍💻 Why _refinery_?\n\nThere are already many other tools available to build training data. Why did we decide to build _yet another one_?\n\n### Enabling ideas of one-person-armies\n\nWe believe that developers can have crazy ideas, and we want to lower the barrier for them to go for that idea. _refinery_ is designed to build labeled training data much faster, so that it takes you very little time to prototype an idea. We've received much love for exactly that, so make sure to give it a try for your next project.\n\n### Extending your existing labeling approach\n\n_refinery_ is more than a labeling tool. It has a built-in labeling editor, but its main advantages come with automation and data management. You can integrate any kind of heuristic to label what is possible automatically, and then focus on headache-causing subsets afterwards. Whether you do the labeling in _refinery_ or any other tool (even crowd labeled) doesn't matter!\n\n### Put structure into unstructured data\n\n_refinery_ is the tool that brings new perspectives into your data. You're working on multilingual, human-written texts? Via our integration to [bricks](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fbricks), you can easily enrich your texts with metadata such as the detected language, sentence complexity and many more. You can use this both to analyze your data, but also to orchestrate your labeling workflow.\n\n### Pushing collaboration\n\nWhile doing so, we aim to improve the collaboration between engineers and subject matter experts (SMEs). In the past, we've seen how our application was being used in meetings to discuss label patterns in form of labeling functions and distant supervisors. We believe that data-centric AI is the best way to leverage collaboration.\n\n### Open-source, and treating training data as a software artifact\n\nWe hate the idea that there are still use cases in which the training data is just a plain CSV-file. That is okay if you _really_ just quickly want to prototype something at hand with a few records, but any serious software should be maintainable. We believe an open-source solution for training data management is what's needed here. _refinery_ is the tool helping you to document your data. That's how you treat training data as a software artifact.\n\n### Integrations\n\nLastly, _refinery_ supports [SDK actions](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-python) like pulling and pushing data. Data-centric AI redefines labeling to be more than a one-time job by giving it an iterative workflow, so we aim to give you more power every day by providing end-to-end capabilities, growing the large-scale availability of high-quality training data. Use our SDK to program integrations with your existing landscapes.\n\n## Your benefits\n\nYou can automate tons of repetitive tasks, gain better insights into the data labeling workflow, receive an implicit documentation for your training data, and can ultimately build better models in shorter time.\n\nOur goal is to make training data building feel more like a programmatic and enjoyable task, instead of something tedious and repetitive. _refinery_ is our contribution to this goal. And we're constantly aiming to improve this contribution.\n\nIf you like what we're working on, please leave a ⭐!\n\n## How does Kern AI make money, if refinery is open-source?\n\nYou won't believe how often we get that question - and it is a fair one 🙂 Put short, the open-source version of _refinery_ is currently a single-user version, and you can get access to a multi-user environment with our commercial options. Additionally, we have commercial products on top of _refinery_, e.g. to use the _refinery_ automations as an actual realtime prediction API.\n\nGenerally, we are passionate about open-source and want to contribute as much as possible.\n\n## 🤓 Features\n\nFor a detailed overview of features, please look into our [docs](https:\u002F\u002Fdocs.kern.ai).\n\n### (Semi-)automated labeling workflow for NLP tasks\n\n- Both manual and programmatic for classifications and span-labeling\n- Integration with state-of-the-art libraries and frameworks\n- Creation and management of lookup lists\u002Fknowledge bases to support during labeling\n- Neural search-based retrieval of similar records and outliers\n- Sliceable labeling sessions to drill-down on specific subsets\n- Multiple labeling tasks possible per project\n- Rich library of ready-made automations in our open-source [bricks](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fbricks) library\n\n### Extensive data management and monitoring\n\n- Best-in-class data management capabilities via our databrowser. Filter, sort and search your data e.g. by confidence, heuristic overlap, user, note, etc.\n- Integration with [🤗 Hugging Face](https:\u002F\u002Fwww.huggingface.co) to automatically create document- and token-level embeddings\n- JSON-based data model for up- and downloads\n- Overview of project metrics like confidence and label distributions and confusion matrix\n- Data accessible and extendable via our [Python SDK](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-python)\n- Attribute modifications to extend your attributes (e.g. with sentence complexity metrics) in-place\n- Again, you can use [bricks](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fbricks) to enrich your data with metadata\n\n### Team workspaces in the [managed version](https:\u002F\u002Fwww.kern.ai\u002Fpricing)\n\n- Allow multiple users to label your data with role-based access and minimized labeling views\n- Integrate crowd labeling workflows\n- Automated calculation of inter-annotator agreements\n\n## ☕ Installation\n\n### From pip\n\n```\npip install kern-refinery\n```\n\nOnce the library is installed, go to the directory where you want to store the data and run `refinery start`. This will automatically `git clone` this repository first if you haven't done so yet. To stop the server, run `refinery stop`.\n\n### From repository\n\n**TL;DR:**\n\n```\n$ git clone https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery.git\n$ cd refinery\n```\n\nIf you're on Mac\u002FLinux:\n\n```\n$ .\u002Fstart\n```\n\nIf you're on Windows:\n\n```\n$ start.bat\n```\n\nTo stop, type `.\u002Fstop` (Mac\u002FLinux) or `stop.bat`.\n\n_refinery_ consists of multiple services that need to be run together. To do so, we've set up a setup file, which will automatically pull and connect the respective services for you. The file is part of this repository, so you can just clone it and run `.\u002Fstart` (Mac\u002FLinux) or `start.bat` (Windows) in the repository. After some minutes (now is a good time to grab a coffee ☕), the setup is done and you can access `http:\u002F\u002Flocalhost:4455` in your browser. To stop the server, run `.\u002Fstop` (Mac\u002FLinux) or `.\u002Fstop.bat` (Windows).\n\n**You're ready to start! 🙌 🎉**\n\nIf you run into any issues during installation, please don't hesitate to reach out to us (see community section below).\n\n### Persisting data\n\nBy default, we store the data to the directory `refinery\u002Fpostgres-data`. If you want to change that path, you need to modify the variable `LOCAL_VOLUME` of the `start` script of your operating system. To remove data, simply delete the volume folder. **Make sure to delete only if you don't need the data any longer - this is irreversible!**\n\n## 📘 Documentation and tutorials\n\nThe best way to start with _refinery_ is our [**quick start**](https:\u002F\u002Fdocs.kern.ai\u002Frefinery\u002Fquickstart).\n\nYou can find extensive guides in our [docs](https:\u002F\u002Fdocs.kern.ai) and [tutorials](https:\u002F\u002Fwww.youtube.com\u002F@kern_ai\u002Fvideos) on our YouTube channel. We've also prepared a [repository with sample projects](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fsample-projects) which you can clone.\n\nIf you need help writing your first labeling functions, look into our open-source content library [bricks](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fbricks).\n\nYou can find our changelog [here](https:\u002F\u002Fchangelog.kern.ai).\n\n## 😵‍💫 Need help?\n\nNo worries, we've got you. If you have questions, reach out to us on [Discord](https:\u002F\u002Fdiscord.gg\u002Fqf4rGCEphW), or [open a ticket](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fdiscussions\u002Fcategories\u002Fq-a) in the \"q&a\" category of our forum.\n\n## 🪢 Community and contact\n\nFeel free to join our [Discord](https:\u002F\u002Fdiscord.gg\u002Fqf4rGCEphW), where we'll happily help you building your training data:\n\nWe send out a (mostly) weekly newsletter about recent findings in data-centric AI, product highlights in development and more. You can subscribe to the newsletter [here](https:\u002F\u002Fwww.kern.ai\u002Fnewsletter).\n\nAlso, you can follow us on [Twitter](https:\u002F\u002Ftwitter.com\u002FMeetKern) and [LinkedIn](https:\u002F\u002Fwww.linkedin.com\u002Fcompany\u002Fkern-ai).\n\n## 🙌 Contributing\n\nContributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**. You can do so by providing feedback about [desired features and bugs](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fissues) you might detect.\n\nIf you actively want to participate in extending the code base, reach out to us. We'll explain you how the architecture is set up, so you can customize the application as you desire.\n\n## ❓ FAQ\n\n### Concept questions\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>What is a heuristic?\u003C\u002Fb>\u003C\u002Fsummary>\n    Heuristics are the ingredients for scaling your data labeling. They don't have to be 100% accurate, heuristics can be e.g. simple Python functions expressing some domain knowledge. When you add and run several of these heuristics, you create what is called a noisy label matrix, that is matched against the reference data that you manually labeled. This allows us to analyze correlations, conflicts, overlaps, the number of hits for a data set, and the accuracy of each heuristic.\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>How can I build an active learning model?\u003C\u002Fb>\u003C\u002Fsummary>\n    We use pre-trained models to create embeddings in the first place. Once this is done, the embeddings are available in the application (both for building active learning heuristics and neural search). In our active learning IDE, you can then build a simple classification or extraction head on top of the embedding, and we'll manage then execution in a containerized environment.\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>How do I know whether my heuristic is good?\u003C\u002Fb>\u003C\u002Fsummary>\n    A heuristic can be “good” with respect to both coverage and precision. For coverage there basically is no limitation at all, for precision we generally recommend some value above 70%, depending on how many heuristics you have. The more heuristics you have, the more overlaps and conflicts will be given, the better weak supervision can work.\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>I have less than 1,000 records - Do I need this?\u003C\u002Fb>\u003C\u002Fsummary>\n    You can definitely use the system for smaller datasets, too! It not only shines via programmatic labeling, but also has a simple and beautiful UI. Go for it 😁\n\u003C\u002Fdetails>\n\n### Technical questions\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>Help!! I forgot my password!\u003C\u002Fb>\u003C\u002Fsummary>\n    No worries, you can send a reset link even on your local machine. However, the link isn't sent to your email, but to the mailhog. Access it via \u003Ca href=\"http:\u002F\u002Flocalhost:4436\">http:\u002F\u002Flocalhost:4436\u003C\u002Fa>.\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>I want to install a library for my labeling function\u003C\u002Fb>\u003C\u002Fsummary>\n    For this, we need to change the requirements.txt of the \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-lf-exec-env\">lf-exec-env\u003C\u002Fa>, the containerized execution environment for your labeling functions. Please just \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fissues\">open an issue\u003C\u002Fa>, and we'll integrate your library as soon as possible.\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>Which data formats are supported?\u003C\u002Fb>\u003C\u002Fsummary>\n    We’ve structured our data formats around JSON, so you can upload most file types natively. This includes spreadsheets, text files, CSV data, generic JSON and many more.\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>How can I upload data?\u003C\u002Fb>\u003C\u002Fsummary>\n    We use \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fpandas-dev\u002Fpandas\">pandas\u003C\u002Fa> internally for matching your data to our JSON-based data model. You can upload the data via our UI, or via our \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-python\">Python SDK\u003C\u002Fa>.\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>How can I download data, and what format does it have?\u003C\u002Fb>\u003C\u002Fsummary>\n    You can download your data in our UI or via the \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-python\">Python SDK\u003C\u002Fa>, where we also provide e.g. adapters to \u003Ca href=\"https:\u002F\u002Fgithub.com\u002FRasaHQ\u002Frasa\">Rasa\u003C\u002Fa>. The export looks something like this:\n\n    [\n        {\n            \"running_id\": \"0\",\n            \"headline\": \"T. Rowe Price (TROW) Dips More Than Broader Markets\",\n            \"date\": \"Jun-30-22 06:00PM\\u00a0\\u00a0\",\n            \"headline__sentiment__MANUAL\": null,\n            \"headline__sentiment__WEAK_SUPERVISION\": \"NEGATIVE\",\n            \"headline__sentiment__WEAK_SUPERVISION__confidence\": 0.62,\n            \"headline__entities__MANUAL\": null,\n            \"headline__entities__WEAK_SUPERVISION\": [\n                \"STOCK\", \"STOCK\", \"STOCK\", \"STOCK\", \"STOCK\", \"STOCK\", \"O\", \"O\", \"O\", \"O\", \"O\"\n            ],\n            \"headline__entities__WEAK_SUPERVISION__confidence\": [\n                0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.00, 0.00, 0.00, 0.00, 0.00\n            ]\n        }\n    ]\n\n\u003C\u002Fdetails>\n\n### Service and hosting questions\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>Are there options for an enterprise on-prem solution?\u003C\u002Fb>\u003C\u002Fsummary>\n    If you're interested in running the multi-user version on your premises, please \u003Ca href=\"https:\u002F\u002Fwww.kern.ai\">reach out to us\u003C\u002Fa>. We can help you to set up the deployment and prepare your project(s) e.g. with workshops.\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>I don't want to label myself. What are my options?\u003C\u002Fb>\u003C\u002Fsummary>\n    Do you want to outsource your labeling, and let your engineers use _refinery_ as a mission control for your training data? \u003Ca href=\"https:\u002F\u002Fwww.kern.ai\">Reach out to us\u003C\u002Fa>, so we can discuss how we can help you with your use case.\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>How can I reach support?\u003C\u002Fb>\u003C\u002Fsummary>\n    In our open-source solution, you can reach out to us via \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002Fqf4rGCEphW\">Discord\u003C\u002Fa>. For our managed version, you have an in-app chat to directly contact our support team.\n\u003C\u002Fdetails>\n\n## 🐍 Python SDK\n\nYou can extend your projects by using our [Python SDK](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-python). With it, you can easily export labeled data of your current project and import new files both programmatically and via CLI (`rsdk pull` and `rsdk push \u003Cfile_name>`). It also comes with adapters, e.g. to [Rasa](https:\u002F\u002Fgithub.com\u002FRasaHQ\u002Frasa).\n\n## 🏠 Architecture\n\nOur architecture follows some main patterns:\n\n- Shared service database to efficiently transfer large data loads. To avoid redundant code in the services, we use submodules to share the data model\n- Containerized function execution for labeling functions, active learning and the record ide\n- Machine learning logic is implemented in stand-alone libraries (e.g. [sequence-learn](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fsequence-learn))\n\n\u003C\u002Fbr>\n\n![Architecture _refinery_](architecture.svg)\n\n\u003Cp align=center>\u003Ci>Some edges are not displayed for simplicity's sake. \n\u003C\u002Fbr>\nThe color of the edges have no implicit meaning, and are only used for better readability.\u003C\u002Fi>\u003C\u002Fp>\n\n\u003C\u002Fbr>\n\n**Service overview (maintained by Kern AI)**\n| Service | Description |\n|--- |--- |\n| [ml-exec-env](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-ml-exec-env) | Execution environment for the active learning module. Containerized function as a service to build active learning models using scikit-learn and sequence-learn. |\n| [embedder](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-embedder) | Embedder for _refinery_. Manages the creation of document- and token-level embeddings using the embedders library. |\n| [weak-supervisor](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-weak-supervisor) | Weak supervision for _refinery_. Manages the integration of heuristics such as labeling functions, active learners or zero-shot classifiers. Uses the weak-nlp library for the actual integration logic and algorithms. |\n| [record-ide-env](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-record-ide-env) | Execution environment for the record IDE. Containerized function as a service to build record-specific \"quick-and-dirty\" code snippets for exploration and debugging. |\n| [config](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-config) | Configuration of _refinery_. Amongst others, this manages endpoints and available language models for spaCy. |\n| [tokenizer](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-tokenizer) | Tokenizer for _refinery_. Manages the creation and storage of spaCy tokens for text-based record attributes and supports multiple language models. |\n| [gateway](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-gateway) | Gateway for _refinery_. Manages incoming requests and holds the workflow logic. To interact with the gateway, the UI or Python SDK can be used. |\n| [authorizer](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-authorizer) | Evaluates whether a user has access to certain resources. |\n| [websocket](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-websocket) | Websocket module for refinery. Enables asynchronous notifications inside the application. |\n| [lf-exec-env](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-lf-exec-env) | Execution environment for labeling functions. Containerized function as a service to execute user-defined Python scripts. |\n| [ac-exec-env](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-ac-exec-env) | Execution environment for attribute calulaction. Containerized function as a service to generate new attributes using Python scripts. |\n| [updater](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-updater) | Updater for _refinery_. Manages migration logic to new versions if required. |\n| [neural-search](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-neural-search) | Neural search for _refinery_. Manages similarity search powered by Qdrant and outlier detection, both based on vector representations of the project records. |\n| [zero-shot](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-zero-shot) | Zero-shot module for _refinery_. Enables the integration of 🤗 Hugging Face zero-shot classifiers as an off-the-shelf no-code heuristic. |\n| [entry](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-entry) | Login and registration screen for refinery. Implemented via Ory Kratos. |\n| [ui](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-ui) | UI for _refinery_. Used to interact with the whole system; to find out how to best work with the system, check out our docs. |\n| [doc-ock](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-doc-ock) | Usage statistics collection for _refinery_. If users allow it, this collects product insight data used to optimize the user experience. |\n| [gateway-proxy](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-gateway-proxy) | Gateway proxy for _refinery_. Manages incoming requests and forwards them to the gateway. Used by the Python SDK. |\n| [parent-images](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-exec-env-parent-image) | Shared images used by _refinery_. Used to reduce the required space for _refinery_. _Not yet listed in architecture diagram_ |\n| [ac-exec-env](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-ac-exec-env) | Execution environment for attribute calculation in _refinery_. Containerized function as a service to build custom attributes derived from the original data. _Not yet listed in architecture diagram_ |\n| [alfred](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Falfred) | Controls the start process of the _refinery_ app. Named after Batman's butler Alfred. _Not yet listed in architecture diagram_|\n\n**Service overview (open-source 3rd party)**\n| Service | Description |\n|--- |--- |\n| [qdrant\u002Fqdrant](https:\u002F\u002Fgithub.com\u002Fqdrant\u002Fqdrant) | Qdrant - Vector Search Engine for the next generation of AI applications |\n| [postgres\u002Fpostgres](https:\u002F\u002Fgithub.com\u002Fpostgres\u002Fpostgres) | PostgreSQL: The World's Most Advanced Open Source Relational Database |\n| [minio\u002Fminio](https:\u002F\u002Fgithub.com\u002Fminio\u002Fminio) | Multi-Cloud ☁️ Object Storage |\n| [mailhog\u002FMailHog](https:\u002F\u002Fgithub.com\u002Fmailhog\u002FMailHog) | Web and API based SMTP testing |\n| [ory\u002Fkratos](https:\u002F\u002Fgithub.com\u002Fory\u002Fkratos) | Next-gen identity server (think Auth0, Okta, Firebase) with Ory-hardened authentication, MFA, FIDO2, TOTP, WebAuthn, profile management, identity schemas, social sign in, registration, account recovery, passwordless. Golang, headless, API-only - without templating or theming headaches. Available as a cloud service. |\n| [ory\u002Foathkeeper](https:\u002F\u002Fgithub.com\u002Fory\u002Foathkeeper) | A cloud native Identity & Access Proxy \u002F API (IAP) and Access Control Decision API that authenticates, authorizes, and mutates incoming HTTP(s) requests. Inspired by the BeyondCorp \u002F Zero Trust white paper. Written in Go. |\n\n**Integrations overview (maintained by Kern AI)**\n| Integration | Description |\n|--- |--- |\n| [refinery-python](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-python) | Official Python SDK for Kern AI refinery. |\n| [sequence-learn](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fsequence-learn) | With sequence-learn, you can build models for named entity recognition as quickly as if you were building a sklearn classifier. |\n| [embedders](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fembedders) | With embedders, you can easily convert your texts into sentence- or token-level embeddings within a few lines of code. Use cases for this include similarity search between texts, information extraction such as named entity recognition, or basic text classification. Integrates 🤗 Hugging Face transformer models |\n| [weak-nlp](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fweak-nlp) | With weak-nlp, you can integrate heuristics like labeling functions and active learners based on weak supervision. Automate data labeling and improve label quality. |\n\n**Integrations overview (open-source 3rd party)**\n| Integration | Description |\n|--- |--- |\n| [huggingface\u002Ftransformers](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers) | 🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX. |\n| [scikit-learn\u002Fscikit-learn](https:\u002F\u002Fgithub.com\u002Fscikit-learn\u002Fscikit-learn) | scikit-learn: machine learning in Python |\n| [explosion\u002FspaCy](https:\u002F\u002Fgithub.com\u002Fexplosion\u002FspaCy) | 💫 Industrial-strength Natural Language Processing (NLP) in Python |\n\n**Submodules overview**\n\nNot listed in the architecture, but for internal code management, we apply git submodules.\n| Submodule | Description |\n|--- |--- |\n| [submodule-model](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-submodule-model) | Data model for refinery. Manages entities and their access for multiple services, e.g. the gateway. |\n| [submodule-s3](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-submodule-s3) | S3 related AWS and Minio logic. |\n\n## 🏫 Glossary\n\n| Term                        | Meaning                                                                                                                                                                                                                                                                                                                                                 |\n| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| Weak supervision            | Technique\u002Fmethodology to integrate different kinds of noisy and imperfect heuristics like labeling functions. It can be used not only to automate data labeling, but generally as an approach to improve your existing label quality.                                                                                                                   |\n| Neural search               | Embedding-based approach to retrieve information; instead of telling a machine a set of constraints, neural search analyzes the vector space of data (encoded via e.g. pre-trained neural networks). Can be used e.g. to find nearest neighbors.                                                                                                        |\n| Active learning             | As data is labeled manually, a model is trained continuously to support the annotator. Can be used e.g. stand-alone, or as a heuristic for weak supervision.                                                                                                                                                                                            |\n| Vector encoding (embedding) | Using pre-trained models such as transformers from [🤗 Hugging Face](https:\u002F\u002Fwww.huggingface.co), texts can be transformed into vector space. This is both helpful for neural search and active learning (in the latter case, simple classifiers can be applied on top of the embedding, which enables fast re-training on the vector representations). |\n\nMissing anything in the glossary? [Add the term](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fissues) in an issue with the tag \"enhancement\".\n\n\u003C!-- |   \t|   \t| -->\n\n## 👩‍💻👨‍💻 Team and contributors\n\n\u003Ctable>\n  \u003Ctr>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fhenrikwenck\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F61f465fb5b23739972c34498_Rectangle%20542.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Henrik Wenck\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fjohanneshötter\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F61f4663f60ecc168136e5863_Rectangle%20545.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Johannes Hötter\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fanton-pullem-b028291ab\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F61f465ee28293abb09cfb4f4_Rectangle%20547.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Anton Pullem\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Flina-lumburovska-4b5250173\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F625eef007630e2379d85d12f_Lina.jpeg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Lina Lumburovska\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fmoritz-feuerpfeil\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F61f4660e28293a7cf7cfb563_Rectangle%20544.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Moritz Feuerpfeil\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fleonard-p%C3%BCttmann-4648231a9\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F629210dbfa42ae1cfa062b2e_Bild%20Leo.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Leo Püttmann\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fsimon-degraf-8aba731b5\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F61f466235b237362fec344ec_Rectangle%20546.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Simon Degraf\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n  \u003C\u002Ftr>\n  \u003Ctr>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Ffelix-kirsch\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F62cd22cd378c5d0f47944cf9_Felix.JPG\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Felix Kirsch\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fjens-wittmeyer-9934a2231\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F6204e65e7187e9a2ffa03777_Jens.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Jens Wittmeyer\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fmikhailkochikov\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F6207c49264911697b5f58939_Mikhail.png\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Mikhail Kochikov\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fsimon-witzke\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F61f465e2831178b0b000731a_Rectangle%20543.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Simon Witzke\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fshamanth-shetty-276a8415a\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F624b013ae420cd98e9e38761_shamanth.png\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Shamanth Shetty\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fdivyanshu-katiyar-45ba03131\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fmedia.licdn.com\u002Fdms\u002Fimage\u002FC5603AQGCchIwJJ2X5g\u002Fprofile-displayphoto-shrink_100_100\u002F0\u002F1630708281678?e=1681948800&v=beta&t=v80garJleGF5M_FPLiLtpTPpWRha6n3HLD6IBzOwmyM\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>Divyanshu Katiyar\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n  \u003C\u002Ftr>\n\u003C\u002Ftable>\n\n## 🌟 Star History\n\n[![Star History Chart](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcode-kern-ai_refinery_readme_5e20e36865da.png)](https:\u002F\u002Fstar-history.com\u002F#code-kern-ai\u002Frefinery&Date)\n\n## 📃 License\n\n_refinery_ is licensed under the Apache License, Version 2.0. View a copy of the [License file](LICENSE).\n","![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcode-kern-ai_refinery_readme_f115b18f2e89.png)\n\n\u003Cdiv align=\"center\">\n    \u003Cp>\u003Cb>数据科学家用于扩展、评估和维护自然语言数据的开源工具。\u003C\u002Fb>\u003C\u002Fp>\n    \u003Cp>\u003Cb>将训练数据视作软件构件。\u003C\u002Fb>\u003C\u002Fp>\n\u003C\u002Fdiv>\n\n\u003Cp align=center>\n    \u003Ca href=\"https:\u002F\u002Fpypi.org\u002Fproject\u002Fkern-refinery\u002F1.3.0\u002F\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fpypi-yellow.svg\" alt=\"pypi\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fblob\u002Fmaster\u002FLICENSE\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-Apache%202.0-success\" alt=\"Apache 2.0 License\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fdiscussions\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FDiscussions-gray.svg?logo=github\" alt=\"GitHub Discussions\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002Fqf4rGCEphW\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FDiscord-gray.svg?logo=discord\" alt=\"Discord\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Ftwitter.com\u002FMeetKern\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FTwitter-white.svg?logo=twitter\" alt=\"Twitter\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fcompany\u002Fkern-ai\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLinkedIn-0A66C2.svg?logo=linkedin\" alt=\"LinkedIn\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fwww.youtube.com\u002F@kern_ai\u002Fvideos\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FYouTube-FF0000.svg?logo=youtube\" alt=\"YouTube\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fapp.kern.ai\u002F\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FCloud-black.svg?logo=data:image\u002Fpng;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAAeCAYAAABNChwpAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAALDSURBVHgBvVfBjRUxDLWzf+GCxJeQkLhNJ2wLFIC02wFIcNgG4IBAdMCBBrYD6IS5oYXLcoAL\u002FBg7iRM7Mx+ElMHSTDxOxn52nMRBKPR4vvwAQA8BELOE+EGQD8QAIYkRgjzMiwyl5WGh8NKXZFD6y1jpP0Jz15ONYzGNRSQtUZYrNBKBPPqn4RDRaSVYEuV\u002FpwYgeEVUYyAtHVW0ZqgoT+5Y4JYUZAMQdSBWr1GVkgGnfegVQZWRjxomScWOHZyw9IKqB8WdPK\u002FFG3lZUOkLdUqsHKrraIKg0RSd8s+uByAIyQSbzV8R4k1KwJIhWKOQEw+InxDq\u002FwlpKJoEOdKeRWcs2WefGoid9x2hm+n5\u002FfTiEQyg59ev37H6817upgBN+7eE+1ciaMmi4e8A+NC7zBlAbJS679S6HOgzFGF7Citcs45jIeCKvt2Tzy8nYb79\u002FM4Zm5dcjJGzPt7wPM0wkGTe7RJPq+D05PYn+bgXbhmEePXq\u002FtMLGEhsOKm3URAwwaLZksISUNlH\u002FhOl4HfhXwBzW\u002FBoKkd2b89vRGuoRmLoABydggjbkEbYroZdP2B0YkilxX5P1z++7k8C6hH98e7pnYsKgOqRm0M0MgJinJuJWOuv2OqEtw8uZ+lPzubDATajXBXh6iFXp2DLbaBURQZOo7BYerTNKqh7LAbnarBns44cvQpaeaaVdMOwy6j6+NP5sy9vUuLIT3w4Mcdtqgdj0iGyxAPkN+XRkQ5JdogHMbNnftIyrSt1GwBnuqwIZs4gKWyzljMYa6EqvC3ZiWIto0OJbL1okJb5fspdDuQ1Wopzbf2rITGHmivbiy0yhfiftvjQ1\u002FVaFVHRa0vwasR\u002FY7vM1TtFKompFPE26D2U1XtBVmwuIOhimMfRcmHZ61muQn122aC1CADOsEK0YBC05FeACzLXjxbHPu\u002FFapiV\u002FQ23LD\u002F8UGO+6AAAAABJRU5ErkJggg==\" alt=\"Kern AI\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fdemo.kern.ai\u002F\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPlayground-white.svg\" alt=\"Playground\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fdocs.kern.ai\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FDocs-blue.svg\" alt=\"Docs\">\u003C\u002Fa>\n    \u003Ca href=\"https:\u002F\u002Fwww.kern.ai\u002F\">\u003Cimg src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FWeb-white.svg\" alt=\"Website\">\u003C\u002Fa>\n\u003C\u002Fp>\n\n这些场景中，有没有哪一个让你觉得似曾相识？\n\n- 你正在独自进行一项自然语言处理相关的个人项目，但手头的标注数据不足以训练出一个效果良好的模型。\n- 你在团队中工作，已经有一些标注数据，但它们只是简单地存储在电子表格或文本文件里，你根本不清楚这些数据的质量究竟如何。\n- 你们团队即将启动一个新项目，但资源有限（标注人员、预算、时间），现在需要弄清楚如何最有效地利用这些资源。\n\n如果是这样，那么refinery正是为你而生。refinery采用以数据为中心的方法，帮助你构建更优质的自然语言处理模型。它能够半自动化地完成标注任务，发现训练数据中的低质量子集，并在一个平台上对数据进行全面监控。\n\nrefinery并不会完全取代人工标注，但它能确保你宝贵的时间被高效利用。此外，refinery的开发者们目前正在与其他标注工具进行集成，方便你在不同方案之间灵活切换。\n\n![refinery展示动图](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcode-kern-ai_refinery_readme_bae4fd401088.gif)\n\n> **_演示:_** 你可以在一个（大部分为只读）在线试用环境中与该应用互动。点击[这里](https:\u002F\u002Fdemo.kern.ai)体验。\n\nrefinery是一个多仓库项目，所有集成的服务都可以在下方的架构图中找到。该应用基于[🤗 Hugging Face](https:\u002F\u002Fwww.huggingface.co)和[spaCy](https:\u002F\u002Fspacy.io\u002F)构建，以便为你的自然语言处理任务提供预训练的语言模型支持；同时，还使用了[qdrant](https:\u002F\u002Fgithub.com\u002Fqdrant\u002Fqdrant)来进行神经网络搜索。\n\n## 目录\n\n- [🧑‍💻 为什么选择refinery？](#-why-refinery)\n  - [助力“单兵作战”的创意实现](#enabling-ideas-of-one-person-armies)\n  - [扩展现有的标注流程](#extending-your-existing-labeling-approach)\n  - [为非结构化数据注入结构](#put-structure-into-unstructured-data)\n  - [促进协作](#pushing-collaboration)\n  - [开源理念，将训练数据视为软件构件](#open-source-and-treating-training-data-as-a-software-artifact)\n  - [集成](#integrations)\n- [你的收益](#your-benefits)\n- [如果refinery是开源的，Kern AI是如何盈利的？](#how-does-kern-ai-make-money-if-refinery-is-open-source)\n- [🤓 功能特性](#-features)\n  - [(半)自动化自然语言处理任务标注流程](#semi-automated-labeling-workflow-for-nlp-tasks)\n  - [全面的数据管理和监控](#extensive-data-management-and-monitoring)\n  - [托管版中的团队工作空间](#team-workspaces-in-the-managed-version)\n- [☕ 安装指南](#-installation)\n  - [通过pip安装](#from-pip)\n  - [从源码安装](#from-repository)\n  - [数据持久化](#persisting-data)\n- [📘 文档与教程](#-documentation-and-tutorials)\n- [😵‍💫 需要帮助吗？](#-need-help)\n- [🪢 社区与联系方式](#-community-and-contact)\n- [🙌 贡献](#-contributing)\n- [❓ 常见问题解答](#-faq)\n  - [概念相关问题](#concept-questions)\n  - [技术相关问题](#technical-questions)\n  - [服务与托管相关问题](#service-and-hosting-questions)\n- [🐍 Python SDK](#-python-sdk)\n- [🏠 架构](#-architecture)\n- [🏫 术语表](#-glossary)\n- [👩‍💻👨‍💻 团队与贡献者](#-team-and-contributors)\n- [🌟 星标历史](#-star-history)\n- [📃 许可证](#-license)\n\n## 🧑‍💻 为什么选择refinery？\n\n目前市面上已经有许多用于构建训练数据的工具。那么，我们为何还要开发“又一个”这样的工具呢？\n\n### 助力“单兵作战”的创意实现\n\n我们相信，开发者们总能迸发出天马行空的想法，而我们的目标就是降低他们实现这些想法的门槛。refinery旨在大幅加速标注数据的构建过程，让你只需花费极少的时间就能完成一个想法的原型验证。正因为这一点，refinery受到了广泛的好评。不妨在下一个项目中亲自试一试吧。\n\n### 扩展现有的标注方法\n\n_refinery_ 不仅仅是一个标注工具。它内置了标注编辑器，但其主要优势在于自动化和数据管理功能。你可以集成任何启发式规则，以自动标注尽可能多的内容，随后再专注于那些棘手的子集。无论你是在 _refinery_ 中进行标注，还是使用其他工具（甚至是众包标注），这都无关紧要！\n\n### 为非结构化数据赋予结构\n\n_refinery_ 是一款能为你的数据带来全新视角的工具。你在处理多语言的人工撰写文本吗？通过与 [bricks](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fbricks) 的集成，你可以轻松地为文本添加元数据，例如检测到的语言、句子复杂度等众多信息。这些元数据既可用于数据分析，也可用于编排你的标注工作流。\n\n### 推动协作\n\n在此过程中，我们的目标是提升工程师与领域专家（SME）之间的协作效率。过去，我们曾看到客户在会议中利用我们的应用讨论标注模式，形式包括标注函数和远程监督等。我们相信，以数据为中心的AI正是充分发挥协作优势的最佳途径。\n\n### 开源，并将训练数据视为软件构件\n\n我们不认同目前仍有许多场景中，训练数据仅仅是一份普通的CSV文件。如果你确实只是想快速用少量数据原型化一个想法，这样也无妨；但任何严肃的软件项目都应当具备可维护性。我们认为，解决这一问题的关键在于开源的训练数据管理方案。_refinery_ 正是帮助你记录和管理数据的工具，从而将训练数据视作一种软件构件。\n\n### 集成\n\n最后，_refinery_ 支持 [SDK操作](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-python)，例如数据的拉取和推送。以数据为中心的AI重新定义了标注流程，使其不再是一次性的任务，而是具有迭代性质的工作流。因此，我们致力于通过提供端到端的能力，不断提升你的工作效率，推动大规模高质量训练数据的可用性。你可以使用我们的SDK，与现有系统进行集成。\n\n## 您的收益\n\n你可以自动化大量重复性任务，更深入地洞察数据标注流程，获得关于训练数据的隐式文档，并最终在更短的时间内构建出更好的模型。\n\n我们的目标是让训练数据的构建过程更像一项程序化且令人愉悦的任务，而非繁琐而重复的工作。_refinery_ 正是我们为此做出的贡献。我们也始终致力于不断改进这一贡献。\n\n如果你喜欢我们的工作，请为我们点亮一颗星⭐！\n\n## 如果 _refinery_ 是开源的，Kern AI 如何盈利呢？\n\n这个问题我们经常被问到——而且确实是个合理的问题 🙂 简而言之，_refinery_ 的开源版本目前仅支持单用户使用，而通过我们的商业版，你可以获得多用户环境的支持。此外，我们还在 _refinery_ 的基础上推出了商业产品，例如将 _refinery_ 的自动化能力作为实时预测API来使用。\n\n总的来说，我们对开源充满热情，并希望尽最大努力做出贡献。\n\n## 🤓 功能特性\n\n如需了解详细的功能列表，请参阅我们的[文档](https:\u002F\u002Fdocs.kern.ai)。\n\n### （半）自动化NLP任务标注工作流\n\n- 支持分类和跨度标注的纯手动及程序化方式\n- 与最先进的一系列库和框架无缝集成\n- 可创建和管理查找表\u002F知识库，以辅助标注\n- 基于神经网络搜索的相似记录与异常值检索\n- 标注会话可切片，便于深入分析特定子集\n- 单个项目中可同时进行多项标注任务\n- 在我们的开源 [bricks](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fbricks) 库中提供了丰富的现成自动化工具\n\n### 丰富的数据管理和监控功能\n\n- 通过我们的数据浏览器实现一流的数据管理能力。你可以按置信度、启发式重叠程度、用户、备注等条件对数据进行筛选、排序和搜索。\n- 与 [🤗 Hugging Face](https:\u002F\u002Fwww.huggingface.co) 集成，自动创建文档级和标记级嵌入。\n- 基于JSON的数据模型，支持数据的上传和下载。\n- 提供项目指标概览，包括置信度分布、标签分布以及混淆矩阵。\n- 数据可通过我们的 [Python SDK](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-python) 进行访问和扩展。\n- 支持就地修改属性，例如添加句子复杂度指标。\n- 同样，你也可以使用 [bricks](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fbricks) 为数据添加更多元数据。\n\n### 团队工作空间（托管版）\n\n- 允许多个用户基于角色权限和精简的标注界面共同标注数据。\n- 可集成众包标注流程。\n- 自动计算标注者间一致性指标。\n\n## ☕ 安装说明\n\n### 通过pip安装\n\n```\npip install kern-refinery\n```\n\n安装完成后，进入你希望存储数据的目录并运行 `refinery start`。如果尚未克隆仓库，该命令会先自动执行 `git clone`。要停止服务，只需运行 `refinery stop`。\n\n### 从仓库直接安装\n\n**简要步骤：**\n\n```\n$ git clone https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery.git\n$ cd refinery\n```\n\n如果你使用的是Mac或Linux系统：\n\n```\n$ .\u002Fstart\n```\n\n如果是Windows系统：\n\n```\n$ start.bat\n```\n\n要停止服务，输入 `.\u002Fstop`（Mac\u002FLinux）或 `stop.bat`。\n\n_refinery_ 由多个服务组成，需要协同运行。为此，我们准备了一个启动脚本，它可以自动拉取并连接各个服务。该脚本包含在仓库中，你只需克隆仓库，然后在仓库目录下运行 `.\u002Fstart`（Mac\u002FLinux）或 `start.bat`（Windows）。等待几分钟后（不妨趁机泡杯咖啡☕），部署完成，你就可以在浏览器中访问 `http:\u002F\u002Flocalhost:4455`。要停止服务，运行 `.\u002Fstop`（Mac\u002FLinux）或 `.\u002Fstop.bat`（Windows）。\n\n**现在你已经可以开始使用了！ 🙌 🎉**\n\n如果在安装过程中遇到任何问题，请随时联系我们（详见下方的社区部分）。\n\n### 数据持久化\n\n默认情况下，数据会存储在 `refinery\u002Fpostgres-data` 目录下。如果你想更改存储路径，需要修改操作系统 `start` 脚本中的 `LOCAL_VOLUME` 变量。若需删除数据，只需直接删除该数据目录即可。**请务必确认不再需要这些数据后再进行删除，因为此操作不可逆！**\n\n## 📘 文档与教程\n\n开始使用 _refinery_ 的最佳方式是我们的[快速入门](https:\u002F\u002Fdocs.kern.ai\u002Frefinery\u002Fquickstart)。\n\n您可以在我们的[文档](https:\u002F\u002Fdocs.kern.ai)以及 YouTube 频道上的[教程](https:\u002F\u002Fwww.youtube.com\u002F@kern_ai\u002Fvideos)中找到详尽的指南。我们还准备了一个包含示例项目的[仓库](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fsample-projects)，您可以直接克隆。\n\n如果您在编写第一个标注函数时需要帮助，可以查看我们的开源内容库 [bricks](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fbricks)。\n\n我们的变更日志可以在这里找到：[changelog.kern.ai](https:\u002F\u002Fchangelog.kern.ai)。\n\n## 😵‍💫 需要帮助吗？\n\n别担心，我们随时为您提供支持。如果您有任何疑问，请通过 [Discord](https:\u002F\u002Fdiscord.gg\u002Fqf4rGCEphW) 联系我们，或者在我们的论坛“问答”分类下[提交工单](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fdiscussions\u002Fcategories\u002Fq-a)。\n\n## 🪢 社区与联系方式\n\n欢迎加入我们的 [Discord](https:\u002F\u002Fdiscord.gg\u002Fqf4rGCEphW)，在那里我们将很乐意帮助您构建训练数据：\n\n我们每周（大致）会发送一封关于以数据为中心的人工智能最新发现、产品开发亮点等内容的简报。您可以通过此链接订阅简报：[www.kern.ai\u002Fnewsletter](https:\u002F\u002Fwww.kern.ai\u002Fnewsletter)。\n\n此外，您还可以关注我们在 [Twitter](https:\u002F\u002Ftwitter.com\u002FMeetKern) 和 [LinkedIn](https:\u002F\u002Fwww.linkedin.com\u002Fcompany\u002Fkern-ai) 上的账号。\n\n## 🙌 贡献\n\n贡献使开源社区成为一个令人惊叹的学习、启发和创造之地。您的任何贡献都将受到**高度赞赏**。您可以通过提供关于可能发现的[期望功能和 bug](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fissues)的反馈来参与其中。\n\n如果您希望积极参与代码库的扩展，请联系我们。我们会向您解释架构的设置方式，以便您可以根据自己的需求自定义应用程序。\n\n## ❓ 常见问题解答\n\n### 概念相关问题\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>什么是启发式方法？\u003C\u002Fb>\u003C\u002Fsummary>\n    启发式方法是规模化数据标注的核心要素。它们不必达到 100% 的准确率，例如，简单的 Python 函数就可以表达领域知识。当您添加并运行多个启发式方法时，会生成所谓的“噪声标签矩阵”，该矩阵会与您手动标注的参考数据进行比对。这使我们能够分析相关性、冲突、重叠情况、数据集中的命中次数，以及每个启发式方法的准确度。\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>如何构建主动学习模型？\u003C\u002Fb>\u003C\u002Fsummary>\n    我们首先使用预训练模型来创建嵌入表示。一旦完成，这些嵌入就会在应用中可用（既可用于构建主动学习启发式方法，也用于神经搜索）。在我们的主动学习 IDE 中，您可以在嵌入之上构建一个简单的分类或抽取头，随后我们会在一个容器化环境中管理其执行。\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>我如何判断我的启发式方法是否有效？\u003C\u002Fb>\u003C\u002Fsummary>\n    启发式方法可以从覆盖度和精确度两个方面来评估其优劣。对于覆盖度而言，基本上没有限制；而对于精确度，我们通常建议保持在 70% 以上，具体数值取决于您拥有的启发式方法数量。启发式方法越多，重叠和冲突的情况就越明显，弱监督的效果也就越好。\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>我的数据量不足 1,000 条——还需要用这个工具吗？\u003C\u002Fb>\u003C\u002Fsummary>\n    当然，即使是较小的数据集也可以使用这套系统！它不仅在程序化标注方面表现出色，还拥有简洁美观的用户界面。不妨试试吧 😁\n\u003C\u002Fdetails>\n\n### 技术相关问题\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>救命！我忘记密码了！\u003C\u002Fb>\u003C\u002Fsummary>\n    别担心，即使在本地机器上也能发送重置链接。不过，该链接不会发送到您的邮箱，而是发送到 mailhog。您可以通过 \u003Ca href=\"http:\u002F\u002Flocalhost:4436\">http:\u002F\u002Flocalhost:4436\u003C\u002Fa> 访问它。\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>我想为我的标注函数安装一个库\u003C\u002Fb>\u003C\u002Fsummary>\n    为此，我们需要修改用于标注函数容器化执行环境的 \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-lf-exec-env\">lf-exec-env\u003C\u002Fa> 中的 requirements.txt 文件。请直接在 \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fissues\">GitHub 上提交一个问题\u003C\u002Fa>,我们会尽快将您的库集成进去。\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>支持哪些数据格式？\u003C\u002Fb>\u003C\u002Fsummary>\n    我们的数据格式以 JSON 为核心，因此您可以原生上传大多数文件类型。这包括电子表格、文本文件、CSV 数据、通用 JSON 等等。\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>如何上传数据？\u003C\u002Fb>\u003C\u002Fsummary>\n    内部我们使用 \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fpandas-dev\u002Fpandas\">pandas\u003C\u002Fa> 将您的数据与我们的基于 JSON 的数据模型进行匹配。您可以通过我们的 UI 或 \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-python\">Python SDK\u003C\u002Fa> 上传数据。\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>如何下载数据？数据是什么格式？\u003C\u002Fb>\u003C\u002Fsummary>\n    您可以在我们的 UI 中或通过 \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-python\">Python SDK\u003C\u002Fa> 下载数据。我们还提供了与 \u003Ca href=\"https:\u002F\u002Fgithub.com\u002FRasaHQ\u002Frasa\">Rasa\u003C\u002Fa> 等框架的适配器。导出的数据格式大致如下：\n\n    [\n        {\n            \"running_id\": \"0\",\n            \"headline\": \"T. Rowe Price (TROW) Dips More Than Broader Markets\",\n            \"date\": \"Jun-30-22 06:00PM\\u00a0\\u00a0\",\n            \"headline__sentiment__MANUAL\": null,\n            \"headline__sentiment__WEAK_SUPERVISION\": \"NEGATIVE\",\n            \"headline__sentiment__WEAK_SUPERVISION__confidence\": 0.62,\n            \"headline__entities__MANUAL\": null,\n            \"headline__entities__WEAK_SUPERVISION\": [\n                \"STOCK\", \"STOCK\", \"STOCK\", \"STOCK\", \"STOCK\", \"STOCK\", \"O\", \"O\", \"O\", \"O\", \"O\"\n            ],\n            \"headline__entities__WEAK_SUPERVISION__confidence\": [\n                0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.00, 0.00, 0.00, 0.00, 0.00\n            ]\n        }\n    ]\n\n\u003C\u002Fdetails>\n\n### 服务与托管问题\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>是否有企业本地部署的解决方案选项？\u003C\u002Fb>\u003C\u002Fsummary>\n    如果您有兴趣在本地运行多用户版本，请\u003Ca href=\"https:\u002F\u002Fwww.kern.ai\">联系我们\u003C\u002Fa>。我们可以帮助您完成部署，并通过工作坊等方式为您的项目做好准备。\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>我不想亲自进行标注，我有哪些选择？\u003C\u002Fb>\u003C\u002Fsummary>\n    您是否希望将标注工作外包出去，让您的工程师使用 _refinery_ 作为训练数据的任务控制中心？请\u003Ca href=\"https:\u002F\u002Fwww.kern.ai\">联系我们\u003C\u002Fa>,以便我们讨论如何根据您的用例为您提供支持。\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n    \u003Csummary>\u003Cb>我该如何联系技术支持？\u003C\u002Fb>\u003C\u002Fsummary>\n    在我们的开源解决方案中，您可以透过\u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002Fqf4rGCEphW\">Discord\u003C\u002Fa>与我们取得联系。而对于我们的托管版本，则提供应用内聊天功能，方便您直接联系我们的支持团队。\n\u003C\u002Fdetails>\n\n## 🐍 Python SDK\n\n您可以通过使用我们的[Python SDK](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-python)来扩展您的项目。借助它，您可以轻松地以编程方式或通过命令行（`rsdk pull` 和 `rsdk push \u003Cfile_name>`）导出当前项目的已标注数据并导入新文件。此外，它还配备了适配器，例如用于[Rasa](https:\u002F\u002Fgithub.com\u002FRasaHQ\u002Frasa)的适配器。\n\n## 🏠 架构\n\n我们的架构遵循以下主要模式：\n\n- 共享的服务数据库，用于高效传输大量数据。为了避免各服务中的代码冗余，我们使用子模块来共享数据模型。\n- 容器化的函数执行环境，用于标注函数、主动学习和记录 IDE。\n- 机器学习逻辑被实现为独立的库（例如[sequence-learn](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fsequence-learn)）。\n\n\u003C\u002Fbr>\n\n![Architecture _refinery_](architecture.svg)\n\n\u003Cp align=center>\u003Ci>为简化起见，部分边线未显示。 \n\u003C\u002Fbr>\n边线的颜色并无特殊含义，仅是为了提高可读性。\u003C\u002Fi>\u003C\u002Fp>\n\n\u003C\u002Fbr>\n\n**服务概览（由 Kern AI 维护）**\n| 服务 | 描述 |\n|--- |--- |\n| [ml-exec-env](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-ml-exec-env) | 主动学习模块的执行环境。容器化的函数即服务，用于利用 scikit-learn 和 sequence-learn 构建主动学习模型。 |\n| [embedder](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-embedder) | _refinery_ 的嵌入器。管理使用 embedders 库创建文档级和标记级嵌入的过程。 |\n| [weak-supervisor](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-weak-supervisor) | _refinery_ 的弱监督模块。管理启发式方法的集成，如标注函数、主动学习器或零样本分类器。实际的集成逻辑和算法则由 weak-nlp 库负责。 |\n| [record-ide-env](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-record-ide-env) | 记录 IDE 的执行环境。容器化的函数即服务，用于构建针对特定记录的“快速且粗糙”的代码片段，以供探索和调试使用。 |\n| [config](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-config) | _refinery_ 的配置服务。其中包括管理 spaCy 的端点及可用的语言模型等。 |\n| [tokenizer](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-tokenizer) | _refinery_ 的分词器。管理基于文本的记录属性的 spaCy 分词结果的创建与存储，并支持多种语言模型。 |\n| [gateway](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-gateway) | _refinery_ 的网关。负责管理传入请求并掌控工作流逻辑。可通过 UI 或 Python SDK 与网关交互。 |\n| [authorizer](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-authorizer) | 评估用户是否具有访问特定资源的权限。 |\n| [websocket](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-websocket) | _refinery_ 的 WebSocket 模块。支持应用内的异步通知功能。 |\n| [lf-exec-env](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-lf-exec-env) | 标注函数的执行环境。容器化的函数即服务，用于执行用户自定义的 Python 脚本。 |\n| [ac-exec-env](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-ac-exec-env) | 属性计算的执行环境。容器化的函数即服务，用于通过 Python 脚本生成新的属性。 |\n| [updater](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-updater) | _refinery_ 的更新服务。必要时管理向新版本的迁移逻辑。 |\n| [neural-search](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-neural-search) | _refinery_ 的神经网络搜索模块。管理基于 Qdrant 的相似度搜索以及异常检测功能，两者均基于项目记录的向量表示。 |\n| [zero-shot](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-zero-shot) | _refinery_ 的零样本模块。允许将 🤗 Hugging Face 的零样本分类器作为现成的无代码启发式方法集成进来。 |\n| [entry](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-entry) | _refinery_ 的登录与注册页面。由 Ory Kratos 实现。 |\n| [ui](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-ui) | _refinery_ 的用户界面。用于与整个系统交互；如需了解如何更好地使用该系统，请查阅我们的文档。 |\n| [doc-ock](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-doc-ock) | _refinery_ 的使用统计收集模块。在用户允许的情况下，会收集产品洞察数据，用于优化用户体验。 |\n| [gateway-proxy](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-gateway-proxy) | _refinery_ 的网关代理。管理传入请求并将它们转发至网关。由 Python SDK 使用。 |\n| [parent-images](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-exec-env-parent-image) | _refinery_ 共享的镜像。用于减少 _refinery_ 所需的存储空间。_尚未列入架构图中_ |\n| [ac-exec-env](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-ac-exec-env) | _refinery_ 中属性计算的执行环境。容器化的函数即服务，用于从原始数据中派生自定义属性。_尚未列入架构图中_ |\n| [alfred](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Falfred) | 控制 _refinery_ 应用程序的启动流程。以蝙蝠侠的管家阿尔弗雷德命名。_尚未列入架构图中_|\n\n**服务概览（开源第三方）**\n| 服务 | 描述 |\n|--- |--- |\n| [qdrant\u002Fqdrant](https:\u002F\u002Fgithub.com\u002Fqdrant\u002Fqdrant) | Qdrant - 面向下一代AI应用的向量搜索引擎 |\n| [postgres\u002Fpostgres](https:\u002F\u002Fgithub.com\u002Fpostgres\u002Fpostgres) | PostgreSQL：全球最先进的开源关系型数据库 |\n| [minio\u002Fminio](https:\u002F\u002Fgithub.com\u002Fminio\u002Fminio) | 多云 ☁️ 对象存储 |\n| [mailhog\u002FMailHog](https:\u002F\u002Fgithub.com\u002Fmailhog\u002FMailHog) | 基于Web和API的SMTP测试 |\n| [ory\u002Fkratos](https:\u002F\u002Fgithub.com\u002Fory\u002Fkratos) | 新一代身份认证服务器（类似Auth0、Okta、Firebase），具备Ory强化的身份验证、多因素认证、FIDO2、TOTP、WebAuthn、个人资料管理、身份模式、社交登录、注册、账户恢复及无密码登录等功能。采用Go语言开发，无前端界面，仅提供API接口，无需处理模板或主题问题。也可作为云服务使用。 |\n| [ory\u002Foathkeeper](https:\u002F\u002Fgithub.com\u002Fory\u002Foathkeeper) | 一款云原生的身份与访问代理\u002FIAP及访问控制决策API，能够对传入的HTTP(s)请求进行身份验证、授权和修改。灵感来源于BeyondCorp\u002F零信任白皮书，使用Go语言编写。\n\n**集成概览（由Kern AI维护）**\n| 集成 | 描述 |\n|--- |--- |\n| [refinery-python](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-python) | Kern AI refinery 的官方Python SDK。 |\n| [sequence-learn](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fsequence-learn) | 使用sequence-learn，您可以像构建sklearn分类器一样快速搭建命名实体识别模型。 |\n| [embedders](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fembedders) | 通过embedders，您只需几行代码即可轻松将文本转换为句子级或词元级别的嵌入表示。应用场景包括文本间的相似性搜索、信息抽取（如命名实体识别）以及基础文本分类等。集成 🤗 Hugging Face 转换器模型。 |\n| [weak-nlp](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Fweak-nlp) | 使用weak-nlp，您可以整合基于弱监督的启发式方法，例如标签函数和主动学习算法。实现数据标注自动化并提升标签质量。 |\n\n**集成概览（开源第三方）**\n| 集成 | 描述 |\n|--- |--- |\n| [huggingface\u002Ftransformers](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers) | 🤗 Transformers：面向PyTorch、TensorFlow和JAX的最先进机器学习框架。 |\n| [scikit-learn\u002Fscikit-learn](https:\u002F\u002Fgithub.com\u002Fscikit-learn\u002Fscikit-learn) | scikit-learn：Python中的机器学习工具库。 |\n| [explosion\u002FspaCy](https:\u002F\u002Fgithub.com\u002Fexplosion\u002FspaCy) | 💫 Python中的工业级自然语言处理（NLP）。 |\n\n**子模块概览**\n\n未在架构图中列出，但为了内部代码管理，我们采用了Git子模块。\n| 子模块 | 描述 |\n|--- |--- |\n| [submodule-model](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-submodule-model) | refinery的数据模型。管理多个服务（例如网关）所需的实体及其访问权限。 |\n| [submodule-s3](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery-submodule-s3) | 与S3相关的AWS和Minio逻辑。 |\n\n\n\n## 🏫 术语表\n\n| 术语                        | 含义                                                                                                                                                                                                                                                                                                                                                 |\n| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| 弱监督            | 一种技术\u002F方法论，用于整合各种噪声大、不完善的启发式规则，如标签函数。它不仅可以用来自动化数据标注，还可以作为一种通用的方法来提升现有标签的质量。                                                                                                                   |\n| 神经搜索               | 基于嵌入的检索方法；神经搜索不是向机器输入一组约束条件，而是分析数据的向量空间（通过预训练的神经网络编码）。可用于寻找最近邻等场景。                                                                                                        |\n| 主动学习             | 在人工标注数据的同时，持续训练模型以辅助标注人员。既可独立使用，也可作为弱监督的一种启发式方法。                                                                                                                                                                                            |\n| 向量编码（嵌入） | 利用来自[🤗 Hugging Face](https:\u002F\u002Fwww.huggingface.co)的预训练模型（如Transformer），可以将文本转换为向量空间表示。这既有助于神经搜索，也有助于主动学习（在后一种情况下，可以在嵌入之上应用简单的分类器，从而实现基于向量表示的快速再训练）。 |\n\n术语表中是否遗漏了某些内容？请在带有“enhancement”标签的议题中[添加该术语](https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fissues)。\n\n\u003C!-- |   \t|   \t| -->\n\n## 👩‍💻👨‍💻 团队与贡献者\n\n\u003Ctable>\n  \u003Ctr>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fhenrikwenck\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F61f465fb5b23739972c34498_Rectangle%20542.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>亨里克·文克\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fjohanneshötter\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F61f4663f60ecc168136e5863_Rectangle%20545.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>约翰内斯·赫特尔\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fanton-pullem-b028291ab\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F61f465ee28293abb09cfb4f4_Rectangle%20547.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>安东·普勒姆\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Flina-lumburovska-4b5250173\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F625eef007630e2379d85d12f_Lina.jpeg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>莉娜·伦布罗夫斯卡\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fmoritz-feuerpfeil\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F61f4660e28293a7cf7cfb563_Rectangle%20544.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>莫里茨·费尔普法伊尔\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fleonard-p%C3%BCttmann-4648231a9\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F629210dbfa42ae1cfa062b2e_Bild%20Leo.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>利奥·普特曼\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fsimon-degraf-8aba731b5\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F61f466235b237362fec344ec_Rectangle%20546.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>西蒙·德格拉夫\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n  \u003C\u002Ftr>\n  \u003Ctr>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Ffelix-kirsch\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F62cd22cd378c5d0f47944cf9_Felix.JPG\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>菲利克斯·基尔希\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fjens-wittmeyer-9934a2231\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F6204e65e7187e9a2ffa03777_Jens.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>延斯·维特迈耶\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fmikhailkochikov\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F6207c49264911697b5f58939_Mikhail.png\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>米哈伊尔·科奇科夫\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fsimon-witzke\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F61f465e2831178b0b000731a_Rectangle%20543.jpg\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>西蒙·维茨克\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fshamanth-shetty-276a8415a\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fuploads-ssl.webflow.com\u002F61f321fd2dc7db10189dabdb\u002F624b013ae420cd98e9e38761_shamanth.png\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>沙曼斯·谢蒂\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n    \u003Ctd align=\"center\">\n        \u003Ca href=\"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fdivyanshu-katiyar-45ba03131\u002F\">\n            \u003Cimg src=\"https:\u002F\u002Fmedia.licdn.com\u002Fdms\u002Fimage\u002FC5603AQGCchIwJJ2X5g\u002Fprofile-displayphoto-shrink_100_100\u002F0\u002F1630708281678?e=1681948800&v=beta&t=v80garJleGF5M_FPLiLtpTPpWRha6n3HLD6IBzOwmyM\" width=\"50px;\" alt=\"\"\u002F>\n            \u003Cbr \u002F>\n            \u003Csub>\n                \u003Cb>迪万舒·卡蒂亚尔\u003C\u002Fb>\n            \u003C\u002Fsub>\n        \u003C\u002Fa>\n        \u003Cbr\u002F>\n    \u003C\u002Ftd>\n  \u003C\u002Ftr>\n\u003C\u002Ftable>\n\n## 🌟 星标历史\n\n[![星标历史图表](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcode-kern-ai_refinery_readme_5e20e36865da.png)](https:\u002F\u002Fstar-history.com\u002F#code-kern-ai\u002Frefinery&Date)\n\n## 📃 许可证\n\n_refinery_ 采用 Apache License, Version 2.0 许可证。请查看 [许可证文件](LICENSE)。","# Refinery 快速上手指南\n\nRefinery 是一款开源的自然语言数据管理工具，旨在帮助数据科学家以“数据为中心”的方式扩展、评估和维护训练数据。它将训练数据视为软件工件，支持半自动标注、低质量数据子集发现以及统一的数据监控。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**：Linux, macOS 或 Windows (推荐 WSL2)\n*   **Python 版本**：Python 3.8 或更高版本\n*   **包管理器**：pip\n*   **网络环境**：首次运行时需联网下载依赖及克隆仓库（国内用户若遇网络问题，建议配置全局代理或使用镜像源）\n\n## 安装步骤\n\n### 1. 安装核心库\n使用 pip 安装 `kern-refinery` 包。国内用户推荐使用清华或阿里镜像源以加速下载：\n\n```bash\npip install kern-refinery -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n### 2. 启动服务\n安装完成后，进入您希望存储数据的目录，执行以下命令启动 Refinery 服务：\n\n```bash\nrefinery start\n```\n\n> **注意**：如果是首次运行，该命令会自动执行 `git clone` 克隆必要的仓库资源。请确保当前目录具有写入权限且网络连接正常。\n\n### 3. 停止服务\n如需停止本地服务器，请在终端运行：\n\n```bash\nrefinery stop\n```\n\n## 基本使用\n\n启动成功后，Refinery 将在本地运行一个 Web 应用。\n\n1.  **访问界面**：\n    打开浏览器，访问默认地址：\n    `http:\u002F\u002Flocalhost:8080`\n    *(注：具体端口号请以终端启动日志输出为准)*\n\n2.  **创建项目**：\n    *   在仪表盘点击 **\"Create Project\"**。\n    *   选择任务类型（如文本分类 Classification 或 跨度标注 Span-labeling）。\n    *   上传您的数据集（支持 JSON 格式，或通过 Python SDK 导入）。\n\n3.  **半自动标注工作流**：\n    *   **启发式标注**：利用内置的 \"Bricks\" 库或编写简单的规则函数，对数据进行批量预标注。\n    *   **人工修正**：进入标注编辑器，系统会优先展示低置信度或存在冲突的样本，供您进行人工校验和修正。\n    *   **数据监控**：在 \"Data Browser\" 中通过置信度、标签分布等指标过滤和查看数据质量。\n\n4.  **导出数据**：\n    标注完成后，可通过界面导出高质量的训练数据（JSON 格式），或使用 Python SDK 将数据推送到您的模型训练流水线中。\n\n---\n*更多高级功能（如团队协作、自定义自动化脚本、API 集成）请参考官方文档：https:\u002F\u002Fdocs.kern.ai*","某电商公司的算法团队正致力于构建一个智能客服意图识别模型，但面临标注数据稀缺且质量参差不齐的困境。\n\n### 没有 refinery 时\n- 训练数据散落在多个 Excel 表格和文本文件中，缺乏统一版本管理，团队成员经常混淆最新数据集。\n- 无法量化评估数据质量，模型效果不佳时，只能盲目重新标注大量数据，浪费宝贵的人力与预算。\n- 标注工作完全依赖人工手动完成，效率低下，导致项目上线周期被迫延长数周。\n- 难以发现数据中的偏差或低质量子集，模型在特定场景下频繁出现误判，却找不到根本原因。\n\n### 使用 refinery 后\n- 将训练数据视为软件工件进行统一管理，清晰追踪每一次数据迭代，彻底消除文件混乱问题。\n- 利用内置评估功能快速定位低质量数据子集，针对性地清洗和补充样本，显著提升了模型准确率。\n- 通过半自动化标注流程辅助人工，大幅减少重复劳动，让标注团队能专注于处理高难度的边缘案例。\n- 在单一平台实时监控数据分布与质量指标，迅速发现并修正数据偏差，确保模型在各种场景下表现稳定。\n\nrefinery 帮助团队从“盲目堆砌数据”转向“以数据为中心”的精细化运营，用更少的资源构建了更强大的 NLP 模型。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcode-kern-ai_refinery_bae4fd40.gif","code-kern-ai","Kern AI","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fcode-kern-ai_e67e70a1.png","Building data-centric open-source tools for NLP",null,"info@kern.ai","MeetKern","https:\u002F\u002Fwww.kern.ai","https:\u002F\u002Fgithub.com\u002Fcode-kern-ai",[82,86,90],{"name":83,"color":84,"percentage":85},"Python","#3572A5",67.8,{"name":87,"color":88,"percentage":89},"Batchfile","#C1F12E",19.5,{"name":91,"color":92,"percentage":93},"Shell","#89e051",12.7,1468,73,"2026-04-15T00:37:37","Apache-2.0",4,"未说明",{"notes":101,"python":99,"dependencies":102},"该工具是一个多仓库项目，核心依赖 Hugging Face 和 spaCy 用于预训练语言模型，依赖 qdrant 用于神经搜索。安装方式支持 pip (kern-refinery) 或从源码克隆。开源版本目前主要为单用户模式。具体 Python 版本、操作系统及硬件资源需求在提供的 README 片段中未明确列出，通常此类 NLP 工具建议在使用 GPU 的环境下运行以获得最佳性能，但非强制要求。",[103,104,105],"Hugging Face (transformers)","spaCy","qdrant",[14,35,16,107],"其他",[109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128],"annotations","data-centric-ai","data-labeling","deep-learning","labeling","labeling-tool","machine-learning","natural-language-processing","neural-search","nlp","text-annotation","transformers","python","human-in-the-loop","spacy","artificial-intelligence","data-science","text-classification","active-learning","supervised-learning","2026-03-27T02:49:30.150509","2026-04-20T07:16:13.817231",[132,137,142,147,152],{"id":133,"question_zh":134,"answer_zh":135,"source_url":136},44042,"在 Windows 上自托管部署时，创建新项目无法上传文件且无报错，可能是什么原因？","这通常是因为 `start.bat` 脚本对系统语言有依赖。脚本默认搜索英文输出中的 \"IPv4 Address\" 来获取 IP 地址，但如果你的系统是德语或其他语言（例如德语显示为 \"IPv4-Adresse\"），脚本将无法找到 IP，导致 Minio 端点配置错误（如 `http:\u002F\u002F =:7053`）。\n\n解决方法：\n1. 打开克隆仓库根目录下的 `start.bat` 文件。\n2. 找到第 9 行左右，将搜索字符串修改为你系统语言对应的关键词。例如德语用户应修改为：\n   `set ip_address_string=\"IPv4-Adresse\"`\n3. 保存并重新运行 `start.bat`，确认输出中正确获取了 IP 地址后再尝试创建项目。","https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fissues\u002F64",{"id":138,"question_zh":139,"answer_zh":140,"source_url":141},44043,"导出记录数据时卡在 \"Prepare Download\"（准备下载）状态不动怎么办？","如果遇到导出记录时界面一直卡在 \"Prepare Download\" 状态，可以尝试完全停止并从终端重新启动 Refinery 服务。许多用户反馈重启后该问题即可解决。如果重启无效，可能是特定环境下的 Bug，建议查看相关日志或联系开发团队在社区 Discord 的办公时间进行同步调试。","https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fissues\u002F195",{"id":143,"question_zh":144,"answer_zh":145,"source_url":146},44044,"分类任务的弱学习器（Weak Learners）是否支持同时使用多个输入特征？","是的，从版本 1.3.0 开始已支持此功能。你可以使用“属性修改”（attribute modifications）功能来组合多个特征。\n\n具体做法是编写一个函数将多个字段（例如 `word_a` 和 `word_b`）拼接或处理成一个新的属性，然后对该新属性进行编码作为输入。示例代码如下：\n```python\ndef word_a_cat_word_b(record):\n    return str(record[\"word_a\"]) + str(record[\"word_b\"])\n```\n之后即可在弱学习器中选择这个新生成的组合作为输入特征。","https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fissues\u002F14",{"id":148,"question_zh":149,"answer_zh":150,"source_url":151},44045,"如何将数据导出为 Label Studio 格式以便导入？","Refinery 已经集成了导出到 Label Studio 的功能。在导出记录（Export records）时，现在可以选择特定的导出格式（包括 Label Studio 格式）。\n\n该功能允许你选择导出的内容（如仅数据、特定任务等）以及文件格式（.csv, .json 等）。相关的后端接口和 UI 更新已在 `export-refactoring` 分支及后续版本中合并，请确保你的 Refinery 版本已更新至包含此功能的最新版本（参考 PR #69 及相关更新）。","https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fissues\u002F143",{"id":153,"question_zh":154,"answer_zh":155,"source_url":156},44046,"如何在执行环境（Execution Environments）中添加自定义的 Python 库（如 langdetect, spacy 等）？","维护者会根据需求将常用库添加到执行环境的父镜像中。目前已确认添加或更新的库包括：\n- `tiktoken==0.4.0`\n- `psycopg2-binary` (更新至 2.9.7 或更高)\n- `spacy` 的日本语支持（通过 Dockerfile 中的 `RUN pip3 install spacy[ja]` 安装）\n- 移除了旧版的 `pypdf` 并更新了 `urllib3`\n\n如果你需要特定的库（如 `langdetect`, `phonenumbers`, `textblob` 等），可以提交 Feature Request，维护者会评估并将其加入基础镜像或通过 Dockerfile 指令安装。","https:\u002F\u002Fgithub.com\u002Fcode-kern-ai\u002Frefinery\u002Fissues\u002F166",[158,163,167,171,176,181,186,191,196,201,206,211,216,221,226,231,236,241,246,250],{"id":159,"version":160,"summary_zh":161,"released_at":162},351519,"v1.15.0","https:\u002F\u002Fchangelog.kern.ai\u002Frefinery\u002Fv1_15_0","2024-06-13T15:12:02",{"id":164,"version":165,"summary_zh":76,"released_at":166},351520,"v1.14.0","2024-04-18T13:50:13",{"id":168,"version":169,"summary_zh":76,"released_at":170},351521,"v1.13.1","2024-03-15T10:00:20",{"id":172,"version":173,"summary_zh":174,"released_at":175},351522,"v1.12.0","请查看我们的[变更日志](https:\u002F\u002Fchangelog.kern.ai\u002Frefinery\u002Fv1_12_0)","2023-09-20T12:48:26",{"id":177,"version":178,"summary_zh":179,"released_at":180},351523,"v1.11.0","查看我们的[变更日志](https:\u002F\u002Fchangelog.kern.ai\u002Frefinery\u002Fv1_11_0)","2023-07-10T10:35:57",{"id":182,"version":183,"summary_zh":184,"released_at":185},351524,"v1.10.1","添加了 doc ock 和神经搜索容器的缺失更改","2023-05-11T09:06:14",{"id":187,"version":188,"summary_zh":189,"released_at":190},351525,"v1.10.0","查看我们的[变更日志](https:\u002F\u002Fchangelog.kern.ai\u002Frefinery\u002Fv1_10_0)","2023-05-10T12:20:29",{"id":192,"version":193,"summary_zh":194,"released_at":195},351526,"v1.9.2","为异步调用添加弱监督修复程序","2023-05-02T11:42:51",{"id":197,"version":198,"summary_zh":199,"released_at":200},351527,"v1.9.1","包含使 bricks 集成工具与即将发布的 bricks 新版本兼容的更新。","2023-04-17T09:31:14",{"id":202,"version":203,"summary_zh":204,"released_at":205},351528,"v1.9.0","查看我们的[变更日志](https:\u002F\u002Fchangelog.kern.ai\u002Frefinery\u002Fv1_9_0)","2023-03-23T13:57:52",{"id":207,"version":208,"summary_zh":209,"released_at":210},351529,"v1.8.0","请查看我们的[变更日志](https:\u002F\u002Fchangelog.kern.ai\u002Frefinery\u002Fv1_8_0)","2023-02-16T10:55:34",{"id":212,"version":213,"summary_zh":214,"released_at":215},351530,"v1.7.1","- 为积木集成工具添加自动任务创建功能\n- 修复首次上传时自动创建属性的问题","2022-12-16T11:30:51",{"id":217,"version":218,"summary_zh":219,"released_at":220},351531,"v1.7.0","see our [changelog](https:\u002F\u002Fdocs.kern.ai\u002Fchangelog\u002Fv170-bricks-content-library-attribute-visibility-comment-filter)","2022-12-12T16:15:12",{"id":222,"version":223,"summary_zh":224,"released_at":225},351532,"v1.6.0","see our [changelog](https:\u002F\u002Fdocs.kern.ai\u002Fchangelog\u002Fv160-data-browser-enhancements-label-renaming-demo-overhaul)","2022-11-21T16:06:54",{"id":227,"version":228,"summary_zh":229,"released_at":230},351533,"v1.5.0","see our [changelog](https:\u002F\u002Fdocs.kern.ai\u002Fchangelog\u002Fv150-new-data-types-massive-size-reduction-and-nicer-looking-comments)","2022-11-02T16:55:12",{"id":232,"version":233,"summary_zh":234,"released_at":235},351534,"v1.4.0","see our [changelog](https:\u002F\u002Fdocs.kern.ai\u002Fchangelog\u002Fv140-comment-system-run-on-10-for-attributes)","2022-10-13T09:51:45",{"id":237,"version":238,"summary_zh":239,"released_at":240},351535,"v1.3.3","From now on we use specific docker container tags to enable save switching between versions.","2022-09-29T10:07:58",{"id":242,"version":243,"summary_zh":244,"released_at":245},351536,"v1.3.2","see our [changelog](https:\u002F\u002Fdocs.kern.ai\u002Fchangelog\u002Fv130-in-app-calculation-of-new-attributes-new-chart)\r\n\r\nalso some minor bugfixes for the start logic to start everything in order","2022-09-28T14:45:12",{"id":247,"version":248,"summary_zh":244,"released_at":249},351537,"v1.3.1","2022-09-28T13:53:53",{"id":251,"version":252,"summary_zh":253,"released_at":254},351538,"v1.3.0","see our [changelog](https:\u002F\u002Fdocs.kern.ai\u002Fchangelog\u002Fv130-in-app-calculation-of-new-attributes-new-chart)","2022-09-27T17:24:22"]