[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-fmind--mlops-python-package":3,"tool-fmind--mlops-python-package":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":79,"owner_location":80,"owner_email":81,"owner_twitter":82,"owner_website":83,"owner_url":84,"languages":85,"stars":101,"forks":102,"last_commit_at":103,"license":104,"difficulty_score":23,"env_os":105,"env_gpu":106,"env_ram":106,"env_deps":107,"category_tags":121,"github_topics":122,"view_count":23,"oss_zip_url":81,"oss_zip_packed_at":81,"status":16,"created_at":132,"updated_at":133,"faqs":134,"releases":174},2737,"fmind\u002Fmlops-python-package","mlops-python-package","A comprehensive Python package template to kickstart and standardize your MLOps initiatives and data pipelines.","mlops-python-package 是一个专为机器学习运维（MLOps）打造的 Python 项目模板，旨在帮助团队快速启动并标准化数据管道与模型开发流程。它解决了 MLOps 实践中常见的代码结构混乱、工具链分散以及缺乏统一规范等痛点，让开发者无需从零搭建基础设施，即可拥有具备生产级质量的代码库。\n\n这款工具特别适合从事机器学习工程化的开发人员、数据科学家以及希望构建稳健 AI 平台的团队使用。其核心亮点在于“开箱即用”的最佳实践集成：在代码质量方面，内置了 Ruff 格式化、Mypy 类型检查及 Pytest 测试框架；在配置管理上，结合 OmegaConf 与 Pydantic 实现灵活且安全的参数验证；同时原生支持 MLflow 进行模型追踪与注册，并配备 GitHub Actions 自动化流水线。此外，它还提供了从数据校验（Pandera）到文档生成的完整工具链。通过 mlops-python-package，用户可以专注于核心算法与业务逻辑，轻松构建灵活、健壮且易于维护的机器学习系统。","# MLOps Python Package\n\n[![check.yml](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Factions\u002Fworkflows\u002Fcheck.yml\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Factions\u002Fworkflows\u002Fcheck.yml)\n[![publish.yml](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Factions\u002Fworkflows\u002Fpublish.yml\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Factions\u002Fworkflows\u002Fpublish.yml)\n[![Documentation](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdocumentation-available-brightgreen.svg)](https:\u002F\u002Ffmind.github.io\u002Fmlops-python-package\u002F)\n[![License](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Flicense\u002Ffmind\u002Fmlops-python-package)](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fblob\u002Fmain\u002FLICENCE.txt)\n[![Release](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fv\u002Frelease\u002Ffmind\u002Fmlops-python-package)](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Freleases)\n\n**This repository contains a Python code base with best practices designed to support your MLOps initiatives.**\n\nThe package leverages several [tools](#tools) and [tips](#tips) to make your MLOps experience as flexible, robust, productive as possible.\n\nYou can use this package as part of your MLOps toolkit or platform (e.g., Model Registry, Experiment Tracking, Realtime Inference, ...).\n\n**Related Resources**:\n\n- **[LLMOps Coding Package (Example)](https:\u002F\u002Fgithub.com\u002Fcallmesora\u002Fllmops-python-package\u002F)**: Example with best practices and tools to support your LLMOps projects.\n- **[MLOps Coding Course (Learning)](https:\u002F\u002Fgithub.com\u002FMLOps-Courses\u002Fmlops-coding-course)**: Learn how to create, develop, and maintain a state-of-the-art MLOps code base.\n- **[Cookiecutter MLOps Package (Template)](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fcookiecutter-mlops-package)**: Start building and deploying Python packages and Docker images for MLOps tasks.\n- **[Agent Skills (Resource)](https:\u002F\u002Fgithub.com\u002FMLOps-Courses\u002Fmlops-coding-skills)**: Enhance your AI Agents with standardized skills for MLOps and coding.\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Ffmind_mlops-python-package_readme_ed9c70b7c1fb.png)\n\n# Table of Contents\n\n- [MLOps Python Package](#mlops-python-package)\n- [Table of Contents](#table-of-contents)\n- [Install](#install)\n  - [Prerequisites](#prerequisites)\n  - [Installation](#installation)\n  - [Next Steps](#next-steps)\n- [Usage](#usage)\n  - [Configuration](#configuration)\n  - [Execution](#execution)\n  - [Automation](#automation)\n  - [Workflows](#workflows)\n- [Tools](#tools)\n  - [Automation](#automation-1)\n    - [AI Assistant: Gemini Code Assist](#ai-assistant-gemini-code-assist)\n    - [Commits: Commitizen](#commits-commitizen)\n    - [Dependabot: Dependabot](#dependabot-dependabot)\n    - [Git Hooks: Pre-Commit](#git-hooks-pre-commit)\n    - [Tasks: Just](#tasks-just)\n  - [CI\u002FCD](#cicd)\n    - [Runner: GitHub Actions](#runner-github-actions)\n  - [CLI](#cli)\n    - [Parser: Argparse](#parser-argparse)\n    - [Logging: Loguru](#logging-loguru)\n  - [Code](#code)\n    - [Coverage: Coverage](#coverage-coverage)\n    - [Editor: VS Code](#editor-vs-code)\n    - [Formatting: Ruff](#formatting-ruff)\n    - [Quality: Ruff](#quality-ruff)\n    - [Security: Bandit](#security-bandit)\n    - [Testing: Pytest](#testing-pytest)\n    - [Typing: Mypy](#typing-mypy)\n    - [Versioning: Git](#versioning-git)\n  - [Configs](#configs)\n    - [Format: YAML](#format-yaml)\n    - [Parser: OmegaConf](#parser-omegaconf)\n    - [Reader: Cloudpathlib](#reader-cloudpathlib)\n    - [Validator: Pydantic](#validator-pydantic)\n  - [Data](#data)\n    - [Container: Pandas](#container-pandas)\n    - [Format: Parquet](#format-parquet)\n    - [Schema: Pandera](#schema-pandera)\n  - [Docs](#docs)\n    - [API: pdoc](#api-pdoc)\n    - [Format: Google](#format-google)\n    - [Hosting: GitHub Pages](#hosting-github-pages)\n  - [Model](#model)\n    - [Evaluation: Scikit-Learn Metrics](#evaluation-scikit-learn-metrics)\n    - [Format: Mlflow Model](#format-mlflow-model)\n    - [Registry: Mlflow Registry](#registry-mlflow-registry)\n    - [Tracking: Mlflow Tracking](#tracking-mlflow-tracking)\n  - [Package](#package)\n    - [Evolution: Changelog](#evolution-changelog)\n    - [Format: Wheel](#format-wheel)\n    - [Manager: uv](#manager-uv)\n    - [Runtime: Docker](#runtime-docker)\n  - [Programming](#programming)\n    - [Language: Python](#language-python)\n    - [Version: Uv](#version-uv)\n  - [Observability](#observability)\n    - [Reproducibility: Mlflow Project](#reproducibility-mlflow-project)\n    - [Monitoring : Mlflow Evaluate](#monitoring--mlflow-evaluate)\n    - [Alerting: Plyer](#alerting-plyer)\n    - [Lineage: Mlflow Dataset](#lineage-mlflow-dataset)\n    - [Explainability: SHAP](#explainability-shap)\n    - [Infrastructure: Mlflow System Metrics](#infrastructure-mlflow-system-metrics)\n- [Tips](#tips)\n  - [AI\u002FML Practices](#aiml-practices)\n    - [Data Catalog](#data-catalog)\n    - [Hyperparameter Optimization](#hyperparameter-optimization)\n    - [Data Splits](#data-splits)\n  - [Design Patterns](#design-patterns)\n    - [Directed-Acyclic Graph](#directed-acyclic-graph)\n    - [Program Service](#program-service)\n    - [Soft Coding](#soft-coding)\n    - [SOLID Principles](#solid-principles)\n    - [IO Separation](#io-separation)\n  - [Python Powers](#python-powers)\n    - [Context Manager](#context-manager)\n    - [Python Package](#python-package)\n  - [Software Engineering](#software-engineering)\n    - [Code Typing](#code-typing)\n    - [Config Typing](#config-typing)\n    - [Dataframe Typing](#dataframe-typing)\n    - [Object Oriented](#object-oriented)\n    - [Semantic Versioning](#semantic-versioning)\n  - [Testing Tricks](#testing-tricks)\n    - [Parallel Testing](#parallel-testing)\n    - [Test Fixtures](#test-fixtures)\n  - [VS Code](#vs-code)\n    - [Code Workspace](#code-workspace)\n    - [GitHub Copilot](#github-copilot)\n    - [VSCode VIM](#vscode-vim)\n- [Resources](#resources)\n  - [Python](#python)\n  - [AI\u002FML\u002FMLOps](#aimlmlops)\n\n# Install\n\nThis section details the requirements, actions, and next steps to kickstart your MLOps project.\n\n## Prerequisites\n\n- [Python>=3.13](https:\u002F\u002Fwww.python.org\u002Fdownloads\u002F): to benefit from [the latest features and performance improvements](https:\u002F\u002Fdocs.python.org\u002F3\u002Fwhatsnew\u002F3.13.html)\n- [uv>=0.5.5](https:\u002F\u002Fdocs.astral.sh\u002Fuv\u002F): to initialize the project [virtual environment](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fvenv.html) and its dependencies\n\n## Installation\n\n1. [Clone this GitHub repository](https:\u002F\u002Fdocs.github.com\u002Fen\u002Frepositories\u002Fcreating-and-managing-repositories\u002Fcloning-a-repository) on your computer\n\n```bash\n# with ssh (recommended)\n$ git clone git@github.com:fmind\u002Fmlops-python-package\n# with https\n$ git clone https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\n```\n\n2. [Run the project installation with uv](https:\u002F\u002Fdocs.astral.sh\u002Fuv\u002F)\n\n```bash\ncd mlops-python-package\u002F\nuv sync\n```\n\n3. Adapt the code base to your desire\n\n## Next Steps\n\nGoing from there, there are dozens of ways to integrate this package to your MLOps platform.\n\nFor instance, you can use Databricks or AWS as your compute platform and model registry.\n\nIt's up to you to adapt the package code to the solution you target. Good luck champ!\n\n# Usage\n\nThis section explains how configure the project code and execute it on your system.\n\n## Configuration\n\nYou can add or edit config files in the `confs\u002F` folder to change the program behavior.\n\n```yaml\n# confs\u002Ftraining.yaml\njob:\n  KIND: TrainingJob\n  inputs:\n    KIND: ParquetReader\n    path: data\u002Finputs_train.parquet\n  targets:\n    KIND: ParquetReader\n    path: data\u002Ftargets_train.parquet\n```\n\nThis config file instructs the program to start a `TrainingJob` with 2 parameters:\n\n- `inputs`: dataset that contains the model inputs\n- `targets`: dataset that contains the model target\n\nYou can find all the parameters of your program in the `src\u002F[package]\u002Fjobs\u002F*.py` files.\n\nYou can also print the full schema supported by this package using `uv run bikes --schema`.\n\n## Execution\n\nThe project code can be executed with uv during your development:\n\n```bash\nuv run [package] confs\u002Ftuning.yaml\nuv run [package] confs\u002Ftraining.yaml\nuv run [package] confs\u002Fpromotion.yaml\nuv run [package] confs\u002Finference.yaml\nuv run [package] confs\u002Fevaluations.yaml\nuv run [package] confs\u002Fexplanations.yaml\n```\n\nIn production, you can build, ship, and run the project as a Python package:\n\n```bash\nuv build\nuv publish # optional\npython -m pip install [package]\n[package] confs\u002Finference.yaml\n```\n\nYou can also install and use this package as a library for another AI\u002FML project:\n\n```python\nfrom [package] import jobs\n\njob = jobs.TrainingJob(...)\nwith job as runner:\n    runner.run()\n```\n\n**Additional tips**:\n\n- You can pass extra configs from the command line using the `--extras` flag\n  - Use it to pass runtime values (e.g., a result from previous job executions)\n- You can pass several config files in the command-line to merge them from left to right\n  - You can define common configurations shared between jobs (e.g., model params)\n- The right job task will be selected automatically thanks to [Pydantic Discriminated Unions](https:\u002F\u002Fdocs.pydantic.dev\u002Flatest\u002Fconcepts\u002Funions\u002F#discriminated-unions)\n  - This is a great way to run any job supported by the application (training, tuning, ...)\n\n## Automation\n\nThis project includes several automation tasks to easily repeat common actions.\n\nYou can invoke the actions from the [command-line](https:\u002F\u002Fjust.systems\u002Fman\u002Fen\u002Fintroduction.html) or [VS Code extension](https:\u002F\u002Fmarketplace.visualstudio.com\u002Fitems?itemName=nefrob.vscode-just-syntax).\n\n```bash\n# execute the project DAG\n$ just project\n# create a code archive\n$ just package\n# list other actions\n$ just\n```\n\n**Available tasks**:\n\n```toml\ndefault # display help information\n\n[check]\ncheck # run check tasks\ncheck-code # check code quality\ncheck-coverage numprocesses=\"auto\" cov_fail_under=\"80\" # check code coverage\ncheck-format # check code format\ncheck-security # check code security\ncheck-test numprocesses=\"auto\" # check unit tests\ncheck-type # check code typing\n\n[clean]\nclean # run clean tasks\nclean-build # clean build folders\nclean-cache # clean cache folder\nclean-constraints # clean constraints file\nclean-coverage # clean coverage files\nclean-docs # clean docs folder\nclean-environment # clean environment file\nclean-mlruns # clean mlruns folder\nclean-mypy # clean mypy folders\nclean-outputs # clean outputs folder\nclean-pytest # clean pytest cache\nclean-python # clean python caches\nclean-requirements # clean requirements file\nclean-ruff # clean ruff cache\nclean-venv # clean venv folder\n\n[commit]\ncommit-bump # bump package\ncommit-files # commit package\ncommit-info # get commit info\n\n[doc]\ndoc # run doc tasks\ndoc-build format=\"google\" output=\"docs\" # build documentation\ndoc-serve format=\"google\" port=\"8088\" # serve documentation\n\n[docker]\ndocker # run docker tasks\ndocker-build tag=\"latest\" # build docker image\ndocker-compose # start docker compose\ndocker-run tag=\"latest\" # run latest docker image\n\n[format]\nformat # run format tasks\nformat-import # format code import\nformat-source # format code source\n\n[install]\ninstall # run install tasks\ninstall-hooks # install git hooks\ninstall-project # install the project\ninstall-rulesets # install github rulesets\n\n[mlflow]\nmlflow # run mlflow tasks\nmlflow-doctor # run mlflow doctor\nmlflow-serve host=\"127.0.0.1\" port=\"5000\" uri=\".\u002Fmlruns\" # start mlflow server\n\n[package]\npackage # run package tasks\npackage-build constraints=\"constraints.txt\" # build python package\npackage-constraints constraints=\"constraints.txt\" # build package constraints\n\n[project]\nproject # run project tasks\nproject-environment # export environment file\nproject-requirements # export requirements file\nproject-run job # run project job using mlflow\n```\n\n## Workflows\n\nThis package supports two GitHub Workflows in `.github\u002Fworkflows`:\n\n- `check.yml`: validate the quality of the package on each Pull Request\n- `publish.yml`: build and publish the docs and packages on code release.\n\nYou can use and extend these workflows to automate repetitive package management tasks.\n\n# Tools\n\nThis sections motivates the use of developer tools to improve your coding experience.\n\n## Automation\n\nPre-defined actions to automate your project development.\n\n### AI Assistant: [Gemini Code Assist](https:\u002F\u002Fdevelopers.google.com\u002Fgemini-code-assist\u002Fdocs\u002Freview-github-code)\n\n- **Motivations**:\n  - Increase your coding productivity\n  - Get code suggestions and completions\n  - Reduce the time spent on reviewing code\n- **Limitations**:\n  - Can generate wrong code, reviews, or summaries\n\n### Commits: [Commitizen](https:\u002F\u002Fcommitizen-tools.github.io\u002Fcommitizen\u002F)\n\n- **Motivations**:\n  - Format your code commits\n  - Generate a standard changelog\n  - Integrate well with [SemVer](https:\u002F\u002Fsemver.org\u002F) and [PEP 440](https:\u002F\u002Fpeps.python.org\u002Fpep-0440\u002F)\n- **Limitations**:\n  - Learning curve for new users\n- **Alternatives**:\n  - Do It Yourself (DIY)\n\n### Dependabot: [Dependabot](https:\u002F\u002Fdocs.github.com\u002Fen\u002Fcode-security\u002Fgetting-started\u002Fdependabot-quickstart-guide)\n\n- **Motivations**:\n  - Avoid security issues\n  - Avoid breaking changes\n  - Update your dependencies\n- **Limitations**:\n  - Can break your code\n- **Alternatives**:\n  - Do It Yourself (DIY)\n\n### Git Hooks: [Pre-Commit](https:\u002F\u002Fpre-commit.com\u002F)\n\n- **Motivations**:\n  - Check your code locally before a commit\n  - Avoid wasting resources on your CI\u002FCD\n  - Can perform extra actions (e.g., file cleanup)\n- **Limitations**:\n  - Add overhead before your commit\n- **Alternatives**:\n  - [Git Hooks](https:\u002F\u002Fgit-scm.com\u002Fbook\u002Fen\u002Fv2\u002FCustomizing-Git-Git-Hooks): less convenient to use\n\n### Tasks: [Just](https:\u002F\u002Fjust.systems\u002Fman\u002Fen\u002Fintroduction.html)\n\n- **Motivations**:\n  - Automate project workflows\n  - Sane syntax compared to alternatives\n  - Good trade-off between power and simplicity\n- **Limitations**:\n  - Not familiar to most developers\n- **Alternatives**:\n  - [Make](https:\u002F\u002Fwww.gnu.org\u002Fsoftware\u002Fmake\u002Fmanual\u002Fmake.html): most popular, but awful syntax\n  - [PyInvoke](https:\u002F\u002Fwww.pyinvoke.org\u002F): pythonic, but verbose and less straightforward.\n\n## CI\u002FCD\n\nExecution of automated workflows on code push and releases.\n\n### Runner: [GitHub Actions](https:\u002F\u002Fgithub.com\u002Ffeatures\u002Factions)\n\n- **Motivations**:\n  - Native on GitHub\n  - Simple workflow syntax\n  - Lots of configs if needed\n- **Limitations**:\n  - SaaS Service\n- **Alternatives**:\n  - [GitLab](https:\u002F\u002Fabout.gitlab.com\u002F): can be installed on-premise\n\n## CLI\n\nIntegrations with the Command-Line Interface (CLI) of your system.\n\n### Parser: [Argparse](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fargparse.html)\n\n- **Motivations**:\n  - Provide CLI arguments\n  - Included in Python runtime\n  - Sufficient for providing configs\n- **Limitations**:\n  - More verbose for advanced parsing\n- **Alternatives**:\n  - [Typer](https:\u002F\u002Ftyper.tiangolo.com\u002F): code typing for the win\n  - [Fire](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fpython-fire): simple but no typing\n  - [Click](https:\u002F\u002Fclick.palletsprojects.com\u002Fen\u002Flatest\u002F): more verbose\n\n### Logging: [Loguru](https:\u002F\u002Floguru.readthedocs.io\u002Fen\u002Fstable\u002F)\n\n- **Motivations**:\n  - Show progress to the user\n  - Work fine out of the box\n  - Saner logging syntax\n- **Limitations**:\n  - Doesn't let you deviate from the base usage\n- **Alternatives**:\n  - [Logging](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Flogging.html): available by default, but feel dated\n\n## Code\n\nEdition, validation, and versioning of your project source code.\n\n### Coverage: [Coverage](https:\u002F\u002Fcoverage.readthedocs.io\u002Fen\u002Flatest\u002F)\n\n- **Motivations**:\n  - Report code covered by tests\n  - Identify code path to test\n  - Show maturity to users\n- **Limitations**:\n  - None\n- **Alternatives**:\n - [Pytest Cov](https:\u002F\u002Fpytest-cov.readthedocs.io\u002Fen\u002Flatest\u002F) A Pytest plugin that uses `coverage.py` to measure code coverage.\n\n### Editor: [VS Code](https:\u002F\u002Fcode.visualstudio.com\u002F)\n\n- **Motivations**:\n  - Open source\n  - Free, simple, open source\n  - Great plugins for Python development\n- **Limitations**:\n  - Require some configuration for Python\n- **Alternatives**:\n  - [PyCharm](https:\u002F\u002Fwww.jetbrains.com\u002Fpycharm\u002F): provide a lot, cost a lot\n  - [Vim](https:\u002F\u002Fwww.vim.org\u002F): I love it, but there is a VS Code plugin\n  - [Spacemacs](https:\u002F\u002Fwww.spacemacs.org\u002F): I love it even more, but not everybody loves LISP\n\n### Formatting: [Ruff](https:\u002F\u002Fdocs.astral.sh\u002Fruff\u002F)\n\n- **Motivations**:\n  - Super fast compared to others\n  - Don't waste time arranging your code\n  - Make your code more readable\u002Fmaintainable\n- **Limitations**:\n  - Still in version 0.x, but more and more adopted\n- **Alternatives**:\n  - [YAPF](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fyapf): more config options that you don't need\n  - [Isort](https:\u002F\u002Fpycqa.github.io\u002Fisort\u002F) + [Black](https:\u002F\u002Fblack.readthedocs.io\u002Fen\u002Fstable\u002F): slower and need two tools\n\n### Quality: [Ruff](https:\u002F\u002Fdocs.astral.sh\u002Fruff\u002F)\n\n- **Motivations**:\n  - Improve your code quality\n  - Super fast compared to others\n  - [Great integration with VS Code](https:\u002F\u002Fmarketplace.visualstudio.com\u002Fitems?itemName=charliermarsh.ruff)\n- **Limitations**:\n  - None\n- **Alternatives**:\n  - [PyLint](https:\u002F\u002Fwww.pylint.org\u002F): too slow and too complex system\n  - [Flake8](https:\u002F\u002Fflake8.pycqa.org\u002Fen\u002Flatest\u002F): too much plugins, I prefer Pylint in practice\n\n### Security: [Bandit](https:\u002F\u002Fbandit.readthedocs.io\u002Fen\u002Flatest\u002F)\n\n- **Motivations**:\n  - Detect security issues\n  - Complement linting solutions\n  - Not to heavy to use and enable\n- **Limitations**:\n  - None\n- **Alternatives**:\n  - None\n\n### Testing: [Pytest](https:\u002F\u002Fdocs.pytest.org\u002Fen\u002Flatest\u002F)\n\n- **Motivations**:\n  - Write tests or pay the price\n  - Super easy to write new test cases\n  - Tons of good plugins (xdist, sugar, cov, ...)\n- **Limitations**:\n  - Doesn't support parallel execution out of the box\n- **Alternatives**:\n  - [Unittest](https:\u002F\u002Fdocs.python.org\u002Ffr\u002F3\u002Flibrary\u002Funittest.html): more verbose, less fun\n\n### Typing: [Mypy](https:\u002F\u002Fmypy-lang.org\u002F)\n\n- **Motivations**:\n  - Static typing is cool!\n  - Communicate types to use\n  - Official type checker for Python\n- **Limitations**:\n  - Can have overhead for complex typing\n- **Alternatives**:\n  - [PyRight](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fpyright): check big code base by MicroSoft\n  - [PyType](https:\u002F\u002Fgoogle.github.io\u002Fpytype\u002F): check big code base by Google\n  - [Pyre](https:\u002F\u002Fpyre-check.org\u002F): check big code base by Facebook\n\n### Versioning: [Git](https:\u002F\u002Fgit-scm.com\u002F)\n\n- **Motivations**:\n  - If you don't version your code, you are a fool\n  - Most popular source code manager (what else?)\n  - Provide hooks to perform automation on some events\n- **Limitations**:\n  - Git can be hard: \u003Chttps:\u002F\u002Fxkcd.com\u002F1597\u002F>\n- **Alternatives**:\n  - [Mercurial](https:\u002F\u002Fwww.mercurial-scm.org\u002F): loved it back then, but git is the only real option\n\n## Configs\n\nManage the configs files of your project to change executions.\n\n### Format: [YAML](https:\u002F\u002Fyaml.org\u002F)\n\n- **Motivations**:\n  - Change execution without changing code\n  - Readable syntax, support comments\n  - Allow to use OmegaConf \u003C3\n- **Limitations**:\n  - Not supported out of the box by Python\n- **Alternatives**:\n  - [JSON](https:\u002F\u002Fwww.json.org\u002Fjson-en.html): no comments, more verbose\n  - [TOML](https:\u002F\u002Ftoml.io\u002Fen\u002F): less suited to config merge\u002Fsharing\n\n### Parser: [OmegaConf](https:\u002F\u002Fomegaconf.readthedocs.io\u002Fen\u002F2.3_branch\u002F)\n\n- **Motivations**:\n  - Parse and merge YAML files\n  - Powerful, doesn't get in your way\n  - Achieve a lot with few lines of code\n- **Limitations**:\n  - Do not support remote files (e.g., s3, gcs, ...)\n    - You can combine it with [cloudpathlib](https:\u002F\u002Fcloudpathlib.drivendata.org\u002Fstable\u002F)\n- **Alternatives**:\n  - [Hydra](https:\u002F\u002Fhydra.cc\u002Fdocs\u002Fintro\u002F): powerful, but gets in your way\n  - [DynaConf](https:\u002F\u002Fwww.dynaconf.com\u002F): more suited for app development\n\n### Reader: [Cloudpathlib](https:\u002F\u002Fcloudpathlib.drivendata.org\u002Fstable\u002F)\n\n- **Motivations**:\n  - Read files from cloud storage\n  - Better integration with cloud platforms\n  - Support several platforms: AWS, GCP, and Azure\n- **Limitations**:\n  - Support of Python typing is not great at the moment\n- **Alternatives**:\n  - Cloud SDK (GCP, AWS, Azure, ...): vendor specific, overkill for this task\n\n### Validator: [Pydantic](https:\u002F\u002Fdocs.pydantic.dev\u002Flatest\u002F)\n\n- **Motivations**:\n  - Validate your config before execution\n  - Pydantic should be builtin (period)\n  - Super charge your Python class\n- **Limitations**:\n  - None\n- **Alternatives**:\n  - [Dataclass](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fdataclasses.html): simpler, but much less powerful\n  - [Attrs](https:\u002F\u002Fwww.attrs.org\u002Fen\u002Fstable\u002F): no validation, less intuitive to use\n\n## Data\n\nDefine the datasets to provide data inputs and outputs.\n\n### Container: [Pandas](https:\u002F\u002Fpandas.pydata.org\u002F)\n\n- **Motivations**:\n  - Load data files in memory\n  - Lingua franca for Python\n  - Most popular options\n- **Limitations**:\n  - Lot of [gotchas](https:\u002F\u002Fwww.tutorialspoint.com\u002Fpython_pandas\u002Fpython_pandas_caveats_and_gotchas.htm)\n- **Alternatives**:\n  - [Polars](https:\u002F\u002Fwww.pola.rs\u002F): faster, saner, but less integrations\n  - [Pyspark](https:\u002F\u002Fspark.apache.org\u002Fdocs\u002Flatest\u002Fapi\u002Fpython\u002F): powerful, popular, distributed, so much overhead\n  - Dask, Ray, Modin, Vaex, ...: less integration (even if it looks like pandas)\n\n### Format: [Parquet](https:\u002F\u002Fparquet.apache.org\u002F)\n\n- **Motivations**:\n  - Store your data on disk\n  - Column-oriented (good for analysis)\n  - Much more efficient and saner than text based\n- **Limitations**:\n  - None\n- **Alternatives**:\n  - [CSV](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FComma-separated_values): human readable, but that's the sole benefit\n  - [Avro](https:\u002F\u002Favro.apache.org\u002F): good alternative for row-oriented workflow\n\n### Schema: [Pandera](https:\u002F\u002Fpandera.readthedocs.io\u002Fen\u002Fstable\u002F)\n\n- **Motivations**:\n  - Typing for dataframe\n  - Communicate data fields\n  - Support pandas and [others](https:\u002F\u002Fpandera.readthedocs.io\u002Fen\u002Fstable\u002Fsupported_libraries.html)\n- **Limitations**:\n  - None\n- **Alternatives**:\n  - [Great Expectations](https:\u002F\u002Fgreatexpectations.io\u002F): powerful, but much more difficult to integrate\n\n## Docs\n\nGenerate and share the project documentations.\n\n### API: [pdoc](https:\u002F\u002Fpdoc.dev\u002F)\n\n- **Motivations**:\n  - Share docs with others\n  - Simple tool, only does API docs\n  - Get the job done, get out of your way\n- **Limitations**:\n  - Only support API docs (i.e., no custom docs)\n- **Alternatives**:\n  - [Sphinx](https:\u002F\u002Fwww.sphinx-doc.org\u002Fen\u002Fmaster\u002F): More complete, overkill for simple projects\n  - [Mkdocs](https:\u002F\u002Fwww.mkdocs.org\u002F): More complete, but requires more setup\n\n### Format: [Google](https:\u002F\u002Fgoogle.github.io\u002Fstyleguide\u002Fpyguide.html)\n\n- **Motivations**:\n  - Common style for docstrings\n  - Most writeable out of alternatives\n  - I often write a single line for simplicity\n- **Limitations**:\n  - None\n- **Alternatives**:\n  - [Numpy](https:\u002F\u002Fnumpydoc.readthedocs.io\u002Fen\u002Flatest\u002Fformat.html): less writeable\n  - [Sphinx](https:\u002F\u002Fsphinx-rtd-tutorial.readthedocs.io\u002Fen\u002Flatest\u002Fdocstrings.html): baroque style\n\n### Hosting: [GitHub Pages](https:\u002F\u002Fpages.github.com\u002F)\n\n- **Motivations**:\n  - Easy to setup\n  - Free and simple\n  - Integrated with GitHub\n- **Limitations**:\n  - Only support static content\n- **Alternatives**:\n  - [ReadTheDocs](https:\u002F\u002Fabout.readthedocs.com\u002F?ref=readthedocs.com): provide more features\n\n## Model\n\nToolkit to handle machine learning models.\n\n### Evaluation: [Scikit-Learn Metrics](https:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fmodel_evaluation.html)\n\n- **Motivations**:\n  - Bring common metrics\n  - Avoid reinventing the wheel\n  - Avoid implementation mistakes\n- **Limitations**:\n  - Limited set of metric to be chosen\n- **Alternatives**:\n  - Implement your own: for custom metrics\n\n### Format: [Mlflow Model](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Fmodels.html)\n\n- **Motivations**:\n  - Standard ML format\n  - Store model dependencies\n  - Strong community ecosystem\n- **Limitations**:\n  - None\n- **Alternatives**:\n  - [Pickle](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fpickle.html): work out of the box, but less suited for big array\n  - [ONNX](https:\u002F\u002Fonnx.ai\u002F): great for deep learning, [no guaranteed compatibility for the rest](https:\u002F\u002Fonnxruntime.ai\u002Fdocs\u002Freference\u002Fcompatibility.html)\n\n### Registry: [Mlflow Registry](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Fmodel-registry.html)\n\n- **Motivations**:\n  - Save and load models\n  - Separate production from consumption\n  - Popular, open source, work on local system\n- **Limitations**:\n  - None\n- **Alternatives**:\n  - [Neptune.ai](https:\u002F\u002Fneptune.ai\u002F): SaaS solution\n  - [Weights and Biases](https:\u002F\u002Fwandb.ai\u002Fsite): SaaS solution\n\n### Tracking: [Mlflow Tracking](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Ftracking.html)\n\n- **Motivations**:\n  - Keep track of metrics and params\n  - Allow to compare model performances\n  - Popular, open source, work on local system\n- **Limitations**:\n  - None\n- **Alternatives**:\n  - [Neptune.ai](https:\u002F\u002Fneptune.ai\u002F): SaaS solution\n  - [Weights and Biases](https:\u002F\u002Fwandb.ai\u002Fsite): SaaS solution\n\n## Package\n\nDefine and build modern Python package.\n\n### Evolution: [Changelog](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FChangelog)\n\n- **Motivation**:\n  - Communicate changes to user\n  - Can be updated with [Commitizen](https:\u002F\u002Fcommitizen-tools.github.io\u002Fcommitizen\u002Fchangelog\u002F)\n  - Standardized with [Keep a Changelog](https:\u002F\u002Fkeepachangelog.com\u002F)\n- **Limitations**:\n  - None\n- **Alternatives**:\n  - None\n\n### Format: [Wheel](https:\u002F\u002Fpeps.python.org\u002Fpep-0427\u002F)\n\n- **Motivations**:\n  - [Has several advantages](https:\u002F\u002Frealpython.com\u002Fpython-wheels\u002F#advantages-of-python-wheels)\n  - Create source code archive\n  - Most modern Python format\n- **Limitations**:\n  - Doesn't ship with C\u002FC++ dependencies (e.g., CUDA)\n    - i.e., use Docker containers for this case\n- **Alternatives**:\n  - [Source](https:\u002F\u002Fdocs.python.org\u002F3\u002Fdistutils\u002Fsourcedist.html): older format, less powerful\n  - [Conda](https:\u002F\u002Fconda.io\u002Fprojects\u002Fconda\u002Fen\u002Flatest\u002Fuser-guide\u002Finstall\u002Findex.html): slow and hard to manage\n\n### Manager: [uv](https:\u002F\u002Fdocs.astral.sh\u002Fuv\u002F)\n\n- **Motivations**:\n  - Define and build Python package\n  - Fast and compliant package manager\n  - Pack every metadata in a single static file\n- **Limitations**:\n  - Cannot add dependencies beyond Python (e.g., CUDA)\n    - i.e., use Docker container for this use case\n- **Alternatives**:\n  - [Setuptools](https:\u002F\u002Fdocs.python.org\u002F3\u002Fdistutils\u002Fsetupscript.html): dynamic file is slower and more risky\n  - [Poetry](https:\u002F\u002Fpython-poetry.org\u002F): previous solution of this package\n  - Pdm, Hatch, PipEnv: \u003Chttps:\u002F\u002Fxkcd.com\u002F1987\u002F>\n\n### Runtime: [Docker](https:\u002F\u002Fwww.docker.com\u002Fresources\u002Fwhat-container\u002F)\n\n- **Motivations**:\n  - Create isolated runtime\n  - Container is the de facto standard\n  - Package C\u002FC++ dependencies with your project\n- **Limitations**:\n  - Some company might block Docker Desktop, you should use alternatives\n- **Alternatives**:\n  - [Conda](https:\u002F\u002Fdocs.conda.io\u002Fen\u002Flatest\u002F): slow and heavy resolver\n\n## Programming\n\nSelect your programming environment.\n\n### Language: [Python](https:\u002F\u002Fwww.python.org\u002F)\n\n- **Motivations**:\n  - Great language for AI\u002FML projects\n  - Robust with additional tools\n  - Hundreds of great libs\n- **Limitations**:\n  - Slow without C bindings\n- **Alternatives**:\n  - [R](https:\u002F\u002Fwww.r-project.org\u002F): specific purpose language\n  - [Julia](https:\u002F\u002Fjulialang.org\u002F): specific purpose language\n\n### Version: [Uv](https:\u002F\u002Fdocs.astral.sh\u002Fuv\u002Fguides\u002Finstall-python\u002F)\n\n- **Motivations**:\n  - Switch between Python version\n  - Allow to select the best version\n  - Support global and local dispatch\n- **Limitations**:\n  - Require some shell configurations\n- **Alternatives**:\n  - Manual installation: time consuming\n  - [PyEnv](https:\u002F\u002Fgithub.com\u002Fpyenv\u002Fpyenv): shell-based, require more setup\n\n## Observability\n\n### Reproducibility: [Mlflow Project](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Fprojects.html)\n\n- **Motivations**:\n  - Share common project formats.\n  - Ensure the project can be reused.\n  - Avoid randomness in project execution.\n- **Limitations**:\n  - Mlflow Project is best suited for small projects.\n- **Alternatives**:\n  - [DVC](https:\u002F\u002Fdvc.org\u002F): both data and models.\n  - [Metaflow](https:\u002F\u002Fmetaflow.org\u002F): focus on machine learning.\n  - **[Apache Airflow](https:\u002F\u002Fairflow.apache.org\u002F)**: for large scale projects.\n\n### Monitoring : [Mlflow Evaluate](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Fmodel-evaluation\u002Findex.html)\n\n- **Motivations**:\n  - Compute the model metrics.\n  - Validate model with thresholds.\n  - Perform post-training evaluations.\n- **Limitations**:\n  - Mlflow Evaluate is less feature-rich as alternatives.\n- **Alternatives**:\n  - **[Giskard](https:\u002F\u002Fwww.giskard.ai\u002F)**: open-core and super complete.\n  - **[Evidently](https:\u002F\u002Fwww.evidentlyai.com\u002F)**: open-source with more metrics.\n  - [Arize AI](https:\u002F\u002Farize.com\u002F): more feature-rich but less flexible.\n  - [Graphana](https:\u002F\u002Fgrafana.com\u002F): you must do everything yourself.\n\n### Alerting: [Plyer](https:\u002F\u002Fgithub.com\u002Fkivy\u002Fplyer)\n\n- **Motivations**:\n  - Simple solution.\n  - Send notifications on system.\n  - Cross-system: Mac, Linux, Windows.\n- **Limitations**:\n  - Should not be used for large scale projects.\n- **Alternatives**:\n  - [Slack](https:\u002F\u002Fslack.com\u002F): for chat-oriented solutions.\n  - [Datadog](https:\u002F\u002Fwww.datadoghq.com\u002F): for infrastructure oriented solutions.\n\n### Lineage: [Mlflow Dataset](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Ftracking\u002Fdata-api.html)\n\n- **Motivations**:\n  - Store information in Mlflow.\n  - Track metadata about run datasets.\n  - Keep URI of the dataset source (e.g., website).\n- **Limitations**:\n  - Not as feature-rich as alternative solutions.\n- **Alternatives**:\n  - [Databricks Lineage](https:\u002F\u002Fdocs.databricks.com\u002Fen\u002Fadmin\u002Fsystem-tables\u002Flineage.html): limited to Databricks.\n  - [OpenLineage and Marquez](https:\u002F\u002Fmarquezproject.github.io\u002F): open-source and flexible.\n\n### Explainability: [SHAP](https:\u002F\u002Fshap.readthedocs.io\u002Fen\u002Flatest\u002F)\n\n- **Motivations**:\n  - Most popular toolkit.\n  - Support various models (linear, model, ...).\n  - Integration with Mlflow through the [SHAP module](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Fpython_api\u002Fmlflow.shap.html).\n- **Limitations**:\n  - Super slow on large dataset.\n  - Mlflow SHAP module is not mature enough.\n- **Alternatives**:\n  - [LIME](https:\u002F\u002Fgithub.com\u002Fmarcotcr\u002Flime): not maintained anymore.\n\n### Infrastructure: [Mlflow System Metrics](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Fsystem-metrics\u002Findex.html)\n\n- **Motivations**:\n  - Track infrastructure information (RAM, CPU, ...).\n  - Integrated with Mlflow tracking.\n  - Provide hardware insights.\n- **Limitations**:\n  - Not as mature as alternative solutions.\n- **Alternatives**:\n  - [Datadog](https:\u002F\u002Fwww.datadoghq.com\u002F): popular and mature solution.\n\n# Tips\n\nThis sections gives some tips and tricks to enrich the develop experience.\n\n## [AI\u002FML Practices](https:\u002F\u002Fmachinelearningmastery.com\u002F)\n\n### [Data Catalog](https:\u002F\u002Fdocs.kedro.org\u002Fen\u002Fstable\u002Fdata\u002Fdata_catalog.html)\n\n**You should decouple the pointer to your data from how to access it.**\n\nIn your code, you can refer to your dataset with a tag (e.g., `inputs`, `targets`).\n\nThis tag can then be associated to a reader\u002Fwriter implementation in a configuration file:\n\n```yaml\n  inputs:\n    KIND: ParquetReader\n    path: data\u002Finputs_train.parquet\n  targets:\n    KIND: ParquetReader\n    path: data\u002Ftargets_train.parquet\n```\n\nIn this package, the implementation are described in `src\u002F[package]\u002Fio\u002Fdatasets.py` and selected by `KIND`.\n\n### [Hyperparameter Optimization](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FHyperparameter_optimization)\n\n**You should select the best hyperparameters for your model using optimization search.**\n\nThe simplest projects can use a `sklearn.model_selection.GridSearchCV` to scan the whole search space.\n\nThis package provides a simple interface to this hyperparameter search facility in `src\u002F[package]\u002Futils\u002Fsearchers.py`.\n\nFor more complex project, we recommend to use more complex strategy (e.g., [Bayesian](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FBayesian_optimization)) and software package (e.g., [Optuna](https:\u002F\u002Foptuna.org\u002F)).\n\n### [Data Splits](https:\u002F\u002Fmachinelearningmastery.com\u002Fdifference-test-validation-datasets\u002F)\n\n**You should properly split your dataset into a training, validation, and testing sets.**\n\n- *Training*: used for fitting the model parameters\n- *Validation*: used to find the best hyperparameters\n- *Testing*: used to evaluate the final model performance\n\nThe sets should be exclusive, and the testing set should never be used as training inputs!\n\nThis package provides a simple deterministic strategy implemented in `src\u002F[package]\u002Futils\u002Fsplitters.py`.\n\n## [Design Patterns](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSoftware_design_pattern)\n\n### [Directed-Acyclic Graph](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FDirected_acyclic_graph)\n\n**You should use Directed-Acyclic Graph (DAG) to connect the steps of your ML pipeline.**\n\nA DAG can express the dependencies between steps while keeping the individual step independent.\n\nThis package provides a DAG example in `tasks\u002Fproject.just`. The approach is based on [Just](https:\u002F\u002Fjust.systems\u002Fman\u002Fen\u002Fintroduction.html) and is explained in the section on Automation above.\n\nIn production, we recommend to use a scalable system such as [Airflow](https:\u002F\u002Fairflow.apache.org\u002F), [Dagster](https:\u002F\u002Fdagster.io\u002F), [Prefect](https:\u002F\u002Fwww.prefect.io\u002F), [Metaflow](https:\u002F\u002Fmetaflow.org\u002F), or [ZenML](https:\u002F\u002Fzenml.io\u002F).\n\n### [Program Service](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSystemd)\n\n**You should provide a global context for the execution of your program.**\n\nThere are several approaches such as [Singleton](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSingleton_pattern), [Global Variable](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FGlobal_variable), or [Component](https:\u002F\u002Fgithub.com\u002Fstuartsierra\u002Fcomponent).\n\nThis package takes inspiration from [Clojure mount](https:\u002F\u002Fgithub.com\u002Ftolitius\u002Fmount). It provides an implementation in `src\u002F[package]\u002Fio\u002Fservices.py`.\n\n### [Soft Coding](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSoftcoding)\n\n**You should separate the program implementation from the program configuration.**\n\nExposing configurations to users allow them to influence the execution behavior without code changes.\n\nThis package seeks to expose as much parameter as possible to the users in configurations stored in the `confs\u002F` folder.\n\n### [SOLID Principles](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSOLID)\n\n**You should implement the SOLID principles to make your code as flexible as possible.**\n\n- *Single responsibility principle*:  Class has one job to do. Each change in requirements can be done by changing just one class.\n- *Open\u002Fclosed principle*: Class is happy (open) to be used by others. Class is not happy (closed) to be changed by others.\n- *Liskov substitution principle*: Class can be replaced by any of its children. Children classes inherit parent's behaviours.\n- *Interface segregation principle*: When classes promise each other something, they should separate these promises (interfaces) into many small promises, so it's easier to understand.\n- *Dependency inversion principle*: When classes talk to each other in a very specific way, they both depend on each other to never change. Instead classes should use promises (interfaces, parents), so classes can change as long as they keep the promise.\n\nIn practice, this mean you can implement software contracts with interface and swap the implementation.\n\nFor instance, you can implement several jobs in `src\u002F[package]\u002Fjobs\u002F*.py` and swap them in your configuration.\n\nTo learn more about the mechanism select for this package, you can check the documentation for [Pydantic Tagged Unions](https:\u002F\u002Fdocs.pydantic.dev\u002Fdev-v2\u002Fusage\u002Ftypes\u002Funions\u002F#discriminated-unions-aka-tagged-unions).\n\n### [IO Separation](https:\u002F\u002Fen.wikibooks.org\u002Fwiki\u002FHaskell\u002FUnderstanding_monads\u002FIO)\n\n**You should separate the code interacting with the external world from the rest.**\n\nThe external is messy and full of risks: missing files, permission issue, out of disk ...\n\nTo isolate these risks, you can put all the related code in an `io` package and use interfaces\n\n## [Python Powers](https:\u002F\u002Frealpython.com\u002F)\n\n### [Context Manager](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fcontextlib.html)\n\n**You should use Python context manager to control and enhance an execution.**\n\nPython provides contexts that can be used to extend a code block. For instance:\n\n```python\n# in src\u002F[package]\u002Fscripts.py\nwith job as runner:  # context\n    runner.run()  # run in context\n```\n\nThis pattern has the same benefit as [Monad](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FMonad_(functional_programming)), a powerful programming pattern.\n\nThe package uses `src\u002F[package]\u002Fjobs\u002F*.py` to handle exception and services.\n\n### [Python Package](https:\u002F\u002Fpackaging.python.org\u002Fen\u002Flatest\u002Ftutorials\u002Fpackaging-projects\u002F)\n\n**You should create Python package to create both library and application for others.**\n\nUsing Python package for your AI\u002FML project has the following benefits:\n\n- Build code archive (i.e., wheel) that be uploaded to Pypi.org\n- Install Python package as a library (e.g., like pandas)\n- Expose script entry points to run a CLI or a GUI\n\nTo build a Python package with uv, you simply have to type in a terminal:\n\n```bash\n# for all uv project\nuv build\n# for this project only\ninv packages\n```\n\n## [Software Engineering](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSoftware_engineering)\n\n### [Code Typing](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Ftyping.html)\n\n**You should type your Python code to make it more robust and explicit for your user.**\n\nPython provides the [typing module](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Ftyping.html) for adding type hints and [mypy](https:\u002F\u002Fmypy-lang.org\u002F) to checking them.\n\n```python\n# in src\u002F[package]\u002Fcore\u002Fmodels.py\n@abc.abstractmethod\ndef fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> \"Model\":\n    \"\"\"Fit the model on the given inputs and target.\"\"\"\n\n@abc.abstractmethod\ndef predict(self, inputs: schemas.Inputs) -> schemas.Outputs:\n    \"\"\"Generate an output with the model for the given inputs.\"\"\"\n```\n\nThis code snippet clearly state the inputs and outputs of the method, both for the developer and the type checker.\n\nThe package aims to type every functions and classes to facilitate the developer experience and fix mistakes before execution.\n\n### [Config Typing](https:\u002F\u002Fdocs.pydantic.dev\u002Flatest\u002F)\n\n**You should type your configuration to avoid exceptions during the program execution.**\n\nPydantic allows to define classes that can validate your configs during the program startup.\n\n```python\n# in src\u002F[package]\u002Futils\u002Fsplitters.py\nclass TrainTestSplitter(Splitter):\n    shuffle: bool = False  # required (time sensitive)\n    test_size: int | float = 24 * 30 * 2  # 2 months\n    random_state: int = 42\n```\n\nThis code snippet allows to communicate the values expected and avoid error that could be avoided.\n\nThe package combines both OmegaConf and Pydantic to parse YAML files and validate them as soon as possible.\n\n### [Dataframe Typing](https:\u002F\u002Fpandera.readthedocs.io\u002Fen\u002Fstable\u002F)\n\n**You should type your dataframe to communicate and validate their fields.**\n\nPandera supports dataframe typing for Pandas and other library like PySpark:\n\n```python\n# in src\u002Fpackage\u002Fschemas.py\nclass InputsSchema(Schema):\n    instant: papd.Index[papd.UInt32] = pa.Field(ge=0, check_name=True)\n    dteday: papd.Series[papd.DateTime] = pa.Field()\n    season: papd.Series[papd.UInt8] = pa.Field(isin=[1, 2, 3, 4])\n    yr: papd.Series[papd.UInt8] = pa.Field(ge=0, le=1)\n    mnth: papd.Series[papd.UInt8] = pa.Field(ge=1, le=12)\n    hr: papd.Series[papd.UInt8] = pa.Field(ge=0, le=23)\n    holiday: papd.Series[papd.Bool] = pa.Field()\n    weekday: papd.Series[papd.UInt8] = pa.Field(ge=0, le=6)\n    workingday: papd.Series[papd.Bool] = pa.Field()\n    weathersit: papd.Series[papd.UInt8] = pa.Field(ge=1, le=4)\n    temp: papd.Series[papd.Float16] = pa.Field(ge=0, le=1)\n    atemp: papd.Series[papd.Float16] = pa.Field(ge=0, le=1)\n    hum: papd.Series[papd.Float16] = pa.Field(ge=0, le=1)\n    windspeed: papd.Series[papd.Float16] = pa.Field(ge=0, le=1)\n    casual: papd.Series[papd.UInt32] = pa.Field(ge=0)\n    registered: papd.Series[papd.UInt32] = pa.Field(ge=0)\n```\n\nThis code snippet defines the fields of the dataframe and some of its constraint.\n\nThe package encourages to type every dataframe used in `src\u002F[package]\u002Fcore\u002Fschemas.py`.\n\n### [Object Oriented](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FObject-oriented_programming)\n\n**You should use the Objected Oriented programming to benefit from [polymorphism](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FPolymorphism_(computer_science)).**\n\nPolymorphism combined with SOLID Principles allows to easily swap your code components.\n\n```python\nclass Reader(abc.ABC, pdt.BaseModel):\n\n    @abc.abstractmethod\n    def read(self) -> pd.DataFrame:\n        \"\"\"Read a dataframe from a dataset.\"\"\"\n```\n\nThis code snippet uses the [abc module](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fabc.html) to define code interfaces for a dataset with a read\u002Fwrite method.\n\nThe package defines class interface whenever possible to provide intuitive and replaceable parts for your AI\u002FML project.\n\n### [Semantic Versioning](https:\u002F\u002Fsemver.org\u002F)\n\n**You should use semantic versioning to communicate the level of compatibility of your releases.**\n\nSemantic Versioning (SemVer) provides a simple schema to communicate code changes. For package X.Y.Z:\n\n- *Major* (X): major release with breaking changed (i.e., imply actions from the benefit)\n- *Minor* (Y): minor release with new features (i.e., provide new capabilities)\n- *Patch* (Z): patch release to fix bugs (i.e., correct wrong behavior)\n\nUv and this package leverage Semantic Versioning to let developers control the speed of adoption for new releases.\n\n## [Testing Tricks](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSoftware_testing)\n\n### [Parallel Testing](https:\u002F\u002Fpytest-xdist.readthedocs.io\u002Fen\u002Fstable\u002F)\n\n**You can run your tests in parallel to speed up the validation of your code base.**\n\nPytest can be extended with the [pytest-xdist plugin](https:\u002F\u002Fpytest-xdist.readthedocs.io\u002Fen\u002Fstable\u002F) for this purpose.\n\nThis package enables Pytest in its automation tasks by default.\n\n### [Test Fixtures](https:\u002F\u002Fdocs.pytest.org\u002Fen\u002Flatest\u002Fexplanation\u002Ffixtures.html)\n\n**You should define reusable objects and actions for your tests with [fixtures](https:\u002F\u002Fdocs.pytest.org\u002Fen\u002Flatest\u002Fexplanation\u002Ffixtures.html).**\n\nFixture can prepare objects for your test cases, such as dataframes, models, files.\n\nThis package defines fixtures in `tests\u002Fconftest.py` to improve your testing experience.\n\n## [VS Code](https:\u002F\u002Fcode.visualstudio.com\u002F)\n\n### [Code Workspace](https:\u002F\u002Fcode.visualstudio.com\u002Fdocs\u002Feditor\u002Fworkspaces)\n\n**You can use VS Code workspace to define configurations for your project.**\n\n[Code Workspace](https:\u002F\u002Fcode.visualstudio.com\u002Fdocs\u002Feditor\u002Fworkspaces) can enable features (e.g. formatting) and set the default interpreter.\n\n```json\n{\n \"settings\": {\n  \"editor.formatOnSave\": true,\n  \"python.defaultInterpreterPath\": \".venv\u002Fbin\u002Fpython\",\n    ...\n },\n}\n```\n\nThis package defines a workspace file that you can load from `[package].code-workspace`.\n\n### [GitHub Copilot](https:\u002F\u002Fgithub.com\u002Ffeatures\u002Fcopilot)\n\n**You can use GitHub Copilot to increase your coding productivity by 30%.**\n\n[GitHub Copilot](https:\u002F\u002Fgithub.com\u002Ffeatures\u002Fcopilot) has been a huge productivity thanks to its smart completion.\n\nYou should become familiar with the solution in less than a single coding session.\n\n### [VSCode VIM](https:\u002F\u002Fmarketplace.visualstudio.com\u002Fitems?itemName=vscodevim.vim)\n\n**You can use VIM keybindings to more efficiently navigate and modify your code.**\n\nLearning VIM is one of the best investment for a career in IT. It can make you 30% more productive.\n\nCompared to GitHub Copilot, VIM can take much more time to master. You can expect a ROI in less than a month.\n\n# Resources\n\nThis section provides resources for building packages for Python and AI\u002FML\u002FMLOps.\n\n## Python\n\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fkrzjoa\u002Fawesome-python-data-science#readme>\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fml-tooling\u002Fbest-of-ml-python>\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fml-tooling\u002Fbest-of-python>\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fml-tooling\u002Fbest-of-python-dev>\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fvinta\u002Fawesome-python>\n\n## AI\u002FML\u002FMLOps\n\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fjosephmisiti\u002Fawesome-machine-learning>\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fvisenger\u002Fawesome-mlops>\n","# MLOps Python 包\n\n[![check.yml](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Factions\u002Fworkflows\u002Fcheck.yml\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Factions\u002Fworkflows\u002Fcheck.yml)\n[![publish.yml](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Factions\u002Fworkflows\u002Fpublish.yml\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Factions\u002Fworkflows\u002Fpublish.yml)\n[![Documentation](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Fdocumentation-available-brightgreen.svg)](https:\u002F\u002Ffmind.github.io\u002Fmlops-python-package\u002F)\n[![License](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Flicense\u002Ffmind\u002Fmlops-python-package)](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fblob\u002Fmain\u002FLICENCE.txt)\n[![Release](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fv\u002Frelease\u002Ffmind\u002Fmlops-python-package)](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Freleases)\n\n**此仓库包含一个基于最佳实践的 Python 代码库，旨在支持您的 MLOps 计划。**\n\n该包利用多种 [工具](#tools) 和 [技巧](#tips) 来使您的 MLOps 体验尽可能灵活、稳健且高效。\n\n您可以将此包用作 MLOps 工具箱或平台的一部分（例如，模型注册表、实验跟踪、实时推理等）。\n\n**相关资源**：\n\n- **[LLMOps 编码包（示例）](https:\u002F\u002Fgithub.com\u002Fcallmesora\u002Fllmops-python-package\u002F)**：包含最佳实践和工具的示例，用于支持您的 LLMOps 项目。\n- **[MLOps 编码课程（学习）](https:\u002F\u002Fgithub.com\u002FMLOps-Courses\u002Fmlops-coding-course)**：学习如何创建、开发和维护最先进的 MLOps 代码库。\n- **[Cookiecutter MLOps 包（模板）](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fcookiecutter-mlops-package)**：开始构建和部署用于 MLOps 任务的 Python 包和 Docker 镜像。\n- **[智能体技能（资源）](https:\u002F\u002Fgithub.com\u002FMLOps-Courses\u002Fmlops-coding-skills)**：通过标准化的 MLOps 和编码技能提升您的 AI 智能体能力。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Ffmind_mlops-python-package_readme_ed9c70b7c1fb.png)\n\n# 目录\n\n- [MLOps Python 包](#mlops-python-package)\n- [目录](#table-of-contents)\n- [安装](#install)\n  - [先决条件](#prerequisites)\n  - [安装](#installation)\n  - [后续步骤](#next-steps)\n- [使用](#usage)\n  - [配置](#configuration)\n  - [执行](#execution)\n  - [自动化](#automation)\n  - [工作流](#workflows)\n- [工具](#tools)\n  - [自动化](#automation-1)\n    - [AI 助手：Gemini Code Assist](#ai-assistant-gemini-code-assist)\n    - [提交：Commitizen](#commits-commitizen)\n    - [依赖管理：Dependabot](#dependabot-dependabot)\n    - [Git 钩子：Pre-Commit](#git-hooks-pre-commit)\n    - [任务管理：Just](#tasks-just)\n  - [CI\u002FCD](#cicd)\n    - [运行器：GitHub Actions](#runner-github-actions)\n  - [CLI](#cli)\n    - [解析器：Argparse](#parser-argparse)\n    - [日志记录：Loguru](#logging-loguru)\n  - [代码](#code)\n    - [覆盖率：Coverage](#coverage-coverage)\n    - [编辑器：VS Code](#editor-vs-code)\n    - [格式化：Ruff](#formatting-ruff)\n    - [质量检查：Ruff](#quality-ruff)\n    - [安全扫描：Bandit](#security-bandit)\n    - [测试：Pytest](#testing-pytest)\n    - [类型检查：Mypy](#typing-mypy)\n    - [版本控制：Git](#versioning-git)\n  - [配置](#configs)\n    - [格式：YAML](#format-yaml)\n    - [解析器：OmegaConf](#parser-omegaconf)\n    - [文件路径管理：Cloudpathlib](#reader-cloudpathlib)\n    - [验证器：Pydantic](#validator-pydantic)\n  - [数据](#data)\n    - [容器：Pandas](#container-pandas)\n    - [格式：Parquet](#format-parquet)\n    - [数据校验：Pandera](#schema-pandera)\n  - [文档](#docs)\n    - [API 文档：pdoc](#api-pdoc)\n    - [格式：Google 风格指南](#format-google)\n    - [托管：GitHub Pages](#hosting-github-pages)\n  - [模型](#model)\n    - [评估：Scikit-Learn Metrics](#evaluation-scikit-learn-metrics)\n    - [格式：Mlflow Model](#format-mlflow-model)\n    - [注册表：Mlflow Registry](#registry-mlflow-registry)\n    - [跟踪：Mlflow Tracking](#tracking-mlflow-tracking)\n  - [打包](#package)\n    - [变更日志：Changelog](#evolution-changelog)\n    - [格式：Wheel](#format-wheel)\n    - [包管理器：uv](#manager-uv)\n    - [运行时环境：Docker](#runtime-docker)\n  - [编程](#programming)\n    - [语言：Python](#language-python)\n    - [版本管理：Uv](#version-uv)\n  - [可观测性](#observability)\n    - [可复现性：Mlflow Project](#reproducibility-mlflow-project)\n    - [监控：Mlflow Evaluate](#monitoring--mlflow-evaluate)\n    - [告警：Plyer](#alerting-plyer)\n    - [数据 lineage：Mlflow Dataset](#lineage-mlflow-dataset)\n    - [可解释性：SHAP](#explainability-shap)\n    - [基础设施指标：Mlflow System Metrics](#infrastructure-mlflow-system-metrics)\n- [提示](#tips)\n  - [AI\u002FML 实践](#aiml-practices)\n    - [数据目录](#data-catalog)\n    - [超参数优化](#hyperparameter-optimization)\n    - [数据划分](#data-splits)\n  - [设计模式](#design-patterns)\n    - [有向无环图](#directed-acyclic-graph)\n    - [程序服务](#program-service)\n    - [软编码](#soft-coding)\n    - [SOLID 原则](#solid-principles)\n    - [输入输出分离](#io-separation)\n  - [Python 功能](#python-powers)\n    - [上下文管理器](#context-manager)\n    - [Python 包](#python-package)\n  - [软件工程](#software-engineering)\n    - [代码类型检查](#code-typing)\n    - [配置类型检查](#config-typing)\n    - [DataFrame 类型检查](#dataframe-typing)\n    - [面向对象编程](#object-oriented)\n    - [语义版本控制](#semantic-versioning)\n  - [测试技巧](#testing-tricks)\n    - [并行测试](#parallel-testing)\n    - [测试夹具](#test-fixtures)\n  - [VS Code](#vs-code)\n    - [代码工作区](#code-workspace)\n    - [GitHub Copilot](#github-copilot)\n    - [VSCode VIM](#vscode-vim)\n- [资源](#resources)\n  - [Python](#python)\n  - [AI\u002FML\u002FMLOps](#aimlmlops)\n\n# 安装\n\n本节详细介绍了启动您的 MLOps 项目所需的条件、操作步骤及后续安排。\n\n## 先决条件\n\n- [Python>=3.13](https:\u002F\u002Fwww.python.org\u002Fdownloads\u002F)：以充分利用 [最新功能和性能改进](https:\u002F\u002Fdocs.python.org\u002F3\u002Fwhatsnew\u002F3.13.html)\n- [uv>=0.5.5](https:\u002F\u002Fdocs.astral.sh\u002Fuv\u002F)：用于初始化项目的 [虚拟环境](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fvenv.html)及其依赖项\n\n## 安装\n\n1. 将此 GitHub 仓库 [克隆到您的计算机](https:\u002F\u002Fdocs.github.com\u002Fen\u002Frepositories\u002Fcreating-and-managing-repositories\u002Fcloning-a-repository)\n\n```bash\n# 推荐使用 SSH\n$ git clone git@github.com:fmind\u002Fmlops-python-package\n# 或使用 HTTPS\n$ git clone https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\n```\n\n2. 使用 uv [运行项目安装](https:\u002F\u002Fdocs.astral.sh\u002Fuv\u002F)\n\n```bash\ncd mlops-python-package\u002F\nuv sync\n```\n\n3. 根据您的需求调整代码库\n\n## 后续步骤\n\n在此基础上，您可以通过多种方式将此包集成到您的 MLOps 平台中。\n\n例如，您可以使用 Databricks 或 AWS 作为计算平台和模型注册表。\n\n具体如何调整包中的代码以适配您的目标解决方案，完全取决于您自己。祝您成功！\n\n# 使用方法\n\n本节介绍如何配置项目代码并在您的系统上执行它。\n\n## 配置\n\n您可以在 `confs\u002F` 文件夹中添加或编辑配置文件，以更改程序的行为。\n\n```yaml\n# confs\u002Ftraining.yaml\njob:\n  KIND: TrainingJob\n  inputs:\n    KIND: ParquetReader\n    path: data\u002Finputs_train.parquet\n  targets:\n    KIND: ParquetReader\n    path: data\u002Ftargets_train.parquet\n```\n\n此配置文件指示程序启动一个 `TrainingJob`，包含两个参数：\n\n- `inputs`: 包含模型输入的数据集\n- `targets`: 包含模型目标的数据集\n\n您可以在 `src\u002F[package]\u002Fjobs\u002F*.py` 文件中找到程序的所有参数。\n\n您还可以使用 `uv run bikes --schema` 打印该包支持的完整模式。\n\n## 执行\n\n在开发过程中，您可以使用 uv 来执行项目代码：\n\n```bash\nuv run [package] confs\u002Ftuning.yaml\nuv run [package] confs\u002Ftraining.yaml\nuv run [package] confs\u002Fpromotion.yaml\nuv run [package] confs\u002Finference.yaml\nuv run [package] confs\u002Fevaluations.yaml\nuv run [package] confs\u002Fexplanations.yaml\n```\n\n在生产环境中，您可以将项目构建、打包并作为 Python 包运行：\n\n```bash\nuv build\nuv publish # 可选\npython -m pip install [package]\n[package] confs\u002Finference.yaml\n```\n\n您也可以将此包安装为库，供其他 AI\u002FML 项目使用：\n\n```python\nfrom [package] import jobs\n\njob = jobs.TrainingJob(...)\nwith job as runner:\n    runner.run()\n```\n\n**附加提示**：\n\n- 您可以通过命令行使用 `--extras` 标志传递额外的配置\n  - 可用于传递运行时值（例如，先前作业执行的结果）\n- 您可以在命令行中传递多个配置文件，它们会从左到右合并\n  - 您可以定义作业之间共享的通用配置（例如，模型参数）\n- 由于 [Pydantic 的区分联合类型](https:\u002F\u002Fdocs.pydantic.dev\u002Flatest\u002Fconcepts\u002Funions\u002F#discriminated-unions)，将自动选择正确的作业任务\n  - 这是运行应用程序支持的任何作业（训练、调参等）的绝佳方式\n\n## 自动化\n\n此项目包含多项自动化任务，可轻松重复常见操作。\n\n您可以通过 [命令行](https:\u002F\u002Fjust.systems\u002Fman\u002Fen\u002Fintroduction.html) 或 [VS Code 扩展](https:\u002F\u002Fmarketplace.visualstudio.com\u002Fitems?itemName=nefrob.vscode-just-syntax) 调用这些操作。\n\n```bash\n# 执行项目 DAG\n$ just project\n# 创建代码归档\n$ just package\n# 列出其他操作\n$ just\n```\n\n**可用任务**：\n\n```toml\ndefault # 显示帮助信息\n\n[check]\ncheck # 运行检查任务\ncheck-code # 检查代码质量\ncheck-coverage numprocesses=\"auto\" cov_fail_under=\"80\" # 检查代码覆盖率\ncheck-format # 检查代码格式\ncheck-security # 检查代码安全\ncheck-test numprocesses=\"auto\" # 检查单元测试\ncheck-type # 检查代码类型\n\n[clean]\nclean # 运行清理任务\nclean-build # 清理构建文件夹\nclean-cache # 清理缓存文件夹\nclean-constraints # 清理约束文件\nclean-coverage # 清理覆盖率文件\nclean-docs # 清理文档文件夹\nclean-environment # 清理环境文件\nclean-mlruns # 清理 mlruns 文件夹\nclean-mypy # 清理 mypy 文件夹\nclean-outputs # 清理输出文件夹\nclean-pytest # 清理 pytest 缓存\nclean-python # 清理 Python 缓存\nclean-requirements # 清理需求文件\nclean-ruff # 清理 ruff 缓存\nclean-venv # 清理 venv 文件夹\n\n[commit]\ncommit-bump # 提升包版本\ncommit-files # 提交包\ncommit-info # 获取提交信息\n\n[doc]\ndoc # 运行文档任务\ndoc-build format=\"google\" output=\"docs\" # 构建文档\ndoc-serve format=\"google\" port=\"8088\" # 提供文档服务\n\n[docker]\ndocker # 运行 Docker 任务\ndocker-build tag=\"latest\" # 构建 Docker 镜像\ndocker-compose # 启动 Docker Compose\ndocker-run tag=\"latest\" # 运行最新版 Docker 镜像\n\n[format]\nformat # 运行格式化任务\nformat-import # 格式化代码导入\nformat-source # 格式化代码源\n\n[install]\ninstall # 运行安装任务\ninstall-hooks # 安装 Git 钩子\ninstall-project # 安装项目\ninstall-rulesets # 安装 GitHub 规则集\n\n[mlflow]\nmlflow # 运行 MLflow 任务\nmlflow-doctor # 运行 MLflow 医生\nmlflow-serve host=\"127.0.0.1\" port=\"5000\" uri=\".\u002Fmlruns\" # 启动 MLflow 服务器\n\n[package]\npackage # 运行打包任务\npackage-build constraints=\"constraints.txt\" # 构建 Python 包\npackage-constraints constraints=\"constraints.txt\" # 构建包约束\n\n[project]\nproject # 运行项目任务\nproject-environment # 导出环境文件\nproject-requirements # 导出需求文件\nproject-run job # 使用 MLflow 运行项目作业\n```\n\n## 工作流\n\n此包在 `.github\u002Fworkflows` 中支持两个 GitHub 工作流：\n\n- `check.yml`: 在每次 Pull Request 上验证包的质量\n- `publish.yml`: 在代码发布时构建并发布文档和包。\n\n您可以使用并扩展这些工作流来自动化重复性的包管理任务。\n\n# 工具\n\n本节旨在鼓励使用开发者工具来提升您的编码体验。\n\n## 自动化\n\n预定义的操作，用于自动化您的项目开发。\n\n### AI 助手：[Gemini Code Assist](https:\u002F\u002Fdevelopers.google.com\u002Fgemini-code-assist\u002Fdocs\u002Freview-github-code)\n\n- **动机**：\n  - 提高您的编码效率\n  - 获取代码建议和补全\n  - 减少审查代码的时间\n- **局限性**：\n  - 可能生成错误的代码、评论或摘要\n\n### 提交：[Commitizen](https:\u002F\u002Fcommitizen-tools.github.io\u002Fcommitizen\u002F)\n\n- **动机**：\n  - 格式化您的代码提交\n  - 生成标准的变更日志\n  - 与 [SemVer](https:\u002F\u002Fsemver.org\u002F) 和 [PEP 440](https:\u002F\u002Fpeps.python.org\u002Fpep-0440\u002F) 良好集成\n- **局限性**：\n  - 新用户的学习曲线较长\n- **替代方案**：\n  - 自己动手 (DIY)\n\n### Dependabot：[Dependabot](https:\u002F\u002Fdocs.github.com\u002Fen\u002Fcode-security\u002Fgetting-started\u002Fdependabot-quickstart-guide)\n\n- **动机**：\n  - 避免安全问题\n  - 避免破坏性更改\n  - 更新您的依赖项\n- **局限性**：\n  - 可能破坏您的代码\n- **替代方案**：\n  - 自己动手 (DIY)\n\n### Git 钩子：[Pre-Commit](https:\u002F\u002Fpre-commit.com\u002F)\n\n- **动机**：\n  - 在提交前本地检查您的代码\n  - 避免在 CI\u002FCD 上浪费资源\n  - 可以执行额外的动作（例如，清理文件）\n- **局限性**：\n  - 在提交前增加开销\n- **替代方案**：\n  - [Git 钩子](https:\u002F\u002Fgit-scm.com\u002Fbook\u002Fen\u002Fv2\u002FCustomizing-Git-Git-Hooks)：使用起来不太方便\n\n### 任务：[Just](https:\u002F\u002Fjust.systems\u002Fman\u002Fen\u002Fintroduction.html)\n\n- **动机**：\n  - 自动化项目工作流\n  - 语法清晰，优于其他工具\n  - 在功能强大与简单易用之间取得了良好平衡\n- **局限性**：\n  - 大多数开发者并不熟悉\n- **替代方案**：\n  - [Make](https:\u002F\u002Fwww.gnu.org\u002Fsoftware\u002Fmake\u002Fmanual\u002Fmake.html)：最流行，但语法糟糕\n  - [PyInvoke](https:\u002F\u002Fwww.pyinvoke.org\u002F)：符合 Python 风格，但冗长且不够直观\n\n## CI\u002FCD\n\n在代码推送和发布时执行自动化工作流。\n\n### 运行器：[GitHub Actions](https:\u002F\u002Fgithub.com\u002Ffeatures\u002Factions)\n\n- **动机**：\n  - 原生集成于 GitHub\n  - 工作流语法简单\n  - 如有需要可进行大量配置\n- **局限性**：\n  - SaaS 服务\n- **替代方案**：\n  - [GitLab](https:\u002F\u002Fabout.gitlab.com\u002F)：可部署在本地\n\n## CLI\n\n与系统命令行界面（CLI）的集成。\n\n### 解析器：[Argparse](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fargparse.html)\n\n- **动机**：\n  - 提供 CLI 参数\n  - 内置于 Python 运行时\n  - 对于提供配置已足够\n- **局限性**：\n  - 对于高级解析较为冗长\n- **替代方案**：\n  - [Typer](https:\u002F\u002Ftyper.tiangolo.com\u002F)：代码类型化更胜一筹\n  - [Fire](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fpython-fire)：简单但无类型化\n  - [Click](https:\u002F\u002Fclick.palletsprojects.com\u002Fen\u002Flatest\u002F)：更为冗长\n\n### 日志记录：[Loguru](https:\u002F\u002Floguru.readthedocs.io\u002Fen\u002Fstable\u002F)\n\n- **动机**：\n  - 向用户展示进度\n  - 开箱即用，效果良好\n  - 日志语法更加清晰\n- **局限性**：\n  - 不允许偏离基础用法\n- **替代方案**：\n  - [Logging](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Flogging.html)：默认可用，但显得有些过时\n\n## 代码\n\n项目源代码的编辑、验证和版本控制。\n\n### 覆盖率：[Coverage](https:\u002F\u002Fcoverage.readthedocs.io\u002Fen\u002Flatest\u002F)\n\n- **动机**：\n  - 报告被测试覆盖的代码\n  - 确定待测试的代码路径\n  - 向用户展示代码成熟度\n- **局限性**：\n  - 无\n- **替代方案**：\n  - [Pytest Cov](https:\u002F\u002Fpytest-cov.readthedocs.io\u002Fen\u002Flatest\u002F)：一个使用 `coverage.py` 来衡量代码覆盖率的 Pytest 插件。\n\n### 编辑器：[VS Code](https:\u002F\u002Fcode.visualstudio.com\u002F)\n\n- **动机**：\n  - 开源\n  - 免费、简单且开源\n  - 拥有优秀的 Python 开发插件\n- **局限性**：\n  - 需要为 Python 进行一些配置\n- **替代方案**：\n  - [PyCharm](https:\u002F\u002Fwww.jetbrains.com\u002Fpycharm\u002F)：功能强大，但价格昂贵\n  - [Vim](https:\u002F\u002Fwww.vim.org\u002F)：我非常喜欢它，不过也有 VS Code 插件\n  - [Spacemacs](https:\u002F\u002Fwww.spacemacs.org\u002F)：我更喜欢它，但并非所有人都喜欢 LISP\n\n### 格式化：[Ruff](https:\u002F\u002Fdocs.astral.sh\u002Fruff\u002F)\n\n- **动机**：\n  - 相较于其他工具速度极快\n  - 不必浪费时间整理代码\n  - 使代码更具可读性和可维护性\n- **局限性**：\n  - 仍处于 0.x 版本，但采用率越来越高\n- **替代方案**：\n  - [YAPF](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fyapf)：配置选项过多，而你可能并不需要\n  - [Isort](https:\u002F\u002Fpycqa.github.io\u002Fisort\u002F) + [Black](https:\u002F\u002Fblack.readthedocs.io\u002Fen\u002Fstable\u002F)：速度较慢，且需使用两种工具\n\n### 质量：[Ruff](https:\u002F\u002Fdocs.astral.sh\u002Fruff\u002F)\n\n- **动机**：\n  - 提升代码质量\n  - 相较其他工具速度极快\n  - [与 VS Code 的出色集成](https:\u002F\u002Fmarketplace.visualstudio.com\u002Fitems?itemName=charliermarsh.ruff)\n- **局限性**：\n  - 无\n- **替代方案**：\n  - [PyLint](https:\u002F\u002Fwww.pylint.org\u002F)：系统过于复杂且运行缓慢\n  - [Flake8](https:\u002F\u002Fflake8.pycqa.org\u002Fen\u002Flatest\u002F)：插件过多，实践中我更倾向于 Pylint\n\n### 安全性：[Bandit](https:\u002F\u002Fbandit.readthedocs.io\u002Fen\u002Flatest\u002F)\n\n- **动机**：\n  - 检测安全问题\n  - 作为 linting 解决方案的补充\n  - 使用和启用都不算复杂\n- **局限性**：\n  - 无\n- **替代方案**：\n  - 无\n\n### 测试：[Pytest](https:\u002F\u002Fdocs.pytest.org\u002Fen\u002Flatest\u002F)\n\n- **动机**：\n  - 编写测试，否则将付出代价\n  - 极易编写新的测试用例\n  - 拥有大量优秀的插件（xdist、sugar、cov 等）\n- **局限性**：\n  - 默认不支持并行执行\n- **替代方案**：\n  - [Unittest](https:\u002F\u002Fdocs.python.org\u002Ffr\u002F3\u002Flibrary\u002Funittest.html)：语法更为冗长，趣味性较低\n\n### 类型检查：[Mypy](https:\u002F\u002Fmypy-lang.org\u002F)\n\n- **动机**：\n  - 静态类型检查很酷！\n  - 可以明确类型用途\n  - Python 的官方类型检查工具\n- **局限性**：\n  - 复杂类型检查可能会带来额外开销\n- **替代方案**：\n  - [PyRight](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fpyright)：由微软负责大型代码库的检查\n  - [PyType](https:\u002F\u002Fgoogle.github.io\u002Fpytype\u002F)：由 Google 负责大型代码库的检查\n  - [Pyre](https:\u002F\u002Fpyre-check.org\u002F)：由 Facebook 负责大型代码库的检查\n\n### 版本控制：[Git](https:\u002F\u002Fgit-scm.com\u002F)\n\n- **动机**：\n  - 如果不进行版本控制，那真是愚蠢\n  - 最流行的源代码管理工具（还能有什么选择呢？）\n  - 提供钩子，可在特定事件发生时执行自动化操作\n- **局限性**：\n  - Git 可能比较难掌握：\u003Chttps:\u002F\u002Fxkcd.com\u002F1597\u002F>\n- **替代方案**：\n  - [Mercurial](https:\u002F\u002Fwww.mercurial-scm.org\u002F)：过去很喜欢它，但现在 Git 才是唯一的选择\n\n## 配置\n\n管理项目的配置文件，以便调整执行行为。\n\n### 格式：[YAML](https:\u002F\u002Fyaml.org\u002F)\n\n- **动机**：\n  - 在不修改代码的情况下改变执行方式\n  - 语法易读，支持注释\n  - 可以使用 OmegaConf \u003C3\n- **局限性**：\n  - Python 默认不支持 YAML\n- **替代方案**：\n  - [JSON](https:\u002F\u002Fwww.json.org\u002Fjson-en.html)：无注释，语法更为冗长\n  - [TOML](https:\u002F\u002Ftoml.io\u002Fen\u002F)：不太适合配置合并或共享\n\n### 解析器：[OmegaConf](https:\u002F\u002Fomegaconf.readthedocs.io\u002Fen\u002F2.3_branch\u002F)\n\n- **动机**：\n  - 解析并合并 YAML 文件\n  - 功能强大，不会妨碍你的工作\n  - 几行代码就能完成大量任务\n- **局限性**：\n  - 不支持远程文件（如 s3、gcs 等）\n    - 可以将其与 [cloudpathlib](https:\u002F\u002Fcloudpathlib.drivendata.org\u002Fstable\u002F) 结合使用\n- **替代方案**：\n  - [Hydra](https:\u002F\u002Fhydra.cc\u002Fdocs\u002Fintro\u002F)：功能强大，但会干扰你的工作\n  - [DynaConf](https:\u002F\u002Fwww.dynaconf.com\u002F)：更适合应用程序开发\n\n### 文件读取器：[Cloudpathlib](https:\u002F\u002Fcloudpathlib.drivendata.org\u002Fstable\u002F)\n\n- **动机**：\n  - 从云存储中读取文件\n  - 与云平台的集成更好\n  - 支持多个平台：AWS、GCP 和 Azure\n- **局限性**：\n  - 目前对 Python 类型的支持还不够完善\n- **替代方案**：\n  - 云 SDK（GCP、AWS、Azure 等）：厂商专用，对于此任务来说过于复杂\n\n### 验证器：[Pydantic](https:\u002F\u002Fdocs.pydantic.dev\u002Flatest\u002F)\n\n- **动机**：\n  - 在执行前验证配置\n  - Pydantic 应该是内置的（就这么简单）\n  - 极大地增强你的 Python 类\n- **局限性**：\n  - 无\n- **替代方案**：\n  - [Dataclass](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fdataclasses.html)：更简单，但功能弱得多\n  - [Attrs](https:\u002F\u002Fwww.attrs.org\u002Fen\u002Fstable\u002F)：没有验证功能，使用起来也不太直观\n\n## 数据\n\n定义数据集，以提供数据输入和输出。\n\n### 容器：[Pandas](https:\u002F\u002Fpandas.pydata.org\u002F)\n\n- **动机**：\n  - 将数据文件加载到内存中\n  - Python 的通用数据交换格式\n  - 最流行的选择\n- **局限性**：\n  - 存在许多陷阱 [gotchas](https:\u002F\u002Fwww.tutorialspoint.com\u002Fpython_pandas\u002Fpython_pandas_caveats_and_gotchas.htm)\n- **替代方案**：\n  - [Polars](https:\u002F\u002Fwww.pola.rs\u002F)：更快、更安全，但集成较少\n  - [Pyspark](https:\u002F\u002Fspark.apache.org\u002Fdocs\u002Flatest\u002Fapi\u002Fpython\u002F)：功能强大、流行、分布式，但开销较大\n  - Dask、Ray、Modin、Vaex 等：集成度较低（尽管外观上与 Pandas 类似）\n\n### 格式：[Parquet](https:\u002F\u002Fparquet.apache.org\u002F)\n\n- **动机**：\n  - 将数据存储在磁盘上\n  - 列式存储（非常适合分析）\n  - 比基于文本的格式更高效、更安全\n- **局限性**：\n  - 无\n- **替代方案**：\n  - [CSV](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FComma-separated_values)：人类可读，但这也是其唯一优点\n  - [Avro](https:\u002F\u002Favro.apache.org\u002F)：适合行式工作流的良好替代方案\n\n### 模式：[Pandera](https:\u002F\u002Fpandera.readthedocs.io\u002Fen\u002Fstable\u002F)\n\n- **动机**：\n  - 为 DataFrame 提供类型注解\n  - 明确数据字段的含义\n  - 支持 Pandas 及其他库 [others](https:\u002F\u002Fpandera.readthedocs.io\u002Fen\u002Fstable\u002Fsupported_libraries.html)\n- **局限性**：\n  - 无\n- **替代方案**：\n  - [Great Expectations](https:\u002F\u002Fgreatexpectations.io\u002F)：功能强大，但集成难度大得多\n\n## 文档\n\n生成并分享项目文档。\n\n### API：[pdoc](https:\u002F\u002Fpdoc.dev\u002F)\n\n- **动机**：\n  - 与他人共享文档\n  - 工具简单，仅用于生成 API 文档\n  - 快速完成任务，不拖后腿\n- **局限性**：\n  - 仅支持 API 文档（即无法生成自定义文档）\n- **替代方案**：\n  - [Sphinx](https:\u002F\u002Fwww.sphinx-doc.org\u002Fen\u002Fmaster\u002F)：功能更全面，但对于简单项目来说过于复杂\n  - [Mkdocs](https:\u002F\u002Fwww.mkdocs.org\u002F)：功能更全面，但设置更为繁琐\n\n### 格式：[Google](https:\u002F\u002Fgoogle.github.io\u002Fstyleguide\u002Fpyguide.html)\n\n- **动机**：\n  - 公认的 docstring 风格\n  - 是所有选项中最易编写的\n  - 为了简洁，我通常只写一行\n- **局限性**：\n  - 无\n- **替代方案**：\n  - [Numpy](https:\u002F\u002Fnumpydoc.readthedocs.io\u002Fen\u002Flatest\u002Fformat.html)：编写起来较困难\n  - [Sphinx](https:\u002F\u002Fsphinx-rtd-tutorial.readthedocs.io\u002Fen\u002Flatest\u002Fdocstrings.html)：风格过于繁复\n\n### 托管：[GitHub Pages](https:\u002F\u002Fpages.github.com\u002F)\n\n- **动机**：\n  - 设置简单\n  - 免费且便捷\n  - 与 GitHub 无缝集成\n- **局限性**：\n  - 仅支持静态内容\n- **替代方案**：\n  - [ReadTheDocs](https:\u002F\u002Fabout.readthedocs.com\u002F?ref=readthedocs.com)：提供更多功能\n\n## 模型\n\n处理机器学习模型的工具集。\n\n### 评估：[Scikit-Learn Metrics](https:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fmodel_evaluation.html)\n\n- **动机**：\n  - 提供常用指标\n  - 避免重复造轮子\n  - 避免实现错误\n- **局限性**：\n  - 可选择的指标种类有限\n- **替代方案**：\n  - 自行实现：用于自定义指标\n\n### 格式：[Mlflow Model](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Fmodels.html)\n\n- **动机**：\n  - 标准化的机器学习模型格式\n  - 存储模型依赖项\n  - 拥有强大的社区生态\n- **局限性**：\n  - 无\n- **替代方案**：\n  - [Pickle](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fpickle.html)：开箱即用，但不适合大型数组\n  - [ONNX](https:\u002F\u002Fonnx.ai\u002F)：非常适合深度学习，但与其他框架的兼容性无法保证 [no guaranteed compatibility for the rest](https:\u002F\u002Fonnxruntime.ai\u002Fdocs\u002Freference\u002Fcompatibility.html)\n\n### 注册表：[Mlflow Registry](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Fmodel-registry.html)\n\n- **动机**：\n  - 保存和加载模型\n  - 将生产环境与消费环境分离\n  - 流行、开源，可在本地系统上运行\n- **局限性**：\n  - 无\n- **替代方案**：\n  - [Neptune.ai](https:\u002F\u002Fneptune.ai\u002F)：SaaS 解决方案\n  - [Weights and Biases](https:\u002F\u002Fwandb.ai\u002Fsite)：SaaS 解决方案\n\n### 追踪：[Mlflow Tracking](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Ftracking.html)\n\n- **动机**：\n  - 跟踪指标和超参数\n  - 可以比较不同模型的表现\n  - 流行、开源，可在本地系统上运行\n- **局限性**：\n  - 无\n- **替代方案**：\n  - [Neptune.ai](https:\u002F\u002Fneptune.ai\u002F)：SaaS 解决方案\n  - [Weights and Biases](https:\u002F\u002Fwandb.ai\u002Fsite)：SaaS 解决方案\n\n## 包\n\n定义并构建现代 Python 包。\n\n### 变更日志：[Changelog](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FChangelog)\n\n- **动机**：\n  - 向用户传达变更信息\n  - 可以使用 [Commitizen](https:\u002F\u002Fcommitizen-tools.github.io\u002Fcommitizen\u002Fchangelog\u002F) 更新\n  - 遵循 [Keep a Changelog](https:\u002F\u002Fkeepachangelog.com\u002F) 的标准化格式\n- **局限性**：\n  - 无\n- **替代方案**：\n  - 无\n\n### 格式：[Wheel](https:\u002F\u002Fpeps.python.org\u002Fpep-0427\u002F)\n\n- **动机**：\n  - 具有多项优势 [has several advantages](https:\u002F\u002Frealpython.com\u002Fpython-wheels\u002F#advantages-of-python-wheels)\n  - 创建源代码归档\n  - 当前最现代的 Python 格式\n- **局限性**：\n  - 不包含 C\u002FC++ 依赖项（例如 CUDA）\n    - 即在这种情况下应使用 Docker 容器\n- **替代方案**：\n  - [Source](https:\u002F\u002Fdocs.python.org\u002F3\u002Fdistutils\u002Fsourcedist.html)：较旧的格式，功能较弱\n  - [Conda](https:\u002F\u002Fconda.io\u002Fprojects\u002Fconda\u002Fen\u002Flatest\u002Fuser-guide\u002Finstall\u002Findex.html)：速度慢且难以管理\n\n### 管理器：[uv](https:\u002F\u002Fdocs.astral.sh\u002Fuv\u002F)\n\n- **动机**：\n  - 定义并构建 Python 包\n  - 快速且符合标准的包管理器\n  - 将所有元数据打包成一个静态文件\n- **局限性**：\n  - 无法添加 Python 之外的依赖项（例如 CUDA）\n    - 即在这种情况下应使用 Docker 容器\n- **替代方案**：\n  - [Setuptools](https:\u002F\u002Fdocs.python.org\u002F3\u002Fdistutils\u002Fsetupscript.html)：动态文件速度较慢且风险较高\n  - [Poetry](https:\u002F\u002Fpython-poetry.org\u002F)：该包的前身解决方案\n  - Pdm、Hatch、PipEnv：\u003Chttps:\u002F\u002Fxkcd.com\u002F1987\u002F>\n\n### 运行时：[Docker](https:\u002F\u002Fwww.docker.com\u002Fresources\u002Fwhat-container\u002F)\n\n- **动机**：\n  - 创建隔离的运行环境\n  - 容器已成为事实上的标准\n  - 可以将 C\u002FC++ 依赖项随项目一起打包\n- **局限性**：\n  - 有些公司可能会阻止使用 Docker Desktop，此时应考虑其他替代方案\n- **替代方案**：\n  - [Conda](https:\u002F\u002Fdocs.conda.io\u002Fen\u002Flatest\u002F)：解析速度慢且资源占用高\n\n## 编程\n\n选择你的编程环境。\n\n### 语言：[Python](https:\u002F\u002Fwww.python.org\u002F)\n\n- **动机**：\n  - 非常适合 AI\u002FML 项目\n  - 功能强大，拥有丰富的工具支持\n  - 数百个优秀的库\n- **局限性**：\n  - 如果没有 C 绑定，性能较慢\n- **替代方案**：\n  - [R](https:\u002F\u002Fwww.r-project.org\u002F)：专用语言\n  - [Julia](https:\u002F\u002Fjulialang.org\u002F)：专用语言\n\n### 版本：[Uv](https:\u002F\u002Fdocs.astral.sh\u002Fuv\u002Fguides\u002Finstall-python\u002F)\n\n- **动机**：\n  - 在不同 Python 版本之间切换\n  - 允许选择最佳版本\n  - 支持全局和局部调度\n- **局限性**：\n  - 需要进行一些 shell 配置\n- **替代方案**：\n  - 手动安装：耗时较长\n  - [PyEnv](https:\u002F\u002Fgithub.com\u002Fpyenv\u002Fpyenv)：基于 shell，需要更多设置\n\n## 可观测性\n\n### 可复现性：[Mlflow Project](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Fprojects.html)\n\n- **动机**：\n  - 共享通用的项目格式\n  - 确保项目可以被重复使用\n  - 避免项目执行中的随机性\n- **局限性**：\n  - Mlflow Project 最适合小型项目\n- **替代方案**：\n  - [DVC](https:\u002F\u002Fdvc.org\u002F)：同时管理数据和模型\n  - [Metaflow](https:\u002F\u002Fmetaflow.org\u002F)：专注于机器学习\n  - **[Apache Airflow](https:\u002F\u002Fairflow.apache.org\u002F)**：适用于大型项目\n\n### 监控：[Mlflow Evaluate](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Fmodel-evaluation\u002Findex.html)\n\n- **动机**：\n  - 计算模型指标\n  - 使用阈值验证模型\n  - 进行训练后的评估\n- **局限性**：\n  - Mlflow Evaluate 的功能相比其他工具较为有限\n- **替代方案**：\n  - **[Giskard](https:\u002F\u002Fwww.giskard.ai\u002F)**：开源核心且功能非常全面\n  - **[Evidently](https:\u002F\u002Fwww.evidentlyai.com\u002F)**：开源工具，提供更丰富的指标\n  - [Arize AI](https:\u002F\u002Farize.com\u002F)：功能更强大，但灵活性较低\n  - [Graphana](https:\u002F\u002Fgrafana.com\u002F)：需要自行完成所有工作\n\n### 告警：[Plyer](https:\u002F\u002Fgithub.com\u002Fkivy\u002Fplyer)\n\n- **动机**：\n  - 解决方案简单\n  - 在系统上发送通知\n  - 跨平台：Mac、Linux、Windows\n- **局限性**：\n  - 不应用于大型项目\n- **替代方案**：\n  - [Slack](https:\u002F\u002Fslack.com\u002F)：面向聊天的解决方案\n  - [Datadog](https:\u002F\u002Fwww.datadoghq.com\u002F)：面向基础设施的解决方案\n\n### 血缘关系：[Mlflow Dataset](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Ftracking\u002Fdata-api.html)\n\n- **动机**：\n  - 将信息存储在 Mlflow 中\n  - 跟踪运行时数据集的元数据\n  - 保留数据集来源的 URI（例如网站）\n- **局限性**：\n  - 功能不如其他解决方案丰富\n- **替代方案**：\n  - [Databricks Lineage](https:\u002F\u002Fdocs.databricks.com\u002Fen\u002Fadmin\u002Fsystem-tables\u002Flineage.html)：仅限于 Databricks\n  - [OpenLineage 和 Marquez](https:\u002F\u002Fmarquezproject.github.io\u002F)：开源且灵活\n\n### 可解释性：[SHAP](https:\u002F\u002Fshap.readthedocs.io\u002Fen\u002Flatest\u002F)\n\n- **动机**：\n  - 最流行的工具包\n  - 支持多种模型（线性模型等）\n  - 可通过 [SHAP 模块](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Fpython_api\u002Fmlflow.shap.html) 与 Mlflow 集成\n- **局限性**：\n  - 处理大规模数据集时速度极慢\n  - Mlflow SHAP 模块尚不成熟\n- **替代方案**：\n  - [LIME](https:\u002F\u002Fgithub.com\u002Fmarcotcr\u002Flime)：目前已不再维护\n\n### 基础设施：[Mlflow System Metrics](https:\u002F\u002Fmlflow.org\u002Fdocs\u002Flatest\u002Fsystem-metrics\u002Findex.html)\n\n- **动机**：\n  - 跟踪基础设施信息（RAM、CPU 等）\n  - 与 Mlflow 跟踪系统集成\n  - 提供硬件洞察\n- **局限性**：\n  - 功能不如其他解决方案成熟\n- **替代方案**：\n  - [Datadog](https:\u002F\u002Fwww.datadoghq.com\u002F)：流行且成熟的解决方案\n\n# 技巧\n\n本节提供一些技巧和窍门，以提升开发体验。\n\n## [AI\u002FML 实践](https:\u002F\u002Fmachinelearningmastery.com\u002F)\n\n### [数据目录](https:\u002F\u002Fdocs.kedro.org\u002Fen\u002Fstable\u002Fdata\u002Fdata_catalog.html)\n\n**应将数据指针与其访问方式解耦。**\n\n在代码中，您可以使用标签（如 `inputs`、`targets`）来引用数据集。\n\n然后，可以在配置文件中将这些标签与具体的读写器实现关联：\n\n```yaml\n  inputs:\n    KIND: ParquetReader\n    path: data\u002Finputs_train.parquet\n  targets:\n    KIND: ParquetReader\n    path: data\u002Ftargets_train.parquet\n```\n\n在此软件包中，实现位于 `src\u002F[package]\u002Fio\u002Fdatasets.py`，并通过 `KIND` 来选择。\n\n### [超参数优化](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FHyperparameter_optimization)\n\n**应使用优化搜索方法为模型选择最佳超参数。**\n\n对于最简单的项目，可以使用 `sklearn.model_selection.GridSearchCV` 来遍历整个搜索空间。\n\n此软件包在 `src\u002F[package]\u002Futils\u002Fsearchers.py` 中提供了该超参数搜索功能的简单接口。\n\n对于更复杂的项目，建议采用更复杂的策略（如 [贝叶斯优化](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FBayesian_optimization)）和相应的软件包（如 [Optuna](https:\u002F\u002Foptuna.org\u002F)）。\n\n### [数据划分](https:\u002F\u002Fmachinelearningmastery.com\u002Fdifference-test-validation-datasets\u002F)\n\n**应将数据集合理划分为训练集、验证集和测试集。**\n\n- *训练集*：用于拟合模型参数\n- *验证集*：用于寻找最佳超参数\n- *测试集*：用于评估最终模型性能\n\n各集合应互斥，且测试集绝不能用作训练输入！\n\n此软件包在 `src\u002F[package]\u002Futils\u002Fsplitters.py` 中实现了一种简单的确定性划分策略。\n\n## [设计模式](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSoftware_design_pattern)\n\n### [有向无环图](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FDirected_acyclic_graph)\n\n**应使用有向无环图（DAG）连接您的 ML 流水线步骤。**\n\nDAG 可以表达步骤之间的依赖关系，同时保持每个步骤的独立性。\n\n此软件包在 `tasks\u002Fproject.just` 中提供了一个 DAG 示例。该方法基于 [Just](https:\u002F\u002Fjust.systems\u002Fman\u002Fen\u002Fintroduction.html)，并在上述自动化部分进行了说明。\n\n在生产环境中，我们建议使用可扩展的系统，如 [Airflow](https:\u002F\u002Fairflow.apache.org\u002F)、[Dagster](https:\u002F\u002Fdagster.io\u002F)、[Prefect](https:\u002F\u002Fwww.prefect.io\u002F)、[Metaflow](https:\u002F\u002Fmetaflow.org\u002F) 或 [ZenML](https:\u002F\u002Fzenml.io\u002F)。\n\n### [程序服务](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSystemd)\n\n**应为程序的执行提供一个全局上下文。**\n\n有多种方法，如 [单例模式](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSingleton_pattern)、[全局变量](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FGlobal_variable) 或 [组件模式](https:\u002F\u002Fgithub.com\u002Fstuartsierra\u002Fcomponent)。\n\n此软件包受到 [Clojure mount](https:\u002F\u002Fgithub.com\u002Ftolitius\u002Fmount) 的启发，在 `src\u002F[package]\u002Fio\u002Fservices.py` 中提供了实现。\n\n### [软编码](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSoftcoding)\n\n**应将程序实现与程序配置分离。**\n\n向用户暴露配置可以让用户在不修改代码的情况下影响程序的行为。\n\n此软件包旨在将尽可能多的参数暴露给用户，并将其存储在 `confs\u002F` 文件夹中。\n\n### [SOLID 原则](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSOLID)\n\n**你应该实现 SOLID 原则，以使你的代码尽可能灵活。**\n\n- *单一职责原则*：一个类只负责一项职责。每当需求发生变化时，只需修改一个类即可。\n- *开闭原则*：类对被他人使用持开放态度；但对被他人修改则持封闭态度。\n- *里氏替换原则*：任何子类都可以替换其父类。子类继承了父类的行为。\n- *接口隔离原则*：当类之间相互承诺时，应将这些承诺（接口）拆分为多个小的、更易理解的接口。\n- *依赖倒置原则*：当类之间以非常具体的方式交互时，它们会相互依赖，从而导致难以更改。相反，类应该通过接口或抽象基类进行交互，这样即使类本身发生变化，只要遵守接口约定，就不会影响整体。\n\n在实践中，这意味着你可以使用接口来定义软件契约，并轻松切换其实现。\n\n例如，你可以在 `src\u002F[package]\u002Fjobs\u002F*.py` 中实现多个任务，并在配置中灵活切换它们。\n\n要了解更多关于本包所选机制的信息，可以查阅 [Pydantic 标记联合体](https:\u002F\u002Fdocs.pydantic.dev\u002Fdev-v2\u002Fusage\u002Ftypes\u002Funions\u002F#discriminated-unions-aka-tagged-unions) 的文档。\n\n### [IO 分离](https:\u002F\u002Fen.wikibooks.org\u002Fwiki\u002FHaskell\u002FUnderstanding_monads\u002FIO)\n\n**你应该将与外部世界交互的代码与其他部分分离。**\n\n外部环境往往杂乱无章且充满风险：文件缺失、权限问题、磁盘空间不足等。\n\n为了隔离这些风险，你可以将所有相关代码放入一个 `io` 包中，并使用接口来管理。\n\n## [Python 功能](https:\u002F\u002Frealpython.com\u002F)\n\n### [上下文管理器](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fcontextlib.html)\n\n**你应该使用 Python 上下文管理器来控制和增强代码的执行流程。**\n\nPython 提供了上下文管理器，可用于扩展代码块的功能。例如：\n\n```python\n# 在 src\u002F[package]\u002Fscripts.py 中\nwith job as runner:  # 上下文\n    runner.run()  # 在上下文中执行\n```\n\n这种模式与功能强大的编程模式 [Monad](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FMonad_(functional_programming)) 具有相似的优势。\n\n该包使用 `src\u002F[package]\u002Fjobs\u002F*.py` 来处理异常和提供服务。\n\n### [Python 包](https:\u002F\u002Fpackaging.python.org\u002Fen\u002Flatest\u002Ftutorials\u002Fpackaging-projects\u002F)\n\n**你应该创建 Python 包，以便为他人提供库和应用程序。**\n\n在你的 AI\u002FML 项目中使用 Python 包具有以下优势：\n\n- 构建可上传到 PyPI 的代码归档（即 wheel 文件）。\n- 将 Python 包作为库安装（例如，像 pandas 一样）。\n- 暴露脚本入口点，以运行 CLI 或 GUI。\n\n使用 uv 构建 Python 包时，只需在终端中输入以下命令：\n\n```bash\n# 对于所有 uv 项目\nuv build\n# 仅针对该项目\ninv packages\n```\n\n## [软件工程](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSoftware_engineering)\n\n### [代码类型注解](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Ftyping.html)\n\n**你应该为你的 Python 代码添加类型注解，使其更加健壮并明确地向用户传达意图。**\n\nPython 提供了 [typing 模块](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Ftyping.html) 用于添加类型提示，并使用 [mypy](https:\u002F\u002Fmypy-lang.org\u002F) 来检查这些类型提示。\n\n```python\n# 在 src\u002F[package]\u002Fcore\u002Fmodels.py 中\n@abc.abstractmethod\ndef fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> \"Model\":\n    \"\"\"根据给定的输入和目标拟合模型。\"\"\"\n\n@abc.abstractmethod\ndef predict(self, inputs: schemas.Inputs) -> schemas.Outputs:\n    \"\"\"根据给定的输入生成输出。\"\"\"\n```\n\n这段代码清晰地说明了方法的输入和输出，既便于开发者理解，也便于类型检查工具验证。\n\n该包旨在为所有函数和类添加类型注解，以提升开发体验并在运行前发现错误。\n\n### [配置类型注解](https:\u002F\u002Fdocs.pydantic.dev\u002Flatest\u002F)\n\n**你应该为你的配置添加类型注解，以避免程序运行时出现异常。**\n\nPydantic 允许定义类，在程序启动时验证配置的有效性。\n\n```python\n# 在 src\u002F[package]\u002Futils\u002Fsplitters.py 中\nclass TrainTestSplitter(Splitter):\n    shuffle: bool = False  # 必填项（时间敏感）\n    test_size: int | float = 24 * 30 * 2  # 2 个月\n    random_state: int = 42\n```\n\n这段代码明确了预期的配置值，有助于避免本可以避免的错误。\n\n该包结合 OmegaConf 和 Pydantic，以尽早解析并验证 YAML 配置文件。\n\n### [数据框类型注解](https:\u002F\u002Fpandera.readthedocs.io\u002Fen\u002Fstable\u002F)\n\n**你应该为你的数据框添加类型注解，以明确其字段并进行验证。**\n\nPandera 支持 Pandas 及其他库（如 PySpark）的数据框类型注解：\n\n```python\n# 在 src\u002Fpackage\u002Fschemas.py 中\nclass InputsSchema(Schema):\n    instant: papd.Index[papd.UInt32] = pa.Field(ge=0, check_name=True)\n    dteday: papd.Series[papd.DateTime] = pa.Field()\n    season: papd.Series[papd.UInt8] = pa.Field(isin=[1, 2, 3, 4])\n    yr: papd.Series[papd.UInt8] = pa.Field(ge=0, le=1)\n    mnth: papd.Series[papd.UInt8] = pa.Field(ge=1, le=12)\n    hr: papd.Series[papd.UInt8] = pa.Field(ge=0, le=23)\n    holiday: papd.Series[papd.Bool] = pa.Field()\n    weekday: papd.Series[papd.UInt8] = pa.Field(ge=0, le=6)\n    workingday: papd.Series[papd Bool] = pa.Field()\n    weathersit: papd.Series[papd.UInt8] = pa.Field(ge=1, le=4)\n    temp: papd.Series[papd.Float16] = pa.Field(ge=0, le=1)\n    atemp: papd.Series[papd Float16] = pa.Field(ge=0, le=1)\n    hum: papd.Series[papd Float16] = pa.Field(ge=0, le=1)\n    windspeed: papd.Series[papd Float16] = pa.Field(ge=0, le=1)\n    casual: papd.Series[papd UInt32] = pa.Field(ge=0)\n    registered: papd.Series[papd UInt32] = pa.Field(ge=0)\n```\n\n这段代码定义了数据框的字段及其约束条件。\n\n该包鼓励为 `src\u002F[package]\u002Fcore\u002Fschemas.py` 中使用的每个数据框添加类型注解。\n\n### [面向对象编程](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FObject-oriented_programming)\n\n**你应该使用面向对象编程，以充分利用 [多态性](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FPolymorphism_(computer_science))。**\n\n结合 SOLID 原则，多态性使得代码组件的替换变得非常容易。\n\n```python\nclass Reader(abc.ABC, pdt.BaseModel):\n\n    @abc.abstractmethod\n    def read(self) -> pd.DataFrame:\n        \"\"\"从数据集中读取数据框。\"\"\"\n```\n\n这段代码使用 [abc 模块](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fabc.html) 定义了一个包含读写方法的数据集接口。\n\n该包尽可能多地定义类接口，以为你的 AI\u002FML 项目提供直观且可替换的组件。\n\n### [语义版本控制](https:\u002F\u002Fsemver.org\u002F)\n\n**你应该使用语义版本控制来传达你发布的版本之间的兼容性级别。**\n\n语义版本控制（SemVer）提供了一个简单的模式来传达代码变更。对于包 X.Y.Z：\n\n- *主版本*（X）：包含破坏性变更的主版本发布（即需要用户采取相应行动）\n- *次版本*（Y）：包含新功能的次版本发布（即提供了新的能力）\n- *修订版本*（Z）：用于修复 bug 的修订版本发布（即修正了错误行为）\n\nUv 和这个包都采用了语义版本控制，以便开发者能够控制新版本的采用速度。\n\n## [测试技巧](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSoftware_testing)\n\n### [并行测试](https:\u002F\u002Fpytest-xdist.readthedocs.io\u002Fen\u002Fstable\u002F)\n\n**你可以并行运行测试，以加快对代码库的验证速度。**\n\n为此，可以使用 [pytest-xdist 插件](https:\u002F\u002Fpytest-xdist.readthedocs.io\u002Fen\u002Fstable\u002F) 来扩展 Pytest。\n\n该包默认在其自动化任务中启用了 Pytest。\n\n### [测试夹具](https:\u002F\u002Fdocs.pytest.org\u002Fen\u002Flatest\u002Fexplanation\u002Ffixtures.html)\n\n**你应该使用 [夹具](https:\u002F\u002Fdocs.pytest.org\u002Fen\u002Flatest\u002Fexplanation\u002Ffixtures.html) 为你的测试定义可重用的对象和操作。**\n\n夹具可以为你的测试用例准备对象，例如数据框、模型、文件等。\n\n该包在 `tests\u002Fconftest.py` 中定义了夹具，以提升你的测试体验。\n\n## [VS Code](https:\u002F\u002Fcode.visualstudio.com\u002F)\n\n### [代码工作区](https:\u002F\u002Fcode.visualstudio.com\u002Fdocs\u002Feditor\u002Fworkspaces)\n\n**你可以使用 VS Code 工作区来为你的项目定义配置。**\n\n[代码工作区](https:\u002F\u002Fcode.visualstudio.com\u002Fdocs\u002Feditor\u002Fworkspaces) 可以启用某些功能（如格式化）并设置默认解释器。\n\n```json\n{\n \"settings\": {\n  \"editor.formatOnSave\": true,\n  \"python.defaultInterpreterPath\": \".venv\u002Fbin\u002Fpython\",\n    ...\n },\n}\n```\n\n该包定义了一个工作区文件，你可以从 `[package].code-workspace` 加载它。\n\n### [GitHub Copilot](https:\u002F\u002Fgithub.com\u002Ffeatures\u002Fcopilot)\n\n**你可以使用 GitHub Copilot 将你的编码效率提高 30%。**\n\n[GitHub Copilot](https:\u002F\u002Fgithub.com\u002Ffeatures\u002Fcopilot) 凭借其智能补全功能，极大地提升了开发效率。\n\n你只需一次编码实践，就能很快熟悉它的用法。\n\n### [VSCode VIM](https:\u002F\u002Fmarketplace.visualstudio.com\u002Fitems?itemName=vscodevim.vim)\n\n**你可以使用 VIM 键绑定更高效地导航和修改代码。**\n\n学习 VIM 是投身 IT 行业的一项绝佳投资。它能使你的工作效率提升 30%。\n\n与 GitHub Copilot 相比，掌握 VIM 需要更多时间。不过，你可以在一个月内看到回报。\n\n# 资源\n\n本节提供了构建 Python 以及 AI\u002FML\u002FMLOps 相关软件包的资源。\n\n## Python\n\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fkrzjoa\u002Fawesome-python-data-science#readme>\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fml-tooling\u002Fbest-of-ml-python>\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fml-tooling\u002Fbest-of-python>\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fml-tooling\u002Fbest-of-python-dev>\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fvinta\u002Fawesome-python>\n\n## AI\u002FML\u002FMLOps\n\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fjosephmisiti\u002Fawesome-machine-learning>\n- \u003Chttps:\u002F\u002Fgithub.com\u002Fvisenger\u002Fawesome-mlops>","# MLOps Python Package 快速上手指南\n\n本指南旨在帮助开发者快速搭建基于最佳实践的 MLOps Python 项目环境，并运行核心任务。\n\n## 环境准备\n\n在开始之前，请确保您的系统满足以下要求：\n\n*   **操作系统**: Linux, macOS 或 Windows (WSL2 推荐)\n*   **Python**: 版本需 >= 3.13 (以利用最新性能特性)\n*   **uv**: 版本需 >= 0.5.5 (用于极速管理虚拟环境和依赖)\n    *   *安装 uv*: `curl -LsSf https:\u002F\u002Fastral.sh\u002Fuv\u002Finstall.sh | sh`\n    *   *国内加速*: 如遇网络问题，可配置 `UV_INDEX_URL` 环境变量指向国内镜像源（如清华源）：\n        ```bash\n        export UV_INDEX_URL=https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n        ```\n\n## 安装步骤\n\n### 1. 克隆项目\n将代码仓库克隆到本地：\n\n```bash\n# 推荐使用 SSH\ngit clone git@github.com:fmind\u002Fmlops-python-package.git\n# 或使用 HTTPS\ngit clone https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package.git\n```\n\n### 2. 初始化环境与同步依赖\n进入项目目录并使用 `uv` 同步环境。该命令会自动创建虚拟环境并安装所有必要的依赖包。\n\n```bash\ncd mlops-python-package\u002F\nuv sync\n```\n\n### 3. (可选) 安装 Git Hooks\n为了在提交代码时自动进行格式检查和质量验证，建议安装预提交钩子：\n\n```bash\nuv run just install-hooks\n```\n\n## 基本使用\n\n本项目通过 YAML 配置文件驱动不同的 MLOps 任务（如训练、调优、推理等）。\n\n### 1. 查看配置结构\n您可以查看 `confs\u002F` 目录下的配置文件，例如 `confs\u002Ftraining.yaml`：\n\n```yaml\n# confs\u002Ftraining.yaml\njob:\n  KIND: TrainingJob\n  inputs:\n    KIND: ParquetReader\n    path: data\u002Finputs_train.parquet\n  targets:\n    KIND: ParquetReader\n    path: data\u002Ftargets_train.parquet\n```\n\n### 2. 运行任务\n使用 `uv run` 执行具体的 MLOps 任务。请将 `[package]` 替换为实际的项目包名（通常在 `pyproject.toml` 中定义）。\n\n**开发模式运行：**\n```bash\n# 运行模型调优\nuv run [package] confs\u002Ftuning.yaml\n\n# 运行模型训练\nuv run [package] confs\u002Ftraining.yaml\n\n# 运行模型推理\nuv run [package] confs\u002Finference.yaml\n\n# 运行模型评估\nuv run [package] confs\u002Fevaluations.yaml\n```\n\n**命令行传参技巧：**\n*   使用 `--extras` 传递运行时动态参数。\n*   支持同时传入多个配置文件，后者会覆盖前者配置（从左到右合并）。\n\n### 3. 常用自动化命令\n项目内置了 `just` 工具来简化常见操作：\n\n```bash\n# 运行完整的项目工作流 (DAG)\njust project\n\n# 构建 Python 分发包\njust package\n\n# 启动本地 MLflow 服务\njust mlflow-serve\n\n# 查看所有可用命令\njust\n```\n\n### 4. 作为库调用\n您也可以将此包作为库集成到其他 AI 项目中：\n\n```python\nfrom [package] import jobs\n\n# 实例化任务\njob = jobs.TrainingJob(...)\n\n# 执行任务\nwith job as runner:\n    runner.run()\n```","某金融科技公司的数据科学团队正在构建一个实时反欺诈模型，需要频繁迭代算法并满足严格的合规审计要求。\n\n### 没有 mlops-python-package 时\n- **环境混乱难复现**：每位成员自行配置依赖和目录结构，导致“在我机器上能跑”的代码无法在测试或生产环境中运行。\n- **质量管控靠人工**：缺乏统一的代码格式化、类型检查和安全性扫描流程，低级错误常流入生产环节，引发线上故障。\n- **模型追踪缺失**：实验参数、数据集版本和模型指标散落在本地笔记或临时文件中，无法满足审计对模型全生命周期可追溯的要求。\n- **部署流程繁琐**：从代码提交到生成 Docker 镜像缺乏自动化流水线，每次发布需手动打包，耗时且容易出错。\n\n### 使用 mlops-python-package 后\n- **标准化项目骨架**：直接套用预置的最佳实践模板，统一了配置管理（OmegaConf）、数据结构（Pandera）和日志规范，确保任何环境下一键复现。\n- **自动化质量门禁**：集成 Ruff、Mypy 和 Bandit 等工具至 Git 钩子与 CI 流程，自动拦截格式错误、类型不匹配及安全漏洞，代码质量显著提升。\n- **全链路模型治理**：内置 MLflow 支持，自动记录实验轨迹、注册模型版本并关联数据血缘，轻松生成符合合规要求的审计报告。\n- **一键持续交付**：基于 GitHub Actions 构建标准化流水线，实现从代码提交到 Wheel 包发布及 Docker 镜像构建的全自动化，发布效率提升数倍。\n\nmlops-python-package 通过提供一套工业级的标准模板与自动化工具链，将碎片化的 MLOps 实践转化为高效、可靠且可审计的工程体系。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Ffmind_mlops-python-package_3267c32a.png","fmind","Médéric Hurier (Fmind)","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Ffmind_d51223e1.png","Freelancer • AI\u002FML Architect & Engineer • AI Agents & MLOps • GCP Professional Cloud Architect • PhD in AI & Computer Security","Fmind.dev","Luxembourg",null,"fmind_dev","https:\u002F\u002Ffmind.dev","https:\u002F\u002Fgithub.com\u002Ffmind",[86,90,94,98],{"name":87,"color":88,"percentage":89},"Jupyter Notebook","#DA5B0B",94.6,{"name":91,"color":92,"percentage":93},"Python","#3572A5",5.2,{"name":95,"color":96,"percentage":97},"Just","#384d54",0.3,{"name":99,"color":96,"percentage":100},"Dockerfile",0,1402,199,"2026-04-03T05:43:04","MIT","Linux, macOS, Windows","未说明",{"notes":108,"python":109,"dependencies":110},"该项目是一个通用的 MLOps Python 包模板，而非特定的深度学习模型。它使用 'uv' 作为包管理和虚拟环境工具（要求版本>=0.5.5）。配置采用 YAML 格式并通过 OmegaConf 解析。支持通过 Docker 容器化部署。由于是通用框架，具体的 GPU、内存和额外库需求取决于用户基于此模板开发的具体机器学习任务。",">=3.13",[111,112,113,114,115,116,117,118,119,120],"uv>=0.5.5","loguru","omegaconf","cloudpathlib","pydantic","pandas","pandera","scikit-learn","mlflow","shap",[13,54,51,15],[123,124,125,126,127,128,119,117,115,129,130,131],"mlops","python","automation","data-pipelines","data-science","machine-learning","data-engineering","machine-learning-operations","python-template","2026-03-27T02:49:30.150509","2026-04-06T05:19:33.722582",[135,140,145,149,154,159,164,169],{"id":136,"question_zh":137,"answer_zh":138,"source_url":139},12675,"是否推荐使用 pydantic-settings 替代 omegaconf + pydantic？","pydantic-settings 是一个很棒的工具，但在当前项目中直接替换需要重构整个配置系统，工作量较大。此外，目前尚不确定它是否能像现有方案那样合并 YAML 文件。如果确实需要加载多个 YAML 文件，可以通过变通方法在实例化前修改配置：\n```python\nSettings.model_config[\"yaml_file\"] = [\n    \".\u002Fexample4.yaml\",\n    \".\u002Fexample5.yaml\",\n    \".\u002Fexample2.yaml\",\n]\nsettings = Settings()\n```\n或者通过子类化并重写 model_config 来实现。","https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F14",{"id":141,"question_zh":142,"answer_zh":143,"source_url":144},12676,"运行训练任务时报错 'NotImplementedError: No usable implementation found!' 是怎么回事？","该错误通常由 `plyer` 包在某些操作系统（如 macOS 或无头 Linux 环境）上找不到可用的通知实现引起。此问题已在后续版本（v3.0.0）中通过移除或修复相关通知逻辑解决。建议更新项目到最新版本，或参考 PR #30 的修改内容手动调整 `src\u002Fbikes\u002Fio\u002Fservices.py` 中的通知调用逻辑。","https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F16",{"id":146,"question_zh":147,"answer_zh":148,"source_url":139},12677,"如何在实例化 Pydantic Settings 时动态覆盖 yaml_file 配置？","目前官方不支持在实例化时直接覆盖 `yaml_file`（不像 env_file 那样方便）。可行的变通方案有两种：\n1. 在创建实例前直接修改类配置（涉及突变，不推荐但有效）：\n```python\nSettings.model_config[\"yaml_file\"] = [\".\u002Fconfig1.yaml\", \".\u002Fconfig2.yaml\"]\nsettings = Settings()\n```\n2. 创建一个子类并重写 `model_config`：\n```python\nclass MySettings(Settings):\n    model_config = {\"yaml_file\": [\".\u002Fcustom.yaml\"]}\nsettings = MySettings()\n```",{"id":150,"question_zh":151,"answer_zh":152,"source_url":153},12678,"项目是否支持自动设置 GitHub 规则集（Rulesets）？","是的，项目已支持自动设置 GitHub Rulesets。该功能已通过 Issue #46 提出并由 PR #47 实现合并。用户只需按照最新文档配置即可启用自动规则应用，无需手动在 GitHub 界面操作。","https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F46",{"id":155,"question_zh":156,"answer_zh":157,"source_url":158},12679,"如何构建确定性的 Python wheels 包？","项目已支持使用约束文件（constraints）构建确定性 wheels。该功能通过 Issue #43 引入，并由 PR #36 实现。构建时请确保使用项目提供的约束文件（如 `constraints.txt`），并在 poetry 或 pip 构建命令中引用它，以保证依赖版本一致性和构建可复现性。","https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F43",{"id":160,"question_zh":161,"answer_zh":162,"source_url":163},12680,"项目是否已从 PyInvoke 迁移到其他任务运行器？","是的，项目已从 PyInvoke 迁移到 Just 作为新的任务运行器。该变更通过 Issue #36 提出，并由 PR #42 完成。用户现在应使用 `just \u003Ctask-name>` 命令代替原来的 `invoke \u003Ctask-name>`。请查看项目根目录的 `justfile` 了解可用任务列表。","https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F36",{"id":165,"question_zh":166,"answer_zh":167,"source_url":168},12681,"项目是否已切换到 PyArrow 数据类型？","是的，项目已完成从原生 pandas 类型到 PyArrow 数据类型的切换。该改进通过 Issue #39 提出，并由 PR #44 实现。此举提升了数据处理性能和内存效率。用户无需额外配置，新版代码默认使用 PyArrow 后端。","https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F39",{"id":170,"question_zh":171,"answer_zh":172,"source_url":173},12682,"如何获取最新的 GitHub Actions 版本配置？","项目已自动升级所有 GitHub Actions 到最新稳定版本。该更新通过 Issue #35 提出，并由 PR #41 合并。用户只需拉取最新代码即可享受更安全、更高效的 CI\u002FCD 流程，无需手动修改 `.github\u002Fworkflows` 中的 action 版本号。","https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F35",[175,180,185,190,195,200,205,209,213,217,221,225,229,233,237],{"id":176,"version":177,"summary_zh":178,"released_at":179},63070,"v4.1.0","## v4.1.0 (2025-03-05)\n\n[](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fblob\u002Fmain\u002FCHANGELOG.md#v410-2025-03-05)\n\n### 功能\n\n[](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fblob\u002Fmain\u002FCHANGELOG.md#feat)\n\n- **gemini**: 增加对 gemini 代码辅助的支持 (#51)\n- **dependabot**: 添加 dependabot 配置文件 (#50)\n- **github**: 添加默认规则集并进行安装 (#47)","2025-03-05T20:17:43",{"id":181,"version":182,"summary_zh":183,"released_at":184},63071,"v4.0.0","## v4.0.0 (2025-03-04)\n\n### 功能\n\n- **tasks**: 从 pyinvoke 切换到 just (#42)\n- **workflows**: 更新 GitHub Actions 版本 (#41)\n- **versions**: 提升 Python 和包的版本 (#40)\n- **mindmap**: 添加该包的思维导图 (#32)\n\n### 修复\n\n- **version**: 准备好进行版本升级\n- **datasets**: 修复 dtype 后端问题 (#44)\n\n### 重构\n\n- **cruft**: 更新至新模板版本","2025-03-04T21:24:27",{"id":186,"version":187,"summary_zh":188,"released_at":189},63072,"v3.0.0","# 功能特性\n\n- **[[FEAT] 用 UV 替代 Poetry #17](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F17)**\n- [[FEAT] 升级依赖包以支持 v3 版本 #18](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F18)**\n- [[FEAT] 更新 GitHub Actions 工作流 #21](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F21)**\n- [[FEAT] 改进 v3 版本的抽象层 #23](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F23)**\n- [[FEAT] 将检查任务拆分到 check GitHub Action 中 #24](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F24)**\n\n# 修复\n\n- [[FIX] 无法运行训练任务 #16](https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fissues\u002F16)","2024-12-14T15:08:33",{"id":191,"version":192,"summary_zh":193,"released_at":194},63073,"v2.0.0","### 功能\n\n- **cruft**: 采用 cruft 并将其与 cookiecutter-mlops-package 关联起来","2024-07-28T18:31:48",{"id":196,"version":197,"summary_zh":198,"released_at":199},63074,"v1.1.1","## 变更内容\n\n* 可观测性，由 @fmind 在 https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fpull\u002F15 中实现\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Ffmind\u002Fmlops-python-package\u002Fcompare\u002Fv1.0.0...v1.1.0","2024-07-23T19:19:45",{"id":201,"version":202,"summary_zh":203,"released_at":204},63075,"v1.0.0","首次重大发布！","2024-03-19T20:56:57",{"id":206,"version":207,"summary_zh":81,"released_at":208},63076,"v0.9.0","2024-03-19T20:40:08",{"id":210,"version":211,"summary_zh":81,"released_at":212},63077,"v0.8.0","2024-03-18T20:53:12",{"id":214,"version":215,"summary_zh":81,"released_at":216},63078,"v0.7.0","2024-03-16T18:23:40",{"id":218,"version":219,"summary_zh":81,"released_at":220},63079,"v0.6.0","2024-03-16T18:22:48",{"id":222,"version":223,"summary_zh":81,"released_at":224},63080,"v0.5.0","2024-03-16T18:22:27",{"id":226,"version":227,"summary_zh":81,"released_at":228},63081,"v0.4.0","2024-03-16T18:21:28",{"id":230,"version":231,"summary_zh":81,"released_at":232},63082,"v0.3.0","2024-03-16T18:21:08",{"id":234,"version":235,"summary_zh":81,"released_at":236},63083,"v0.2.0","2024-03-16T18:20:51",{"id":238,"version":239,"summary_zh":81,"released_at":240},63084,"v0.1.0","2024-03-16T18:20:33"]