[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-alirezadir--Production-Level-Deep-Learning":3,"tool-alirezadir--Production-Level-Deep-Learning":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",156033,2,"2026-04-14T23:32:00",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":77,"owner_email":78,"owner_twitter":78,"owner_website":79,"owner_url":80,"languages":78,"stars":81,"forks":82,"last_commit_at":83,"license":78,"difficulty_score":84,"env_os":85,"env_gpu":86,"env_ram":85,"env_deps":87,"category_tags":101,"github_topics":102,"view_count":32,"oss_zip_url":78,"oss_zip_packed_at":78,"status":17,"created_at":115,"updated_at":116,"faqs":117,"releases":123},7613,"alirezadir\u002FProduction-Level-Deep-Learning","Production-Level-Deep-Learning","A guideline for building practical production-level deep learning systems to be deployed in real world applications. ","Production-Level-Deep-Learning 是一份专注于构建可落地深度学习系统的工程指南。它旨在解决人工智能项目中普遍存在的“高失败率”痛点，帮助团队跨越从模型训练到真实世界部署的巨大鸿沟。许多项目往往因技术范围界定不清、缺乏明确的评估指标或难以进入生产环境而夭折，这份指南正是为了填补这一空白而生。\n\n该资源非常适合希望将算法转化为实际产品的开发者、机器学习工程师以及技术团队管理者。不同于仅关注模型准确率的学术教程，Production-Level-Deep-Learning 提供了全栈视角的解决方案，涵盖了数据管理（如标注策略与合成数据）、项目生命周期规划、优先级评估模型以及基础设施工具链选型等关键环节。其独特亮点在于整合了来自伯克利全栈深度学习训练营及业界专家的最佳实践，通过清晰的生命周期图示和心智模型，指导用户如何平衡“高影响力”与“低成本”，从而系统化地设计并交付稳健的深度学习应用。无论是准备面试的从业者，还是正在规划 AI 落地的团队，都能从中获得极具价值的实操指引。","# :bulb: A Guide to Production Level Deep Learning :clapper: :scroll:  :ferry:\n🇨🇳 Translation in [Chinese](https:\u002F\u002Fgithub.com\u002Falirezadir\u002FProduction-Level-Deep-Learning\u002Fblob\u002Fmaster\u002Fother-languages\u002FChinese(Simplified).md)\n\n### :label: NEW: [Machine Learning Interviews](https:\u002F\u002Fgithub.com\u002Falirezadir\u002FMachine-Learning-Interviews)\n\n:label: Note: All feedback and contribution are very welcome :blush:\n\nDeploying deep learning models in production can be challenging, as it is far beyond training models with good performance. Several distinct components need to be designed and developed in order to deploy a production level deep learning system (seen below):\n\n\u003Cp align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_c951f889b933.png\" title=\"\" width=\"95%\" height=\"95%\">\n\u003C\u002Fp>\n\nThis repo aims to be an engineering guideline for building production-level deep learning systems which will be deployed in real world applications. \n\nThe material presented here is borrowed from [Full Stack Deep Learning Bootcamp](https:\u002F\u002Ffullstackdeeplearning.com) (by [Pieter Abbeel](https:\u002F\u002Fpeople.eecs.berkeley.edu\u002F~pabbeel\u002F) at UC Berkeley, [Josh Tobin](http:\u002F\u002Fjosh-tobin.com\u002F) at OpenAI, and [Sergey Karayev](https:\u002F\u002Fsergeykarayev.com\u002F) at Turnitin), [TFX workshop](https:\u002F\u002Fconferences.oreilly.com\u002Ftensorflow\u002Ftf-ca\u002Fpublic\u002Fschedule\u002Fdetail\u002F79327) by [Robert Crowe](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Frobert-crowe\u002F), and [Pipeline.ai](https:\u002F\u002Fpipeline.ai\u002F)'s [Advanced KubeFlow Meetup](https:\u002F\u002Fwww.meetup.com\u002FAdvanced-KubeFlow\u002F) by [Chris Fregly](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fcfregly\u002F).\n\n# Machine Learning Projects\nFun :flushed: fact: **85% of AI projects fail**. \u003Csup>[1](#fsdl)\u003C\u002Fsup> Potential reasons include: \n- Technically infeasible  or poorly scoped \n- Never make the leap to production \n- Unclear success criteria (metrics)\n- Poor team management \n  \n## 1. ML Projects lifecycle\n\u003Cp align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_5e0f2d9d6229.png\" title=\"\" width=\"95%\" height=\"95%\">\u003C\u002Fp>\n\n- Importance of understanding state of the art in your domain:\n  - Helps to understand what is possible \n  - Helps to know what to try next \n## 2. Mental Model for ML project \n  The two important factors to consider when defining and prioritizing ML projects:\n  - High Impact:\n    - Complex parts of your pipeline \n    - Where \"cheap prediction\" is valuable\n    - Where automating complicated manual process is valuable \n  - Low Cost:\n    - Cost is driven by: \n      - Data availability \n      - Performance requirements: costs tend to scale super-linearly in the accuracy requirement \n      - Problem difficulty: \n        - Some of the hard problems include: unsupervised learning, reinforcement learning, and certain categories of supervised learning \n\u003Cp align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_5dcda0a20c8c.png\" title=\"\" width=\"90%\" height=\"90%\">\n\u003C\u002Fp>\n  \n# Full stack pipeline \n\nThe following figure represents a high level overview of different components in a production level deep learning system:\n\u003Cp align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_f514cc37052a.png\" title=\"\" width=\"100%\" height=\"100%\">\n\u003C\u002Fp>\nIn the following, we will go through each module and recommend toolsets and frameworks as well as best practices from practitioners that fit each component. \n\n## 1. Data Management \n### 1.1 Data Sources \n* Supervised deep learning requires a lot of labeled data\n* Labeling own data is costly! \n* Here are some resources for data: \n  * Open source data (good to start with, but not an advantage) \n  * Data augmentation (a MUST for computer vision, an option for NLP)\n  * Synthetic data (almost always worth starting with, esp. in NLP)\n### 1.2  Data Labeling \n* Requires: separate software stack (labeling platforms), temporary labor, and QC\n* Sources of labor for labeling: \n  * Crowdsourcing (Mechanical Turk): cheap and scalable, less reliable, needs QC\n  * Hiring own annotators: less QC needed, expensive, slow to scale \n  * Data labeling service companies:\n    * [FigureEight](https:\u002F\u002Fwww.figure-eight.com\u002F)  \n* Labeling platforms: \n  * [Diffgram](https:\u002F\u002Fdiffgram.com\u002F): Training Data Software (Computer Vision)\n  * [Prodigy](https:\u002F\u002Fprodi.gy\u002F): An annotation tool powered\nby active learning (by developers of Spacy), text and image \n  * [HIVE](https:\u002F\u002Fthehive.ai\u002F): AI as a Service platform for computer vision  \n  * [Supervisely](https:\u002F\u002Fsupervise.ly\u002F): entire computer vision platform \n  * [Labelbox](https:\u002F\u002Flabelbox.com\u002F): computer vision  \n  * [Scale](https:\u002F\u002Fscale.com\u002F) AI data platform (computer vision & NLP)\n\n    \n### 1.3. Data Storage \n* Data storage options: \n  * **Object store**: Store binary data (images, sound files, compressed texts) \n    * [Amazon S3](https:\u002F\u002Faws.amazon.com\u002Fs3\u002F) \n    * [Ceph](https:\u002F\u002Fceph.io\u002F) Object Store\n  * **Database**: Store metadata (file paths, labels, user activity, etc). \n    * [Postgres](https:\u002F\u002Fwww.postgresql.org\u002F) is the right choice for most of applications, with the best-in-class SQL and great support for unstructured JSON. \n  * **Data Lake**: to aggregate features which are not obtainable from database (e.g. logs)\n    * [Amazon Redshift](https:\u002F\u002Faws.amazon.com\u002Fredshift\u002F)\n  * **Feature Store**: store, access, and share machine learning features \n (Feature extraction could be computationally expensive and nearly impossible to scale, hence re-using features by different models and teams is a key to high performance ML teams). \n    * [FEAST](https:\u002F\u002Fgithub.com\u002Fgojek\u002Ffeast) (Google cloud, Open Source)\n    * [Michelangelo Palette](https:\u002F\u002Feng.uber.com\u002Fmichelangelo\u002F) (Uber)\n* Suggestion: At training time, copy data into a local or networked **filesystem** (NFS). \u003Csup>[1](#fsdl)\u003C\u002Fsup> \n\n### 1.4. Data Versioning \n* It's a \"MUST\" for deployed ML models:  \n  **Deployed ML models are part code, part data**. \u003Csup>[1](#fsdl)\u003C\u002Fsup>  No data versioning means no model versioning. \n* Data versioning platforms: \n  * [DVC](https:\u002F\u002Fdvc.org\u002F): Open source version control system for ML projects \n  * [Pachyderm](https:\u002F\u002Fwww.pachyderm.com\u002F): version control for data \n  * [Dolt](https:\u002F\u002Fgithub.com\u002Fdolthub\u002Fdolt): a SQL database with Git-like version control for data and schema\n    \n### 1.5. Data Processing \n* Training data for production models may come from different sources, including *Stored data in db and object stores*, *log processing*, and *outputs of other classifiers*.\n* There are dependencies between tasks, each needs to be kicked off after its dependencies are finished. For example, training on new log data, requires a preprocessing step before training. \n* Makefiles are not scalable. \"Workflow manager\"s become pretty essential in this regard.\n* **Workflow orchestration:**\n  * [Luigi](https:\u002F\u002Fgithub.com\u002Fspotify\u002Fluigi) by Spotify\n  * [Airflow](https:\u002F\u002Fairflow.apache.org\u002F) by Airbnb: Dynamic, extensible, elegant, and scalable (the most widely used)\n      * DAG workflow \n      * Robust conditional execution: retry in case of failure  \n      * Pusher supports docker images with tensorflow serving \n      * Whole workflow in a single .py file \n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_102cac11c7d4.png\" title=\"\" width=\"65%\" height=\"65%\">\n   \u003C\u002Fp>\n   \n\n## 2. Development, Training, and Evaluation \n### 2.1. Software engineering\n* Winner language: Python\n* Editors:\n   * Vim\n   * Emacs  \n   * [VS Code](https:\u002F\u002Fcode.visualstudio.com\u002F) (Recommended by the author): Built-in git staging and diff, Lint code, open projects remotely through ssh \n   * Notebooks: Great as starting point of the projects, hard to scale (fun fact: Netflix’s Notebook-Driven Architecture is an exception, which is entirely based on [nteract](https:\u002F\u002Fnteract.io\u002F) suites). \n      * [nteract](https:\u002F\u002Fnteract.io\u002F): a next-gen React-based UI for Jupyter notebooks\n      * [Papermill](https:\u002F\u002Fgithub.com\u002Fnteract\u002Fpapermill): is an [nteract](https:\u002F\u002Fnteract.io\u002F) library built for *parameterizing*, *executing*, and *analyzing* Jupyter Notebooks.\n      * [Commuter](https:\u002F\u002Fgithub.com\u002Fnteract\u002Fcommuter): another [nteract](https:\u002F\u002Fnteract.io\u002F) project which provides a read-only display of notebooks (e.g. from S3 buckets).\n   * [Streamlit](https:\u002F\u002Fstreamlit.io\u002F): interactive data science tool with applets\n * Compute recommendations \u003Csup>[1](#fsdl)\u003C\u002Fsup>:\n   * For *individuals* or *startups*: \n     * Development: a 4x Turing-architecture PC\n     * Training\u002FEvaluation: Use the same 4x GPU PC. When running many experiments, either buy shared servers or use cloud instances.\n   * For *large companies:* \n     * Development: Buy a 4x Turing-architecture PC per ML scientist or let them use V100 instances\n     * Training\u002FEvaluation: Use cloud instances with proper provisioning and handling of failures\n * Cloud Providers: \n   * GCP: option to connect GPUs to any instance + has TPUs \n   * AWS:  \n### 2.2. Resource Management \n  * Allocating free resources to programs \n  * Resource management options: \n    * Old school cluster job scheduler ( e.g. [Slurm](https:\u002F\u002Fslurm.schedmd.com\u002F) workload manager )\n    * Docker + Kubernetes\n    * Kubeflow \n    * [Polyaxon](https:\u002F\u002Fpolyaxon.com\u002F) (paid features)\n    \n### 2.3. DL Frameworks \n  * Unless having a good reason not to, use Tensorflow\u002FKeras or PyTorch. \u003Csup>[1](#fsdl)\u003C\u002Fsup> \n  * The following figure shows a comparison between different frameworks on how they stand for *\"developement\"* and *\"production\"*.  \n\n  \u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_ade5d50dba15.png\" title=\"\" width=\"95%\" height=\"95%\">\n   \u003C\u002Fp>\n\n  \n### 2.4. Experiment management\n\n* Development, training, and evaluation strategy:\n  * Always start **simple** \n    * Train a small model on a small batch. Only if it works, scale to larger data and models, and hyperparameter tuning!  \n  * Experiment management tools: \n  * [Tensorboard](https:\u002F\u002Fwww.tensorflow.org\u002Ftensorboard)\n      * provides the visualization and tooling needed for ML experimentation  \n  * [Losswise](https:\u002F\u002Flosswise.com\u002F) (Monitoring for ML)\n  * [Comet](https:\u002F\u002Fwww.comet.ml\u002F): lets you track code, experiments, and results on ML projects\n  * [Weights & Biases](https:\u002F\u002Fwww.wandb.com\u002F): Record and visualize every detail of your research with easy collaboration \n  * [MLFlow Tracking](https:\u002F\u002Fwww.mlflow.org\u002Fdocs\u002Flatest\u002Ftracking.html#tracking): for logging parameters, code versions, metrics, and output files as well as visualization of the results.\n    * Automatic experiment tracking with one line of code in python\n    * Side by side comparison of experiments \n    * Hyper parameter tuning \n    * Supports Kubernetes based jobs \n    \n### 2.5. Hyperparameter Tuning \n  * Approaches: \n    * Grid search \n    * Random search \n    * Bayesian Optimization\n    * HyperBand and Asynchronous Successive Halving Algorithm (ASHA)\n    * Population-based Training\n\n  * Platforms: \n    * [RayTune](http:\u002F\u002Ftune.io\u002F): Ray Tune is a Python library for hyperparameter tuning at any scale (with  a focus on deep learning and deep reinforcement learning). Supports any machine learning framework, including PyTorch, XGBoost, MXNet, and Keras.\n    * [Katib](https:\u002F\u002Fgithub.com\u002Fkubeflow\u002Fkatib): Kubernete's Native System   for Hyperparameter Tuning and Neural Architecture Search, inspired by   [Google vizier](https:\u002F\u002Fstatic.googleusercontent.com\u002Fmedia\u002F research.google.com\u002Fja\u002F\u002Fpubs\u002Farchive\u002F  bcb15507f4b52991a0783013df4222240e942381.pdf) and supports multiple ML\u002FDL   frameworks (e.g. TensorFlow, MXNet, and PyTorch). \n    * [Hyperas](https:\u002F\u002Fmaxpumperla.com\u002Fhyperas\u002F): a simple wrapper around  hyperopt for Keras, with a simple template notation to define  hyper-parameter ranges to tune.\n    * [SIGOPT](https:\u002F\u002Fsigopt.com\u002F):  a scalable, enterprise-grade  optimization platform \n    * [Sweeps](https:\u002F\u002Fdocs.wandb.com\u002Flibrary\u002Fsweeps) from [Weights & Biases] (https:\u002F\u002Fwww.wandb.com\u002F): Parameters are not explicitly specified by a   developer. Instead they are approximated and learned by a machine   learning model.\n    * [Keras Tuner](https:\u002F\u002Fgithub.com\u002Fkeras-team\u002Fkeras-tuner): A hyperparameter tuner for Keras, specifically for tf.keras with TensorFlow 2.0.\n\n### 2.6. Distributed Training \n  * Data parallelism: Use it when iteration time is too long (both tensorflow and PyTorch support)\n    * [Ray Distributed Training](https:\u002F\u002Fray.readthedocs.io\u002Fen\u002Flatest\u002Fdistributed_training.html)\n  * Model parallelism: when model does not fit on a single GPU \n  * Other solutions: \n    * Horovod\n\n## 3. Troubleshooting [TBD]\n\n## 4. Testing and Deployment \n### 4.1. Testing and CI\u002FCD\nMachine Learning production software requires a more diverse set of test suites than traditional software:\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_984bbba7ff5f.png\" title=\"\" width=\"75%\" height=\"75%\">\n   \u003C\u002Fp>\n   \n* Unit and Integration Testing: \n   * Types of tests: \n     * Training system tests: testing training pipeline\n     * Validation tests: testing prediction system on validation set \n     * Functionality tests: testing prediction system on few important examples \n* Continuous Integration: Running tests after each new code change pushed to the repo \n * SaaS for continuous integration: \n    * [Argo](https:\u002F\u002Fargoproj.github.io\u002F): Open source Kubernetes native workflow engine for orchestrating parallel jobs (incudes workflows, events, CI and CD).\n    * [CircleCI](https:\u002F\u002Fcircleci.com\u002F): Language-Inclusive Support, Custom Environments, Flexible Resource Allocation, used by instacart, Lyft, and StackShare.\n    * [Travis CI](https:\u002F\u002Ftravis-ci.org\u002F)\n    * [Buildkite](https:\u002F\u002Fbuildkite.com\u002F): Fast and stable builds, Open source agent runs on almost any machine and architecture, Freedom to use your own  tools and services\n    * Jenkins: Old school build system  \n\n\n### 4.2. Web Deployment\n  * Consists of a **Prediction System** and a **Serving System**\n      * Prediction System: Process input data, make predictions \n      * Serving System (Web server): \n        * Serve prediction with scale in mind  \n        * Use REST API to serve prediction HTTP requests\n        * Calls the prediction system to respond \n  * Serving options: \n      * 1. Deploy to VMs, scale by adding instances \n      * 2. Deploy as containers, scale via orchestration \n          * Containers \n              * Docker \n          * Container Orchestration:\n              * Kubernetes (the most popular now)\n              * MESOS \n              * Marathon \n      * 3. Deploy code as a \"serverless function\"\n      * 4. Deploy via a **model serving** solution\n  * Model serving:\n      * Specialized web deployment for ML models\n      * Batches request for GPU inference \n      * Frameworks:\n         * Tensorflow serving \n         * MXNet Model server \n         * Clipper (Berkeley)\n         * SaaS solutions\n            * [Seldon](https:\u002F\u002Fwww.seldon.io\u002F): serve and scale models built in any framework on Kubernetes\n            * [Algorithmia](https:\u002F\u002Falgorithmia.com\u002F)\n   * Decision making: CPU or GPU? \n      * CPU inference:\n         * CPU inference is preferable if it meets the requirements.\n         * Scale by adding more servers, or going serverless. \n      * GPU inference: \n         * TF serving or Clipper \n         * Adaptive batching is useful \n  * (Bonus) Deploying Jupyter Notebooks:\n      * [Kubeflow Fairing](https:\u002F\u002Fgithub.com\u002Fkubeflow\u002Ffairing) is a hybrid deployment package that let's you deploy your *Jupyter notebook* codes! \n    \n### 4.5 Service Mesh and Traffic Routing \n* Transition from monolithic applications towards a distributed microservice architecture could be challenging. \n* A **Service mesh** (consisting of a network of microservices) reduces the complexity of such deployments, and eases the strain on development teams.\n  * [Istio](https:\u002F\u002Fistio.io\u002F): a service mesh to ease creation of  a network of deployed services with load balancing, service-to-service authentication, monitoring, with few or no code changes in service code. \n### 4.4. Monitoring:\n* Purpose of monitoring: \n   * Alerts for downtime, errors, and distribution shifts \n   * Catching service and data regressions \n* Cloud providers solutions are decent \n* [Kiali](https:\u002F\u002Fkiali.io\u002F):an observability console for Istio with service mesh configuration capabilities. It answers these questions: How are the microservices connected? How are they performing?\n\n#### Are we done?\n\u003Cp align=\"center\">\n   \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_a7e0a253941d.png\" title=\"\" width=\"65%\" height=\"65%\">\n\u003C\u002Fp>\n\n### 4.5. Deploying on Embedded and Mobile Devices  \n* Main challenge: memory footprint and compute constraints \n* Solutions: \n   * Quantization \n   * Reduced model size \n      * MobileNets \n   * Knowledge Distillation \n      * DistillBERT (for NLP)\n* Embedded and Mobile Frameworks: \n   * Tensorflow Lite\n   * PyTorch Mobile\n   * Core ML \n   * ML Kit \n   * FRITZ \n   * OpenVINO\n* Model Conversion:\n   * Open Neural Network Exchange (ONNX): open-source format for deep learning models \n### 4.6. All-in-one solutions\n   * Tensorflow Extended (TFX)\n   * Michelangelo (Uber)\n   * Google Cloud AI Platform \n   * Amazon SageMaker \n   * Neptune \n   * FLOYD \n   * Paperspace \n   * Determined AI \n   * Domino data lab \n\u003Cp align=\"center\">\n   \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_221a0c60d764.png\" title=\"\" width=\"100%\" height=\"100%\">\n\u003C\u002Fp>\n\n# Tensorflow Extended (TFX) \n[TBD]\n\u003Cp align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_2af13db65e2c.png\" title=\"\" width=\"95%\" height=\"95%\">\n\u003C\u002Fp>\n\n# Airflow and KubeFlow ML Pipelines \n[TBD]\n\u003Cp align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_7452dc68047c.png\" title=\"\" width=\"45%\" height=\"45%\">\n\u003C\u002Fp>\n\n\n## Other useful links: \n* [Lessons learned from building practical deep learning systems](https:\u002F\u002Fwww.slideshare.net\u002Fxamat\u002Flessons-learned-from-building-practical-deep-learning-systems)\n* [Machine Learning: The High Interest Credit Card of Technical Debt](https:\u002F\u002Fai.google\u002Fresearch\u002Fpubs\u002Fpub43146)\n \n## [Contributing](https:\u002F\u002Fgithub.com\u002Falirezadir\u002FProduction-Level-Deep-Learning\u002Fblob\u002Fmaster\u002FCONTRIBUTING.md)\n\n## References: \n\n\u003Ca name=\"fsdl\">[1]\u003C\u002Fa>: [Full Stack Deep Learning Bootcamp](https:\u002F\u002Ffullstackdeeplearning.com\u002F), Nov 2019. \n\n\u003Ca name=\"pipe\">[2]\u003C\u002Fa>: [Advanced KubeFlow Workshop](https:\u002F\u002Fwww.meetup.com\u002FAdvanced-KubeFlow\u002F) by [Pipeline.ai](https:\u002F\u002Fpipeline.ai\u002F), 2019. \n\n\u003Ca name=\"pipe\">[3]\u003C\u002Fa>: [TFX: Real World Machine Learning in Production](https:\u002F\u002Fcdn.oreillystatic.com\u002Fen\u002Fassets\u002F1\u002Fevent\u002F298\u002FTFX_%20Production%20ML%20pipelines%20with%20TensorFlow%20Presentation.pdf)\n\n   \n    \n","# :bulb: 生产级深度学习指南 :clapper: :scroll:  :ferry:\n🇨🇳 中文翻译在 [这里](https:\u002F\u002Fgithub.com\u002Falirezadir\u002FProduction-Level-Deep-Learning\u002Fblob\u002Fmaster\u002Fother-languages\u002FChinese(Simplified).md)\n\n### :label: 新增：[机器学习面试题](https:\u002F\u002Fgithub.com\u002Falirezadir\u002FMachine-Learning-Interviews)\n\n:label: 注意：欢迎所有反馈和贡献 :blush:\n\n将深度学习模型部署到生产环境中可能充满挑战，因为这远远超出了训练出性能良好的模型这一阶段。为了部署一个生产级别的深度学习系统，需要设计和开发多个不同的组件（如下所示）：\n\n\u003Cp align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_c951f889b933.png\" title=\"\" width=\"95%\" height=\"95%\">\n\u003C\u002Fp>\n\n本仓库旨在为构建将在实际应用中部署的生产级深度学习系统提供工程指导。\n\n此处提供的内容借鉴自以下资源：[全栈深度学习训练营](https:\u002F\u002Ffullstackdeeplearning.com)（由加州大学伯克利分校的 [Pieter Abbeel](https:\u002F\u002Fpeople.eecs.berkeley.edu\u002F~pabbeel\u002F)、OpenAI 的 [Josh Tobin](http:\u002F\u002Fjosh-tobin.com\u002F) 和 Turnitin 的 [Sergey Karayev](https:\u002F\u002Fsergeykarayev.com\u002F) 共同创办）、[Robert Crowe](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Frobert-crowe\u002F) 主持的 [TFX 工作坊](https:\u002F\u002Fconferences.oreilly.com\u002Ftensorflow\u002Ftf-ca\u002Fpublic\u002Fschedule\u002Fdetail\u002F79327)，以及 [Pipeline.ai](https:\u002F\u002Fpipeline.ai\u002F) 的 [高级 KubeFlow 聚会](https:\u002F\u002Fwww.meetup.com\u002FAdvanced-KubeFlow\u002F)（由 [Chris Fregly](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fcfregly\u002F) 组织）。\n\n# 机器学习项目\n有趣的是：**85% 的 AI 项目都会失败**。\u003Csup>[1](#fsdl)\u003C\u002Fsup> 可能的原因包括：\n- 技术上不可行或范围界定不清\n- 始终未能进入生产阶段\n- 成功标准（指标）不明确\n- 团队管理不善\n\n## 1. 机器学习项目生命周期\n\u003Cp align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_5e0f2d9d6229.png\" title=\"\" width=\"95%\" height=\"95%\">\u003C\u002Fp>\n\n- 理解所在领域的最新进展的重要性：\n  - 有助于了解哪些是可行的\n  - 有助于知道下一步该尝试什么\n## 2. 机器学习项目的思维模型\n在定义和优先排序机器学习项目时，需要考虑两个重要因素：\n- 高影响力：\n  - 您流水线中的复杂部分\n  - 在“低成本预测”具有价值的地方\n  - 自动化复杂手动流程具有价值的地方\n- 低投入：\n  - 投入成本主要受以下因素驱动：\n    - 数据的可获得性\n    - 性能要求：成本通常会随着准确率要求的提高而呈超线性增长\n    - 问题的难度：\n      - 一些较为困难的问题包括：无监督学习、强化学习以及某些类型的监督学习\n\u003Cp align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_5dcda0a20c8c.png\" title=\"\" width=\"90%\" height=\"90%\">\n\u003C\u002Fp>\n  \n# 全栈式流水线\n\n下图展示了生产级深度学习系统中各个组件的高层次概览：\n\u003Cp align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_f514cc37052a.png\" title=\"\" width=\"100%\" height=\"100%\">\n\u003C\u002Fp>\n接下来，我们将逐一介绍每个模块，并推荐适合各组件的工具集、框架以及从业者的最佳实践。\n\n## 1. 数据管理 \n### 1.1 数据来源 \n* 监督式深度学习需要大量标注数据\n* 自行标注数据成本高昂！\n* 以下是一些数据资源：\n  * 开源数据（适合作为起点，但并非优势）\n  * 数据增强（计算机视觉领域必不可少，自然语言处理领域则可选）\n  * 合成数据（几乎总是值得首先尝试，尤其是在自然语言处理领域）\n### 1.2 数据标注 \n* 需要独立的软件栈（标注平台）、临时劳动力和质量控制\n* 标注劳动力来源：\n  * 众包（Mechanical Turk）：廉价且可扩展，但可靠性较低，需进行质量控制\n  * 自聘标注员：所需质量控制较少，成本较高，扩展速度较慢\n  * 数据标注服务公司：\n    * [FigureEight](https:\u002F\u002Fwww.figure-eight.com\u002F)\n* 标注平台：\n  * [Diffgram](https:\u002F\u002Fdiffgram.com\u002F)：用于计算机视觉的训练数据软件\n  * [Prodigy](https:\u002F\u002Fprodi.gy\u002F)：基于主动学习的标注工具（由 Spacy 的开发者打造），适用于文本和图像\n  * [HIVE](https:\u002F\u002Fthehive.ai\u002F)：面向计算机视觉的 AI 即服务平台\n  * [Supervisely](https:\u002F\u002Fsupervise.ly\u002F)：完整的计算机视觉平台\n  * [Labelbox](https:\u002F\u002Flabelbox.com\u002F)：专注于计算机视觉\n  * [Scale](https:\u002F\u002Fscale.com\u002F)：AI 数据平台（涵盖计算机视觉和自然语言处理）\n\n    \n### 1.3 数据存储 \n* 数据存储选项：\n  * **对象存储**：用于存储二进制数据（图像、音频文件、压缩文本等）\n    * [Amazon S3](https:\u002F\u002Faws.amazon.com\u002Fs3\u002F)\n    * [Ceph](https:\u002F\u002Fceph.io\u002F) 对象存储\n  * **数据库**：用于存储元数据（文件路径、标签、用户活动等）。\n    * 对于大多数应用来说，[Postgres](https:\u002F\u002Fwww.postgresql.org\u002F) 是理想选择，它拥有业界一流的 SQL 功能，并且对非结构化 JSON 数据提供了很好的支持。\n  * **数据湖**：用于聚合无法从数据库中获取的特征（例如日志）。\n    * [Amazon Redshift](https:\u002F\u002Faws.amazon.com\u002Fredshift\u002F)\n  * **特征存储**：用于存储、访问和共享机器学习特征\n    （特征提取可能非常耗时且难以扩展，因此不同模型和团队之间共享特征是高性能机器学习团队的关键）。 \n    * [FEAST](https:\u002F\u002Fgithub.com\u002Fgojek\u002Ffeast)（Google Cloud，开源）\n    * [Michelangelo Palette](https:\u002F\u002Feng.uber.com\u002Fmichelangelo\u002F)（Uber）\n* 建议：在训练时，将数据复制到本地或网络文件系统（NFS）中。\u003Csup>[1](#fsdl)\u003C\u002Fsup> \n\n### 1.4 数据版本控制 \n* 对已部署的机器学习模型而言，这是“必须”的：\n  **已部署的机器学习模型既是代码的一部分，也是数据的一部分**。\u003Csup>[1](#fsdl)\u003C\u002Fsup> 如果没有数据版本控制，也就没有模型版本控制。\n* 数据版本控制平台：\n  * [DVC](https:\u002F\u002Fdvc.org\u002F)：面向机器学习项目的开源版本控制系统\n  * [Pachyderm](https:\u002F\u002Fwww.pachyderm.com\u002F)：用于数据的版本控制\n  * [Dolt](https:\u002F\u002Fgithub.com\u002Fdolthub\u002Fdolt)：一种带有类似 Git 版本控制功能的 SQL 数据库，可用于数据和模式的版本管理\n\n### 1.5. 数据处理\n* 生产模型的训练数据可能来自不同来源，包括 *数据库和对象存储中的存储数据*、*日志处理* 以及 *其他分类器的输出*。\n* 任务之间存在依赖关系，每个任务都需要在其依赖项完成后才能启动。例如，在新日志数据上进行训练之前，必须先完成预处理步骤。\n* Makefile 不具备可扩展性。因此，“工作流管理器”在此变得非常重要。\n* **工作流编排：**\n  * Spotify 的 [Luigi](https:\u002F\u002Fgithub.com\u002Fspotify\u002Fluigi)\n  * Airbnb 的 [Airflow](https:\u002F\u002Fairflow.apache.org\u002F)：动态、可扩展、优雅且广泛使用\n      * DAG 工作流\n      * 强大的条件执行功能：失败时可重试\n      * Pusher 支持带有 TensorFlow Serving 的 Docker 镜像\n      * 整个工作流可以放在一个 .py 文件中\n\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_102cac11c7d4.png\" title=\"\" width=\"65%\" height=\"65%\">\n   \u003C\u002Fp>\n   \n\n## 2. 开发、训练与评估 \n### 2.1. 软件工程\n* 主流语言：Python\n* 编辑器：\n   * Vim\n   * Emacs  \n   * [VS Code](https:\u002F\u002Fcode.visualstudio.com\u002F)（作者推荐）：内置 Git 暂存和差异查看功能，支持代码检查，并可通过 SSH 远程打开项目\n   * 笔记本：作为项目的起点非常棒，但难以扩展（有趣的是，Netflix 的笔记本驱动架构是一个例外，它完全基于 [nteract](https:\u002F\u002Fnteract.io\u002F) 套件）。\n      * [nteract](https:\u002F\u002Fnteract.io\u002F)：基于 React 的下一代 Jupyter 笔记本用户界面\n      * [Papermill](https:\u002F\u002Fgithub.com\u002Fnteract\u002Fpapermill)：一个用于 *参数化*、*执行* 和 *分析* Jupyter 笔记本的 [nteract](https:\u002F\u002Fnteract.io\u002F) 库。\n      * [Commuter](https:\u002F\u002Fgithub.com\u002Fnteract\u002Fcommuter)：另一个 [nteract](https:\u002F\u002Fnteract.io\u002F) 项目，提供只读的笔记本显示功能（例如从 S3 存储桶中加载）。\n   * [Streamlit](https:\u002F\u002Fstreamlit.io\u002F)：具有应用程序的小型交互式数据科学工具\n * 计算资源建议 \u003Csup>[1](#fsdl)\u003C\u002Fsup>：\n   * 对于 *个人* 或 *初创公司*：\n     * 开发：配备 4 块 Turing 架构 GPU 的电脑\n     * 训练\u002F评估：使用同一台配备 4 块 GPU 的电脑。如果需要运行大量实验，可以购买共享服务器或使用云实例。\n   * 对于 *大型公司*：\n     * 开发：为每位机器学习科学家配备一台配备 4 块 Turing 架构 GPU 的电脑，或者允许他们使用 V100 实例\n     * 训练\u002F评估：使用云实例，并做好适当的资源配置和故障处理\n * 云服务提供商：\n   * GCP：可以选择将 GPU 连接到任何实例，并且还提供 TPU\n   * AWS：  \n### 2.2. 资源管理 \n  * 将空闲资源分配给程序\n  * 资源管理选项：\n    * 传统集群作业调度器（例如 [Slurm](https:\u002F\u002Fslurm.schedmd.com\u002F) 工作负载管理器）\n    * Docker + Kubernetes\n    * Kubeflow \n    * [Polyaxon](https:\u002F\u002Fpolyaxon.com\u002F)（付费功能）\n    \n### 2.3. 深度学习框架 \n  * 除非有充分理由不使用，否则应选择 Tensorflow\u002FKeras 或 PyTorch。\u003Csup>[1](#fsdl)\u003C\u002Fsup> \n  * 下图展示了不同框架在“开发”和“生产”方面的比较。\n\n  \u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_ade5d50dba15.png\" title=\"\" width=\"95%\" height=\"95%\">\n   \u003C\u002Fp>\n\n  \n### 2.4. 实验管理\n\n* 开发、训练和评估策略：\n  * 始终从 **简单** 开始\n    * 先用小批量数据训练一个小模型。只有当它有效时，再逐步扩大到更大的数据集和模型，并进行超参数调优！  \n  * 实验管理工具：\n  * [Tensorboard](https:\u002F\u002Fwww.tensorflow.org\u002Ftensorboard)\n      * 提供 ML 实验所需的可视化和工具支持\n  * [Losswise](https:\u002F\u002Flosswise.com\u002F)（ML 监控）\n  * [Comet](https:\u002F\u002Fwww.comet.ml\u002F)：允许您跟踪代码、实验和 ML 项目的成果\n  * [Weights & Biases](https:\u002F\u002Fwww.wandb.com\u002F)：轻松协作记录并可视化研究的每一个细节\n  * [MLFlow Tracking](https:\u002F\u002Fwww.mlflow.org\u002Fdocs\u002Flatest\u002Ftracking.html#tracking)：用于记录参数、代码版本、指标和输出文件，并对结果进行可视化。\n    * 只需一行 Python 代码即可自动跟踪实验\n    * 并排比较多个实验\n    * 超参数调优\n    * 支持基于 Kubernetes 的作业\n    \n### 2.5. 超参数调优 \n  * 方法：\n    * 网格搜索\n    * 随机搜索\n    * 贝叶斯优化\n    * HyperBand 和异步连续减半算法 (ASHA)\n    * 基于种群的训练\n\n  * 平台：\n    * [RayTune](http:\u002F\u002Ftune.io\u002F)：Ray Tune 是一个用于任意规模超参数调优的 Python 库（专注于深度学习和深度强化学习）。支持所有机器学习框架，包括 PyTorch、XGBoost、MXNet 和 Keras。\n    * [Katib](https:\u002F\u002Fgithub.com\u002Fkubeflow\u002Fkatib)：Kubernetes 原生的超参数调优和神经架构搜索系统，灵感来源于 Google vizier（参考文献：[Google vizier](https:\u002F\u002Fstatic.googleusercontent.com\u002Fmedia\u002F research.google.com\u002Fja\u002F\u002Fpubs\u002Farchive\u002F bcb15507f4b52991a0783013df4222240e942381.pdf)），支持多种 ML\u002FDL 框架（如 TensorFlow、MXNet 和 PyTorch）。\n    * [Hyperas](https:\u002F\u002Fmaxpumperla.com\u002Fhyperas\u002F)：一个简单的 Keras 超参数优化封装库，采用简洁的模板语法来定义超参数范围。\n    * [SIGOPT](https:\u002F\u002Fsigopt.com\u002F)：一个可扩展的企业级优化平台\n    * [Sweeps](https:\u002F\u002Fdocs.wandb.com\u002Flibrary\u002Fsweeps) 来自 [Weights & Biases]（https:\u002F\u002Fwww.wandb.com\u002F）：开发者无需显式指定参数，而是由机器学习模型自动近似并学习这些参数。\n    * [Keras Tuner](https:\u002F\u002Fgithub.com\u002Fkeras-team\u002Fkeras-tuner)：一个专门针对 tf.keras 和 TensorFlow 2.0 的 Keras 超参数调优工具。\n\n### 2.6. 分布式训练 \n  * 数据并行：当迭代时间过长时使用（Tensorflow 和 PyTorch 都支持）\n    * [Ray 分布式训练](https:\u002F\u002Fray.readthedocs.io\u002Fen\u002Flatest\u002Fdistributed_training.html)\n  * 模型并行：当模型无法放入单个 GPU 时使用\n  * 其他解决方案：\n    * Horovod\n\n## 3. 故障排除 [待定]\n\n## 4. 测试与部署\n\n### 4.1. 测试与 CI\u002FCD\n机器学习生产级软件比传统软件需要更丰富的测试套件：\n\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_984bbba7ff5f.png\" title=\"\" width=\"75%\" height=\"75%\">\n   \u003C\u002Fp>\n   \n* 单元测试与集成测试： \n   * 测试类型： \n     * 训练系统测试：测试训练流水线\n     * 验证测试：在验证集上测试预测系统\n     * 功能测试：在少数重要示例上测试预测系统 \n* 持续集成：每次向代码库推送新代码变更后运行测试 \n * 持续集成的 SaaS 工具： \n    * [Argo](https:\u002F\u002Fargoproj.github.io\u002F)：开源的 Kubernetes 原生工作流引擎，用于编排并行任务（包括工作流、事件、CI 和 CD）。\n    * [CircleCI](https:\u002F\u002Fcircleci.com\u002F)：支持多种语言、自定义环境、灵活的资源分配，被 Instacart、Lyft 和 StackShare 等公司使用。\n    * [Travis CI](https:\u002F\u002Ftravis-ci.org\u002F)\n    * [Buildkite](https:\u002F\u002Fbuildkite.com\u002F)：构建快速且稳定，开源代理几乎可在任何机器和架构上运行，允许自由使用自有工具和服务。\n    * Jenkins：老牌构建系统  \n\n\n### 4.2. Web 部署\n  * 包括 **预测系统** 和 **服务系统**\n      * 预测系统：处理输入数据，生成预测结果\n      * 服务系统（Web 服务器）： \n        * 以可扩展性为目标提供预测服务  \n        * 使用 REST API 处理预测 HTTP 请求\n        * 调用预测系统进行响应 \n  * 服务选项： \n      * 1. 部署到虚拟机，通过增加实例来扩展规模 \n      * 2. 以容器形式部署，通过编排技术实现扩展 \n          * 容器 \n              * Docker \n          * 容器编排：\n              * Kubernetes（目前最流行）\n              * MESOS \n              * Marathon \n      * 3. 将代码部署为“无服务器函数”\n      * 4. 通过 **模型服务** 解决方案进行部署\n  * 模型服务：\n      * 面向机器学习模型的专业化 Web 部署\n      * 批量处理请求以进行 GPU 推理 \n      * 框架：\n         * Tensorflow Serving \n         * MXNet Model Server \n         * Clipper（伯克利团队）\n         * SaaS 解决方案\n            * [Seldon](https:\u002F\u002Fwww.seldon.io\u002F)：可在 Kubernetes 上部署并扩展任何框架构建的模型\n            * [Algorithmia](https:\u002F\u002Falgorithmia.com\u002F)\n   * 决策：CPU 还是 GPU？ \n      * CPU 推理：\n         * 如果能满足需求，优先选择 CPU 推理。\n         * 可通过增加服务器数量或采用无服务器架构来扩展规模。 \n      * GPU 推理： \n         * TF Serving 或 Clipper \n         * 自适应批处理非常有用 \n  * （附加）部署 Jupyter Notebook：\n      * [Kubeflow Fairing](https:\u002F\u002Fgithub.com\u002Fkubeflow\u002Ffairing) 是一个混合部署工具包，可以让你部署你的 *Jupyter Notebook* 代码！ \n    \n### 4.5 服务网格与流量路由 \n* 从单体应用向分布式微服务架构过渡可能充满挑战。 \n* **服务网格**（由一组微服务组成的网络）能够降低此类部署的复杂性，并减轻开发团队的压力。\n  * [Istio](https:\u002F\u002Fistio.io\u002F)：一种服务网格，可在不修改或仅少量修改服务代码的情况下，轻松创建具有负载均衡、服务间身份验证和监控功能的服务网络。 \n### 4.4. 监控：\n* 监控的目的： \n   * 对宕机、错误和分布漂移发出警报 \n   * 捕捉服务和数据回归问题 \n* 云服务商提供的解决方案相当不错 \n* [Kiali](https:\u002F\u002Fkiali.io\u002F)：Istio 的可观测性控制台，具备服务网格配置能力。它可以回答以下问题：微服务之间是如何连接的？它们的性能如何？\n\n#### 我们完成了吗？\n\u003Cp align=\"center\">\n   \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_a7e0a253941d.png\" title=\"\" width=\"65%\" height=\"65%\">\n\u003C\u002Fp>\n\n### 4.5. 在嵌入式和移动设备上部署  \n* 主要挑战：内存占用和计算资源限制 \n* 解决方案： \n   * 量化 \n   * 减小模型尺寸 \n      * MobileNets \n   * 知识蒸馏 \n      * DistillBERT（用于 NLP）\n* 嵌入式和移动框架： \n   * Tensorflow Lite\n   * PyTorch Mobile\n   * Core ML \n   * ML Kit \n   * FRITZ \n   * OpenVINO\n* 模型转换：\n   * 开放神经网络交换格式（ONNX）：深度学习模型的开源格式 \n### 4.6. 一体化解决方案\n   * Tensorflow Extended (TFX)\n   * Michelangelo（Uber）\n   * Google Cloud AI Platform \n   * Amazon SageMaker \n   * Neptune \n   * FLOYD \n   * Paperspace \n   * Determined AI \n   * Domino 数据实验室 \n\u003Cp align=\"center\">\n   \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_221a0c60d764.png\" title=\"\" width=\"100%\" height=\"100%\">\n\u003C\u002Fp>\n\n# Tensorflow Extended (TFX) \n[TBD]\n\u003Cp align=\"center\">\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_2af13db65e2c.png\" title=\"\" width=\"95%\" height=\"95%\">\n\u003C\u002Fp>\n\n# Airflow 和 KubeFlow ML 流水线 \n[TBD]\n\u003Cp align=\"center\">\n    \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_readme_7452dc68047c.png\" title=\"\" width=\"45%\" height=\"45%\">\n\u003C\u002Fp>\n\n\n## 其他有用链接： \n* [构建实用深度学习系统的经验教训](https:\u002F\u002Fwww.slideshare.net\u002Fxamat\u002Flessons-learned-from-building-practical-deep-learning-systems)\n* [机器学习：技术债务中的高息信用卡](https:\u002F\u002Fai.google\u002Fresearch\u002Fpubs\u002Fpub43146)\n \n## [贡献指南](https:\u002F\u002Fgithub.com\u002Falirezadir\u002FProduction-Level-Deep-Learning\u002Fblob\u002Fmaster\u002FCONTRIBUTING.md)\n\n## 参考文献： \n\n\u003Ca name=\"fsdl\">[1]\u003C\u002Fa>: [全栈深度学习训练营](https:\u002F\u002Ffullstackdeeplearning.com\u002F)，2019 年 11 月。 \n\n\u003Ca name=\"pipe\">[2]\u003C\u002Fa>: [高级 KubeFlow 研讨会](https:\u002F\u002Fwww.meetup.com\u002FAdvanced-KubeFlow\u002F)由 [Pipeline.ai](https:\u002F\u002Fpipeline.ai\u002F) 组织，2019 年。 \n\n\u003Ca name=\"pipe\">[3]\u003C\u002Fa>: [TFX：生产环境中的真实世界机器学习](https:\u002F\u002Fcdn.oreillystatic.com\u002Fen\u002Fassets\u002F1\u002Fevent\u002F298\u002FTFX_%20Production%20ML%20pipelines%20with%20TensorFlow%20Presentation.pdf)","# Production-Level-Deep-Learning 快速上手指南\n\n> **注意**：`Production-Level-Deep-Learning` 并非一个可直接安装运行的软件包或库，而是一份**工程实践指南与知识库**。它汇总了构建生产级深度学习系统所需的组件、工具链推荐及最佳实践。\n>\n> 本指南将指导你如何基于该仓库的建议，搭建一套完整的生产级深度学习开发环境。\n\n## 1. 环境准备\n\n在开始构建生产级流水线之前，请确保你的开发环境满足以下基础要求。\n\n### 系统要求\n*   **操作系统**: Linux (推荐 Ubuntu 20.04\u002F22.04) 或 macOS。生产环境通常部署在 Linux 服务器上。\n*   **硬件资源**:\n    *   **开发阶段**: 建议配备至少 4 块 Turing 架构（如 RTX 2080\u002F3080）或同等性能的 GPU 工作站。\n    *   **训练\u002F评估**: 对于大规模实验，建议使用云实例（AWS, GCP）或共享集群，需支持 Docker 和 Kubernetes。\n*   **编程语言**: Python 3.8+ (业界标准)。\n\n### 前置依赖\n你需要安装以下核心工具以支撑指南中推荐的各个模块（数据管理、工作流编排、实验追踪等）：\n\n*   **版本控制**: `git`\n*   **容器化**: `docker` 及 `docker-compose`\n*   **编排工具**: `kubectl` (若使用 Kubernetes\u002FKubeflow)\n*   **包管理**: `pip` 或 `conda`\n*   **编辑器**: VS Code (推荐，支持远程 SSH 开发) 或 Vim\u002FEmacs\n\n**国内加速建议**：\n*   **Python 包**: 配置清华源或阿里源加速 `pip` 安装。\n    ```bash\n    pip config set global.index-url https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n    ```\n*   **Docker 镜像**: 配置阿里云或网易云镜像加速器。\n*   **Git 克隆**: 若访问 GitHub 缓慢，可使用 `hub.fastgit.org` 等镜像地址。\n\n## 2. 安装步骤\n\n由于本项目是资料库而非软件，\"安装\"过程实质上是**获取指南内容**并**部署推荐的核心工具链**。\n\n### 第一步：获取指南源码\n克隆仓库到本地，以便查阅详细的架构图和工具列表。\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Falirezadir\u002FProduction-Level-Deep-Learning.git\ncd Production-Level-Deep-Learning\n```\n\n### 第二步：部署核心工具链 (示例)\n根据指南中的 \"Full stack pipeline\" 建议，以下是构建最小可行生产环境所需的关键组件安装命令。\n\n#### 1. 数据版本控制 (DVC)\n用于管理模型中的数据版本，确保“代码 + 数据”可复现。\n```bash\npip install dvc[s3] \n# 初始化当前项目\ndvc init\n# 配置远程存储 (以 S3 为例，国内可用阿里云 OSS 兼容接口)\ndvc remote add -d myremote s3:\u002F\u002Fmy-bucket\u002Fpath\n```\n\n#### 2. 工作流编排 (Airflow)\n用于管理数据处理和训练任务的依赖关系 (DAG)。\n```bash\n# 设置 Airflow 主目录\nexport AIRFLOW_HOME=~\u002Fairflow\n# 安装 Apache Airflow (建议指定版本)\npip install apache-airflow==2.7.0\n# 初始化数据库\nairflow db init\n# 创建管理员用户\nairflow users create \\\n    --username admin \\\n    --firstname Admin \\\n    --lastname User \\\n    --role Admin \\\n    --email admin@example.com \\\n    --password admin\n```\n\n#### 3. 实验管理 (MLflow)\n用于记录参数、指标和模型产物。\n```bash\npip install mlflow\n# 启动本地 MLflow 服务\nmlflow ui --host 0.0.0.0 --port 5000\n```\n\n#### 4. 超参数调优 (Ray Tune)\n用于规模化超参数搜索。\n```bash\npip install ray[tune]\n```\n\n## 3. 基本使用\n\n本项目的“使用”是指参照其**思维模型**和**工具链组合**来启动你的机器学习项目。\n\n### 场景：启动一个新的 CV\u002FNLP 项目\n\n#### 1. 项目规划 (Mental Model)\n参考仓库中的 `images\u002Fprioritize.png`，在写代码前确认：\n*   **高影响力**: 该任务是否处于流程瓶颈？自动化是否能显著降低成本？\n*   **低成本**: 数据是否可得？性能要求是否在合理范围内？\n\n#### 2. 数据管理流程\n不要直接使用原始数据训练。遵循指南建议的流水线：\n1.  **存储**: 将二进制数据（图片\u002F音频）存入对象存储 (如 MinIO\u002FS3)，元数据存入 PostgreSQL。\n2.  **标注**: 使用 Labelbox 或 Prodigy 进行标注，或采用主动学习策略。\n3.  **版本化**: 使用 DVC 跟踪数据集变更。\n    ```bash\n    # 添加数据到 DVC 跟踪\n    dvc add data\u002Fraw_images\n    # 提交到 git (此时只提交 .dvc 文件)\n    git add data\u002Fraw_images.dvc .gitignore\n    git commit -m \"Add initial dataset v1\"\n    ```\n\n#### 3. 构建训练流水线 (Workflow)\n使用 Airflow 定义 DAG，确保预处理完成后才触发训练。\n*   **简单示例逻辑** (伪代码):\n    ```python\n    # airflow_dag.py\n    preprocess_task = BashOperator(task_id='preprocess', bash_command='python preprocess.py')\n    train_task = BashOperator(task_id='train', bash_command='python train.py')\n    \n    preprocess_task >> train_task\n    ```\n\n#### 4. 实验与调优\n在训练脚本中集成 MLflow 和 Ray Tune。\n*   **代码集成示例**:\n    ```python\n    import mlflow\n    import ray\n    from ray import tune\n\n    # 开启实验记录\n    mlflow.set_experiment(\"production_cv_model\")\n\n    def train_model(config):\n        with mlflow.start_run():\n            # 记录参数\n            mlflow.log_params(config)\n            # ... 执行训练逻辑 ...\n            accuracy = 0.95 \n            # 记录指标\n            mlflow.log_metric(\"accuracy\", accuracy)\n            return {\"accuracy\": accuracy}\n\n    # 启动超参搜索\n    analysis = tune.run(\n        train_model,\n        config={\n            \"lr\": tune.grid_search([0.001, 0.01]),\n            \"batch_size\": tune.choice([32, 64])\n        },\n        num_samples=5\n    )\n    ```\n\n#### 5. 模型部署准备\n当实验成功后，利用指南推荐的 **TensorFlow Serving** 或 **TorchServe** (配合 Docker\u002FKubernetes) 进行模型封装，而非直接导出脚本。\n\n---\n*提示：详细的技术选型对比图（如框架对比、基础设施概览）请查看本仓库 `images\u002F` 目录下的原始图表。*","某电商初创团队正试图将实验室中准确率高达 95% 的商品图像识别模型部署到真实的移动端购物应用中，以自动化处理海量用户上传的晒单图片。\n\n### 没有 Production-Level-Deep-Learning 时\n- **项目夭折率高**：团队盲目启动高难度无监督学习项目，因数据稀缺且定义模糊，导致 85% 的 AI 尝试在数月后无果而终。\n- **数据瓶颈难破**：缺乏系统的数据增强与合成策略，过度依赖昂贵的人工标注，导致训练数据成本激增且覆盖场景不足。\n- **架构缺失混乱**：仅关注模型训练代码，忽视数据管理、监控及流水线设计，导致模型无法从实验环境平滑迁移至生产环境。\n- **评估标准模糊**：未建立清晰的业务成功指标，技术团队与产品团队对“模型可用”的认知存在巨大偏差，造成资源浪费。\n\n### 使用 Production-Level-Deep-Learning 后\n- **精准立项避坑**：依据其提供的“高影响力 - 低成本”优先级的思维模型，团队转而聚焦于数据充足的监督学习任务，确保项目快速落地。\n- **高效数据策略**：采纳指南中关于数据增强和合成数据的最佳实践，在不增加人工成本的前提下大幅扩充了训练集，提升了模型泛化能力。\n- **全栈工程落地**：参照其全栈流水线架构图，构建了包含数据版本控制、自动化测试及在线监控的完整工程体系，实现了模型的稳定上线。\n- **明确成功准则**：利用其生命周期管理建议，在项目初期就定义了与技术性能挂钩的业务指标，确保了研发方向与商业价值的高度对齐。\n\nProduction-Level-Deep-Learning 将深度学习从单纯的算法实验转化为可落地、可维护的工业级系统工程，显著降低了真实场景下的部署风险。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Falirezadir_Production-Level-Deep-Learning_09b10252.png","alirezadir","Alireza Dirafzoon","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Falirezadir_817fe708.jpg","Ex-Meta (MultiModal AI for AR\u002FVR), Ex-Samsung (AR\u002FAI)  | 🎶 Founder Sol8","Ex. Meta ","San Francisco, CA",null,"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Falirezadir\u002F","https:\u002F\u002Fgithub.com\u002Falirezadir",4620,683,"2026-04-14T07:05:26",4,"未说明","开发推荐 4x Turing 架构显卡或 V100 实例；训练\u002F评估建议使用云实例（支持 GPU\u002FTPU）；具体显存和 CUDA 版本未说明",{"notes":88,"python":89,"dependencies":90},"本项目主要是一份构建生产级深度学习系统的工程指南和资源列表，而非单一的可执行代码库，因此没有统一的安装依赖。文中建议个人开发者使用 4x Turing 架构 PC 进行开发，大型公司可为每位科学家配备此类设备或使用 V100 云实例。数据管理建议使用对象存储（如 S3）和数据库（如 Postgres）。工作流编排推荐 Airflow 或 Luigi。实验管理推荐 Tensorboard、W&B 或 MLFlow。超参数调优推荐 RayTune 或 Katib。","3.x (文中指出 Python 为获胜语言，但未指定具体小版本)",[91,92,93,94,95,96,97,98,99,100],"TensorFlow","Keras","PyTorch","Airflow","Luigi","DVC","MLFlow","RayTune","Kubeflow","Docker",[14,13,16,15],[103,104,105,106,107,108,109,110,111,112,113,114],"machine-learning","deep-learning","pipeline","scalable-applications","production-system","tfx","kubeflow","artificial-intelligence","ai","practical-machine-learning","deployment","system-design","2026-03-27T02:49:30.150509","2026-04-15T08:09:37.139528",[118],{"id":119,"question_zh":120,"answer_zh":121,"source_url":122},34101,"为什么在某些情况下 CPU 推理比 GPU 更好？","如果 CPU 推理能够满足性能要求（例如延迟），且不需要进行批量推理，那么优先选择 CPU 会更便宜且开销更小。否则，特别是在需要批量推理的场景下，应选择 GPU 推理。","https:\u002F\u002Fgithub.com\u002Falirezadir\u002FProduction-Level-Deep-Learning\u002Fissues\u002F2",[]]