[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-jayinai--data-science-question-answer":3,"tool-jayinai--data-science-question-answer":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",154349,2,"2026-04-13T23:32:16",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":73,"owner_location":73,"owner_email":73,"owner_twitter":73,"owner_website":73,"owner_url":76,"languages":77,"stars":82,"forks":83,"last_commit_at":84,"license":85,"difficulty_score":86,"env_os":87,"env_gpu":88,"env_ram":88,"env_deps":89,"category_tags":92,"github_topics":94,"view_count":32,"oss_zip_url":73,"oss_zip_packed_at":73,"status":17,"created_at":102,"updated_at":103,"faqs":104,"releases":105},7357,"jayinai\u002Fdata-science-question-answer","data-science-question-answer","A repo for data science related questions and answers","data-science-question-answer 是一个专注于数据科学领域的问答知识库，旨在为从业者提供面试准备指南，并帮助初学者快速掌握核心概念。它主要解决了学习者在面对海量理论知识时难以抓住重点、缺乏系统性复习材料以及面试前不知如何高效梳理知识广度的痛点。\n\n这份资源特别适合即将参加数据科学或机器学习岗位面试的开发者、希望转行进入该领域的新人，以及需要快速回顾基础概念的从业人员。需要注意的是，官方已标记此仓库为“弃用”状态，并引导用户关注其最新的进阶项目，但其中涵盖的基础分类依然具有参考价值。\n\n其独特亮点在于强调“知识广度”而非深度钻研，定位为速查手册而非深层教材。内容覆盖简历优化技巧（如如何量化项目成果）、SQL 连接操作详解、主流框架（如 Spark）速览，以及统计学与机器学习核心议题（如交叉验证、正则化、偏差方差权衡等）。此外，它倡导社区共建模式，鼓励用户通过提交 Pull Request 共同完善内容，体现了开源协作的精神。对于想要快速构建数据科学知识体系的用户来说，这是一份实用且友好的入门指引。","# This repo is depreciated, check out the latest [Nailing Machine Learning Concepts](https:\u002F\u002Fgithub.com\u002Fjayinai\u002Fnail-ml-concept)\n\nThe purpose of this repo is two fold:\n\n* To help you (data science practitioners) prepare for data science related interviews\n* To introduce to people who don't know but want to learn some basic data science concepts\n\nThe focus is on the knowledge breadth so this is more of a quick reference rather than an in-depth study material. If you want to learn a specific topic in detail please refer to other content or reach out and I'd love to point you to materials I found useful.\n\nI might add some topics from time to time but hey, this should also be a community effort, right? Any pull request is welcome!\n\nHere are the categorizes:\n\n* [Resume](#resume)\n* [SQL](#sql)\n* [Tools and Framework](#tools-and-framework)\n* [Statistics and ML In General](#statistics-and-ml-in-general)\n* [Supervised Learning](#supervised-learning)\n* [Unsupervised Learning](#unsupervised-learning)\n* [Reinforcement Learning](#reinforcement-learning)\n* [Natural Language Processing](#natural-language-processing)\n* [System](#system)\n\n## Resume\n\nThe only advice I can give about resume is to indicate your past data science \u002F machine learning projects in a specific, **quantifiable** way. Consider the following two statements:\n\n> Trained a machine learning system\n\nand\n\n> Designed and deployed a deep learning model to recognize objects using Keras, Tensorflow, and Node.js. The model has 1\u002F30 model size, 1\u002F3 training time, 1\u002F5 inference time, and 2x faster convergence compared with traditional neural networks (e.g, ResNet)\n\nThe second is much better because it quantifies your contribution and also highlights specific technologies you used (and therefore have expertise in). This would require you to log what you've done during experiments. But don't exaggerate.\n\nSpend some time going over your resume \u002F past projects to make sure you explain them well.\n\n\n## SQL\n\n* [Difference between joins](#difference-between-joins)\n\n\n### Difference between joins\n\n* **(INNER) JOIN**: Returns records that have matching values in both tables\n* **LEFT (OUTER) JOIN**: Return all records from the left table, and the matched records from the right table\n* **RIGHT (OUTER) JOIN**: Return all records from the right table, and the matched records from the left table\n* **FULL (OUTER) JOIN**: Return all records when there is a match in either left or right table\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_dd3ca26789d4.png)\n\n[back to top](#data-science-question-answer)\n\n\n## Tools and Framework\n\nThe resources here are only meant to help you brush up on the topis rather than making you an expert.\n\n* [Spark](#spark)\n\n### Spark\n\nUsing PySpark API.\n\n* The best resource is of course [Spark's documentation](https:\u002F\u002Fspark.apache.org\u002Fdocs\u002Flatest\u002F). Take a thorough review of the topics\n* If you are really time constrained, scan the Spark's documentation and check [PySpark cheat sheet](https:\u002F\u002Fs3.amazonaws.com\u002Fassets.datacamp.com\u002Fblog_assets\u002FPySpark_Cheat_Sheet_Python.pdf) for the basics\n\n\n[back to top](#data-science-question-answer)\n\n\n## Statistics and ML In General\n\n* [Project Workflow](#project-workflow)\n* [Cross Validation](#cross-validation)\n* [Feature Importance](#feature-importance)\n* [Mean Squared Error vs. Mean Absolute Error](#mean-squared-error-vs.-mean-absolute-error)\n* [L1 vs L2 regularization](#l1-vs-l2-regularization)\n* [Correlation vs Covariance](#correlation-vs-covariance)\n* [Would adding more data address underfitting](#would-adding-more-data-address-underfitting)\n* [Activation Function](#activation-function)\n* [Bagging](#bagging)\n* [Stacking](#stacking)\n* [Generative vs discriminative](#generative-vs-discriminative)\n* [Parametric vs Nonparametric](#parametric-vs-nonparametric)\n* [Recommender System](#recommender-system)\n\n### Project Workflow\n\nGiven a data science \u002F machine learning project, what steps should we follow? Here's\nhow I would tackle it:\n\n* **Specify business objective.** Are we trying to win more customers, achieve higher satisfaction, or gain more revenues?\n* **Define problem.** What is the specific gap in your ideal world and the real one that requires machine learning to fill? Ask questions that can be addressed using your data and predictive modeling (ML algorithms).\n* **Create a common sense baseline.** But before you resort to ML, set up a baseline to solve the problem as if you know zero data science. You may be amazed at how effective this baseline is. It can be as simple as recommending the top N popular items or other rule-based logic. This baseline can also server as a good benchmark for ML algorithms.\n* **Review ML literatures.** To avoid reinventing the wheel and get inspired on what techniques \u002F algorithms are good at addressing the questions using our data.\n* **Set up a single-number metric.** What it means to be successful - high accuracy, lower error, or bigger AUC - and how do you measure it? The metric has to align with high-level goals, most often the success of your business. Set up a single-number against which all models are measured.\n* **Do exploratory data analysis (EDA).** Play with the data to get a general idea of data type, distribution, variable correlation, facets etc. This step would involve a lot of plotting.\n* **Partition data.** Validation set should be large enough to detect differences between the models you are training; test set should be large enough to indicate the overall performance of the final model; training set, needless to say, the larger the merrier.\n* **Preprocess.** This would include data integration, cleaning, transformation, reduction, discretization and more.\n* **Engineer features.** Coming up with features is difficult, time-consuming, requires expert knowledge. Applied machine learning is basically feature engineering. This step usually involves feature selection and creation, using domain knowledge. Can be minimal for deep learning projects.\n* **Develop models.** Choose which algorithm to use, what hyperparameters to tune, which architecture to use etc.\n* **Ensemble.** Ensemble can usually boost performance, depending on the correlations of the models\u002Ffeatures. So it’s always a good idea to try out. But be open-minded about making tradeoff - some ensemble are too complex\u002Fslow to put into production.\n* **Deploy model.** Deploy models into production for inference.\n* **Monitor model.** Monitor model performance, and collect feedbacks.\n* **Iterate.** Iterate the previous steps. Data science tends to be an iterative process, with new and improved models being developed over time.\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_5970d96ba71a.png)\n\n\n[back to top](#data-science-question-answer)\n\n### Cross Validation\n\nCross-validation is a technique to evaluate predictive models by partitioning the original sample into a training set to train the model, and a validation set to evaluate it. For example, a k-fold cross validation divides the data into k folds (or partitions), trains on each k-1 fold, and evaluate on the remaining 1 fold. This results to k models\u002Fevaluations, which can be averaged to get a overall model performance.\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_a9f0e2fd07fc.png)\n\n[back to top](#data-science-question-answer)\n\n\n### Feature Importance\n\n* In linear models, feature importance can be calculated by the scale of the coefficients\n* In tree-based methods (such as random forest), important features are likely to appear closer to the root of the tree.  We can get a feature's importance for random forest by computing the averaging depth at which it appears across all trees in the forest.\n\n[back to top](#data-science-question-answer)\n\n\n### Mean Squared Error vs. Mean Absolute Error\n\n* **Similarity**: both measure the average model prediction error; range from 0 to infinity; the lower the better\n* Mean Squared Error (MSE) gives higher weights to large error (e.g., being off by 10 just MORE THAN TWICE as bad as being off by 5), whereas Mean Absolute Error (MAE) assign equal weights (being off by 10 is just twice as bad as being off by 5)\n* MSE is continuously differentiable, MAE is not (where y_pred == y_true)\n\n[back to top](#data-science-question-answer)\n\n\n### L1 vs L2 regularization\n\n* **Similarity**: both L1 and L2 regularization **prevent overfitting** by shrinking (imposing a penalty) on the coefficients\n* **Difference**: L2 (Ridge) shrinks all the coefficient by the same proportions but eliminates none, while L1 (Lasso) can shrink some coefficients to zero, performing variable selection.\n* **Which to choose**: If all the features are correlated with the label, ridge outperforms lasso, as the coefficients are never zero in ridge. If only a subset of features are correlated with the label, lasso outperforms ridge as in lasso model some coefficient can be shrunken to zero.\n* In Graph (a), the black square represents the feasible region of the L1 regularization while graph (b) represents the feasible region for L2 regularization. The contours in the plots represent different loss values (for the unconstrained regression model ). The feasible point that minimizes the loss is more likely to happen on the coordinates on graph (a) than on graph (b) since graph (a) is more **angular**.  This effect amplifies when your number of coefficients increases, i.e. from 2 to 200. The implication of this is that the L1 regularization gives you sparse estimates. Namely, in a high dimensional space, you got mostly zeros and a small number of non-zero coefficients.\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_11f828e59231.png)\n\n[back to top](#data-science-question-answer)\n\n\n### Correlation vs Covariance\n\n* Both determine the relationship and measure the dependency between two random variables\n* Correlation is when the change in one item may result in the change in the another item, while covariance is when two items vary together (joint variability)\n* Covariance is nothing but a measure of correlation. On the contrary, correlation refers to the scaled form of covariance\n* Range: correlation is between -1 and +1, while covariance lies between negative infinity and infinity.\n\n\n[back to top](#data-science-question-answer)\n\n\n### Would adding more data address underfitting\n\nUnderfitting happens when a model is not complex enough to learn well from the data. It is the problem of model rather than data size. So a potential way to address underfitting is to increase the model complexity (e.g., to add higher order coefficients for linear model, increase depth for tree-based methods, add more layers \u002F number of neurons for neural networks etc.)\n\n[back to top](#data-science-question-answer)\n\n\n### Activation Function\n\nFor neural networks\n\n* Non-linearity: ReLU is often used. Use Leaky ReLU (a small positive gradient for negative input, say, `y = 0.01x` when x \u003C 0) to address dead ReLU issue\n* Multi-class: softmax\n* Binary: sigmoid\n* Regression: linear\n\n[back to top](#data-science-question-answer)\n\n### Bagging\n\nTo address overfitting, we can use an ensemble method called bagging (bootstrap aggregating),\nwhich reduces the variance of the meta learning algorithm. Bagging can be applied\nto decision tree or other algorithms.\n\nHere is a [great illustration](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fauto_examples\u002Fensemble\u002Fplot_bias_variance.html#sphx-glr-auto-examples-ensemble-plot-bias-variance-py) of a single estimator vs. bagging.\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_6d99cf8b0cbd.png)\n\n* Bagging is when samlping is performed *with* replacement. When sampling is performed *without* replacement, it's called pasting.\n* Bagging is popular due to its boost for performance, but also due to that individual learners can be trained in parallel and scale well\n* Ensemble methods work best when the learners are as independent from one another as possible\n* Voting: soft voting (predict probability and average over all individual learners) often works better than hard voting\n* out-of-bag instances can act validation set for bagging\n\n[back to top](#data-science-question-answer)\n\n\n### Stacking\n\n* Instead of using trivial functions (such as hard voting) to aggregate the predictions from individual learners, train a model to perform this aggregation\n* First split the training set into two subsets: the first subset is used to train the learners in the first layer\n* Next the first layer learners are used to make predictions (meta features) on the second subset, and those predictions are used to train another models (to obtain the weigts of different learners) in the second layer\n* We can train multiple models in the second layer, but this entails subsetting the original dataset into 3 parts\n\n![stacking](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_707ee16ce9b1.jpg)\n\n[back to top](#data-science-question-answer)\n\n\n### Generative vs discriminative\n\n* Discriminative algorithms model *p(y|x; w)*, that is, given the dataset and learned\nparameter, what is the probability of y belonging to a specific class. A discriminative algorithm\ndoesn't care about how the data was generated, it simply categorizes a given example\n* Generative algorithms try to model *p(x|y)*, that is, the distribution of features given\nthat it belongs to a certain class. A generative algorithm models how the data was\ngenerated.\n\n> Given a training set, an algorithm like logistic regression or\n> the perceptron algorithm (basically) tries to find a straight line—that is, a\n> decision boundary—that separates the elephants and dogs. Then, to classify\n> a new animal as either an elephant or a dog, it checks on which side of the\n> decision boundary it falls, and makes its prediction accordingly.\n\n> Here’s a different approach. First, looking at elephants, we can build a\n> model of what elephants look like. Then, looking at dogs, we can build a\n> separate model of what dogs look like. Finally, to classify a new animal, we\n> can match the new animal against the elephant model, and match it against\n> the dog model, to see whether the new animal looks more like the elephants\n> or more like the dogs we had seen in the training set.\n\n[back to top](#data-science-question-answer)\n\n\n### Parametric vs Nonparametric\n\n* A learning model that summarizes data with a set of parameters of fixed size (independent of the number of training examples) is called a parametric model.\n* A model where the number of parameters is not determined prior to training. Nonparametric does not mean that they have no parameters. On the contrary, nonparametric models (can) become more and more complex with an increasing amount of data.\n\n[back to top](#data-science-question-answer)\n\n\n### Recommender System\n\n* I put recommend system here since technically it falls neither under supervised nor unsupervised learning\n* A recommender system seeks to predict the 'rating' or 'preference' a user would give to items and then recommend items accordingly\n* Content based recommender systems recommends items similar to those a given user has liked in the past, based on either explicit (ratings, like\u002Fdislike button) or implicit (viewed\u002Ffinished an article) feedbacks. Content based recommenders work solely with the past interactions of a given user and do not take other users into consideration.\n* Collaborative filtering is based on past interactions of the whole user base. There are two Collaborative filtering approaches: **item-based** or **user-based**\n  - item-based: for user u, a score for an unrated item is produced by combining the ratings of users similar to u.\n  - user-based:  a rating (u, i) is produced by looking at the set of items similar to i (interaction similarity), then the ratings by u of similar items are combined into a predicted rating\n* In recommender systems traditionally matrix factorization methods are used, although we recently there are also deep learning based methods\n* Cold start and sparse matrix can be issues for recommender systems\n* Widely used in movies, news, research articles, products, social tags, music, etc.\n\n![cf](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_850010f2cd3e.gif)\n\n[back to top](#data-science-question-answer)\n\n\n## Supervised Learning\n\n* [Linear regression](#linear-regression)\n* [Logistic regression](#logistic-regression)\n* [Naive Bayes](#naive-bayes)\n* [KNN](#knn)\n* [SVM](#svm)\n* [Decision tree](#decision-tree)\n* [Random forest](#random-forest)\n* [Boosting Tree](#boosting-tree)\n* [MLP](#mlp)\n* [CNN](#cnn)\n* [RNN and LSTM](#rnn-and-lstm)\n\n### Linear regression\n\n* How to learn the parameter: minimize the cost function\n* How to minimize cost function: gradient descent\n* Regularization:\n    - L1 (Lasso): can shrink certain coef to zero, thus performing feature selection\n    - L2 (Ridge): shrink all coef with the same proportion; almost always outperforms L1\n    - Elastic Net: combined L1 and L2 priors as regularizer\n* Assumes linear relationship between features and the label\n* Can add polynomial and interaction features to add non-linearity\n\n![lr](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_0d6b770c8a41.png)\n\n[back to top](#data-science-question-answer)\n\n\n### Logistic regression\n\n* Generalized linear model (GLM) for binary classification problems\n* Apply the sigmoid function to the output of linear models, squeezing the target\nto range [0, 1]\n* Threshold to make prediction: usually if the output > .5, prediction 1; otherwise prediction 0\n* A special case of softmax function, which deals with multi-class problems\n\n[back to top](#data-science-question-answer)\n\n\n### Naive Bayes\n\n* Naive Bayes (NB) is a supervised learning algorithm based on applying [Bayes' theorem](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FBayes%27_theorem)\n* It is called naive because it builds the naive assumption that each feature\nare independent of each other\n* NB can make different assumptions (i.e., data distributions, such as Gaussian,\nMultinomial, Bernoulli)\n* Despite the over-simplified assumptions, NB classifier works quite well in real-world\napplications, especially for text classification (e.g., spam filtering)\n* NB can be extremely fast compared to more sophisticated methods\n\n[back to top](#data-science-question-answer)\n\n### KNN\n\n* Given a data point, we compute the K nearest data points (neighbors) using certain\ndistance metric (e.g., Euclidean metric). For classification, we take the majority label\nof neighbors; for regression, we take the mean of the label values.\n* Note for KNN we don't train a model; we simply compute during\ninference time. This can be computationally expensive since each of the test example\nneed to be compared with every training example to see how close they are.\n* There are approximation methods can have faster inference time by\npartitioning the training data into regions (e.g., [annoy](https:\u002F\u002Fgithub.com\u002Fspotify\u002Fannoy))\n* When K equals 1 or other small number the model is prone to overfitting (high variance), while\nwhen K equals number of data points or other large number the model is prone to underfitting (high bias)\n\n![KNN](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_d867ce7400ee.png)\n\n[back to top](#data-science-question-answer)\n\n\n### SVM\n\n* Can perform linear, nonlinear, or outlier detection (unsupervised)\n* Large margin classifier: using SVM we not only have a decision boundary, but want the boundary\nto be as far from the closest training point as possible\n* The closest training examples are called support vectors, since they are the points\nbased on which the decision boundary is drawn\n* SVMs are sensitive to feature scaling\n\n![svm](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_675fedee7173.png)\n\n[back to top](#data-science-question-answer)\n\n\n### Decision tree\n\n* Non-parametric, supervised learning algorithms\n* Given the training data, a decision tree algorithm divides the feature space into\nregions. For inference, we first see which\nregion does the test data point fall in, and take the mean label values (regression)\nor the majority label value (classification).\n* **Construction**: top-down, chooses a variable to split the data such that the\ntarget variables within each region are as homogeneous as possible. Two common\nmetrics: gini impurity or information gain, won't matter much in practice.\n* Advantage: simply to understand & interpret, mirrors human decision making\n* Disadvantage:\n    - can overfit easily (and generalize poorly) if we don't limit the depth of the tree\n    - can be non-robust: A small change in the training data can lead to a totally different tree\n    - instability: sensitive to training set rotation due to its orthogonal decision boundaries\n\n![decision tree](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_c557af4b34c5.gif)\n\n[back to top](#data-science-question-answer)\n\n\n### Random forest\n\nRandom forest improves bagging further by adding some randomness. In random forest,\nonly a subset of features are selected at random to construct a tree (while often not subsample instances).\nThe benefit is that random forest **decorrelates** the trees.\n\nFor example, suppose we have a dataset. There is one very predicative feature, and a couple\nof moderately predicative features. In bagging trees, most of the trees\nwill use this very predicative feature in the top split, and therefore making most of the trees\nlook similar, **and highly correlated**. Averaging many highly correlated results won't lead\nto a large reduction in variance compared with uncorrelated results.\nIn random forest for each split we only consider a subset of the features and therefore\nreduce the variance even further by introducing more uncorrelated trees.\n\nI wrote a [notebook](assets\u002Fbag-rf-var.ipynb) to illustrate this point.\n\nIn practice, tuning random forest entails having a large number of trees (the more the better, but\nalways consider computation constraint). Also, `min_samples_leaf` (The minimum number of\nsamples at the leaf node)to control the tree size and overfitting. Always cross validate the parameters.\n\n[back to top](#data-science-question-answer)\n\n\n### Boosting Tree\n\n**How it works**\n\nBoosting builds on weak learners, and in an iterative fashion. In each iteration,\na new learner is added, while all existing learners are kept unchanged. All learners\nare weighted based on their performance (e.g., accuracy), and after a weak learner\nis added, the data are re-weighted: examples that are misclassified gain more weights,\nwhile examples that are correctly classified lose weights. Thus, future weak learners\nfocus more on examples that previous weak learners misclassified.\n\n**Difference from random forest (RF)**\n\n* RF grows trees **in parallel**, while Boosting is sequential\n* RF reduces variance, while Boosting reduces errors by reducing bias\n\n**XGBoost (Extreme Gradient Boosting)**\n\n> XGBoost uses a more regularized model formalization to control overfitting, which gives it better performance\n\n[back to top](#data-science-question-answer)\n\n\n### MLP\n\nA feedforward neural network of multiple layers. In each layer we\ncan have multiple neurons, and each of the neuron in the next layer is a linear\u002Fnonlinear\ncombination of the all the neurons in the previous layer. In order to train the network\nwe back propagate the errors layer by layer. In theory MLP can approximate any functions.\n\n![mlp](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_88a70583fb61.jpg)\n\n[back to top](#data-science-question-answer)\n\n\n### CNN\n\nThe Conv layer is the building block of a Convolutional Network. The Conv layer consists\nof a set of learnable filters (such as 5 * 5 * 3, width * height * depth). During the forward\npass, we slide (or more precisely, convolve) the filter across the input and compute the dot\nproduct. Learning again happens when the network back propagate the error layer by layer.\n\nInitial layers capture low-level features such as angle and edges, while later\nlayers learn a combination of the low-level features and in the previous layers\nand can therefore represent higher level feature, such as shape and object parts.\n\n![CNN](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_e53bb12da14b.jpg)\n\n[back to top](#data-science-question-answer)\n\n\n### RNN and LSTM\n\nRNN is another paradigm of neural network where we have difference layers of cells,\nand each cell not only takes as input the cell from the previous layer, but also the previous\ncell within the same layer. This gives RNN the power to model sequence.\n\n![RNN](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_2a37bd4e9b12.jpeg)\n\nThis seems great, but in practice RNN barely works due to exploding\u002Fvanishing gradient, which\nis cause by a series of multiplication of the same matrix. To solve this, we can use\na variation of RNN, called long short-term memory (LSTM), which is capable of learning\nlong-term dependencies.\n\nThe math behind LSTM can be pretty complicated, but intuitively LSTM introduce\n\n* input gate\n* output gate\n* forget gate\n* memory cell (internal state)\n\nLSTM resembles human memory: it forgets old stuff (old internal state * forget gate)\nand learns from new input (input node * input gate)\n\n![lstm](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_66c92451008d.png)\n\n[back to top](#data-science-question-answer)\n\n\n## Unsupervised Learning\n\n* [Clustering](#clustering)\n* [Principal Component Analysis](#principal-component-analysis)\n* [Autoencoder](#autoencoder)\n* [Generative Adversarial Network](#generative-adversarial-network)\n\n### Clustering\n\n* Clustering is a unsupervised learning algorithm that groups data in such\na way that data points in the same group are more similar to each other than to\nthose from other groups\n* Similarity is usually defined using a distance measure (e.g, Euclidean, Cosine, Jaccard, etc.)\n* The goal is usually to discover the underlying structure within the data (usually high dimensional)\n* The most common clustering algorithm is K-means, where we define K (the number of clusters)\nand the algorithm iteratively finds the cluster each data point belongs to\n\n[scikit-learn](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fclustering.html) implements many clustering algorithms. Below is a comparison adopted from its page.\n\n![clustering](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_a0db7d1bd529.png)\n\n[back to top](#data-science-question-answer)\n\n\n### Principal Component Analysis\n\n* Principal Component Analysis (PCA) is a dimension reduction technique that projects\nthe data into a lower dimensional space\n* PCA uses Singular Value Decomposition (SVD), which is a matrix factorization method\nthat decomposes a matrix into three smaller matrices (more details of SVD [here](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSingular-value_decomposition))\n* PCA finds top N principal components, which are dimensions along which the data vary\n(spread out) the most. Intuitively, the more spread out the data along a specific dimension,\nthe more information is contained, thus the more important this dimension is for the\npattern recognition of the dataset\n* PCA can be used as pre-step for data visualization: reducing high dimensional data\ninto 2D or 3D. An alternative dimensionality reduction technique is [t-SNE](https:\u002F\u002Flvdmaaten.github.io\u002Ftsne\u002F)\n\nHere is a visual explanation of PCA\n\n![pca](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_57dc89507f6d.gif)\n\n[back to top](#data-science-question-answer)\n\n\n\n### Autoencoder\n\n* The aim of an autoencoder is to learn a representation (encoding) for a set of data\n* An autoencoder always consists of two parts, the encoder and the decoder. The encoder would find a lower dimension representation (latent variable) of the original input, while the decoder is used to reconstruct from the lower-dimension vector such that the distance between the original and reconstruction is minimized\n* Can be used for data denoising and dimensionality reduction\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_c0a13af7e7d7.png)\n\n\n### Generative Adversarial Network\n\n* Generative Adversarial Network (GAN) is an unsupervised learning algorithm that also has supervised flavor: using supervised loss as part of training\n* GAN typically has two major components: the **generator** and the **discriminator**. The generator tries to generate \"fake\" data (e.g, images or sentences) that fool the discriminator into thinking that they're real, while the discriminator tries to distinguish between real and generated data. It's a fight between the two players thus the name adversarial, and this fight drives both sides to improve until \"fake\" data are indistinguishable from the real data\n* How does it work, intuitively\n\t- The generator takes a **random** input and generates a sample of data\n\t- The discriminator then either takes the generated sample or a real data sample, and tries to predict whether the input is real or generated (i.e., solving a binary classification problem)\n\t- Given a truth score range of [0, 1], ideally the we'd love to see discriminator give low score to generated data but high score to real data. On the other hand, we also wanna see the generated data fool the discriminator. And this paradox drives both sides become stronger\n* How does it work, from a training perspective\n\t- Without training, the generator creates 'garbage' data only while the discriminator is too 'innocent' to tell the difference between fake and real data\n\t- Usually we would first train the discriminator with both real (label 1) and generated data (label 0) for N epochs so it would have a good judgement of what is real vs. fake\n\t- Then we **set the discriminator non-trainable**, and train the generator. Even though the discriminator is non-trainable at this stage, we still use it as a classifier so that **error signals can be back propagated and therefore enable the generator to learn**\n\t- The above two steps would continue in turn until both sides cannot be improved further\n* Here are some [tips and tricks to make GANs work](https:\u002F\u002Fgithub.com\u002Fsoumith\u002Fganhacks)\n* One Caveat is that the **adversarial part is only auxiliary: The end goal of using GAN is to generate data that even experts cannot tell if it's real or fake**\n\n![gan](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_718e5a3f0ed4.jpg)\n\n[back to top](#data-science-question-answer)\n\n\n## Reinforcement Learning\n\n[TODO]\n\n## Natural Language Processing\n\n* [Tokenization](#tokenization)\n* [Stemming and lemmatization](#stemming-and-lemmatization)\n* [N-gram](#ngram)\n* [Bag of Words](#bag-of-words)\n* [word2vec](#word2vec)\n\n\n### Tokenization\n\n* Tokenization is the process of converting a sequence of characters into a sequence of tokens\n* Consider this example: `The quick brown fox jumped over the lazy dog`. In this case each word (separated by space) would be a token\n* Sometimes tokenization doesn't have a definitive answer. For instance, `O'Neill` can be tokenized to `o` and `neill`, `oneill`, or `o'neill`.\n* In some cases tokenization requires language-specific knowledge. For example, it doesn't make sense to tokenize `aren't` into `aren` and `t`\n* For a more detailed treatment of tokenization please check [here](https:\u002F\u002Fnlp.stanford.edu\u002FIR-book\u002Fhtml\u002Fhtmledition\u002Ftokenization-1.html)\n\n[back to top](#data-science-question-answer)\n\n### Stemming and lemmatization\n\n* The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form\n* Stemming usually refers to a crude heuristic process that chops off the ends of words\n* Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words\n* If confronted with the token `saw`, stemming might return just `s`, whereas lemmatization would attempt to return either `see` or `saw` depending on whether the use of the token was as a verb or a noun\n* For a more detailed treatment please check [here](https:\u002F\u002Fnlp.stanford.edu\u002FIR-book\u002Fhtml\u002Fhtmledition\u002Fstemming-and-lemmatization-1.html)\n\n[back to top](#data-science-question-answer)\n\n\n### N gram\n\n* n-gram is a contiguous sequence of n items from a given sample of text or speech\n* An n-gram of size 1 is referred to as a \"unigram\"; size 2 is a \"bigram\" size 3 is a \"trigram\". Larger sizes are sometimes referred to by the value of n in modern language, e.g., \"four-gram\", \"five-gram\", and so on.\n* Consider this example: `The quick brown fox jumped over the lazy dog.`\n  - bigram would be `the quick`, `quick brown`, `brown fox`, ..., i.e, every two consecutive words (or tokens)\n  - trigram would be `the quick brown`, `quick brown fox`, `brown fox jumped`, ..., i.e., every three consecutive words (or tokens)\n* ngram model models sequence, i.e., predicts next word (n) given previous words (1, 2, 3, ..., n-1)\n* multiple gram (bigram and above) captures **context**\n* to choose n in n-gram requires experiments and making tradeoff between stability of the estimate against its appropriateness. Rule of thumb: trigram is a common choice with large training corpora (millions of words), whereas a bigram is often used with smaller ones.\n* n-gram can be used as features for machine learning and downstream NLP tasks\n\n[back to top](#data-science-question-answer)\n\n\n### Bag of Words\n\n* Why? Machine learning models cannot work with raw text directly; rather, they take numerical values as input.\n* Bag of words (BoW) builds a **vocabulary** of all the unique words in our dataset, and associate a unique index to each word in the vocabulary\n* It is called a \"bag\" of words, because it is a representation that completely ignores the order of words\n* Consider this example of two sentences: (1) `John likes to watch movies, especially horor movies.`, (2) `Mary likes movies too.` We would first build a vocabulary of unique words (all lower cases and ignoring punctuations): `[john, likes, to, watch, movies, especially, horor, mary, too]`. Then we can represent each sentence using term frequency, i.e, the number of times a term appears. So (1) would be `[1, 1, 1, 1, 2, 1, 1, 0, 0]`, and (2) would be `[0, 1, 0, 0, 1, 0, 0, 1, 1]`\n* A common alternative to the use of dictionaries is the [hashing trick](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FFeature_hashing), where words are directly mapped to indices with a hashing function\n* As the vocabulary grows bigger (tens of thousand), the vector to represent short sentences \u002F document becomes sparse (almost all zeros)\n\n[back to top](#data-science-question-answer)\n\n### word2vec\n\n* Shallow, two-layer neural networks that are trained to construct linguistic context of words\n* Takes as input a large corpus, and produce a vector space, typically of several hundred\ndimension, and each word in the corpus is assigned a vector in the space\n* The key idea is **context**: words that occur often in the same context should have same\u002Fopposite\nmeanings.\n* Two flavors\n    - continuous bag of words (CBOW): the model predicts the current word given a window of surrounding context words\n    - skip gram: predicts the surrounding context words using the current word\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_936c70c9f198.png)\n\n[back to top](#data-science-question-answer)\n\n\n## System\n\n* [Cron job](#cron-job)\n* [Linux](#linux)\n\n### Cron job\n\nThe software utility **cron** is a **time-based job scheduler** in Unix-like computer operating systems. People who set up and maintain software environments use cron to schedule jobs (commands or shell scripts) to run periodically at fixed times, dates, or intervals. It typically automates system maintenance or administration -- though its general-purpose nature makes it useful for things like downloading files from the Internet and downloading email at regular intervals.\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_fd8cf847cbb9.png)\n\nTools:\n* [Apache Airflow](https:\u002F\u002Fairflow.apache.org\u002F)\n\n[back to top](#data-science-question-answer)\n\n\n### Linux\n\nUsing **Ubuntu** as an example.\n\n* Become root: `sudo su`\n* Install package: `sudo apt-get install \u003Cpackage>`\n\n[back to top](#data-science-question-answer)\n\n\nConfession: some images are adopted from the internet without proper credit. If you are the author and this would be an issue for you, please let me know.\n","# 此仓库已弃用，请查看最新的 [Nailing Machine Learning Concepts](https:\u002F\u002Fgithub.com\u002Fjayinai\u002Fnail-ml-concept)\n\n本仓库的目的有两个：\n\n* 帮助数据科学从业者准备相关面试\n* 向不了解但希望学习数据科学基础知识的人介绍一些基本概念\n\n重点在于知识的广度，因此这更像是一份快速参考手册，而非深入的学习资料。如果你希望详细学习某个特定主题，请参考其他资源，或者联系我，我很乐意为你推荐我曾觉得有用的材料。\n\n我可能会不时添加一些新内容，不过这也应该是一项社区共同参与的工作，对吧？欢迎任何 Pull Request！\n\n以下是分类目录：\n\n* [简历](#resume)\n* [SQL](#sql)\n* [工具与框架](#tools-and-framework)\n* [统计学与机器学习概论](#statistics-and-ml-in-general)\n* [监督学习](#supervised-learning)\n* [无监督学习](#unsupervised-learning)\n* [强化学习](#reinforcement-learning)\n* [自然语言处理](#natural-language-processing)\n* [系统](#system)\n\n## 简历\n\n关于简历，我能给出的唯一建议就是以具体、**可量化**的方式描述你过去的数据科学\u002F机器学习项目。请对比以下两种表述：\n\n> 训练了一个机器学习系统\n\n和\n\n> 使用 Keras、TensorFlow 和 Node.js 设计并部署了一套深度学习模型来识别物体。该模型的参数量仅为传统神经网络（如 ResNet）的 1\u002F30，训练时间缩短至 1\u002F3，推理速度提升至 5 倍，且收敛速度提高了 2 倍。\n\n第二种表述要好得多，因为它量化了你的贡献，并突出了你所使用的技术（从而表明你的专业能力）。这就要求你在实验过程中记录下自己所做的工作。不过也不要夸大其词。\n\n花些时间仔细检查你的简历或过往项目，确保能够清晰地解释它们。\n\n## SQL\n\n* [JOIN 的区别](#difference-between-joins)\n\n\n### JOIN 的区别\n\n* **(INNER) JOIN**：返回两个表中匹配值的记录\n* **LEFT (OUTER) JOIN**：返回左表中的所有记录，以及右表中匹配的记录\n* **RIGHT (OUTER) JOIN**：返回右表中的所有记录，以及左表中匹配的记录\n* **FULL (OUTER) JOIN**：当左表或右表中有匹配时，返回所有记录\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_dd3ca26789d4.png)\n\n[返回顶部](#data-science-question-answer)\n\n\n## 工具与框架\n\n这里的资源仅用于帮助你复习相关主题，而不是让你成为专家。\n\n* [Spark](#spark)\n\n### Spark\n\n使用 PySpark API。\n\n* 最好的资源当然是 [Spark 的官方文档](https:\u002F\u002Fspark.apache.org\u002Fdocs\u002Flatest\u002F)。请仔细阅读相关章节。\n* 如果时间非常紧张，可以快速浏览 Spark 文档，并查阅 [PySpark 备忘录](https:\u002F\u002Fs3.amazonaws.com\u002Fassets.datacamp.com\u002Fblog_assets\u002FPySpark_Cheat_Sheet_Python.pdf)，了解基础知识。\n\n\n[返回顶部](#data-science-question-answer)\n\n\n## 统计学与机器学习概论\n\n* [项目流程](#project-workflow)\n* [交叉验证](#cross-validation)\n* [特征重要性](#feature-importance)\n* [均方误差 vs. 平均绝对误差](#mean-squared-error-vs.-mean-absolute-error)\n* [L1 正则化 vs. L2 正则化](#l1-vs-l2-regularization)\n* [相关性 vs. 协方差](#correlation-vs-covariance)\n* [增加数据是否能解决欠拟合问题](#would-adding-more-data-address-underfitting)\n* [激活函数](#activation-function)\n* [Bagging](#bagging)\n* [Stacking](#stacking)\n* [生成式模型 vs. 判别式模型](#generative-vs-discriminative)\n* [参数化模型 vs. 非参数化模型](#parametric-vs-nonparametric)\n* [推荐系统](#recommender-system)\n\n### 项目流程\n\n对于一个数据科学\u002F机器学习项目，我们应该遵循哪些步骤呢？以下是我通常的做法：\n\n* **明确业务目标。** 我们的目标是吸引更多客户、提高满意度，还是增加收入？\n* **定义问题。** 在理想状态与现实之间，究竟存在怎样的差距需要通过机器学习来弥补？提出那些能够利用现有数据和预测建模（机器学习算法）来解决的问题。\n* **建立常识基线。** 在求助于机器学习之前，先尝试在完全不懂数据科学的情况下解决问题，看看效果如何。你可能会惊讶地发现，这种基线方法往往非常有效，比如推荐最受欢迎的前 N 个商品，或者基于规则的逻辑。这个基线也可以作为评估机器学习模型性能的良好基准。\n* **回顾机器学习文献。** 避免重复造轮子，并从中获得灵感，了解哪些技术或算法适合用我们的数据来解决当前问题。\n* **设定单一指标。** 成功意味着什么——高准确率、低误差，还是更高的 AUC——又该如何衡量？这个指标必须与高层目标保持一致，通常是与业务成功相关的指标。设定一个统一的数值指标，用来评估所有模型的表现。\n* **进行探索性数据分析 (EDA)。** 通过玩转数据，大致了解数据类型、分布、变量之间的相关性、分面等信息。这一步通常涉及大量的可视化操作。\n* **划分数据集。** 验证集应足够大，以便检测不同模型之间的差异；测试集也应足够大，以反映最终模型的整体性能；而训练集，则自然是越大越好。\n* **数据预处理。** 包括数据集成、清洗、转换、降维、离散化等操作。\n* **特征工程。** 构建特征是一项困难且耗时的工作，需要丰富的专业知识。实际上，应用机器学习的核心就是特征工程。这一步通常涉及特征选择和构造，需要结合领域知识。对于深度学习项目，特征工程可能相对简单。\n* **开发模型。** 选择合适的算法、调整超参数、设计网络架构等。\n* **集成模型。** 集成通常可以提升性能，但这取决于各个模型或特征之间的相关性。因此，尝试集成总是值得的。不过也要做好心理准备，因为有些集成方案过于复杂或运行缓慢，难以投入生产。\n* **部署模型。** 将模型部署到生产环境中进行推理。\n* **监控模型。** 监控模型的表现，并收集反馈。\n* **迭代。** 重复上述步骤。数据科学往往是一个迭代的过程，随着时间推移会不断开发出新的、更优秀的模型。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_5970d96ba71a.png)\n\n\n[返回顶部](#data-science-question-answer)\n\n### 交叉验证\n\n交叉验证是一种评估预测模型的技术，它将原始样本划分为训练集和验证集，其中训练集用于训练模型，验证集用于评估模型。例如，k折交叉验证会将数据分成k个折叠（或分区），每次使用k-1个折叠进行训练，并在剩下的1个折叠上进行评估。这样可以得到k个模型\u002F评估结果，然后对这些结果取平均，从而获得模型的整体性能。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_a9f0e2fd07fc.png)\n\n[返回顶部](#data-science-question-answer)\n\n\n### 特征重要性\n\n* 在线性模型中，特征重要性可以通过系数的大小来计算。\n* 在基于树的方法中（如随机森林），重要的特征通常会出现在树的更靠近根节点的位置。对于随机森林，我们可以通过计算该特征在森林中所有树上的平均出现深度来衡量其重要性。\n\n[返回顶部](#data-science-question-answer)\n\n\n### 均方误差与平均绝对误差\n\n* **相似性**：两者都用于衡量模型预测误差的平均值；取值范围从0到无穷大；数值越小越好。\n* 均方误差（MSE）对较大的误差赋予更高的权重（例如，误差为10比误差为5糟糕两倍以上），而平均绝对误差（MAE）则对所有误差赋予相同的权重（误差为10只是误差为5的两倍糟糕）。\n* 均方误差是连续可导的，而平均绝对误差则不是（当预测值等于真实值时）。\n\n[返回顶部](#data-science-question-answer)\n\n\n### L1正则化与L2正则化\n\n* **相似性**：L1和L2正则化都通过收缩（施加惩罚）系数来**防止过拟合**。\n* **差异**：L2（岭回归）会按相同比例缩小所有系数，但不会将任何系数置零；而L1（套索回归）可以将部分系数缩小到零，从而实现变量选择。\n* **如何选择**：如果所有特征都与标签相关，岭回归的表现优于套索回归，因为岭回归中的系数永远不会为零。如果只有部分特征与标签相关，则套索回归的表现更好，因为它可以将某些系数缩小到零。\n* 图(a)中的黑色正方形代表L1正则化的可行域，而图(b)则代表L2正则化的可行域。图中的等高线表示不同的损失值（针对无约束的回归模型）。由于图(a)的形状更加**棱角分明**，因此使损失最小化的可行点更可能出现在图(a)的坐标上，而不是图(b)上。当系数数量增加时，这种效应会更加明显，例如从2个增加到200个。这意味着L1正则化会产生稀疏估计，即在高维空间中，大多数系数为零，只有少数非零系数。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_11f828e59231.png)\n\n[返回顶部](#data-science-question-answer)\n\n\n### 相关性与协方差\n\n* 两者都用于确定两个随机变量之间的关系并衡量它们的依赖程度。\n* 相关性是指一个变量的变化可能导致另一个变量的变化，而协方差则是指两个变量共同变化的程度（联合变异）。\n* 协方差本质上就是一种相关性的度量。相反，相关性则是协方差的标准化形式。\n* 取值范围：相关性介于-1到+1之间，而协方差的取值范围则是负无穷到正无穷。\n\n[返回顶部](#data-science-question-answer)\n\n\n### 增加数据是否能解决欠拟合问题？\n\n欠拟合是指模型不够复杂，无法很好地从数据中学习的情况。这主要是模型本身的问题，而非数据量不足所致。因此，解决欠拟合的一种方法是提高模型的复杂度（例如，为线性模型添加更高次的项，为基于树的方法增加深度，为神经网络增加层数或神经元数量等）。\n\n[返回顶部](#data-science-question-answer)\n\n\n### 激活函数\n\n对于神经网络：\n\n* 非线性：常使用ReLU。为了解决ReLU“死亡”问题，可以使用Leaky ReLU（对负输入赋予一个小的正梯度，例如当x \u003C 0时，y = 0.01x）。\n* 多分类：softmax。\n* 二分类：sigmoid。\n* 回归：线性。\n\n[返回顶部](#data-science-question-answer)\n\n\n### 装袋法\n\n为了应对过拟合问题，我们可以使用一种称为装袋法（Bootstrap Aggregating）的集成方法，它可以降低元学习算法的方差。装袋法可以应用于决策树或其他算法。\n\n这里有一个很好的示例展示了单个估计器与装袋法的区别：[链接](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fauto_examples\u002Fensemble\u002Fplot_bias_variance.html#sphx-glr-auto-examples-ensemble-plot-bias-variance-py)。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_6d99cf8b0cbd.png)\n\n* 装袋法是在有放回的情况下进行抽样。如果没有放回地进行抽样，则称为粘贴法。\n* 装袋法之所以受欢迎，不仅因为它能够提升模型性能，还因为各个基学习器可以并行训练且具有良好的可扩展性。\n* 集成方法的效果最好是在各个基学习器尽可能相互独立的情况下。\n* 投票方式：软投票（预测概率并取所有基学习器的平均值）通常比硬投票效果更好。\n* 装袋法中未被选中的样本可以用作验证集。\n\n[返回顶部](#data-science-question-answer)\n\n\n### 堆叠法\n\n* 不再使用简单的聚合方法（如硬投票）来综合各个基学习器的预测结果，而是训练一个模型来进行这种聚合。\n* 首先将训练集分成两个子集：第一个子集用于训练第一层的基学习器。\n* 然后利用第一层的基学习器对第二个子集进行预测（生成元特征），并将这些预测结果用于训练第二层的其他模型（以确定不同基学习器的权重）。\n* 我们可以在第二层训练多个模型，但这需要将原始数据集进一步分成三份。\n\n![堆叠法](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_707ee16ce9b1.jpg)\n\n[返回顶部](#data-science-question-answer)\n\n### 生成式与判别式\n\n* 判别式算法建模的是 *p(y|x; w)*，即在给定数据集和学习到的参数的情况下，y 属于某一特定类别的概率。判别式算法并不关心数据是如何生成的，它只是简单地对给定的样本进行分类。\n* 生成式算法则试图建模 *p(x|y)*，即在已知样本属于某一类别时，特征的分布情况。生成式算法是对数据生成过程的建模。\n\n> 给定一个训练集，像逻辑回归或感知机这样的算法（基本上）会尝试找到一条直线——也就是决策边界——来将大象和狗分开。然后，为了将一个新的动物分类为大象或狗，它会检查该动物落在决策边界的哪一侧，并据此做出预测。\n\n> 这里还有另一种方法。首先，通过观察大象，我们可以建立一个关于大象外观的模型。接着，通过观察狗，我们也可以建立一个关于狗外观的独立模型。最后，为了对一个新的动物进行分类，我们可以将这个新动物与大象模型进行匹配，同时也与狗模型进行匹配，以判断这个新动物更像我们在训练集中见过的那些大象，还是更像那些狗。\n\n[返回顶部](#data-science-question-answer)\n\n\n### 参数化与非参数化\n\n* 使用一组大小固定（与训练样本数量无关）的参数来总结数据的学习模型称为参数化模型。\n* 在非参数化模型中，参数的数量在训练之前是不确定的。非参数化并不意味着它们没有参数。相反，随着数据量的增加，非参数化模型可能会变得越来越复杂。\n\n[返回顶部](#data-science-question-answer)\n\n\n### 推荐系统\n\n* 我把推荐系统放在这里，因为从技术上讲，它既不属于监督学习，也不属于无监督学习。\n* 推荐系统旨在预测用户对物品的“评分”或“偏好”，并据此推荐相应的物品。\n* 基于内容的推荐系统会根据用户过去喜欢过的物品，推荐与其相似的其他物品，这些反馈可以是显式的（如评分、点赞\u002F不喜欢按钮）或隐式的（如浏览或完成一篇文章）。基于内容的推荐仅依赖于单个用户的过往行为，而不考虑其他用户的信息。\n* 协同过滤则基于整个用户群体的过往行为。协同过滤有两种方法：**基于物品的**或**基于用户的**：\n  - 基于物品的：对于用户 u，未评分物品的得分是通过结合与 u 相似用户的评分来计算的。\n  - 基于用户的：评分 (u, i) 是通过查看与物品 i 相似的物品集合（交互相似性），再将 u 对这些相似物品的评分综合起来得到的预测评分。\n* 传统上，推荐系统多采用矩阵分解方法，但近年来也有基于深度学习的方法。\n* 冷启动问题和稀疏矩阵可能是推荐系统面临的一些挑战。\n* 推荐系统广泛应用于电影、新闻、研究论文、产品、社交标签、音乐等领域。\n\n![cf](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_850010f2cd3e.gif)\n\n[返回顶部](#data-science-question-answer)\n\n\n## 监督学习\n\n* [线性回归](#linear-regression)\n* [逻辑回归](#logistic-regression)\n* [朴素贝叶斯](#naive-bayes)\n* [K近邻](#knn)\n* [支持向量机](#svm)\n* [决策树](#decision-tree)\n* [随机森林](#random-forest)\n* [提升树](#boosting-tree)\n* [多层感知器](#mlp)\n* [卷积神经网络](#cnn)\n* [循环神经网络与LSTM](#rnn-and-lstm)\n\n### 线性回归\n\n* 如何学习参数：最小化损失函数。\n* 如何最小化损失函数：梯度下降法。\n* 正则化：\n    - L1（Lasso）：可以将某些系数缩减至零，从而实现特征选择。\n    - L2（Ridge）：以相同的比例缩减所有系数；通常性能优于 L1。\n    - 弹性网络：将 L1 和 L2 正则化相结合。\n* 假设特征与标签之间存在线性关系。\n* 可以添加多项式特征和交互特征以引入非线性。\n\n![lr](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_0d6b770c8a41.png)\n\n[返回顶部](#data-science-question-answer)\n\n\n### 逻辑回归\n\n* 用于二分类问题的广义线性模型（GLM）。\n* 将 sigmoid 函数应用于线性模型的输出，使目标值被压缩到 [0, 1] 范围内。\n* 预测时的阈值：通常如果输出 > 0.5，则预测为 1；否则预测为 0。\n* 它是 softmax 函数的一个特例，后者用于处理多分类问题。\n\n[返回顶部](#data-science-question-answer)\n\n\n### 朴素贝叶斯\n\n* 朴素贝叶斯（NB）是一种基于 [贝叶斯定理](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FBayes%27_theorem) 的监督学习算法。\n* 称之为“朴素”，是因为它假设每个特征之间相互独立。\n* NB 可以做出不同的假设（即数据分布，如高斯分布、多项式分布、伯努利分布等）。\n* 尽管假设过于简化，朴素贝叶斯分类器在实际应用中表现相当不错，尤其是在文本分类领域（例如垃圾邮件过滤）。\n* 与更复杂的方法相比，朴素贝叶斯的速度非常快。\n\n[返回顶部](#data-science-question-answer)\n\n### K近邻\n\n* 给定一个数据点，我们使用某种距离度量（如欧几里得距离）计算出其 K 个最近的数据点（邻居）。对于分类任务，我们取邻居中多数的标签；对于回归任务，我们取标签值的平均值。\n* 需要注意的是，KNN 并不训练模型，而是在推理时直接进行计算。这可能会导致较高的计算开销，因为每个测试样本都需要与每一个训练样本进行比较，以确定它们之间的距离。\n* 有一些近似方法可以通过将训练数据划分为多个区域来加快推理速度（例如 [annoy](https:\u002F\u002Fgithub.com\u002Fspotify\u002Fannoy)）。\n* 当 K 等于 1 或其他较小的数值时，模型容易过拟合（方差较大）；而当 K 等于数据点总数或其他较大的数值时，模型则容易欠拟合（偏差较大）。\n\n![KNN](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_d867ce7400ee.png)\n\n[返回顶部](#data-science-question-answer)\n\n\n### 支持向量机\n\n* 可以执行线性分类、非线性分类，或异常值检测（无监督）。\n* 大间隔分类器：使用 SVM 不仅可以得到决策边界，还可以使该边界尽可能远离最近的训练点。\n* 最接近的训练样本被称为支持向量，因为决策边界的绘制正是基于这些点。\n* SVM 对特征缩放较为敏感。\n\n![svm](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_675fedee7173.png)\n\n[返回顶部](#data-science-question-answer)\n\n### 决策树\n\n* 非参数、监督学习算法\n* 给定训练数据后，决策树算法会将特征空间划分为若干区域。在推理时，我们首先确定测试数据点落在哪个区域，然后取该区域的平均标签值（回归）或多数标签值（分类）。\n* **构建**：自顶向下，选择一个变量来分割数据，使得每个区域内的目标变量尽可能同质化。常用的两种指标是基尼不纯度或信息增益，但在实际应用中差异不大。\n* 优点：易于理解和解释，贴近人类的决策过程。\n* 缺点：\n    - 如果不限制树的深度，容易过拟合（泛化能力差）。\n    - 不够稳健：训练数据的微小变化可能导致完全不同的树结构。\n    - 不稳定性：由于其正交的决策边界，对训练集的旋转较为敏感。\n\n![决策树](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_c557af4b34c5.gif)\n\n[返回顶部](#data-science-question-answer)\n\n\n### 随机森林\n\n随机森林通过引入随机性进一步改进了袋装法。在随机森林中，每次构建树时只随机选择一部分特征（而通常不会对样本进行子采样）。这样做的好处是使各棵树之间 **去相关化**。\n\n例如，假设我们有一个数据集，其中有一个非常有预测性的特征，以及几个中等预测性的特征。在袋装树中，大多数树会在根节点使用这个非常有预测性的特征进行分裂，因此这些树看起来非常相似，**并且高度相关**。然而，对大量高度相关的结果取平均，并不能像对不相关的结果那样显著降低方差。\n\n而在随机森林中，每次分裂时只考虑部分特征，从而通过引入更多不相关的树来进一步降低方差。\n\n我编写了一个 [笔记本](assets\u002Fbag-rf-var.ipynb) 来说明这一点。\n\n在实践中，调参随机森林时需要设置较大的树数量（越多越好，但也要考虑计算资源的限制）。此外，还需要调整 `min_samples_leaf` 参数（叶节点上的最小样本数），以控制树的大小和过拟合问题。始终要对参数进行交叉验证。\n\n[返回顶部](#data-science-question-answer)\n\n\n### 提升树\n\n**工作原理**\n\n提升方法基于弱学习器，采用迭代的方式构建模型。在每一轮迭代中，都会添加一个新的弱学习器，而现有的所有弱学习器保持不变。每个弱学习器会根据其表现（如准确率）被赋予相应的权重。当新弱学习器加入后，数据会被重新加权：那些被错误分类的样本权重会增加，而正确分类的样本权重会减少。这样一来，后续的弱学习器会更加关注之前弱学习器未能正确分类的样本。\n\n**与随机森林的区别**\n\n* 随机森林是 **并行** 构建树，而提升树是 **串行** 的。\n* 随机森林主要降低方差，而提升树则通过减少偏差来降低误差。\n\n**XGBoost（极端梯度提升）**\n\n> XGBoost 使用了更正则化的模型形式来控制过拟合，因此性能更好。\n\n[返回顶部](#data-science-question-answer)\n\n\n### 多层感知机（MLP）\n\n一种多层前馈神经网络。每一层可以包含多个神经元，下一层的每个神经元都是上一层所有神经元的线性或非线性组合。为了训练网络，我们采用逐层反向传播误差的方法。理论上，MLP 可以逼近任意函数。\n\n![mlp](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_88a70583fb61.jpg)\n\n[返回顶部](#data-science-question-answer)\n\n\n### 卷积神经网络（CNN）\n\n卷积层是卷积神经网络的基本构成单元。卷积层由一组可学习的滤波器组成（例如 5×5×3，宽度×高度×深度）。在前向传播过程中，我们会将滤波器在整个输入上滑动（更准确地说，进行卷积运算），并计算点积。网络通过逐层反向传播误差来完成学习。\n\n早期的卷积层主要捕捉低层次特征，如角度和边缘；而随着层数的增加，它们会学习到更高层次的特征，比如形状和物体的部分结构。\n\n![CNN](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_e53bb12da14b.jpg)\n\n[返回顶部](#data-science-question-answer)\n\n\n### 循环神经网络（RNN）与长短期记忆网络（LSTM）\n\nRNN 是另一种神经网络范式，它由多层细胞组成，每一层的细胞不仅接收来自上一层的输入，还会接收本层之前的细胞状态作为输入。这使得 RNN 具备建模序列的能力。\n\n![RNN](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_2a37bd4e9b12.jpeg)\n\n听起来很棒，但实际上 RNN 往往难以有效工作，因为存在梯度爆炸或梯度消失的问题，这是由一系列相同矩阵的乘法造成的。为了解决这个问题，我们可以使用 RNN 的一种变体——长短期记忆网络（LSTM），它能够学习长期依赖关系。\n\nLSTM 的数学原理可能比较复杂，但从直观上看，LSTM 引入了以下机制：\n\n* 输入门\n* 输出门\n* 忘记门\n* 记忆细胞（内部状态）\n\nLSTM 类似于人类的记忆：它会忘记旧的信息（旧的内部状态乘以忘记门），同时从新的输入中学习（输入节点乘以输入门）。\n\n![lstm](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_66c92451008d.png)\n\n[返回顶部](#data-science-question-answer)\n\n\n## 无监督学习\n\n* [聚类](#clustering)\n* [主成分分析](#principal-component-analysis)\n* [自编码器](#autoencoder)\n* [生成对抗网络](#generative-adversarial-network)\n\n### 聚类\n\n* 聚类是一种无监督学习算法，它将数据分组，使得同一组内的数据点彼此相似度高于与其他组数据点的相似度。\n* 相似度通常通过距离度量来定义（如欧氏距离、余弦相似度、Jaccard 系数等）。\n* 其目标通常是发现数据中的潜在结构（通常是高维数据）。\n* 最常见的聚类算法是 K-means，用户需要指定聚类的数量 K，算法会迭代地为每个数据点分配所属的聚类。\n\n[scikit-learn](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fclustering.html) 实现了多种聚类算法。以下是其官网页面上的一张对比图。\n\n![clustering](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_a0db7d1bd529.png)\n\n[返回顶部](#data-science-question-answer)\n\n### 主成分分析\n\n* 主成分分析（PCA）是一种降维技术，它将数据投影到低维空间。\n* PCA 使用奇异值分解（SVD），这是一种矩阵分解方法，可以将一个矩阵分解为三个较小的矩阵（关于 SVD 的更多详细信息请参见 [这里](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSingular-value_decomposition)）。\n* PCA 会找到前 N 个主成分，这些主成分是数据变化（分散）最大的方向。直观地说，数据在某个特定维度上越分散，所包含的信息就越多，因此这个维度对于数据集的模式识别就越重要。\n* PCA 可以用作数据可视化的预处理步骤：将高维数据降维到 2D 或 3D。另一种降维技术是 [t-SNE](https:\u002F\u002Flvdmaaten.github.io\u002Ftsne\u002F)。\n\n以下是 PCA 的可视化解释：\n\n![pca](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_57dc89507f6d.gif)\n\n[返回顶部](#data-science-question-answer)\n\n\n\n### 自编码器\n\n* 自编码器的目标是为一组数据学习一种表示（编码）。\n* 自编码器通常由两部分组成：编码器和解码器。编码器会找到原始输入的低维表示（潜在变量），而解码器则用于从这个低维向量中重建原始数据，使得重建数据与原始数据之间的距离最小化。\n* 可用于数据去噪和降维。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_c0a13af7e7d7.png)\n\n\n### 生成对抗网络\n\n* 生成对抗网络（GAN）是一种无监督学习算法，同时也带有监督学习的特性：在训练过程中会使用监督损失。\n* GAN 通常有两个主要组成部分：**生成器**和**判别器**。生成器试图生成“假”数据（例如图像或句子），以欺骗判别器使其认为这些数据是真实的；而判别器则试图区分真实数据和生成的数据。两者之间是一种对抗关系，因此得名“对抗”，这种对抗促使双方不断改进，直到“假”数据与真实数据无法区分。\n* 直观的工作原理：\n\t- 生成器接收一个**随机**输入，并生成一个数据样本。\n\t- 判别器随后会接收到生成的样本或真实的数据样本，尝试预测输入是真实数据还是生成的数据（即解决一个二分类问题）。\n\t- 在 [0, 1] 的真值评分范围内，理想情况下，我们希望判别器对生成的数据给出较低的分数，而对真实数据给出较高的分数。另一方面，我们也希望生成的数据能够成功欺骗判别器。这种矛盾推动双方不断强化自身。\n* 从训练的角度来看，工作原理如下：\n\t- 如果没有经过训练，生成器只会生成“垃圾”数据，而判别器则过于“天真”，无法区分真假数据。\n\t- 通常我们会先用真实数据（标签为 1）和生成数据（标签为 0）对判别器进行 N 个 epoch 的训练，使其能够较好地判断什么是真实数据，什么是生成数据。\n\t- 然后我们将**判别器设置为不可训练**，并开始训练生成器。尽管此时判别器处于不可训练状态，我们仍然将其作为分类器来使用，这样**误差信号就可以反向传播，从而使生成器得以学习**。\n\t- 上述两个步骤会交替进行，直到双方都无法再进一步提升。\n* 以下是一些使 GAN 能够有效工作的[技巧和窍门](https:\u002F\u002Fgithub.com\u002Fsoumith\u002Fganhacks)。\n* 需要注意的一点是，**对抗部分只是辅助性的：使用 GAN 的最终目标是生成连专家都难以分辨真假的数据**。\n\n![gan](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_718e5a3f0ed4.jpg)\n\n[返回顶部](#data-science-question-answer)\n\n\n## 强化学习\n\n[待办]\n\n## 自然语言处理\n\n* [分词](#tokenization)\n* [词干提取和词形还原](#stemming-and-lemmatization)\n* [N 元语法](#ngram)\n* [词袋模型](#bag-of-words)\n* [word2vec](#word2vec)\n\n\n### 分词\n\n* 分词是将字符序列转换为标记序列的过程。\n* 举个例子：`The quick brown fox jumped over the lazy dog`。在这种情况下，每个单词（由空格分隔）就是一个标记。\n* 有时分词并没有明确的答案。例如，`O'Neill` 可以被分词为 `o` 和 `neill`、`oneill` 或 `o'neill`。\n* 在某些情况下，分词需要特定于语言的知识。例如，将 `aren't` 分词为 `aren` 和 `t` 就没有意义。\n* 如需更详细的分词介绍，请参阅 [这里](https:\u002F\u002Fnlp.stanford.edu\u002FIR-book\u002Fhtml\u002Fhtmledition\u002Ftokenization-1.html)。\n\n[返回顶部](#data-science-question-answer)\n\n### 词干提取和词形还原\n\n* 词干提取和词形还原的目的都是将一个词的不同屈折形式以及有时相关的派生形式归约为一个共同的基础形式。\n* 词干提取通常指一种粗略的启发式过程，通过截断词尾来简化词形。\n* 词形还原通常是指利用词汇表和词的形态学分析来进行更规范的操作。\n* 如果面对标记 `saw`，词干提取可能会直接返回 `s`，而词形还原则会根据该标记是用作动词还是名词，尝试返回 `see` 或 `saw`。\n* 如需更详细的介绍，请参阅 [这里](https:\u002F\u002Fnlp.stanford.edu\u002FIR-book\u002Fhtml\u002Fhtmledition\u002Fstemming-and-lemmatization-1.html)。\n\n[返回顶部](#data-science-question-answer)\n\n\n### N 元语法\n\n* n 元语法是从给定文本或语音样本中连续选取的 n 个元素组成的序列。\n* 大小为 1 的 n 元语法称为“一元语法”；大小为 2 的称为“二元语法”，大小为 3 的称为“三元语法”。更大的 n 值有时会用现代语言中的数字来表示，例如“四元语法”、“五元语法”等。\n* 举个例子：`The quick brown fox jumped over the lazy dog.`\n  - 二元语法将是 `the quick`、`quick brown`、`brown fox`……，即每两个连续的单词（或标记）。\n  - 三元语法将是 `the quick brown`、`quick brown fox`、`brown fox jumped`……，即每三个连续的单词（或标记）。\n* n 元语法模型是对序列的建模，即根据前面的 1、2、3……n-1 个词来预测下一个词（第 n 个词）。\n* 多元语法（二元及以上）能够捕捉**上下文**。\n* 选择 n 元语法中的 n 值需要通过实验，在估计的稳定性与适用性之间做出权衡。经验法则：对于大型训练语料库（数百万词），三元语法是一个常见的选择；而对于较小的语料库，则常使用二元语法。\n* n 元语法可以用作机器学习和下游自然语言处理任务的特征。\n\n[返回顶部](#data-science-question-answer)\n\n### 词袋模型\n\n* 为什么？机器学习模型无法直接处理原始文本；它们需要以数值作为输入。\n* 词袋模型（BoW）会构建一个包含数据集中所有唯一词汇的**词汇表**，并为词汇表中的每个词分配一个唯一的索引。\n* 它被称为“词袋”，是因为这种表示方法完全忽略了单词的顺序。\n* 举个例子：有两句话：(1) `John likes to watch movies, especially horor movies.`，(2) `Mary likes movies too.`。首先我们会构建一个不区分大小写且忽略标点符号的唯一词汇表：`[john, likes, to, watch, movies, especially, horor, mary, too]`。然后可以用词频来表示每句话，即某个词出现的次数。因此，句子 (1) 可表示为 `[1, 1, 1, 1, 2, 1, 1, 0, 0]`，而句子 (2) 则是 `[0, 1, 0, 0, 1, 0, 0, 1, 1]`。\n* 使用字典的一种常见替代方案是[哈希技巧](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FFeature_hashing)，它通过哈希函数将单词直接映射到索引。\n* 随着词汇表规模的增大（达到数万级别），用于表示短句或文档的向量会变得非常稀疏（几乎全是零）。\n\n[返回顶部](#data-science-question-answer)\n\n### word2vec\n\n* 是一种浅层的两层神经网络，经过训练可以捕捉词语的语言学上下文。\n* 输入是一个大型语料库，输出是一个通常由几百维组成的向量空间，语料库中的每个词都会被映射到这个空间中的一个向量。\n* 其核心思想是**上下文**：经常出现在相同上下文中的词，其含义应该相近或相反。\n* 主要有两种形式：\n    - 连续词袋模型（CBOW）：根据当前词周围的上下文窗口预测当前词。\n    - 跳字模型（Skip-Gram）：根据当前词预测周围的上下文词。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_936c70c9f198.png)\n\n[返回顶部](#data-science-question-answer)\n\n\n## 系统\n\n* [Cron 作业](#cron-job)\n* [Linux](#linux)\n\n### Cron 作业\n\n软件工具 **cron** 是类 Unix 操作系统中的一种**基于时间的任务调度器**。负责搭建和维护软件环境的人员使用 cron 来安排任务（命令或 Shell 脚本）在固定的时间、日期或间隔周期性地运行。它通常用于自动化系统维护或管理——尽管其通用性也使其适用于诸如定期从互联网下载文件或定时收发电子邮件等任务。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_readme_fd8cf847cbb9.png)\n\n工具：\n* [Apache Airflow](https:\u002F\u002Fairflow.apache.org\u002F)\n\n[返回顶部](#data-science-question-answer)\n\n\n### Linux\n\n以 **Ubuntu** 为例。\n\n* 切换到 root 用户：`sudo su`\n* 安装软件包：`sudo apt-get install \u003Cpackage>`\n\n[返回顶部](#data-science-question-answer)\n\n\n忏悔：部分图片来自网络，未注明原作者。如果您是这些图片的作者，并认为这会对您造成困扰，请告知我。","# data-science-question-answer 快速上手指南\n\n> **⚠️ 重要提示**：本仓库（`data-science-question-answer`）已**弃用**。作者建议转向最新的替代项目：**[Nailing Machine Learning Concepts](https:\u002F\u002Fgithub.com\u002Fjayinai\u002Fnail-ml-concept)**。\n>\n> 本指南基于原仓库内容整理，旨在帮助数据科学从业者准备面试或快速复习核心概念。本项目主要为**知识库\u002F文档类资源**，无需复杂的环境配置或安装过程。\n\n## 环境准备\n\n本项目本质是一个包含面试题解、概念解析和最佳实践的 Markdown 文档集合，因此对系统环境要求极低。\n\n*   **操作系统**：Windows, macOS, Linux 均可。\n*   **前置依赖**：\n    *   **Git**：用于克隆代码仓库。\n    *   **浏览器** 或 **Markdown 阅读器**：用于查看内容（如 VS Code, Typora, 或直接在 GitHub 网页端浏览）。\n    *   *(可选)* **Python & Jupyter**：如果你希望运行文中提到的部分代码示例（如 PySpark, Scikit-learn 相关），建议安装 Anaconda 或 Miniconda。\n\n## 安装步骤\n\n由于这是一个文档型仓库，\"安装\"即为克隆代码到本地。\n\n1.  **克隆仓库**\n    打开终端（Terminal 或 CMD），执行以下命令：\n    ```bash\n    git clone https:\u002F\u002Fgithub.com\u002Fjayinai\u002Fdata-science-question-answer.git\n    ```\n\n    *国内用户加速方案（如果直接克隆速度慢）：*\n    ```bash\n    # 使用 Gitee 镜像（如果有）或通过代理加速\n    git clone https:\u002F\u002Fgitee.com\u002Fmirror\u002Fdata-science-question-answer.git \n    # 注：若官方无 Gitee 镜像，建议使用 git clone --depth=1 来减少下载量\n    git clone --depth=1 https:\u002F\u002Fgithub.com\u002Fjayinai\u002Fdata-science-question-answer.git\n    ```\n\n2.  **进入目录**\n    ```bash\n    cd data-science-question-answer\n    ```\n\n3.  **查看内容**\n    *   **方式 A（推荐）**：直接在 GitHub 网页版浏览，体验最佳（含目录跳转和图片渲染）。\n    *   **方式 B**：在本地使用 VS Code 打开文件夹，安装 `Markdown Preview Enhanced` 插件进行预览。\n    *   **方式 C**：直接使用文本编辑器打开 `README.md` 文件。\n\n## 基本使用\n\n本工具的核心用法是**按需查阅**特定主题的知识要点。以下是几个典型的使用场景示例：\n\n### 1. 准备简历优化\n在 `Resume` 章节，学习如何将项目经历量化。\n*   **错误示范**：`Trained a machine learning system`\n*   **正确示范**：`Designed and deployed a deep learning model to recognize objects using Keras, Tensorflow, and Node.js. The model has 1\u002F30 model size, 1\u002F3 training time, 1\u002F5 inference time, and 2x faster convergence compared with traditional neural networks.`\n*   **操作**：阅读文档中关于简历的建议，检查并修改你自己的简历描述。\n\n### 2. 复习 SQL 连接区别\n在 `SQL` 章节快速回顾 Join 的类型。\n*   **操作**：搜索 `Difference between joins`。\n*   **核心知识点**：\n    *   `INNER JOIN`: 返回两表匹配的记录。\n    *   `LEFT JOIN`: 返回左表所有记录及右表匹配记录。\n    *   `RIGHT JOIN`: 返回右表所有记录及左表匹配记录。\n    *   `FULL JOIN`: 返回任一表有匹配的所有记录。\n\n### 3. 理解机器学习工作流\n在 `Statistics and ML In General` -> `Project Workflow` 章节，掌握标准的数据科学项目流程。\n*   **关键步骤速查**：\n    1.  **Specify business objective**: 明确业务目标（如增加营收）。\n    2.  **Define problem**: 定义具体问题。\n    3.  **Create a common sense baseline**: 建立常识基线（如推荐 Top N 热门商品）。\n    4.  **EDA**: 探索性数据分析。\n    5.  **Feature Engineering**: 特征工程（核心环节）。\n    6.  **Model Development & Ensemble**: 模型开发与集成。\n    7.  **Deploy & Monitor**: 部署与监控。\n\n### 4. 辨析核心概念 (面试高频)\n利用文档快速对比易混淆概念，例如：\n*   **L1 vs L2 正则化**：\n    *   L1 (Lasso): 可将系数压缩为 0，用于特征选择，产生稀疏解。\n    *   L2 (Ridge): 按比例缩小系数但不为 0，防止过拟合。\n*   **MSE vs MAE**：\n    *   MSE: 对大误差惩罚更重（平方级），连续可导。\n    *   MAE: 对误差线性惩罚，鲁棒性更强但不可导点较多。\n*   **欠拟合对策**：\n    *   增加数据量**不能**解决欠拟合。\n    *   解决方法：增加模型复杂度（如增加树深度、神经网络层数、多项式特征等）。\n\n### 5. 查阅 Spark 速查表\n在 `Tools and Framework` -> `Spark` 章节。\n*   **操作**：文档提供了 [PySpark Cheat Sheet](https:\u002F\u002Fs3.amazonaws.com\u002Fassets.datacamp.com\u002Fblog_assets\u002FPySpark_Cheat_Sheet_Python.pdf) 的链接，适合时间紧迫时快速扫描基础 API。\n\n---\n*注：本指南仅涵盖基础查阅流程。如需深入某个具体算法（如 Stacking, Bagging, NLP 等），请直接翻阅仓库对应的 Markdown 章节。*","一位刚转行数据科学的求职者正在紧急备战下周的技术面试，同时需要优化简历以突出项目亮点。\n\n### 没有 data-science-question-answer 时\n- 简历描述空洞，只写“训练过机器学习系统”，缺乏量化指标和技术栈细节，难以吸引面试官注意。\n- 面对\"SQL 连接类型区别”或\"L1 与 L2 正则化差异”等基础概念题，需翻阅多本厚书或搜索零散博客，复习效率极低。\n- 对 Spark 等框架仅停留在理论层面，缺乏快速查阅的速查表（Cheat Sheet），无法在短时间内梳理 API 核心用法。\n- 缺乏系统性的知识广度梳理，容易在“生成式与判别式模型”等宏观对比问题上逻辑混乱，回答不够专业。\n\n### 使用 data-science-question-answer 后\n- 参考简历章节，将项目经历改写为“使用 Keras 和 Tensorflow 部署深度学习模型，推理速度提升 5 倍”，具体量化成果并明确技术栈。\n- 直接查阅 SQL 和统计学章节，快速获取内连接与外连接的清晰定义及图示，几分钟内掌握面试高频考点。\n- 利用工具提供的 PySpark 速查表和官方文档指引，迅速重温关键 API，建立起从理论到代码的快速映射。\n- 通过浏览监督学习、无监督学习等分类目录，构建完整的知识图谱，确保在回答宏观概念对比时条理清晰、覆盖全面。\n\ndata-science-question-answer 通过提供结构化的速查指南和实战建议，帮助从业者高效填补知识盲区并显著提升面试竞争力。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjayinai_data-science-question-answer_a9f0e2fd.png","jayinai",null,"https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fjayinai_7fb89f6d.png","Tech Lead, Machine Learning Engineer","https:\u002F\u002Fgithub.com\u002Fjayinai",[78],{"name":79,"color":80,"percentage":81},"Jupyter Notebook","#DA5B0B",100,2418,648,"2026-04-08T04:35:38","MIT",1,"","未说明",{"notes":90,"python":88,"dependencies":91},"该仓库已废弃（deprecated），作者建议转向新的项目 'Nailing Machine Learning Concepts'。当前内容主要为数据科学面试准备的概念性问答和快速参考指南（涵盖 SQL、统计学、机器学习理论等），并非可运行的软件工具或代码库，因此无需特定的操作系统、GPU、内存或依赖库环境。",[],[16,14,93],"其他",[95,96,97,98,99,100,101],"data-science","machine-learning","deep-learning","sql","statistics","system","reinforcement-learning","2026-03-27T02:49:30.150509","2026-04-14T12:28:04.029202",[],[]]