[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-andrewekhalel--MLQuestions":3,"tool-andrewekhalel--MLQuestions":65},[4,23,32,40,49,57],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":22},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",85052,2,"2026-04-08T11:03:08",[13,14,15,16,17,18,19,20,21],"图像","数据工具","视频","插件","Agent","其他","语言模型","开发框架","音频","ready",{"id":24,"name":25,"github_repo":26,"description_zh":27,"stars":28,"difficulty_score":29,"last_commit_at":30,"category_tags":31,"status":22},5784,"funNLP","fighting41love\u002FfunNLP","funNLP 是一个专为中文自然语言处理（NLP）打造的超级资源库，被誉为\"NLP 民工的乐园”。它并非单一的软件工具，而是一个汇集了海量开源项目、数据集、预训练模型和实用代码的综合性平台。\n\n面对中文 NLP 领域资源分散、入门门槛高以及特定场景数据匮乏的痛点，funNLP 提供了“一站式”解决方案。这里不仅涵盖了分词、命名实体识别、情感分析、文本摘要等基础任务的标准工具，还独特地收录了丰富的垂直领域资源，如法律、医疗、金融行业的专用词库与数据集，甚至包含古诗词生成、歌词创作等趣味应用。其核心亮点在于极高的全面性与实用性，从基础的字典词典到前沿的 BERT、GPT-2 模型代码，再到高质量的标注数据和竞赛方案，应有尽有。\n\n无论是刚刚踏入 NLP 领域的学生、需要快速验证想法的算法工程师，还是从事人工智能研究的学者，都能在这里找到急需的“武器弹药”。对于开发者而言，它能大幅减少寻找数据和复现模型的时间；对于研究者，它提供了丰富的基准测试资源和前沿技术参考。funNLP 以开放共享的精神，极大地降低了中文自然语言处理的开发与研究成本，是中文 AI 社区不可或缺的宝藏仓库。",79857,1,"2026-04-08T20:11:31",[19,14,18],{"id":33,"name":34,"github_repo":35,"description_zh":36,"stars":37,"difficulty_score":29,"last_commit_at":38,"category_tags":39,"status":22},5773,"cs-video-courses","Developer-Y\u002Fcs-video-courses","cs-video-courses 是一个精心整理的计算机科学视频课程清单，旨在为自学者提供系统化的学习路径。它汇集了全球知名高校（如加州大学伯克利分校、新南威尔士大学等）的完整课程录像，涵盖从编程基础、数据结构与算法，到操作系统、分布式系统、数据库等核心领域，并深入延伸至人工智能、机器学习、量子计算及区块链等前沿方向。\n\n面对网络上零散且质量参差不齐的教学资源，cs-video-courses 解决了学习者难以找到成体系、高难度大学级别课程的痛点。该项目严格筛选内容，仅收录真正的大学层级课程，排除了碎片化的简短教程或商业广告，确保用户能接触到严谨的学术内容。\n\n这份清单特别适合希望夯实计算机基础的开发者、需要补充特定领域知识的研究人员，以及渴望像在校生一样系统学习计算机科学的自学者。其独特的技术亮点在于分类极其详尽，不仅包含传统的软件工程与网络安全，还细分了生成式 AI、大语言模型、计算生物学等新兴学科，并直接链接至官方视频播放列表，让用户能一站式获取高质量的教育资源，免费享受世界顶尖大学的课堂体验。",79792,"2026-04-08T22:03:59",[18,13,14,20],{"id":41,"name":42,"github_repo":43,"description_zh":44,"stars":45,"difficulty_score":46,"last_commit_at":47,"category_tags":48,"status":22},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,3,"2026-04-04T04:44:48",[17,13,20,19,18],{"id":50,"name":51,"github_repo":52,"description_zh":53,"stars":54,"difficulty_score":46,"last_commit_at":55,"category_tags":56,"status":22},519,"PaddleOCR","PaddlePaddle\u002FPaddleOCR","PaddleOCR 是一款基于百度飞桨框架开发的高性能开源光学字符识别工具包。它的核心能力是将图片、PDF 等文档中的文字提取出来，转换成计算机可读取的结构化数据，让机器真正“看懂”图文内容。\n\n面对海量纸质或电子文档，PaddleOCR 解决了人工录入效率低、数字化成本高的问题。尤其在人工智能领域，它扮演着连接图像与大型语言模型（LLM）的桥梁角色，能将视觉信息直接转化为文本输入，助力智能问答、文档分析等应用场景落地。\n\nPaddleOCR 适合开发者、算法研究人员以及有文档自动化需求的普通用户。其技术优势十分明显：不仅支持全球 100 多种语言的识别，还能在 Windows、Linux、macOS 等多个系统上运行，并灵活适配 CPU、GPU、NPU 等各类硬件。作为一个轻量级且社区活跃的开源项目，PaddleOCR 既能满足快速集成的需求，也能支撑前沿的视觉语言研究，是处理文字识别任务的理想选择。",75229,"2026-04-09T11:17:25",[19,13,20,18],{"id":58,"name":59,"github_repo":60,"description_zh":61,"stars":62,"difficulty_score":29,"last_commit_at":63,"category_tags":64,"status":22},3215,"awesome-machine-learning","josephmisiti\u002Fawesome-machine-learning","awesome-machine-learning 是一份精心整理的机器学习资源清单，汇集了全球优秀的机器学习框架、库和软件工具。面对机器学习领域技术迭代快、资源分散且难以甄选的痛点，这份清单按编程语言（如 Python、C++、Go 等）和应用场景（如计算机视觉、自然语言处理、深度学习等）进行了系统化分类，帮助使用者快速定位高质量项目。\n\n它特别适合开发者、数据科学家及研究人员使用。无论是初学者寻找入门库，还是资深工程师对比不同语言的技术选型，都能从中获得极具价值的参考。此外，清单还延伸提供了免费书籍、在线课程、行业会议、技术博客及线下聚会等丰富资源，构建了从学习到实践的全链路支持体系。\n\n其独特亮点在于严格的维护标准：明确标记已停止维护或长期未更新的项目，确保推荐内容的时效性与可靠性。作为机器学习领域的“导航图”，awesome-machine-learning 以开源协作的方式持续更新，旨在降低技术探索门槛，让每一位从业者都能高效地站在巨人的肩膀上创新。",72149,"2026-04-03T21:50:24",[20,18],{"id":66,"github_repo":67,"name":68,"description_en":69,"description_zh":70,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":79,"owner_company":80,"owner_location":81,"owner_email":82,"owner_twitter":80,"owner_website":80,"owner_url":83,"languages":80,"stars":84,"forks":85,"last_commit_at":86,"license":80,"difficulty_score":29,"env_os":87,"env_gpu":88,"env_ram":88,"env_deps":89,"category_tags":92,"github_topics":80,"view_count":10,"oss_zip_url":80,"oss_zip_packed_at":80,"status":22,"created_at":93,"updated_at":94,"faqs":95,"releases":96},5878,"andrewekhalel\u002FMLQuestions","MLQuestions","Machine Learning and Computer Vision Engineer - Technical Interview Questions","MLQuestions 是一个专为机器学习和计算机视觉工程师打造的面试题库，旨在帮助求职者高效备战技术面试。它收录了涵盖偏差与方差权衡、梯度下降原理、过拟合与欠拟合应对策略等核心概念的 65 道经典面试题，并近期扩展了自然语言处理（NLP）领域的专项问题。\n\n对于许多致力于进入顶尖科技公司或初创企业的开发者而言，技术面试往往充满挑战，难以系统性地复习关键知识点。MLQuestions 通过整理高频考题并提供权威解答链接，解决了备考资源分散、重点不明的痛点，让用户能够针对性地巩固理论基础与工程实践知识。此外，项目还推荐了统计学、机器学习系统设计等专业书籍及模拟面试资源，构建了完整的备考生态。\n\n这款工具非常适合正在寻找机器学习、深度学习或计算机视觉相关岗位的工程师、研究人员以及应届毕业生使用。无论你是希望检验自身知识盲区，还是想要系统梳理算法原理，MLQuestions 都能提供清晰的学习路径。其内容紧跟 2026 年行业趋势，不仅关注传统模型理论，也涵盖了生产级系统设计的实战思考，是提升面试竞争力的实用助手。","# 65 Machine Learning Interview Questions 2026\nA collection of technical interview questions for machine learning and computer vision engineering positions.\n\n### Recently added: [Natural Language Processing (NLP) Interview Questions 2026](https:\u002F\u002Fgithub.com\u002Fandrewekhalel\u002FMLQuestions\u002Ftree\u002Fmaster\u002FNLP)\n\n## Preparation Resources\n1. [ML Engineer Interview Course](https:\u002F\u002Fwww.tryexponent.com\u002Fcourses\u002Fml-engineer?ref=zjgwmje&tap_s=5026306-8f044e)\n1. [Mock ML Interview](https:\u002F\u002Fwww.tryexponent.com\u002Fcoaching?ref=zjgwmje&tap_s=5026306-8f044e&category=mock_interviews&src=nav&skill=ml): Get ready for your next interview by practicing with ML engineers from top tech companies and startups.\n1. [All of Statistics: A Concise Course in Statistical Inference](https:\u002F\u002Famzn.to\u002F3r87WGa) by Larry Wasserman\n2. [Machine Learning](https:\u002F\u002Famzn.to\u002F3RdiFK3) by Tom Mitchell\n3. [Designing Machine Learning Systems: An Iterative Process for Production-Ready Applications](https:\u002F\u002Famzn.to\u002F3LiVgD2) by Chip Huyen\n\n---\nThis page is sponsored by [Jobbyo](https:\u002F\u002Fjobbyo.ai\u002F?linkId=lp_801223&sourceId=akhalel&tenantId=jobbyoai):\n\u003Cp align=center style=\"font-style:italic;\">\"If you’re focused on interview prep, let \u003Ca href=\"https:\u002F\u002Fjobbyo.ai\u002F?linkId=lp_801223&sourceId=akhalel&tenantId=jobbyoai\">Jobbyo\u003C\u002Fa> handle the busywork of applying.\nIt automates applications and keeps your job search organized while you stay sharp\"\u003C\u002Fp>\n\nUse promo code \"MLQUESTIONS\" at checkout to get 20% off your Premium subscription for 3 months!\n\n## Questions\n\n#### 1) What's the trade-off between bias and variance? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n\nIf our model is too simple and has very few parameters then it may have high bias and low variance. On the other hand if our model has large number of parameters then it’s going to have high variance and low bias. So we need to find the right\u002Fgood balance without overfitting and underfitting the data. [[src]](https:\u002F\u002Ftowardsdatascience.com\u002Funderstanding-the-bias-variance-tradeoff-165e6942b229)\n\n#### 2) What is gradient descent? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n[[Answer]](https:\u002F\u002Fmachinelearningmastery.com\u002Fgradient-descent-for-machine-learning\u002F)\n\nGradient descent is an optimization algorithm used to find the values of parameters (coefficients) of a function (f) that minimizes a cost function (cost).\n\nGradient descent is best used when the parameters cannot be calculated analytically (e.g. using linear algebra) and must be searched for by an optimization algorithm.\n\n#### 3) Explain over- and under-fitting and how to combat them? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n[[Answer]](https:\u002F\u002Ftowardsdatascience.com\u002Foverfitting-vs-underfitting-a-complete-example-d05dd7e19765)\n\nML\u002FDL models essentially learn a relationship between its given inputs(called training features) and objective outputs(called labels). Regardless of the quality of the learned relation(function), its performance on a test set(a collection of data different from the training input) is subject to investigation.\n\nMost ML\u002FDL models have trainable parameters which will be learned to build that input-output relationship. Based on the number of parameters each model has, they can be sorted into more flexible(more parameters) to less flexible(less parameters).\n\nThe problem of Underfitting arises when the flexibility of a model(its number of parameters) is not adequate to capture the underlying pattern in a training dataset. Overfitting, on the other hand, arises when the model is too flexible to the underlying pattern. In the later case it is said that the model has “memorized” the training data.\n\nAn example of underfitting is estimating a second order polynomial(quadratic function) with a first order polynomial(a simple line). Similarly, estimating a line with a 10th order polynomial would be an example of overfitting.\n\n\n#### 4) How do you combat the curse of dimensionality? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n\n - Feature Selection(manual or via statistical methods)\n - Principal Component Analysis (PCA)\n - Multidimensional Scaling\n - Locally linear embedding  \n[[src]](https:\u002F\u002Ftowardsdatascience.com\u002Fwhy-and-how-to-get-rid-of-the-curse-of-dimensionality-right-with-breast-cancer-dataset-7d528fb5f6c0)\n\n#### 5) What is regularization, why do we use it, and give some examples of common methods? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\nA technique that discourages learning a more complex or flexible model, so as to avoid the risk of overfitting. \nExamples\n - Ridge (L2 norm)\n - Lasso (L1 norm)  \nThe obvious *disadvantage* of **ridge** regression, is model interpretability. It will shrink the coefficients for least important predictors, very close to zero. But it will never make them exactly zero. In other words, the *final model will include all predictors*. However, in the case of the **lasso**, the L1 penalty has the effect of forcing some of the coefficient estimates to be *exactly equal* to zero when the tuning parameter λ is sufficiently large. Therefore, the lasso method also performs variable selection and is said to yield sparse models.\n[[src]](https:\u002F\u002Ftowardsdatascience.com\u002Fregularization-in-machine-learning-76441ddcf99a)\n\n#### 6) Explain Principal Component Analysis (PCA)? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n[[Answer]](https:\u002F\u002Ftowardsdatascience.com\u002Fa-one-stop-shop-for-principal-component-analysis-5582fb7e0a9c)\n\nPrincipal Component Analysis (PCA) is a dimensionality reduction technique used in machine learning to reduce the number of features in a dataset while retaining as much information as possible. It works by identifying the directions (principal components) in which the data varies the most, and projecting the data onto a lower-dimensional subspace along these directions.\n\n#### 7) Why is ReLU better and more often used than Sigmoid in Neural Networks? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n\n* Computation Efficiency:\n  As ReLU is a simple threshold the forward and backward path will be faster.\n* Reduced Likelihood of Vanishing Gradient:\n  Gradient of ReLU is 1 for positive values and 0 for negative values while Sigmoid activation saturates (gradients close to 0) quickly with slightly higher or lower inputs leading to vanishing gradients.\n* Sparsity:\n  Sparsity happens when the input of ReLU is negative. This means fewer neurons are firing ( sparse activation ) and the network is lighter. \n\n\n[[src1]](https:\u002F\u002Fmedium.com\u002Fthe-theory-of-everything\u002Funderstanding-activation-functions-in-neural-networks-9491262884e0) [[src2]](https:\u002F\u002Fstats.stackexchange.com\u002Fquestions\u002F126238\u002Fwhat-are-the-advantages-of-relu-over-sigmoid-function-in-deep-neural-networks)\n\n\n\n#### 8) Given stride S and kernel sizes  for each layer of a (1-dimensional) CNN, create a function to compute the [receptive field](https:\u002F\u002Fwww.quora.com\u002FWhat-is-a-receptive-field-in-a-convolutional-neural-network) of a particular node in the network. This is just finding how many input nodes actually connect through to a neuron in a CNN. [[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\nThe receptive field are defined portion of space within an inputs that will be used during an operation to generate an output.\n\nConsidering a CNN filter of size k, the receptive field of a peculiar layer is only the number of input used by the filter, in this case k, multiplied by the dimension of the input that is not being reduced by the convolutionnal filter a. This results in a receptive field of k*a.\n\nMore visually, in the case of an image of size 32x32x3, with a CNN with a filter size of 5x5, the corresponding recpetive field will be the the filter size, 5 multiplied by the depth of the input volume (the RGB colors) which is the color dimensio. This thus gives us a recpetive field of dimension 5x5x3.\n\n#### 9) Implement [connected components](http:\u002F\u002Faishack.in\u002Ftutorials\u002Flabelling-connected-components-example\u002F) on an image\u002Fmatrix. [[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n\n#### 10) Implement a sparse matrix class in C++. [[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n[[Answer]](https:\u002F\u002Fwww.geeksforgeeks.org\u002Fsparse-matrix-representation\u002F)\n\n#### 11) Create a function to compute an [integral image](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSummed-area_table), and create another function to get area sums from the integral image.[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n[[Answer]](https:\u002F\u002Fwww.geeksforgeeks.org\u002Fsubmatrix-sum-queries\u002F)\n\n#### 12) How would you remove outliers when trying to estimate a flat plane from noisy samples? [[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\nRandom sample consensus (RANSAC) is an iterative method to estimate parameters of a mathematical model from a set of observed data that contains outliers, when outliers are to be accorded no influence on the values of the estimates.\n[[src]](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FRandom_sample_consensus)\n\n\n\n#### 13) How does [CBIR](https:\u002F\u002Fwww.robots.ox.ac.uk\u002F~vgg\u002Fpublications\u002F2013\u002Farandjelovic13\u002Farandjelovic13.pdf) work? [[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n[[Answer]](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FContent-based_image_retrieval)\nContent-based image retrieval is the concept of using images to gather metadata on their content. Compared to the current image retrieval approach based on the keywords associated to the images, this technique generates its metadata from computer vision techniques to extract the relevant informations that will be used during the querying step. Many approach are possible from feature detection to retrieve keywords to the usage of CNN to extract dense features that will be associated to a known distribution of keywords. \n\nWith this last approach, we care less about what is shown on the image but more about the similarity between the metadata generated by a known image and a list of known label and or tags projected into this metadata space.\n\n#### 14) How does image registration work? Sparse vs. dense [optical flow](http:\u002F\u002Fwww.ncorr.com\u002Fdownload\u002Fpublications\u002Fbakerunify.pdf) and so on. [[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n#### 15) Describe how convolution works. What about if your inputs are grayscale vs RGB imagery? What determines the shape of the next layer?[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)] \nIn a convolutional neural network (CNN), the convolution operation is applied to the input image using a small matrix called a kernel or filter. The kernel slides over the image in small steps, called strides, and performs element-wise multiplications with the corresponding elements of the image and then sums up the results. The output of this operation is called a feature map.\n\nWhen the input is RGB(or more than 3 channels) the sliding window will be a sliding cube. The shape of the next layer is determined by Kernel size, number of kernels, stride, padding, and dialation.\n\n[[src1]](https:\u002F\u002Fdev.to\u002Fsandeepbalachandran\u002Fmachine-learning-convolution-with-color-images-2p41)[[src2]](https:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F70231487\u002Foutput-dimensions-of-convolution-in-pytorch)\n\n#### 16) Talk me through how you would create a 3D model of an object from imagery and depth sensor measurements taken at all angles around the object. [[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\nThere are two popular methods for 3D reconstruction:\n* Structure from Motion (SfM) [[src]](https:\u002F\u002Fwww.mathworks.com\u002Fhelp\u002Fvision\u002Fug\u002Fstructure-from-motion.html)\n\n* Multi-View Stereo (MVS) [[src]](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=Zwwty2qPNs8)\n\nSfM is better suited for creating models of large scenes while MVS is better suited for creating models of small objects.\n\n\n#### 17) Implement SQRT(const double & x) without using any special functions, just fundamental arithmetic. [[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\nThe taylor series can be used for this step by providing an approximation of sqrt(x):\n\n[[Answer]](https:\u002F\u002Fmath.stackexchange.com\u002Fquestions\u002F732540\u002Ftaylor-series-of-sqrt1x-using-sigma-notation)\n\n#### 18) Reverse a bitstring. [[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\nIf you are using python3 :\n\n```\ndata = b'\\xAD\\xDE\\xDE\\xC0'\nmy_data = bytearray(data)\nmy_data.reverse()\n```\n#### 19) Implement non maximal suppression as efficiently as you can. [[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\nNon-Maximum Suppression (NMS) is a technique used to eliminate multiple detections of the same object in a given image.\nTo solve that first sort bounding boxes based on their scores(N LogN). Starting with the box with the highest score, remove boxes whose overlapping metric(IoU) is greater than a certain threshold.(N^2)\n\nTo optimize this solution you can use special data structures to query for overlapping boxes such as R-tree or KD-tree. (N LogN)\n[[src]](https:\u002F\u002Ftowardsdatascience.com\u002Fnon-maxima-suppression-139f7e00f0b5)\n\n#### 20) Reverse a linked list in place. [[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n[[Answer]](https:\u002F\u002Fwww.geeksforgeeks.org\u002Freverse-a-linked-list\u002F)\n\n#### 21) What is data normalization and why do we need it? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\nData normalization is very important preprocessing step, used to rescale values to fit in a specific range to assure better convergence during backpropagation. In general, it boils down to subtracting the mean of each data point and dividing by its standard deviation. If we don't do this then some of the features (those with high magnitude) will be weighted more in the cost function (if a higher-magnitude feature changes by 1%, then that change is pretty big, but for smaller features it's quite insignificant). The data normalization makes all features weighted equally.\n\n#### 22) Why do we use convolutions for images rather than just FC layers? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\nFirstly, convolutions preserve, encode, and actually use the spatial information from the image. If we used only FC layers we would have no relative spatial information. Secondly, Convolutional Neural Networks (CNNs) have a partially built-in translation in-variance, since each convolution kernel acts as it's own filter\u002Ffeature detector.\n\n#### 23) What makes CNNs translation invariant? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\nAs explained above, each convolution kernel acts as it's own filter\u002Ffeature detector. So let's say you're doing object detection, it doesn't matter where in the image the object is since we're going to apply the convolution in a sliding window fashion across the entire image anyways.\n\n#### 24) Why do we have max-pooling in classification CNNs? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\nfor a role in Computer Vision. Max-pooling in a CNN allows you to reduce computation since your feature maps are smaller after the pooling. You don't lose too much semantic information since you're taking the maximum activation. There's also a theory that max-pooling contributes a bit to giving CNNs more translation in-variance. Check out this great video from Andrew Ng on the [benefits of max-pooling](https:\u002F\u002Fwww.coursera.org\u002Flearn\u002Fconvolutional-neural-networks\u002Flecture\u002FhELHk\u002Fpooling-layers).\n\n#### 25) Why do segmentation CNNs typically have an encoder-decoder style \u002F structure? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\nThe encoder CNN can basically be thought of as a feature extraction network, while the decoder uses that information to predict the image segments by \"decoding\" the features and upscaling to the original image size.\n\n#### 26) What is the significance of Residual Networks? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\nThe main thing that residual connections did was allow for direct feature access from previous layers. This makes information propagation throughout the network much easier. One very interesting paper about this shows how using local skip connections gives the network a type of ensemble multi-path structure, giving features multiple paths to propagate throughout the network.\n\n#### 27) What is batch normalization and why does it work? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\nTraining Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. The idea is then to normalize the inputs of each layer in such a way that they have a mean output activation of zero and standard deviation of one. This is done for each individual mini-batch at each layer i.e compute the mean and variance of that mini-batch alone, then normalize. This is analogous to how the inputs to networks are standardized. How does this help? We know that normalizing the inputs to a network helps it learn. But a network is just a series of layers, where the output of one layer becomes the input to the next. That means we can think of any layer in a neural network as the first layer of a smaller subsequent network. Thought of as a series of neural networks feeding into each other, we normalize the output of one layer before applying the activation function, and then feed it into the following layer (sub-network).\n\n#### 28) Why would you use many small convolutional kernels such as 3x3 rather than a few large ones? [[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\nThis is very well explained in the [VGGNet paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1409.1556.pdf). There are 2 reasons: First, you can use several smaller kernels rather than few large ones to get the same receptive field and capture more spatial context, but with the smaller kernels you are using less parameters and computations. Secondly, because with smaller kernels you will be using more filters, you'll be able to use more activation functions and thus have a more discriminative mapping function being learned by your CNN.\n\n#### 29) Why do we need a validation set and test set? What is the difference between them? [[src](https:\u002F\u002Fwww.toptal.com\u002Fmachine-learning\u002Finterview-questions)]\nWhen training a model, we divide the available data into three separate sets:\n\n - The training dataset is used for fitting the model’s parameters. However, the accuracy that we achieve on the training set is not reliable for predicting if the model will be accurate on new samples.\n - The validation dataset is used to measure how well the model does on examples that weren’t part of the training dataset. The metrics computed on the validation data can be used to tune the hyperparameters of the model. However, every time we evaluate the validation data and we make decisions based on those scores, we are leaking information from the validation data into our model. The more evaluations, the more information is leaked. So we can end up overfitting to the validation data, and once again the validation score won’t be reliable for predicting the behaviour of the model in the real world.\n - The test dataset is used to measure how well the model does on previously unseen examples. It should only be used once we have tuned the parameters using the validation set.\n\nSo if we omit the test set and only use a validation set, the validation score won’t be a good estimate of the generalization of the model.\n\n#### 30) What is stratified cross-validation and when should we use it? [[src](https:\u002F\u002Fwww.toptal.com\u002Fmachine-learning\u002Finterview-questions)]\nCross-validation is a technique for dividing data between training and validation sets. On typical cross-validation this split is done randomly. But in stratified cross-validation, the split preserves the ratio of the categories on both the training and validation datasets.\n\nFor example, if we have a dataset with 10% of category A and 90% of category B, and we use stratified cross-validation, we will have the same proportions in training and validation. In contrast, if we use simple cross-validation, in the worst case we may find that there are no samples of category A in the validation set.\n\nStratified cross-validation may be applied in the following scenarios:\n\n - On a dataset with multiple categories. The smaller the dataset and the more imbalanced the categories, the more important it will be to use stratified cross-validation.\n - On a dataset with data of different distributions. For example, in a dataset for autonomous driving, we may have images taken during the day and at night. If we do not ensure that both types are present in training and validation, we will have generalization problems.\n\n#### 31) Why do ensembles typically have higher scores than individual models? [[src](https:\u002F\u002Fwww.toptal.com\u002Fmachine-learning\u002Finterview-questions)]\nAn ensemble is the combination of multiple models to create a single prediction. The key idea for making better predictions is that the models should make different errors. That way the errors of one model will be compensated by the right guesses of the other models and thus the score of the ensemble will be higher.\n\nWe need diverse models for creating an ensemble. Diversity can be achieved by:\n - Using different ML algorithms. For example, you can combine logistic regression, k-nearest neighbors, and decision trees.\n - Using different subsets of the data for training. This is called bagging.\n - Giving a different weight to each of the samples of the training set. If this is done iteratively, weighting the samples according to the errors of the ensemble, it’s called boosting.\nMany winning solutions to data science competitions are ensembles. However, in real-life machine learning projects, engineers need to find a balance between execution time and accuracy.\n\n#### 32) What is an imbalanced dataset? Can you list some ways to deal with it? [[src](https:\u002F\u002Fwww.toptal.com\u002Fmachine-learning\u002Finterview-questions)]\nAn imbalanced dataset is one that has different proportions of target categories. For example, a dataset with medical images where we have to detect some illness will typically have many more negative samples than positive samples—say, 98% of images are without the illness and 2% of images are with the illness.\n\nThere are different options to deal with imbalanced datasets:\n - Oversampling or undersampling. Instead of sampling with a uniform distribution from the training dataset, we can use other distributions so the model sees a more balanced dataset.\n - Data augmentation. We can add data in the less frequent categories by modifying existing data in a controlled way. In the example dataset, we could flip the images with illnesses, or add noise to copies of the images in such a way that the illness remains visible.\n - Using appropriate metrics. In the example dataset, if we had a model that always made negative predictions, it would achieve a precision of 98%. There are other metrics such as precision, recall, and F-score that describe the accuracy of the model better when using an imbalanced dataset.\n\n#### 33) Can you explain the differences between supervised, unsupervised, and reinforcement learning? [[src](https:\u002F\u002Fwww.toptal.com\u002Fmachine-learning\u002Finterview-questions)]\nIn supervised learning, we train a model to learn the relationship between input data and output data. We need to have labeled data to be able to do supervised learning.\n\nWith unsupervised learning, we only have unlabeled data. The model learns a representation of the data. Unsupervised learning is frequently used to initialize the parameters of the model when we have a lot of unlabeled data and a small fraction of labeled data. We first train an unsupervised model and, after that, we use the weights of the model to train a supervised model.\n\nIn reinforcement learning, the model has some input data and a reward depending on the output of the model. The model learns a policy that maximizes the reward. Reinforcement learning has been applied successfully to strategic games such as Go and even classic Atari video games.\n\n#### 34) What is data augmentation? Can you give some examples? [[src](https:\u002F\u002Fwww.toptal.com\u002Fmachine-learning\u002Finterview-questions)]\nData augmentation is a technique for synthesizing new data by modifying existing data in such a way that the target is not changed, or it is changed in a known way.\n\nComputer vision is one of fields where data augmentation is very useful. There are many modifications that we can do to images:\n - Resize\n - Horizontal or vertical flip\n - Rotate\n - Add noise\n - Deform\n - Modify colors\nEach problem needs a customized data augmentation pipeline. For example, on OCR, doing flips will change the text and won’t be beneficial; however, resizes and small rotations may help.\n\n#### 35) What is Turing test? [[src](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\nThe Turing test is a method to test the machine’s ability to match the human level intelligence. A machine is used to challenge the human intelligence that when it passes the test, it is considered as intelligent. Yet a machine could be viewed as intelligent without sufficiently knowing about people to mimic a human.\n\n#### 36) What is Precision?  \nPrecision (also called positive predictive value) is the fraction of relevant instances among the retrieved instances  \nPrecision = true positive \u002F (true positive + false positive)  \n[[src]](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FPrecision_and_recall)\n\n#### 37) What is Recall?  \nRecall (also known as sensitivity) is the fraction of relevant instances that have been retrieved over the total amount of relevant instances.\nRecall = true positive \u002F (true positive + false negative)  \n[[src]](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FPrecision_and_recall)\n\n#### 38) Define F1-score. [[src](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\nIt is the weighted average of precision and recall. It considers both false positive and false negative into account. It is used to measure the model’s performance.  \nF1-Score = 2 * (precision * recall) \u002F (precision + recall)\n\n#### 39) What is cost function? [[src](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\nCost function is a scalar functions which Quantifies the error factor of the Neural Network. Lower the cost function better the Neural network. Eg: MNIST Data set to classify the image, input image is digit 2 and the Neural network wrongly predicts it to be 3\n\n#### 40) List different activation neurons or functions. [[src](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n - Linear Neuron\n - Binary Threshold Neuron\n - Stochastic Binary Neuron\n - Sigmoid Neuron\n - Tanh function\n - Rectified Linear Unit (ReLU)\n\n#### 41) Define Learning Rate.\nLearning rate is a hyper-parameter that controls how much we are adjusting the weights of our network with respect the loss gradient. [[src](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FLearning_rate)]\n\n#### 42) What is Momentum (w.r.t NN optimization)?\nMomentum lets the optimization algorithm remembers its last step, and adds some proportion of it to the current step. This way, even if the algorithm is stuck in a flat region, or a small local minimum, it can get out and continue towards the true minimum. [[src]](https:\u002F\u002Fwww.quora.com\u002FWhat-is-the-difference-between-momentum-and-learning-rate)\n\n#### 43) What is the difference between Batch Gradient Descent and Stochastic Gradient Descent?\nBatch gradient descent computes the gradient using the whole dataset. This is great for convex, or relatively smooth error manifolds. In this case, we move somewhat directly towards an optimum solution, either local or global. Additionally, batch gradient descent, given an annealed learning rate, will eventually find the minimum located in it's basin of attraction.\n\nStochastic gradient descent (SGD) computes the gradient using a single sample. SGD works well (Not well, I suppose, but better than batch gradient descent) for error manifolds that have lots of local maxima\u002Fminima. In this case, the somewhat noisier gradient calculated using the reduced number of samples tends to jerk the model out of local minima into a region that hopefully is more optimal. [[src]](https:\u002F\u002Fstats.stackexchange.com\u002Fquestions\u002F49528\u002Fbatch-gradient-descent-versus-stochastic-gradient-descent)\n\n#### 44) Epoch vs. Batch vs. Iteration.\n - **Epoch**: one forward pass and one backward pass of **all** the training examples  \n - **Batch**: examples processed together in one pass (forward and backward)  \n - **Iteration**: number of training examples \u002F Batch size  \n\n#### 45) What is vanishing gradient? [[src](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\nAs we add more and more hidden layers, back propagation becomes less and less useful in passing information to the lower layers. In effect, as information is passed back, the gradients begin to vanish and become small relative to the weights of the networks.\n\n#### 46) What are dropouts? [[src](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\nDropout is a simple way to prevent a neural network from overfitting. It is the dropping out of some of the units in a neural network. It is similar to the natural reproduction process, where the nature produces offsprings by combining distinct genes (dropping out others) rather than strengthening the co-adapting of them.\n\n#### 47) Define LSTM. [[src](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\nLong Short Term Memory – are explicitly designed to address the long term dependency problem, by maintaining a state what to remember and what to forget.\n\n#### 48) List the key components of LSTM. [[src](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n - Gates (forget, Memory, update & Read)\n - tanh(x) (values between -1 to 1)\n - Sigmoid(x) (values between 0 to 1)\n\n#### 49) List the variants of RNN. [[src](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n - LSTM: Long Short Term Memory\n - GRU: Gated Recurrent Unit\n - End to End Network\n - Memory Network\n\n#### 50) What is Autoencoder, name few applications. [[src](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\nAuto encoder is basically used to learn a compressed form of given data. Few applications include\n - Data denoising\n - Dimensionality reduction\n - Image reconstruction\n - Image colorization\n\n#### 51) What are the components of GAN? [[src](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n - Generator\n - Discriminator\n\n#### 52) What's the difference between boosting and bagging?\nBoosting and bagging are similar, in that they are both ensembling techniques, where a number of weak learners (classifiers\u002Fregressors that are barely better than guessing) combine (through averaging or max vote) to create a strong learner that can make accurate predictions. Bagging means that you take bootstrap samples (with replacement) of your data set and each sample trains a (potentially) weak learner. Boosting, on the other hand, uses all data to train each learner, but instances that were misclassified by the previous learners are given more weight so that subsequent learners give more focus to them during training. [[src]](https:\u002F\u002Fwww.quora.com\u002FWhats-the-difference-between-boosting-and-bagging)\n\n#### 53) Explain how a ROC curve works. [[src]](https:\u002F\u002Fwww.springboard.com\u002Fblog\u002Fmachine-learning-interview-questions\u002F)\nThe ROC curve is a graphical representation of the contrast between true positive rates and the false positive rate at various thresholds. It’s often used as a proxy for the trade-off between the sensitivity of the model (true positives) vs the fall-out or the probability it will trigger a false alarm (false positives).\n\n#### 54) What’s the difference between Type I and Type II error? [[src]](https:\u002F\u002Fwww.springboard.com\u002Fblog\u002Fmachine-learning-interview-questions\u002F)\nType I error is a false positive, while Type II error is a false negative. Briefly stated, Type I error means claiming something has happened when it hasn’t, while Type II error means that you claim nothing is happening when in fact something is.\nA clever way to think about this is to think of Type I error as telling a man he is pregnant, while Type II error means you tell a pregnant woman she isn’t carrying a baby.\n\n#### 55) What’s the difference between a generative and discriminative model? [[src]](https:\u002F\u002Fwww.springboard.com\u002Fblog\u002Fmachine-learning-interview-questions\u002F)\nA generative model will learn categories of data while a discriminative model will simply learn the distinction between different categories of data. Discriminative models will generally outperform generative models on classification tasks.\n\n#### 56) Instance-Based Versus Model-Based Learning.\n\n - **Instance-based Learning**: The system learns the examples by heart, then generalizes to new cases using a similarity measure.\n\n - **Model-based Learning**: Another way to generalize from a set of examples is to build a model of these examples, then use that model to make predictions. This is called model-based learning.\n[[src]](https:\u002F\u002Fmedium.com\u002F@sanidhyaagrawal08\u002Fwhat-is-instance-based-and-model-based-learning-s1e10-8e68364ae084)\n\n\n#### 57) When to use a Label Encoding vs. One Hot Encoding?\n\nThis question generally depends on your dataset and the model which you wish to apply. But still, a few points to note before choosing the right encoding technique for your model:\n\nWe apply One-Hot Encoding when:\n\n- The categorical feature is not ordinal (like the countries above)\n- The number of categorical features is less so one-hot encoding can be effectively applied\n\nWe apply Label Encoding when:\n\n- The categorical feature is ordinal (like Jr. kg, Sr. kg, Primary school, high school)\n- The number of categories is quite large as one-hot encoding can lead to high memory consumption\n\n[[src]](https:\u002F\u002Fwww.analyticsvidhya.com\u002Fblog\u002F2020\u002F03\u002Fone-hot-encoding-vs-label-encoding-using-scikit-learn\u002F)\n\n#### 58) What is the difference between LDA and PCA for dimensionality reduction?\n\nBoth LDA and PCA are linear transformation techniques: LDA is a supervised whereas PCA is unsupervised – PCA ignores class labels.\n\nWe can picture PCA as a technique that finds the directions of maximal variance. In contrast to PCA, LDA attempts to find a feature subspace that maximizes class separability.\n\n[[src]](https:\u002F\u002Fsebastianraschka.com\u002Ffaq\u002Fdocs\u002Flda-vs-pca.html)\n\n#### 59) What is t-SNE?\n\nt-Distributed Stochastic Neighbor Embedding (t-SNE) is an unsupervised, non-linear technique primarily used for data exploration and visualizing high-dimensional data. In simpler terms, t-SNE gives you a feel or intuition of how the data is arranged in a high-dimensional space. \n\n[[src]](https:\u002F\u002Ftowardsdatascience.com\u002Fan-introduction-to-t-sne-with-python-example-5a3a293108d1)\n\n#### 60) What is the difference between t-SNE and PCA for dimensionality reduction?\n\nThe first thing to note is that PCA was developed in 1933 while t-SNE was developed in 2008. A lot has changed in the world of data science since 1933 mainly in the realm of compute and size of data. Second, PCA is a linear dimension reduction technique that seeks to maximize variance and preserves large pairwise distances. In other words, things that are different end up far apart. This can lead to poor visualization especially when dealing with non-linear manifold structures. Think of a manifold structure as any geometric shape like: cylinder, ball, curve, etc.\n\nt-SNE differs from PCA by preserving only small pairwise distances or local similarities whereas PCA is concerned with preserving large pairwise distances to maximize variance.\n\n[[src]](https:\u002F\u002Ftowardsdatascience.com\u002Fan-introduction-to-t-sne-with-python-example-5a3a293108d1)\n\n#### 61) What is UMAP?\n\nUMAP (Uniform Manifold Approximation and Projection) is a novel manifold learning technique for dimension reduction. UMAP is constructed from a theoretical framework based in Riemannian geometry and algebraic topology. The result is a practical scalable algorithm that applies to real world data.\n\n[[src]](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.03426#:~:text=UMAP%20)\n\n#### 62) What is the difference between t-SNE and UMAP for dimensionality reduction?\n\nThe biggest difference between the output of UMAP when compared with t-SNE is this balance between local and global structure - UMAP is often better at preserving global structure in the final projection. This means that the inter-cluster relations are potentially more meaningful than in t-SNE. However, it's important to note that, because UMAP and t-SNE both necessarily warp the high-dimensional shape of the data when projecting to lower dimensions, any given axis or distance in lower dimensions still isn’t directly interpretable in the way of techniques such as PCA.\n\n[[src]](https:\u002F\u002Fpair-code.github.io\u002Funderstanding-umap\u002F)\n\n#### 63) How Random Number Generator Works, e.g. rand() function in python works?\nIt generates a pseudo random number based on the seed and there are some famous algorithm, please see below link for further information on this.\n[[src]](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FLinear_congruential_generator)\n\n#### 64) Given that we want to evaluate the performance of 'n' different machine learning models on the same data, why would the following splitting mechanism be incorrect :\n```\ndef get_splits():\n    df = pd.DataFrame(...)\n    rnd = np.random.rand(len(df))\n    train = df[ rnd \u003C 0.8 ]\n    valid = df[ rnd >= 0.8 & rnd \u003C 0.9 ]\n    test = df[ rnd >= 0.9 ]\n\n    return train, valid, test\n\n#Model 1\n\nfrom sklearn.tree import DecisionTreeClassifier\ntrain, valid, test = get_splits()\n...\n\n#Model 2\n\nfrom sklearn.linear_model import LogisticRegression\ntrain, valid, test = get_splits()\n...\n```\nThe rand() function orders the data differently each time it is run, so if we run the splitting mechanism again, the 80% of the rows we get will be different from the ones we got the first time it was run. This presents an issue as we need to compare the performance of our models on the same test set. In order to ensure reproducible and consistent sampling we would have to set the random seed in advance or store the data once it is split. Alternatively, we could simply set the 'random_state' parameter in sklearn's train_test_split() function in order to get the same train, validation and test sets across different executions. \n\n[[src]](https:\u002F\u002Ftowardsdatascience.com\u002Fwhy-do-we-set-a-random-state-in-machine-learning-models-bb2dc68d8431#:~:text=In%20Scikit%2Dlearn%2C%20the%20random,random%20state%20instance%20from%20np.)\n\n\n#### 65) What is the difference between Bayesian vs frequentist statistics? [[src]](https:\u002F\u002Fwww.kdnuggets.com\u002F2022\u002F10\u002Fnlp-interview-questions.html)\nFrequentist statistics is a framework that focuses on estimating population parameters using sample statistics, and providing point estimates and confidence intervals.\n\nBayesian statistics, on the other hand, is a framework that uses prior knowledge and information to update beliefs about a parameter or hypothesis, and provides probability distributions for parameters.\n\nThe main difference is that Bayesian statistics incorporates prior knowledge and beliefs into the analysis, while frequentist statistics doesn't.\n\n### 66) What is the basic difference between LSTM and Transformers? [[src]](https:\u002F\u002Fblog.finxter.com\u002Ftransformer-vs-lstm\u002F#:~:text=LSTM%20models%20consist%20of%20RNN,feed-forward%20neural%20network%20components.)\nLSTMs (Long Short Term Memory) models consist of RNN cells designed to store and manipulate information across time steps more efficiently. In contrast, Transformer models contain a stack of encoder and decoder layers, each consisting of self attention and feed-forward neural network components. \n\n### 66) What are RCNNs? [[src]](https:\u002F\u002Ftowardsdatascience.com\u002Flearn-rcnns-with-this-toy-dataset-be19dce380ec)\nRecurrent Convolutional model is a model that is specially designed to make predictions using a sequence of images (more commonly also know as video). These models are used in object detection tasks in computer vision. The RCNN approach combines both region proposal techniques and convolutional neural networks (CNNs) to identify and locate objects within an image.\n\n\n## Contributions\nContributions are most welcomed.\n 1. Fork the repository.\n 2. Commit your *questions* or *answers*.\n 3. Open **pull request**.\n","# 65道机器学习面试题 2026\n这是一份针对机器学习和计算机视觉工程师职位的技术面试题合集。\n\n### 最新添加：[自然语言处理（NLP）面试题 2026](https:\u002F\u002Fgithub.com\u002Fandrewekhalel\u002FMLQuestions\u002Ftree\u002Fmaster\u002FNLP)\n\n## 备考资源\n1. [机器学习工程师面试课程](https:\u002F\u002Fwww.tryexponent.com\u002Fcourses\u002Fml-engineer?ref=zjgwmje&tap_s=5026306-8f044e)\n1. [模拟机器学习面试](https:\u002F\u002Fwww.tryexponent.com\u002Fcoaching?ref=zjgwmje&tap_s=5026306-8f044e&category=mock_interviews&src=nav&skill=ml)：与来自顶尖科技公司和初创企业的机器学习工程师一起练习，为你的下一次面试做好准备。\n1. [《统计学大全：统计推断简明教程》](https:\u002F\u002Famzn.to\u002F3r87WGa) 作者：拉里·瓦瑟曼\n2. [《机器学习》](https:\u002F\u002Famzn.to\u002F3RdiFK3) 作者：汤姆·米切尔\n3. [《设计机器学习系统：面向生产就绪应用的迭代式流程》](https:\u002F\u002Famzn.to\u002F3LiVgD2) 作者：奇普·休恩\n\n---\n本页面由 [Jobbyo](https:\u002F\u002Fjobbyo.ai\u002F?linkId=lp_801223&sourceId=akhalel&tenantId=jobbyoai) 赞助：\n\u003Cp align=center style=\"font-style:italic;\">“如果你专注于面试准备，那就让 \u003Ca href=\"https:\u002F\u002Fjobbyo.ai\u002F?linkId=lp_801223&sourceId=akhalel&tenantId=jobbyoai\">Jobbyo\u003C\u002Fa> 来处理繁琐的申请工作吧。\n它会自动完成申请，并在你保持高效备考的同时，帮你整理好整个求职过程。”\u003C\u002Fp>\n\n结账时使用优惠码“MLQUESTIONS”，即可享受高级订阅3个月的20%折扣！\n\n## 面试题\n\n#### 1) 偏差与方差之间存在怎样的权衡？[[来源]](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)\n\n如果我们的模型过于简单，参数很少，那么它可能会有较高的偏差和较低的方差。相反，如果模型拥有大量参数，则会表现出高方差和低偏差。因此，我们需要在不过度拟合或欠拟合数据的情况下，找到一个合适的平衡点。[[来源]](https:\u002F\u002Ftowardsdatascience.com\u002Funderstanding-the-bias-variance-tradeoff-165e6942b229)\n\n#### 2) 什么是梯度下降法？[[来源]](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)\n[[答案]](https:\u002F\u002Fmachinelearningmastery.com\u002Fgradient-descent-for-machine-learning\u002F)\n\n梯度下降法是一种优化算法，用于寻找使成本函数（cost）最小化的函数（f）参数（系数）值。\n\n当参数无法通过解析方法（例如线性代数）计算得出，而必须借助优化算法来搜索时，梯度下降法是最适用的方法。\n\n#### 3) 解释过拟合与欠拟合现象，以及如何应对它们？[[来源]](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)\n[[答案]](https:\u002F\u002Ftowardsdatascience.com\u002Foverfitting-vs-underfitting-a-complete-example-d05dd7e19765)\n\n机器学习\u002F深度学习模型本质上是在其输入（称为训练特征）和目标输出（称为标签）之间学习一种关系。无论所学习的关系（函数）质量如何，其在测试集（与训练数据不同的数据集合）上的表现都需要进一步评估。\n\n大多数机器学习\u002F深度学习模型都具有可训练的参数，这些参数将被学习以构建输入-输出关系。根据每个模型拥有的参数数量，可以将其分为更灵活（参数更多）和较不灵活（参数较少）两类。\n\n欠拟合问题出现在模型的灵活性（即参数数量）不足以捕捉训练数据中的潜在模式时。而过拟合则发生在模型对潜在模式过于敏感、过于灵活的情况下。在这种情况下，我们说模型已经“记住”了训练数据。\n\n一个欠拟合的例子是用一阶多项式（一条直线）去拟合二阶多项式（二次函数）。类似地，用十阶多项式去拟合一条直线则属于过拟合。\n\n#### 4) 如何应对维度灾难？[[来源]](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)\n\n - 特征选择（手动或通过统计方法）\n - 主成分分析（PCA）\n - 多维尺度分析\n - 局部线性嵌入  \n[[来源]](https:\u002F\u002Ftowardsdatascience.com\u002Fwhy-and-how-to-get-rid-of-the-curse-of-dimensionality-right-with-breast-cancer-dataset-7d528fb5f6c0)\n\n#### 5) 什么是正则化？我们为什么要使用正则化？请列举一些常见的正则化方法。[[来源]](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)\n正则化是一种抑制学习复杂或灵活模型的技术，旨在避免过拟合的风险。\n示例：\n - 岭回归（L2范数）\n - Lasso回归（L1范数）  \n**岭回归**的一个明显缺点是模型的可解释性较差。它会将不太重要的预测变量的系数压缩到非常接近于零，但永远不会使其精确等于零。换句话说，最终模型中会包含所有预测变量。然而，在 **Lasso** 的情况下，由于L1惩罚的作用，当调节参数λ足够大时，某些系数估计值会被强制变为*完全等于零*。因此，Lasso方法同时具备变量选择功能，能够生成稀疏模型。\n[[来源]](https:\u002F\u002Ftowardsdatascience.com\u002Fregularization-in-machine-learning-76441ddcf99a)\n\n#### 6) 请解释主成分分析（PCA）？[[来源]](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)\n[[答案]](https:\u002F\u002Ftowardsdatascience.com\u002Fa-one-stop-shop-for-principal-component-analysis-5582fb7e0a9c)\n\n主成分分析（PCA）是一种降维技术，常用于机器学习中，旨在减少数据集中的特征数量，同时尽可能保留数据信息。其原理是识别数据变化最大的方向（主成分），并将数据投影到沿这些方向的低维子空间上。\n\n#### 7) 为什么ReLU在神经网络中比Sigmoid更好，也更常用？[[来源]](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)\n\n * 计算效率：\n   由于ReLU是一个简单的阈值函数，前向传播和反向传播的速度都会更快。\n * 减少梯度消失的可能性：\n   ReLU的梯度在正值时为1，在负值时为0；而Sigmoid激活函数则会在输入稍有变化时迅速饱和（梯度接近于0），从而导致梯度消失。\n * 稀疏性：\n   当ReLU的输入为负值时，就会产生稀疏性。这意味着只有少数神经元会被激活（稀疏激活），从而使网络更加轻量。\n\n[[src1]](https:\u002F\u002Fmedium.com\u002Fthe-theory-of-everything\u002Funderstanding-activation-functions-in-neural-networks-9491262884e0) [[src2]](https:\u002F\u002Fstats.stackexchange.com\u002Fquestions\u002F126238\u002Fwhat-are-the-advantages-of-relu-over-sigmoid-function-in-deep-neural-networks)\n\n\n\n#### 8) 给定一维卷积神经网络中每一层的步长 S 和卷积核大小，编写一个函数来计算网络中某个节点的[感受野](https:\u002F\u002Fwww.quora.com\u002FWhat-is-a-receptive-field-in-a-convolutional-neural-network)。这实际上就是确定有多少输入节点会真正连接到 CNN 中的一个神经元。[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n感受野是指在一次操作中，用于生成输出的输入空间中的特定区域。\n\n考虑一个大小为 k 的 CNN 滤波器，某一层的感受野就是滤波器所使用的输入数量 k，再乘以输入中未被卷积滤波器缩减的维度 a。这样就得到感受野大小为 k*a。\n\n更直观地讲，假设有一张 32x32x3 的图像，使用 5x5 大小的卷积滤波器，那么对应的感受野就是滤波器大小 5，乘以输入体积的深度（即 RGB 颜色通道数），也就是颜色维度。因此，感受野的尺寸为 5x5x3。\n\n#### 9) 在一张图像或矩阵上实现[连通组件](http:\u002F\u002Faishack.in\u002Ftutorials\u002Flabelling-connected-components-example\u002F)的标记。[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n\n#### 10) 用 C++ 实现一个稀疏矩阵类。[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n[[答案]](https:\u002F\u002Fwww.geeksforgeeks.org\u002Fsparse-matrix-representation\u002F)\n\n#### 11) 编写一个函数来计算[积分图像](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FSummed-area_table)，并再编写一个函数从积分图像中获取区域和。[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n[[答案]](https:\u002F\u002Fwww.geeksforgeeks.org\u002Fsubmatrix-sum-queries\u002F)\n\n#### 12) 当你试图从噪声样本中估计一个平面时，你会如何去除异常值？[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n随机抽样一致性（RANSAC）是一种迭代方法，用于从包含异常值的观测数据集中估计数学模型的参数，同时确保这些异常值不会影响估计结果。\n[[src]](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FRandom_sample_consensus)\n\n\n\n#### 13) [CBIR](https:\u002F\u002Fwww.robots.ox.ac.uk\u002F~vgg\u002Fpublications\u002F2013\u002Farandjelovic13\u002Farandjelovic13.pdf) 是如何工作的？[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n[[答案]](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FContent-based_image_retrieval)\n基于内容的图像检索是指利用图像本身的内容来提取元数据的概念。与目前基于图像关联关键词的检索方式不同，这种方法通过计算机视觉技术提取相关信息，并在查询阶段使用这些信息进行检索。实现方式多种多样，可以从特征检测提取关键词，到使用卷积神经网络提取密集特征，并将其映射到已知的关键词分布中。\n\n采用后一种方法时，我们不再关注图像中具体显示的内容，而是关注由已知图像生成的元数据与目标标签或标记之间的相似性。\n\n#### 14) 图像配准是如何工作的？稀疏与稠密[光流](http:\u002F\u002Fwww.ncorr.com\u002Fdownload\u002Fpublications\u002Fbakerunify.pdf)等。[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n#### 15) 描述卷积是如何工作的。如果输入是灰度图像还是 RGB 图像，又有什么区别？决定下一层形状的因素有哪些？[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n在卷积神经网络（CNN）中，卷积操作是通过一个称为卷积核或滤波器的小矩阵对输入图像进行处理的。卷积核以固定步长滑过图像，与图像对应位置的元素逐点相乘并求和，最终得到的结果称为特征图。\n\n当输入是 RGB（或多于 3 个通道）图像时，滑动窗口将变成一个三维立方体。下一层的形状由卷积核大小、卷积核数量、步长、填充以及膨胀率等因素决定。\n[[src1]](https:\u002F\u002Fdev.to\u002Fsandeepbalachandran\u002Fmachine-learning-convolution-with-color-images-2p41)[[src2]](https:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F70231487\u002Foutput-dimensions-of-convolution-in-pytorch)\n\n#### 16) 请描述一下如何根据物体周围各个角度拍摄的图像和深度传感器测量数据，创建该物体的 3D 模型。[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n3D 重建主要有两种流行的方法：\n* 运动恢复结构法（SfM）[[src]](https:\u002F\u002Fwww.mathworks.com\u002Fhelp\u002Fvision\u002Fug\u002Fstructure-from-motion.html)\n\n* 多视角立体视觉法（MVS）[[src]](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=Zwwty2qPNs8)\n\nSfM 更适合构建大型场景的模型，而 MVS 则更适合构建小型物体的模型。\n\n\n#### 17) 不使用任何特殊函数，仅利用基本算术运算，实现 SQRT(const double & x)。[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n可以使用泰勒级数来近似计算 sqrt(x)：\n\n[[答案]](https:\u002F\u002Fmath.stackexchange.com\u002Fquestions\u002F732540\u002Ftaylor-series-of-sqrt1x-using-sigma-notation)\n\n#### 18) 反转一个位串。[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n如果你使用的是 Python 3：\n\n```\ndata = b'\\xAD\\xDE\\xDE\\xC0'\nmy_data = bytearray(data)\nmy_data.reverse()\n```\n#### 19) 尽可能高效地实现非极大值抑制。[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n非极大值抑制（NMS）是一种用于消除同一图像中多次检测到同一对象的技术。\n解决方法是首先按照边界框的得分进行排序（时间复杂度为 N logN）。然后从得分最高的框开始，移除那些与当前框重叠程度（IoU）超过一定阈值的框。（时间复杂度为 N^2）\n\n为了优化这个解决方案，你可以使用特殊的数据结构来查询重叠的边界框，比如 R 树或 KD 树。（N LogN）\n[[src]](https:\u002F\u002Ftowardsdatascience.com\u002Fnon-maxima-suppression-139f7e00f0b5)\n\n#### 20) 将链表原地反转。[[src](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fcomputervision\u002Fcomments\u002F7gku4z\u002Ftechnical_interview_questions_in_cv\u002F)]\n\n[[答案]](https:\u002F\u002Fwww.geeksforgeeks.org\u002Freverse-a-linked-list\u002F)\n\n#### 21) 什么是数据归一化？为什么我们需要它？[[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n数据归一化是非常重要的预处理步骤，用于将数值缩放到特定范围，以确保在反向传播过程中更好地收敛。通常，这涉及到对每个数据点减去其均值，并除以其标准差。如果不进行归一化，某些特征（特别是那些数值较大的特征）会在损失函数中被赋予更高的权重（如果一个高数值特征变化了1%，那将是一个很大的变化，但对于较小的特征来说则微不足道）。通过数据归一化，所有特征的权重会变得相对均衡。\n\n#### 22) 为什么我们在处理图像时使用卷积层而不是全连接层？[[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n首先，卷积操作能够保留、编码并有效利用图像中的空间信息。如果我们只使用全连接层，就无法获取任何相对的空间信息。其次，卷积神经网络（CNN）具有一定的平移不变性，因为每个卷积核都充当自己的滤波器或特征检测器。\n\n#### 23) 是什么使 CNN 具有平移不变性？[[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n如上所述，每个卷积核都充当自己的滤波器或特征检测器。因此，假设你在进行目标检测，目标在图像中的位置并不重要，因为我们无论如何都会以滑动窗口的方式在整个图像上应用卷积操作。\n\n#### 24) 为什么分类 CNN 中要使用最大池化层？[[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n最大池化层在 CNN 中的作用是减少计算量，因为在池化之后，特征图的尺寸会变小。同时，由于我们取的是最大激活值，语义信息并不会丢失太多。此外，还有观点认为，最大池化也有助于增强 CNN 的平移不变性。可以参考吴恩达关于[最大池化的优点](https:\u002F\u002Fwww.coursera.org\u002Flearn\u002Fconvolutional-neural-networks\u002Flecture\u002FhELHk\u002Fpooling-layers)的精彩视频。\n\n#### 25) 为什么分割 CNN 通常采用编码器-解码器的结构？[[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n编码器 CNN 可以被视为一个特征提取网络，而解码器则利用这些特征来预测图像的各个分割区域，通过“解码”特征并将分辨率恢复到原始图像大小。\n\n#### 26) 残差网络的意义是什么？[[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n残差连接的主要作用是允许信息直接从之前的层传递过来，从而使得信息在网络中的传播变得更加容易。一篇非常有趣的论文指出，使用局部跳跃连接可以使网络形成一种多路径集成结构，让特征可以通过多种路径在网络中传播。\n\n#### 27) 什么是批量归一化？为什么它有效？[[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n训练深度神经网络的一个复杂之处在于，随着前面各层参数的变化，每一层输入的分布也会随之改变。因此，批量归一化的想法就是对每一层的输入进行标准化，使其输出的激活均值为零，标准差为一。这一过程是在每个单独的小批次上完成的，即仅针对该小批次计算均值和方差，然后进行归一化。这类似于对网络输入进行标准化的做法。那么，这样做有什么好处呢？我们知道，对网络输入进行归一化有助于模型的学习。然而，网络实际上是由一系列层组成的，前一层的输出会成为下一层的输入。这意味着我们可以把神经网络中的任意一层看作是一个更小子网络的第一层。这样，当我们对一层的输出进行归一化后再应用激活函数，并将其输入到下一层（子网络）时，就能实现逐层的归一化。\n\n#### 28) 为什么我们会使用多个小卷积核（如3x3），而不是少数几个大卷积核？[[src](http:\u002F\u002Fhouseofbots.com\u002Fnews-detail\u002F2849-4-data-science-and-machine-learning-interview-questions)]\n这一点在[VGGNet论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1409.1556.pdf)中有很好的解释。原因有两个：首先，使用多个小卷积核可以获得与少数大卷积核相同的感受野，并捕捉更多的空间上下文信息，但小卷积核所需的参数和计算量更少。其次，由于使用小卷积核时需要更多的滤波器，因此可以应用更多的激活函数，从而使 CNN 学习到更具区分性的映射函数。\n\n#### 29) 为什么我们需要验证集和测试集？它们之间有什么区别？[[src](https:\u002F\u002Fwww.toptal.com\u002Fmachine-learning\u002Finterview-questions)]\n在训练模型时，我们会将可用数据分为三个独立的集合：\n\n - 训练集用于拟合模型的参数。然而，在训练集上获得的准确率并不能可靠地预测模型在新样本上的表现。\n - 验证集用于评估模型在未参与训练的数据上的表现。基于验证集计算的指标可以用来调整模型的超参数。但是，每次我们评估验证集并根据其结果做出决策时，都会将验证集的信息泄露到模型中。评估次数越多，泄露的信息就越多。最终可能导致模型过拟合验证集，这样一来，验证集的表现同样无法可靠地预测模型在现实世界中的行为。\n - 测试集用于评估模型在从未见过的数据上的表现。它应该仅在我们已经使用验证集调优完参数之后才被使用。\n\n因此，如果我们省略测试集而只使用验证集，那么验证集上的得分将无法很好地估计模型的泛化能力。\n\n#### 30) 什么是分层交叉验证？我们什么时候应该使用它？[[来源](https:\u002F\u002Fwww.toptal.com\u002Fmachine-learning\u002Finterview-questions)]  \n交叉验证是一种将数据划分为训练集和验证集的技术。在普通的交叉验证中，这种划分是随机进行的。而在分层交叉验证中，划分会保持训练集和验证集中各类别比例的一致性。\n\n例如，如果我们有一个数据集，其中类别A占10%，类别B占90%，如果使用分层交叉验证，那么训练集和验证集中的比例将完全相同。相反，如果使用普通的交叉验证，在最坏的情况下，验证集中可能完全没有类别A的样本。\n\n分层交叉验证可以应用于以下场景：\n\n- 数据集中包含多个类别时。数据集越小、类别分布越不均衡，使用分层交叉验证就越重要。\n- 数据集中包含不同分布的数据时。例如，在自动驾驶的数据集中，可能会同时包含白天和夜晚拍摄的图像。如果不确保这两种类型的数据都出现在训练集和验证集中，就可能导致模型泛化能力不足。\n\n#### 31) 为什么集成模型通常比单个模型的得分更高？[[来源](https:\u002F\u002Fwww.toptal.com\u002Fmachine-learning\u002Finterview-questions)]  \n集成模型是通过结合多个模型来生成单一预测的方法。其核心思想在于，各个模型应犯不同的错误。这样，一个模型的错误就会被其他模型正确的预测所弥补，从而使得集成模型的整体得分更高。\n\n创建集成模型需要多样化的子模型。实现多样性的方法包括：\n- 使用不同的机器学习算法。例如，可以将逻辑回归、K近邻和决策树结合起来。\n- 在训练时使用数据的不同子集。这被称为自助法（bagging）。\n- 为训练集中的每个样本赋予不同的权重。如果以迭代的方式进行，并根据集成模型的误差动态调整样本权重，则称为提升法（boosting）。\n\n许多数据科学竞赛的获奖方案都是集成模型。然而，在实际的机器学习项目中，工程师需要在运行时间和准确性之间找到平衡。\n\n#### 32) 什么是不平衡数据集？你能列举一些处理它的方法吗？[[来源](https:\u002F\u002Fwww.toptal.com\u002Fmachine-learning\u002Finterview-questions)]  \n不平衡数据集是指目标类别比例差异较大的数据集。例如，在医学图像数据集中，如果我们要检测某种疾病，通常负样本的数量会远远多于正样本——比如98%的图像没有该疾病，而只有2%的图像有该疾病。\n\n处理不平衡数据集有不同的方法：\n- 过采样或欠采样。我们可以不从训练集中采用均匀分布的采样方式，而是使用其他分布方式，使模型看到更加均衡的数据集。\n- 数据增强。我们可以通过对现有数据进行可控的修改来增加少数类别的样本数量。例如，在上述数据集中，可以将带有疾病的图像进行翻转，或者在图像副本上添加噪声，但要确保疾病仍然可见。\n- 使用合适的评估指标。在上述数据集中，如果模型总是预测为阴性，其准确率也会达到98%。但在处理不平衡数据集时，精确率、召回率和F1分数等指标能够更好地反映模型的性能。\n\n#### 33) 请解释监督学习、无监督学习和强化学习之间的区别？[[来源](https:\u002F\u002Fwww.toptal.com\u002Fmachine-learning\u002Finterview-questions)]  \n在监督学习中，我们训练模型来学习输入数据与输出数据之间的关系。进行监督学习的前提是有标注好的数据。\n\n而在无监督学习中，我们只有未标注的数据。模型会学习数据的表示形式。无监督学习常用于在大量未标注数据和少量标注数据的情况下初始化模型参数。首先训练一个无监督模型，然后利用该模型的权重来训练一个监督模型。\n\n强化学习中，模型接收输入数据，并根据其输出获得奖励。模型会学习一种能够最大化奖励的策略。强化学习已成功应用于围棋等策略游戏，甚至经典的雅达利电子游戏。\n\n#### 34) 什么是数据增强？你能举几个例子吗？[[来源](https:\u002F\u002Fwww.toptal.com\u002Fmachine-learning\u002Finterview-questions)]  \n数据增强是一种通过修改现有数据来合成新数据的技术，且修改后的数据不会改变目标内容，或者改变方式是已知的。\n\n计算机视觉领域是数据增强非常有用的一个方向。我们可以对图像进行多种修改：\n- 调整大小\n- 水平或垂直翻转\n- 旋转\n- 添加噪声\n- 变形\n- 调整颜色\n\n每种问题都需要定制化的数据增强流程。例如，在光学字符识别（OCR）任务中，翻转会改变文本内容，反而不利于模型；而调整大小和小幅旋转则可能有所帮助。\n\n#### 35) 什么是图灵测试？[[来源](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]  \n图灵测试是一种用来检验机器是否具备人类水平智能的方法。通过让机器与人类进行交互，若机器通过了测试，则被认为具有智能。然而，即使机器表现得像人类一样，也可能并不真正理解人类，仅仅是在模仿而已。\n\n#### 36) 什么是精确率？  \n精确率（也称为阳性预测值）是指检索到的相关实例占所有检索结果的比例。  \n精确率 = 真正例 \u002F (真正例 + 假正例)  \n[[来源]](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FPrecision_and_recall)\n\n#### 37) 什么是召回率？  \n召回率（也称为灵敏度）是指所有相关实例中被正确检索出来的实例所占的比例。  \n召回率 = 真正例 \u002F (真正例 + 假负例)  \n[[来源]](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FPrecision_and_recall)\n\n#### 38) 请定义F1分数。[[来源](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]  \n它是精确率和召回率的加权平均值，同时考虑了假正例和假负例的影响。F1分数常用于衡量模型的性能。  \nF1分数 = 2 * (精确率 * 召回率) \u002F (精确率 + 召回率)\n\n#### 39) 什么是损失函数？[[来源](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n损失函数是一个标量函数，用于量化神经网络的误差程度。损失函数值越低，神经网络的表现越好。例如，在MNIST数据集中对图像进行分类时，输入图像是数字2，但神经网络错误地预测为3。\n\n#### 40) 列举不同的激活函数或神经元类型。[[来源](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n - 线性神经元\n - 二值阈值神经元\n - 随机二值神经元\n - Sigmoid神经元\n - Tanh函数\n - 整流线性单元（ReLU）\n\n#### 41) 定义学习率。\n学习率是一个超参数，用于控制我们在梯度下降过程中根据损失梯度调整网络权重的程度。[[来源](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FLearning_rate)]\n\n#### 42) 动量（在神经网络优化中）是什么？\n动量使优化算法记住上一步的方向，并将其按一定比例加到当前步骤中。这样，即使算法陷入平坦区域或局部最小值，它也能跳出并继续向真正的最小值前进。[[来源]](https:\u002F\u002Fwww.quora.com\u002FWhat-is-the-difference-between-momentum-and-learning-rate)\n\n#### 43) 批量梯度下降和随机梯度下降有什么区别？\n批量梯度下降使用整个数据集来计算梯度。这对于凸的或相对平滑的误差曲面非常有效。在这种情况下，我们可以较为直接地朝着局部或全局最优解前进。此外，如果学习率逐渐降低，批量梯度下降最终会找到其吸引域内的最小值。\n\n随机梯度下降（SGD）则使用单个样本来计算梯度。SGD在具有大量局部最大\u002F最小值的误差曲面上表现更好（虽然不能说“好”，但相比批量梯度下降确实更优）。在这种情况下，由于样本数量较少而产生的较嘈杂的梯度，往往能将模型从局部最小值中“拉出”，进入一个更有希望的更优区域。[[来源]](https:\u002F\u002Fstats.stackexchange.com\u002Fquestions\u002F49528\u002Fbatch-gradient-descent-versus-stochastic-gradient-descent)\n\n#### 44) 时代、批次与迭代的区别。\n - **时代**：对**所有**训练样本进行一次前向传播和一次反向传播  \n - **批次**：在一次前向和反向传播中一起处理的样本  \n - **迭代**：训练样本总数除以批次大小\n\n#### 45) 什么是梯度消失问题？[[来源](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n随着隐藏层的不断增加，反向传播传递信息到较低层的能力越来越弱。实际上，当信息被反向传播时，梯度会逐渐消失，相对于网络的权重变得非常小。\n\n#### 46) 什么是丢弃法？[[来源](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n丢弃法是一种简单有效的防止神经网络过拟合的方法。它通过随机丢弃神经网络中的部分单元来实现。这类似于自然界的繁殖过程：自然界通过组合不同的基因（即丢弃其他基因）来产生后代，而不是强化基因之间的协同适应。\n\n#### 47) 定义LSTM。[[来源](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n长短期记忆网络——专门设计用来解决长期依赖问题，通过维护一个状态来决定记住什么和忘记什么。\n\n#### 48) 列举LSTM的关键组成部分。[[来源](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n - 门控机制（遗忘门、记忆门、更新门和读取门）\n - tanh(x)（取值范围为-1到1）\n - Sigmoid(x)（取值范围为0到1）\n\n#### 49) 列举RNN的变体。[[来源](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n - LSTM：长短期记忆网络\n - GRU：门控循环单元\n - 端到端网络\n - 记忆网络\n\n#### 50) 什么是自编码器？列举几个应用。[[来源](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n自编码器主要用于学习给定数据的压缩表示。其应用包括：\n - 数据去噪\n - 降维\n - 图像重建\n - 图像着色\n\n#### 51) GAN由哪些组件构成？[[来源](https:\u002F\u002Fintellipaat.com\u002Finterview-question\u002Fartificial-intelligence-interview-questions\u002F)]\n - 生成器\n - 判别器\n\n#### 52) 提升法和装袋法有什么区别？\n提升法和装袋法都是集成学习技术，它们通过结合多个弱学习器（分类器或回归器，其性能仅略高于随机猜测）来形成一个强学习器，从而做出准确的预测。装袋法是通过对数据集进行有放回的自助采样，每个样本训练一个可能较弱的学习器。而提升法则使用全部数据来训练每个学习器，但之前被错误分类的样本会被赋予更高的权重，以便后续学习器在训练时更加关注这些样本。[[来源]](https:\u002F\u002Fwww.quora.com\u002FWhats-the-difference-between-boosting-and-bagging)\n\n#### 53) 解释ROC曲线的工作原理。[[来源]](https:\u002F\u002Fwww.springboard.com\u002Fblog\u002Fmachine-learning-interview-questions\u002F)\nROC曲线是以图形方式展示不同阈值下真正例率与假正例率之间对比关系的工具。它常被用作模型灵敏度（真正例）与误报率（假正例）之间权衡的指标。\n\n#### 54) 第一类错误和第二类错误有什么区别？[[来源]](https:\u002F\u002Fwww.springboard.com\u002Fblog\u002Fmachine-learning-interview-questions\u002F)\n第一类错误是假阳性，而第二类错误是假阴性。简而言之，第一类错误是指明明没有发生某件事却声称发生了，而第二类错误则是指明明有事情发生却声称没有。可以这样理解：第一类错误就像告诉一个男人他怀孕了，而第二类错误则像是告诉一位孕妇她并没有怀孕。\n\n#### 55) 生成模型和判别模型有什么区别？[[来源]](https:\u002F\u002Fwww.springboard.com\u002Fblog\u002Fmachine-learning-interview-questions\u002F)\n生成模型会学习数据的类别分布，而判别模型则只学习不同类别之间的区分特征。通常情况下，判别模型在分类任务上的表现优于生成模型。\n\n#### 56) 基于实例的学习与基于模型的学习。\n\n- **基于实例的学习**：系统通过记忆训练样本进行学习，然后使用相似性度量将知识泛化到新样本上。\n\n- **基于模型的学习**：另一种从一组示例中进行泛化的办法是构建这些示例的模型，然后利用该模型来进行预测。这种方法称为基于模型的学习。\n[[src]](https:\u002F\u002Fmedium.com\u002F@sanidhyaagrawal08\u002Fwhat-is-instance-based-and-model-based-learning-s1e10-8e68364ae084)\n\n\n#### 57) 何时使用标签编码与独热编码？\n\n这个问题通常取决于你的数据集以及你希望应用的模型。不过，在为模型选择合适的编码技术之前，仍有一些需要注意的点：\n\n我们使用独热编码的情况包括：\n\n- 分类特征是非有序的（如上述国家）；\n- 分类特征的数量较少，因此可以有效地应用独热编码。\n\n我们使用标签编码的情况包括：\n\n- 分类特征是有序的（如幼儿班、小学低年级、小学高年级、中学）；\n- 类别数量较多，因为独热编码可能导致较高的内存消耗。\n\n[[src]](https:\u002F\u002Fwww.analyticsvidhya.com\u002Fblog\u002F2020\u002F03\u002Fone-hot-encoding-vs-label-encoding-using-scikit-learn\u002F)\n\n#### 58) LDA和PCA在降维方面有什么区别？\n\nLDA和PCA都是线性变换技术：LDA是有监督的，而PCA则是无监督的——PCA会忽略类别标签。\n\n我们可以把PCA理解为一种寻找数据方差最大方向的技术。与PCA不同，LDA则试图找到能够最大化类别可分性的特征子空间。\n\n[[src]](https:\u002F\u002Fsebastianraschka.com\u002Ffaq\u002Fdocs\u002Flda-vs-pca.html)\n\n#### 59) 什么是t-SNE？\n\nt分布随机邻域嵌入（t-SNE）是一种无监督的非线性技术，主要用于数据探索和可视化高维数据。简单来说，t-SNE可以帮助你直观地感受数据在高维空间中的分布情况。\n\n[[src]](https:\u002F\u002Ftowardsdatascience.com\u002Fan-introduction-to-t-sne-with-python-example-5a3a293108d1)\n\n#### 60) t-SNE和PCA在降维方面有什么区别？\n\n首先需要指出的是，PCA诞生于1933年，而t-SNE则是在2008年才被提出。自1933年以来，数据科学领域发生了巨大变化，尤其是在计算能力和数据规模方面。其次，PCA是一种线性降维技术，它旨在最大化方差并保留较大的成对距离。换句话说，原本差异较大的数据点在降维后仍然会相距较远。这可能会导致较差的可视化效果，特别是在处理非线性流形结构时。所谓流形结构，可以理解为任何几何形状，比如圆柱体、球体、曲线等。\n\n相比之下，t-SNE只保留较小的成对距离或局部相似性，而PCA则关注于保留较大的成对距离以最大化方差。\n\n[[src]](https:\u002F\u002Ftowardsdatascience.com\u002Fan-introduction-to-t-sne-with-python-example-5a3a293108d1)\n\n#### 61) 什么是UMAP？\n\nUMAP（均匀流形近似与投影）是一种新颖的流形学习降维技术。UMAP基于黎曼几何和代数拓扑的理论框架构建而成，最终形成了一种实用且可扩展的算法，适用于真实世界的数据。\n\n[[src]](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.03426#:~:text=UMAP%20)\n\n#### 62) t-SNE和UMAP在降维方面有什么区别？\n\nUMAP与t-SNE输出结果最大的区别在于其对局部与全局结构的平衡——UMAP往往更能保持最终投影中的全局结构。这意味着簇间关系可能比t-SNE更为有意义。然而，需要注意的是，由于UMAP和t-SNE在将高维数据投影到低维空间时都会不可避免地扭曲数据的原始形状，因此在低维空间中的任何坐标轴或距离都无法像PCA那样直接解释。\n\n[[src]](https:\u002F\u002Fpair-code.github.io\u002Funderstanding-umap\u002F)\n\n#### 63) 随机数生成器是如何工作的？例如Python中的rand()函数？\n它根据种子生成伪随机数，并且存在一些著名的算法。更多信息请参见以下链接：\n[[src]](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FLinear_congruential_generator)\n\n#### 64) 假设我们要在同一数据集上评估n个不同的机器学习模型的性能，为什么以下划分机制是不正确的呢？\n```\ndef get_splits():\n    df = pd.DataFrame(...)\n    rnd = np.random.rand(len(df))\n    train = df[ rnd \u003C 0.8 ]\n    valid = df[ rnd >= 0.8 & rnd \u003C 0.9 ]\n    test = df[ rnd >= 0.9 ]\n\n    return train, valid, test\n\n#模型1\n\nfrom sklearn.tree import DecisionTreeClassifier\ntrain, valid, test = get_splits()\n...\n\n#模型2\n\nfrom sklearn.linear_model import LogisticRegression\ntrain, valid, test = get_splits()\n...\n```\nrand()函数每次运行时都会以不同的顺序排列数据，因此如果我们再次运行划分机制，得到的80%的数据行将会与第一次不同。这就带来了一个问题：我们需要在相同的测试集上比较各个模型的性能。为了确保采样的可重复性和一致性，我们必须提前设置随机种子，或者在数据划分完成后将其保存下来。另外，也可以直接在sklearn的train_test_split()函数中设置`random_state`参数，这样就能在不同次执行中获得完全一致的训练集、验证集和测试集。\n\n[[src]](https:\u002F\u002Ftowardsdatascience.com\u002Fwhy-do-we-set-a-random-state-in-machine-learning-models-bb2dc68d8431#:~:text=In%20Scikit%2Dlearn%2C%20the%20random,random%20state%20instance%20from%20np.)\n\n\n#### 65) 贝叶斯统计与频率派统计有何区别？ [[src]](https:\u002F\u002Fwww.kdnuggets.com\u002F2022\u002F10\u002Fnlp-interview-questions.html)\n频率派统计是一种以样本统计量来估计总体参数，并提供点估计和置信区间的框架。\n\n而贝叶斯统计则是一种利用先验知识和信息来更新对某个参数或假设的信念，并为参数提供概率分布的框架。\n\n两者的主要区别在于：贝叶斯统计将先验知识和信念纳入分析过程，而频率派统计则不考虑这些因素。\n\n\n### 66) LSTM与Transformer的基本区别是什么？ [[src]](https:\u002F\u002Fblog.finxter.com\u002Ftransformer-vs-lstm\u002F#:~:text=LSTM%20models%20consist%20of%20RNN,feed-forward%20neural%20network%20components.)\nLSTM（长短期记忆网络）由RNN单元组成，这些单元经过设计，能够在时间步之间更高效地存储和操作信息。相比之下，Transformer模型包含一系列编码器和解码器层，每一层都由自注意力机制和前馈神经网络组件构成。\n\n### 66) 什么是RCNN？[[src]](https:\u002F\u002Ftowardsdatascience.com\u002Flearn-rcnns-with-this-toy-dataset-be19dce380ec)\n循环卷积模型是一种专门设计用于对图像序列（通常也称为视频）进行预测的模型。这类模型广泛应用于计算机视觉中的目标检测任务。RCNN方法结合了区域建议技术和卷积神经网络（CNN），以识别和定位图像中的目标。\n\n\n## 贡献\n我们非常欢迎各位的贡献。\n1. 克隆仓库。\n2. 提交你的*问题*或*答案*。\n3. 打开**拉取请求**。","# MLQuestions 快速上手指南\n\n**工具简介**\nMLQuestions 是一个开源的机器学习与计算机视觉面试题库，收录了 65+ 道核心技术问题及解答，涵盖偏差方差权衡、梯度下降、过拟合处理、CNN 原理、正则化等关键知识点。本指南旨在帮助开发者快速访问并利用该资源进行技术准备。\n\n## 环境准备\n\n本项目为纯文本\u002FMarkdown 格式的问答集合，**无需安装任何软件环境或依赖库**即可阅读。\n\n*   **系统要求**：任意操作系统（Windows, macOS, Linux）。\n*   **前置依赖**：\n    *   Web 浏览器（推荐 Chrome, Edge, Firefox）用于在线浏览。\n    *   或 Git 命令行工具（可选，用于本地克隆仓库）。\n    *   Markdown 阅读器（可选，用于本地查看 `.md` 文件）。\n\n## 安装步骤\n\n你可以选择在线直接阅读，或将内容克隆到本地离线查阅。\n\n### 方式一：在线浏览（推荐）\n直接访问 GitHub 仓库页面，无需任何操作：\n[https:\u002F\u002Fgithub.com\u002Fandrewekhalel\u002FMLQuestions](https:\u002F\u002Fgithub.com\u002Fandrewekhalel\u002FMLQuestions)\n\n### 方式二：本地克隆\n如果你希望离线阅读或贡献内容，请使用以下命令克隆仓库：\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fandrewekhalel\u002FMLQuestions.git\ncd MLQuestions\n```\n\n> **提示**：如果在国内访问 GitHub 速度较慢，可尝试使用国内代码托管平台（如 Gitee）搜索同名镜像项目，或使用加速代理配置 Git。\n\n## 基本使用\n\n### 1. 浏览核心面试题\n进入项目根目录后，打开 `README.md` 文件即可看到按顺序排列的面试题目。每个问题下方通常附有简要解答思路及外部详细解答链接（如 Towards Data Science, GeeksforGeeks 等）。\n\n**示例：查看“偏差与方差的权衡”问题**\n在文件中定位到 `#### 1) What's the trade-off between bias and variance?`，阅读其下方的解释：\n> 如果模型过于简单且参数很少，则可能具有高偏差和低方差。反之，如果模型参数过多，则具有高方差和低偏差。我们需要找到合适的平衡点，避免过拟合和欠拟合。\n\n### 2. 专项领域复习\n该项目包含特定领域的子模块，例如自然语言处理（NLP）。\n\n**访问 NLP 专题：**\n*   **在线**：点击 README 中的 \"Natural Language Processing (NLP) Interview Questions 2026\" 链接。\n*   **本地**：进入子目录查看相关文件：\n    ```bash\n    cd NLP\n    ls\n    # 使用任意文本编辑器打开对应的 .md 文件\n    code . \n    ```\n\n### 3. 结合推荐资源深度学习\nREADME 中列出了经典的备考书籍和课程，建议配合题目进行系统性学习：\n*   **统计基础**：*All of Statistics* (Larry Wasserman)\n*   **机器学习理论**：*Machine Learning* (Tom Mitchell)\n*   **工程实践**：*Designing Machine Learning Systems* (Chip Huyen)\n\n你可以利用搜索引擎或图书馆获取这些书籍的中文版或英文版，针对题库中提到的概念（如 PCA、RANSAC、ReLU vs Sigmoid）进行深入研读。","资深算法工程师李明正在备战一家头部科技公司的机器学习岗位面试，需要在短时间内系统梳理计算机视觉与深度学习的核心考点。\n\n### 没有 MLQuestions 时\n- 复习范围模糊，只能在海量博客和论坛中盲目搜索“偏置与方差”、“梯度下降”等基础概念，难以确定考察深度。\n- 缺乏针对最新趋势（如 2026 年新增的 NLP 方向）的整理，容易遗漏大模型时代的关键面试题。\n- 只能被动阅读零散答案，缺乏模拟实战的问答逻辑，导致在解释“过拟合与欠拟合”时条理不清，无法展现工程化思维。\n- 准备过程耗时巨大，大量时间浪费在筛选低质量资料上，挤压了手写代码和系统设计练习的时间。\n\n### 使用 MLQuestions 后\n- 直接获取涵盖 65+ 道精选题目的清单，从基础的统计推断到复杂的 CV\u002FNLP 场景，复习边界清晰明确。\n- 紧跟技术前沿，利用最新补充的 NLP 专项题库，精准覆盖大语言模型相关的面试热点，消除知识盲区。\n- 参照提供的标准解答思路与权威教材链接（如 Chip Huyen 的著作），将零散知识点串联成系统的回答框架，表达更具逻辑性。\n- 高效利用碎片时间聚焦核心考点，节省下的精力可用于配合推荐的模拟面试服务进行实战演练，显著提升自信。\n\nMLQuestions 将原本杂乱无章的备考过程转化为一条清晰、高效且紧跟前沿的通关路径，帮助候选人用最短时间构建起扎实的技术防御体系。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fandrewekhalel_MLQuestions_66f3b0f8.png","andrewekhalel","Andrew Khalel","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fandrewekhalel_951c6c17.jpg","Work hard 💻   Play hard 💃",null,"Madrid","andrew_emel@hotmail.com","https:\u002F\u002Fgithub.com\u002Fandrewekhalel",4553,743,"2026-04-09T02:47:20","","未说明",{"notes":90,"python":88,"dependencies":91},"该项目并非可运行的 AI 软件工具，而是一个包含机器学习与计算机视觉面试题的文档集合（Markdown\u002F文本格式）。因此，它不需要特定的操作系统、GPU、内存、Python 环境或任何依赖库。用户只需通过浏览器查看或使用任意文本编辑器阅读即可。",[],[18],"2026-03-27T02:49:30.150509","2026-04-09T21:34:03.791636",[],[]]