[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-justmarkham--DAT8":3,"tool-justmarkham--DAT8":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":79,"owner_location":80,"owner_email":81,"owner_twitter":81,"owner_website":82,"owner_url":83,"languages":84,"stars":97,"forks":98,"last_commit_at":99,"license":81,"difficulty_score":23,"env_os":100,"env_gpu":101,"env_ram":101,"env_deps":102,"category_tags":113,"github_topics":114,"view_count":23,"oss_zip_url":81,"oss_zip_packed_at":81,"status":16,"created_at":133,"updated_at":134,"faqs":135,"releases":136},2104,"justmarkham\u002FDAT8","DAT8","General Assembly's 2015 Data Science course in Washington, DC","DAT8 是通用集会（General Assembly）于 2015 年在华盛顿特区开设的数据科学课程全套开源资料库。它并非一个可直接运行的软件工具，而是一套结构完整、循序渐进的教学资源，旨在帮助学习者从零开始掌握数据科学的核心技能。\n\n这套资料系统地解决了数据科学入门难、知识碎片化的问题。课程内容涵盖从命令行操作、版本控制、数据清洗、探索性分析、可视化，到机器学习算法（如线性回归、逻辑回归、KNN、决策树、聚类等）、模型评估、自然语言处理及实战项目等全流程。所有材料按课时整理，配合讲师 Kevin Markham 的详细讲解与代码示例，让抽象概念变得具体可操作。\n\nDAT8 特别适合希望系统学习数据科学的初学者、转行者或自学者，尤其是具备基础编程兴趣但缺乏完整学习路径的开发者、分析师或研究人员。其独特亮点在于由知名数据教育者主导，内容经过实际课堂验证，并支持通过 Binder 一键在线运行笔记，无需本地配置即可体验部分实验环境。此外，课程还整合了丰富的 Python 学习资源推荐，帮助用户夯实基础。无论是想构建知识体系，还是寻找教学参考，DAT8 都是一份值得深入挖掘的宝贵资产。","## DAT8 Course Repository\n\nCourse materials for [General Assembly's Data Science course](https:\u002F\u002Fgeneralassemb.ly\u002Feducation\u002Fdata-science\u002Fwashington-dc\u002F) in Washington, DC (8\u002F18\u002F15 - 10\u002F29\u002F15).\n\n**Instructor:** Kevin Markham ([Data School blog](http:\u002F\u002Fwww.dataschool.io\u002F), [email newsletter](http:\u002F\u002Fwww.dataschool.io\u002Fsubscribe\u002F), [YouTube channel](https:\u002F\u002Fwww.youtube.com\u002Fuser\u002Fdataschool))\n\n[![Binder](http:\u002F\u002Fmybinder.org\u002Fbadge.svg)](http:\u002F\u002Fmybinder.org\u002Frepo\u002Fjustmarkham\u002FDAT8)\n\nTuesday | Thursday\n--- | ---\n8\u002F18: [Introduction to Data Science](#class-1-introduction-to-data-science) | 8\u002F20: [Command Line, Version Control](#class-2-command-line-and-version-control)\n8\u002F25: [Data Reading and Cleaning](#class-3-data-reading-and-cleaning) | 8\u002F27: [Exploratory Data Analysis](#class-4-exploratory-data-analysis)\n9\u002F1: [Visualization](#class-5-visualization) | 9\u002F3: [Machine Learning](#class-6-machine-learning)\n9\u002F8: [Getting Data](#class-7-getting-data) | 9\u002F10: [K-Nearest Neighbors](#class-8-k-nearest-neighbors)\n9\u002F15: [Basic Model Evaluation](#class-9-basic-model-evaluation) | 9\u002F17: [Linear Regression](#class-10-linear-regression)\n9\u002F22: [First Project Presentation](#class-11-first-project-presentation) | 9\u002F24: [Logistic Regression](#class-12-logistic-regression)\n9\u002F29: [Advanced Model Evaluation](#class-13-advanced-model-evaluation) | 10\u002F1: [Naive Bayes and Text Data](#class-14-naive-bayes-and-text-data)\n10\u002F6: [Natural Language Processing](#class-15-natural-language-processing) | 10\u002F8: [Kaggle Competition](#class-16-kaggle-competition)\n10\u002F13: [Decision Trees](#class-17-decision-trees) | 10\u002F15: [Ensembling](#class-18-ensembling)\n10\u002F20: [Advanced scikit-learn, Clustering](#class-19-advanced-scikit-learn-and-clustering) | 10\u002F22: [Regularization, Regex](#class-20-regularization-and-regular-expressions)\n10\u002F27: [Course Review](#class-21-course-review-and-final-project-presentation) | 10\u002F29: [Final Project Presentation](#class-22-final-project-presentation)\n\n\u003C!--\n### Before the Course Begins\n* Install [Git](http:\u002F\u002Fgit-scm.com\u002Fdownloads).\n* Create an account on the [GitHub](https:\u002F\u002Fgithub.com\u002F) website.\n    * It is not necessary to download \"GitHub for Windows\" or \"GitHub for Mac\"\n* Install the [Anaconda distribution](http:\u002F\u002Fcontinuum.io\u002Fdownloads) of Python 2.7x.\n    * If you choose not to use Anaconda, here is a list of the [Python packages](other\u002Fpython_packages.md) you will need to install during the course.\n* We would like to check the setup of your laptop before the course begins:\n    * You can have your laptop checked before the intermediate Python workshop on Tuesday 8\u002F11 (5:30pm-6:30pm), at the [15th & K Starbucks](http:\u002F\u002Fwww.yelp.com\u002Fbiz\u002Fstarbucks-washington-15) on Saturday 8\u002F15 (1pm-3pm), or before class on Tuesday 8\u002F18 (5:30pm-6:30pm).\n    * Alternatively, you can walk through the [setup checklist](other\u002Fsetup_checklist.md) yourself.\n* Once you receive an email invitation from Slack, join our \"DAT8 team\" and add your photo.\n* Practice Python using the resources below.\n-->\n\n### Python Resources\n* [Codecademy's Python course](http:\u002F\u002Fwww.codecademy.com\u002Fen\u002Ftracks\u002Fpython): Good beginner material, including tons of in-browser exercises.\n* [Dataquest](https:\u002F\u002Fwww.dataquest.io): Uses interactive exercises to teach Python in the context of data science.\n* [Google's Python Class](https:\u002F\u002Fdevelopers.google.com\u002Fedu\u002Fpython\u002F): Slightly more advanced, including hours of useful lecture videos and downloadable exercises (with solutions).\n* [Introduction to Python](http:\u002F\u002Fintrotopython.org\u002F): A series of IPython notebooks that do a great job explaining core Python concepts and data structures.\n* [Python for Informatics](http:\u002F\u002Fwww.pythonlearn.com\u002Fbook.php): A very beginner-oriented book, with associated [slides](https:\u002F\u002Fdrive.google.com\u002Ffolderview?id=0B7X1ycQalUnyal9yeUx3VW81VDg&usp=sharing) and [videos](https:\u002F\u002Fwww.youtube.com\u002Fplaylist?list=PLlRFEj9H3Oj4JXIwMwN1_ss1Tk8wZShEJ).\n* [A Crash Course in Python for Scientists](http:\u002F\u002Fnbviewer.ipython.org\u002Fgist\u002Frpmuller\u002F5920182): Read through the Overview section for a very quick introduction to Python.\n* [Python 2.7 Quick Reference](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fpython-reference\u002Fblob\u002Fmaster\u002Freference.py): My beginner-oriented guide that demonstrates Python concepts through short, well-commented examples.\n* [Beginner](code\u002F00_python_beginner_workshop.py) and [intermediate](code\u002F00_python_intermediate_workshop.py) workshop code: Useful for review and reference.\n* [Python Tutor](http:\u002F\u002Fpythontutor.com\u002F): Allows you to visualize the execution of Python code.\n\n\u003C!--\n### Submission Forms\n* [Feedback form](http:\u002F\u002Fbit.ly\u002Fdat8feedback)\n* [Homework and project submissions](http:\u002F\u002Fbit.ly\u002Fdat8homework)\n-->\n\n### [Course project](project\u002FREADME.md)\n\n### [Comparison of machine learning models](other\u002Fmodel_comparison.md)\n\n### [Comparison of model evaluation procedures and metrics](other\u002Fmodel_evaluation_comparison.md)\n\n### [Advice for getting better at data science](other\u002Fadvice.md)\n\n### [Additional resources](#additional-resources-1)\n\n-----\n\n### Class 1: Introduction to Data Science\n* Course overview ([slides](slides\u002F01_course_overview.pdf))\n* Introduction to data science ([slides](slides\u002F01_intro_to_data_science.pdf))\n* Discuss the course project: [requirements](project\u002FREADME.md) and [example projects](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT-project-examples)\n* Types of data ([slides](slides\u002F01_types_of_data.pdf)) and [public data sources](project\u002Fpublic_data.md)\n* Welcome from General Assembly staff\n\n**Homework:**\n* Work through GA's friendly [command line tutorial](http:\u002F\u002Fgeneralassembly.github.io\u002Fprework\u002Fcommand-line\u002F#\u002F) using Terminal (Linux\u002FMac) or Git Bash (Windows).\n* Read through this [command line reference](code\u002F02_command_line.md), and complete the pre-class exercise at the bottom. (There's nothing you need to submit once you're done.)\n* Watch videos 1 through 8 (21 minutes) of [Introduction to Git and GitHub](https:\u002F\u002Fwww.youtube.com\u002Fplaylist?list=PL5-da3qGB5IBLMp7LtN8Nc3Efd4hJq0kD), or read sections 1.1 through 2.2 of [Pro Git](http:\u002F\u002Fgit-scm.com\u002Fbook\u002Fen\u002Fv2).\n* If your laptop has any setup issues, please work with us to resolve them by Thursday. If your laptop has not yet been checked, you should come early on Thursday, or just walk through the [setup checklist](other\u002Fsetup_checklist.md) yourself (and let us know you have done so).\n\n**Resources:**\n* For a useful look at the different types of data scientists, read [Analyzing the Analyzers](http:\u002F\u002Fcdn.oreillystatic.com\u002Foreilly\u002Fradarreport\u002F0636920029014\u002FAnalyzing_the_Analyzers.pdf) (32 pages).\n* For some thoughts on what it's like to be a data scientist, read these short posts from [Win-Vector](http:\u002F\u002Fwww.win-vector.com\u002Fblog\u002F2012\u002F09\u002Fon-being-a-data-scientist\u002F) and [Datascope Analytics](http:\u002F\u002Fdatascopeanalytics.com\u002Fwhat-we-think\u002F2014\u002F07\u002F31\u002Fsix-qualities-of-a-great-data-scientist).\n* Quora has a [data science topic FAQ](https:\u002F\u002Fwww.quora.com\u002FData-Science) with lots of interesting Q&A.\n* Keep up with local data-related events through the Data Community DC [event calendar](http:\u002F\u002Fwww.datacommunitydc.org\u002Fcalendar) or [weekly newsletter](http:\u002F\u002Fwww.datacommunitydc.org\u002Fnewsletter).\n\n-----\n\n### Class 2: Command Line and Version Control\n* Slack tour\n* Review the command line pre-class exercise ([code](code\u002F02_command_line.md))\n* Git and GitHub ([slides](slides\u002F02_git_github.pdf))\n* Intermediate command line\n\n**Homework:**\n* Complete the [command line homework assignment](homework\u002F02_command_line_chipotle.md) with the Chipotle data.\n* Review the code from the [beginner](code\u002F00_python_beginner_workshop.py) and [intermediate](code\u002F00_python_intermediate_workshop.py) Python workshops. If you don't feel comfortable with any of the content (excluding the \"requests\" and \"APIs\" sections), you should spend some time this weekend practicing Python:\n    * [Introduction to Python](http:\u002F\u002Fintrotopython.org\u002F) does a great job explaining Python essentials and includes tons of example code.\n    * If you like learning from a book, [Python for Informatics](http:\u002F\u002Fwww.pythonlearn.com\u002Fhtml-270\u002F) has useful chapters on strings, lists, and dictionaries.\n    * If you prefer interactive exercises, try these lessons from [Codecademy](http:\u002F\u002Fwww.codecademy.com\u002Fen\u002Ftracks\u002Fpython): \"Python Lists and Dictionaries\" and \"A Day at the Supermarket\".\n    * If you have more time, try missions 2 and 3 from [DataQuest's Learning Python](https:\u002F\u002Fwww.dataquest.io\u002Fcourse\u002Flearning-python) course.\n    * If you've already mastered these topics and want more of a challenge, try solving [Python Challenge](http:\u002F\u002Fwww.pythonchallenge.com\u002F) number 1 (decoding a message) and send me your code in Slack.\n* To give you a framework for thinking about your project, watch [What is machine learning, and how does it work?](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=elojMnjn4kk) (10 minutes). (This is the [IPython notebook](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fscikit-learn-videos\u002Fblob\u002Fmaster\u002F01_machine_learning_intro.ipynb) shown in the video.) Alternatively, read [A Visual Introduction to Machine Learning](http:\u002F\u002Fwww.r2d3.us\u002Fvisual-intro-to-machine-learning-part-1\u002F), which focuses on a specific machine learning model called decision trees.\n* **Optional:** Browse through some more [example student projects](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT-project-examples), which may help to inspire your own project!\n\n**Git and Markdown Resources:**\n* [Pro Git](http:\u002F\u002Fgit-scm.com\u002Fbook\u002Fen\u002Fv2) is an excellent book for learning Git. Read the first two chapters to gain a deeper understanding of version control and basic commands.\n* If you want to practice a lot of Git (and learn many more commands), [Git Immersion](http:\u002F\u002Fgitimmersion.com\u002F) looks promising.\n* If you want to understand how to contribute on GitHub, you first have to understand [forks and pull requests](http:\u002F\u002Fwww.dataschool.io\u002Fsimple-guide-to-forks-in-github-and-git\u002F).\n* [GitRef](http:\u002F\u002Fgitref.org\u002F) is my favorite reference guide for Git commands, and [Git quick reference for beginners](http:\u002F\u002Fwww.dataschool.io\u002Fgit-quick-reference-for-beginners\u002F) is a shorter guide with commands grouped by workflow.\n* [Cracking the Code to GitHub's Growth](https:\u002F\u002Fgrowthhackers.com\u002Fgrowth-studies\u002Fgithub) explains why GitHub is so popular among developers.\n* [Markdown Cheatsheet](https:\u002F\u002Fgithub.com\u002Fadam-p\u002Fmarkdown-here\u002Fwiki\u002FMarkdown-Cheatsheet) provides a thorough set of Markdown examples with concise explanations. GitHub's [Mastering Markdown](https:\u002F\u002Fguides.github.com\u002Ffeatures\u002Fmastering-markdown\u002F) is a simpler and more attractive guide, but is less comprehensive.\n\n**Command Line Resources:**\n* If you want to go much deeper into the command line, [Data Science at the Command Line](http:\u002F\u002Fshop.oreilly.com\u002Fproduct\u002F0636920032823.do) is a great book. The [companion website](http:\u002F\u002Fdatascienceatthecommandline.com\u002F) provides installation instructions for a \"data science toolbox\" (a virtual machine with many more command line tools), as well as a long reference guide to popular command line tools.\n* If you want to do more at the command line with CSV files, try out [csvkit](http:\u002F\u002Fcsvkit.readthedocs.org\u002F), which can be installed via `pip`.\n\n-----\n\n### Class 3: Data Reading and Cleaning\n* Git and GitHub assorted tips ([slides](slides\u002F02_git_github.pdf))\n* Review command line homework ([solution](homework\u002F02_command_line_chipotle.md))\n* Python:\n    * Spyder interface\n    * Looping exercise\n    * Lesson on file reading with airline safety data ([code](code\u002F03_file_reading.py), [data](data\u002Fairlines.csv), [article](http:\u002F\u002Ffivethirtyeight.com\u002Ffeatures\u002Fshould-travelers-avoid-flying-airlines-that-have-had-crashes-in-the-past\u002F))\n    * Data cleaning exercise\n    * Walkthrough of Python homework with Chipotle data ([code](code\u002F03_python_homework_chipotle.py), [data](data\u002Fchipotle.tsv), [article](http:\u002F\u002Fwww.nytimes.com\u002Finteractive\u002F2015\u002F02\u002F17\u002Fupshot\u002Fwhat-do-people-actually-order-at-chipotle.html))\n\n**Homework:**\n* Complete the [Python homework assignment](code\u002F03_python_homework_chipotle.py) with the Chipotle data, add a commented Python script to your GitHub repo, and submit a link using the homework submission form. You have until Tuesday (9\u002F1) to complete this assignment. (**Note:** Pandas, which is covered in class 4, should not be used for this assignment.)\n\n**Resources:**\n* [Want to understand Python's comprehensions? Think in Excel or SQL](http:\u002F\u002Fblog.lerner.co.il\u002Fwant-to-understand-pythons-comprehensions-think-like-an-accountant\u002F) may be helpful if you are still confused by list comprehensions.\n* [My code isn't working](http:\u002F\u002Fwww.tecoed.co.uk\u002Fuploads\u002F1\u002F4\u002F2\u002F4\u002F14249012\u002F624506_orig.png) is a great flowchart explaining how to debug Python errors.\n* [PEP 8](https:\u002F\u002Fwww.python.org\u002Fdev\u002Fpeps\u002Fpep-0008\u002F) is Python's \"classic\" style guide, and is worth a read if you want to write readable code that is consistent with the rest of the Python community.\n* If you want to understand Python at a deeper level, Ned Batchelder's [Loop Like A Native](http:\u002F\u002Fnedbatchelder.com\u002Ftext\u002Fiter.html) and [Python Names and Values](http:\u002F\u002Fnedbatchelder.com\u002Ftext\u002Fnames1.html) are excellent presentations.\n\n-----\n\n### Class 4: Exploratory Data Analysis\n* Pandas ([code](code\u002F04_pandas.py)):\n    * MovieLens 100k movie ratings ([data](data\u002Fu.user), [data dictionary](http:\u002F\u002Ffiles.grouplens.org\u002Fdatasets\u002Fmovielens\u002Fml-100k-README.txt), [website](http:\u002F\u002Fgrouplens.org\u002Fdatasets\u002Fmovielens\u002F))\n    * Alcohol consumption by country ([data](data\u002Fdrinks.csv), [article](http:\u002F\u002Ffivethirtyeight.com\u002Fdatalab\u002Fdear-mona-followup-where-do-people-drink-the-most-beer-wine-and-spirits\u002F))\n    * Reports of UFO sightings ([data](data\u002Fufo.csv), [website](http:\u002F\u002Fwww.nuforc.org\u002Fwebreports.html))\n* Project question exercise\n\n**Homework:**\n* The deadline for discussing your project ideas with an instructor is Tuesday (9\u002F1), and your project question write-up is due Thursday (9\u002F3).\n* Read [How Software in Half of NYC Cabs Generates $5.2 Million a Year in Extra Tips](http:\u002F\u002Fiquantny.tumblr.com\u002Fpost\u002F107245431809\u002Fhow-software-in-half-of-nyc-cabs-generates-5-2) for an excellent example of exploratory data analysis.\n* Read [Anscombe's Quartet, and Why Summary Statistics Don't Tell the Whole Story](http:\u002F\u002Fdata.heapanalytics.com\u002Fanscombes-quartet-and-why-summary-statistics-dont-tell-the-whole-story\u002F) for a classic example of why visualization is useful.\n\n**Resources:**\n* Browsing or searching the Pandas [API Reference](http:\u002F\u002Fpandas.pydata.org\u002Fpandas-docs\u002Fstable\u002Fapi.html) is an excellent way to locate a function even if you don't know its exact name.\n* [What I do when I get a new data set as told through tweets](http:\u002F\u002Fsimplystatistics.org\u002F2014\u002F06\u002F13\u002Fwhat-i-do-when-i-get-a-new-data-set-as-told-through-tweets\u002F) is a fun (yet enlightening) look at the process of exploratory data analysis.\n\n-----\n\n### Class 5: Visualization\n* Python homework with the Chipotle data due ([solution](code\u002F03_python_homework_chipotle.py), [detailed explanation](notebooks\u002F03_python_homework_chipotle_explained.ipynb))\n* Part 2 of Exploratory Data Analysis with Pandas ([code](code\u002F04_pandas.py))\n* Visualization with Pandas and Matplotlib ([notebook](notebooks\u002F05_pandas_visualization.ipynb))\n\n**Homework:**\n* Your project question write-up is due on Thursday.\n* Complete the [Pandas homework assignment](code\u002F05_pandas_homework_imdb.py) with the [IMDb data](data\u002Fimdb_1000.csv). You have until Tuesday (9\u002F8) to complete this assignment.\n* If you're not using Anaconda, install the [Jupyter Notebook](http:\u002F\u002Fjupyter.readthedocs.org\u002Fen\u002Flatest\u002Finstall.html) (formerly known as the IPython Notebook) using `pip`. (The Jupyter or IPython Notebook is included with Anaconda.)\n\n**Pandas Resources:**\n* To learn more Pandas, read this [three-part tutorial](http:\u002F\u002Fwww.gregreda.com\u002F2013\u002F10\u002F26\u002Fintro-to-pandas-data-structures\u002F), or review these two excellent (but extremely long) notebooks on Pandas: [introduction](https:\u002F\u002Fgithub.com\u002Ffonnesbeck\u002FBios8366\u002Fblob\u002Fmaster\u002Fnotebooks\u002FSection2_5-Introduction-to-Pandas.ipynb) and [data wrangling](https:\u002F\u002Fgithub.com\u002Ffonnesbeck\u002FBios8366\u002Fblob\u002Fmaster\u002Fnotebooks\u002FSection2_6-Data-Wrangling-with-Pandas.ipynb).\n* If you want to go really deep into Pandas (and NumPy), read the book [Python for Data Analysis](http:\u002F\u002Fshop.oreilly.com\u002Fproduct\u002F0636920023784.do), written by the creator of Pandas.\n* This notebook demonstrates the different types of [joins in Pandas](notebooks\u002F05_pandas_merge.ipynb), for when you need to figure out how to merge two DataFrames.\n* This is a nice, short tutorial on [pivot tables](https:\u002F\u002Fbeta.oreilly.com\u002Flearning\u002Fpivot-tables) in Pandas.\n* For working with geospatial data in Python, [GeoPandas](http:\u002F\u002Fgeopandas.org\u002Findex.html) looks promising. This [tutorial](http:\u002F\u002Fmichelleful.github.io\u002Fcode-blog\u002F2015\u002F04\u002F24\u002Fsgmap\u002F) uses GeoPandas (and scikit-learn) to build a \"linguistic street map\" of Singapore.\n\n**Visualization Resources:**\n* Watch [Look at Your Data](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=coNDCIMH8bk) (18 minutes) for an excellent example of why visualization is useful for understanding your data.\n* For more on Pandas plotting, read this [notebook](https:\u002F\u002Fgithub.com\u002Ffonnesbeck\u002FBios8366\u002Fblob\u002Fmaster\u002Fnotebooks\u002FSection2_7-Plotting-with-Pandas.ipynb) or the [visualization page](http:\u002F\u002Fpandas.pydata.org\u002Fpandas-docs\u002Fstable\u002Fvisualization.html) from the official Pandas documentation.\n* To learn how to customize your plots further, browse through this [notebook on matplotlib](https:\u002F\u002Fgithub.com\u002Ffonnesbeck\u002FBios8366\u002Fblob\u002Fmaster\u002Fnotebooks\u002FSection2_4-Matplotlib.ipynb) or this [similar notebook](https:\u002F\u002Fgithub.com\u002Fjrjohansson\u002Fscientific-python-lectures\u002Fblob\u002Fmaster\u002FLecture-4-Matplotlib.ipynb).\n* Read [Overview of Python Visualization Tools](http:\u002F\u002Fpbpython.com\u002Fvisualization-tools-1.html) for a useful comparison of Matplotlib, Pandas, Seaborn, ggplot, Bokeh, Pygal, and Plotly.\n* To explore different types of visualizations and when to use them, [Choosing a Good Chart](http:\u002F\u002Fextremepresentation.typepad.com\u002Ffiles\u002Fchoosing-a-good-chart-09.pdf) and [The Graphic Continuum](http:\u002F\u002Fwww.coolinfographics.com\u002Fstorage\u002Fpost-images\u002FThe-Graphic-Continuum-POSTER.jpg) are nice one-page references, and the interactive [R Graph Catalog](http:\u002F\u002Fshiny.stat.ubc.ca\u002Fr-graph-catalog\u002F) has handy filtering capabilities.\n* This [PowerPoint presentation](http:\u002F\u002Fwww2.research.att.com\u002F~volinsky\u002FDataMining\u002FColumbia2011\u002FSlides\u002FTopic2-EDAViz.ppt) from Columbia's Data Mining class contains lots of good advice for properly using different types of visualizations.\n* [Harvard's Data Science course](http:\u002F\u002Fcs109.github.io\u002F2014\u002F) includes an excellent lecture on [Visualization Goals, Data Types, and Statistical Graphs](http:\u002F\u002Fcm.dce.harvard.edu\u002F2015\u002F01\u002F14328\u002FL03\u002Fscreen_H264LargeTalkingHead-16x9.shtml) (83 minutes), for which the [slides](https:\u002F\u002Fdocs.google.com\u002Ffile\u002Fd\u002F0B7IVstmtIvlHLTdTbXdEVENoRzQ\u002Fedit) are also available.\n\n-----\n\n### Class 6: Machine Learning\n* Part 2 of Visualization with Pandas and Matplotlib ([notebook](notebooks\u002F05_pandas_visualization.ipynb))\n* Brief introduction to the Jupyter\u002FIPython Notebook\n* \"Human learning\" exercise:\n    * [Iris dataset](http:\u002F\u002Farchive.ics.uci.edu\u002Fml\u002Fdatasets\u002FIris) hosted by the UCI Machine Learning Repository\n    * [Iris photo](http:\u002F\u002Fsebastianraschka.com\u002FImages\u002F2014_python_lda\u002Firis_petal_sepal.png)\n    * [Notebook](notebooks\u002F06_human_learning_iris.ipynb)\n* Introduction to machine learning ([slides](slides\u002F06_machine_learning.pdf))\n\n**Homework:**\n* **Optional:** Complete the bonus exercise listed in the [human learning notebook](notebooks\u002F06_human_learning_iris.ipynb). It will take the place of any one homework you miss, past or future! This is due on Tuesday (9\u002F8).\n* If you're not using Anaconda, install [requests](http:\u002F\u002Fwww.python-requests.org\u002Fen\u002Flatest\u002Fuser\u002Finstall\u002F) and [Beautiful Soup 4](http:\u002F\u002Fwww.crummy.com\u002Fsoftware\u002FBeautifulSoup\u002Fbs4\u002Fdoc\u002F#installing-beautiful-soup) using `pip`. (Both of these packages are included with Anaconda.)\n\n**Machine Learning Resources:**\n* For a very quick summary of the key points about machine learning, watch [What is machine learning, and how does it work?](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=elojMnjn4kk) (10 minutes) or read the [associated notebook](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fscikit-learn-videos\u002Fblob\u002Fmaster\u002F01_machine_learning_intro.ipynb).\n* For a more in-depth introduction to machine learning, read section 2.1 (14 pages) of Hastie and Tibshirani's excellent book, [An Introduction to Statistical Learning](http:\u002F\u002Fwww-bcf.usc.edu\u002F~gareth\u002FISL\u002F). (It's a free PDF download!)\n* The [Learning Paradigms](http:\u002F\u002Fwork.caltech.edu\u002Flibrary\u002F014.html) video (13 minutes) from [Caltech's Learning From Data course](http:\u002F\u002Fwork.caltech.edu\u002Ftelecourse.html) provides a nice comparison of supervised versus unsupervised learning, as well as an introduction to \"reinforcement learning\".\n* [Real-World Active Learning](https:\u002F\u002Fbeta.oreilly.com\u002Fideas\u002Freal-world-active-learning) is a readable and thorough introduction to \"active learning\", a variation of machine learning in which humans label only the most \"important\" observations.\n* For a preview of some of the machine learning content we will cover during the course, read Sebastian Raschka's [overview of the supervised learning process](https:\u002F\u002Fgithub.com\u002Frasbt\u002Fpattern_classification\u002Fblob\u002Fmaster\u002Fmachine_learning\u002Fsupervised_intro\u002Fintroduction_to_supervised_machine_learning.md).\n* [Data Science, Machine Learning, and Statistics: What is in a Name?](http:\u002F\u002Fwww.win-vector.com\u002Fblog\u002F2013\u002F04\u002Fdata-science-machine-learning-and-statistics-what-is-in-a-name\u002F) discusses the differences between these (and other) terms.\n* [The Emoji Translation Project](https:\u002F\u002Fwww.kickstarter.com\u002Fprojects\u002Ffred\u002Fthe-emoji-translation-project) is a really fun application of machine learning.\n* Look up the [characteristics of your zip code](http:\u002F\u002Fwww.esri.com\u002Flanding-pages\u002Ftapestry\u002F), and then read about the [67 distinct segments](http:\u002F\u002Fdoc.arcgis.com\u002Fen\u002Fesri-demographics\u002Fdata\u002Ftapestry-segmentation.htm) in detail.\n\n**IPython Notebook Resources:**\n* For a recap of the IPython Notebook introduction (and a preview of scikit-learn), watch [scikit-learn and the IPython Notebook](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=IsXXlYVBt1M) (15 minutes) or read the [associated notebook](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fscikit-learn-videos\u002Fblob\u002Fmaster\u002F02_machine_learning_setup.ipynb).\n* If you would like to learn the IPython Notebook, the official [Notebook tutorials](https:\u002F\u002Fgithub.com\u002Fjupyter\u002Fnotebook\u002Fblob\u002Fmaster\u002Fdocs\u002Fsource\u002Fexamples\u002FNotebook\u002FExamples%20and%20Tutorials%20Index.ipynb) are useful.\n* This [Reddit discussion](https:\u002F\u002Fwww.reddit.com\u002Fr\u002FPython\u002Fcomments\u002F3be5z2\u002Fdo_you_prefer_ipython_notebook_over_ipython\u002F) compares the relative strengths of the IPython Notebook and Spyder.\n\n-----\n\n### Class 7: Getting Data\n* Pandas homework with the IMDb data due ([solution](code\u002F05_pandas_homework_imdb.py))\n* Optional \"human learning\" exercise with the iris data due ([solution](notebooks\u002F06_human_learning_iris.ipynb))\n* APIs ([code](code\u002F07_api.py))\n    * [OMDb API](http:\u002F\u002Fwww.omdbapi.com\u002F)\n* Web scraping ([code](code\u002F07_web_scraping.py))\n    * [IMDb: robots.txt](http:\u002F\u002Fwww.imdb.com\u002Frobots.txt)\n    * [Example web page](data\u002Fexample.html)\n    * [IMDb: The Shawshank Redemption](http:\u002F\u002Fwww.imdb.com\u002Ftitle\u002Ftt0111161\u002F)\n\n**Homework:**\n* **Optional:** Complete the homework exercise listed in the [web scraping code](code\u002F07_web_scraping.py). It will take the place of any one homework you miss, past or future! This is due on Tuesday (9\u002F15).\n* **Optional:** If you're not using Anaconda, [install Seaborn](http:\u002F\u002Fstanford.edu\u002F~mwaskom\u002Fsoftware\u002Fseaborn\u002Finstalling.html) using `pip`. If you're using Anaconda, install Seaborn by running `conda install seaborn` at the command line. (Note that some students in past courses have had problems with Anaconda after installing Seaborn.)\n\n**API Resources:**\n* This Python script to [query the U.S. Census API](https:\u002F\u002Fgithub.com\u002Flaurakurup\u002Fcensus-api) was created by a former DAT student. It's a bit more complicated than the example we used in class, it's very well commented, and it may provide a useful framework for writing your own code to query APIs.\n* [Mashape](https:\u002F\u002Fwww.mashape.com\u002Fexplore) and [Apigee](https:\u002F\u002Fapigee.com\u002Fproviders) allow you to explore tons of different APIs. Alternatively, a [Python API wrapper](http:\u002F\u002Fwww.pythonforbeginners.com\u002Fapi\u002Flist-of-python-apis) is available for many popular APIs.\n* The [Data Science Toolkit](http:\u002F\u002Fwww.datasciencetoolkit.org\u002F) is a collection of location-based and text-related APIs.\n* [API Integration in Python](https:\u002F\u002Frealpython.com\u002Fblog\u002Fpython\u002Fapi-integration-in-python\u002F) provides a very readable introduction to REST APIs.\n* Microsoft's [Face Detection API](https:\u002F\u002Fwww.projectoxford.ai\u002Fdemo\u002Fface#detection), which powers [How-Old.net](http:\u002F\u002Fhow-old.net\u002F), is a great example of how a machine learning API can be leveraged to produce a compelling web application.\n\n**Web Scraping Resources:**\n* The [Beautiful Soup documentation](http:\u002F\u002Fwww.crummy.com\u002Fsoftware\u002FBeautifulSoup\u002Fbs4\u002Fdoc\u002F) is incredibly thorough, but is hard to use as a reference guide. However, the section on [specifying a parser](http:\u002F\u002Fwww.crummy.com\u002Fsoftware\u002FBeautifulSoup\u002Fbs4\u002Fdoc\u002F#specifying-the-parser-to-use) may be helpful if Beautiful Soup appears to be parsing a page incorrectly.\n* For more Beautiful Soup examples and tutorials, see [Web Scraping 101 with Python](http:\u002F\u002Fwww.gregreda.com\u002F2013\u002F03\u002F03\u002Fweb-scraping-101-with-python\u002F), a former DAT student's well-commented notebook on [scraping Craigslist](https:\u002F\u002Fgithub.com\u002FAlexjmsherman\u002FDataScience_GeneralAssembly\u002Fblob\u002Fmaster\u002FFinal_Project\u002F1.%20Final_Project_Data%20Scraping.ipynb), this [notebook](http:\u002F\u002Fweb.stanford.edu\u002F~zlotnick\u002FTextAsData\u002FWeb_Scraping_with_Beautiful_Soup.html) from Stanford's Text As Data course, and this [notebook](https:\u002F\u002Fgithub.com\u002Fcs109\u002F2014\u002Fblob\u002Fmaster\u002Flectures\u002F2014_09_23-lecture\u002Fdata_scraping_transcript.ipynb) and associated [video](http:\u002F\u002Fcm.dce.harvard.edu\u002F2015\u002F01\u002F14328\u002FL07\u002Fscreen_H264LargeTalkingHead-16x9.shtml) from Harvard's Data Science course.\n* For a much longer web scraping tutorial covering Beautiful Soup, lxml, XPath, and Selenium, watch [Web Scraping with Python](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=p1iX0uxM1w8) (3 hours 23 minutes) from PyCon 2014. The [slides](https:\u002F\u002Fdocs.google.com\u002Fpresentation\u002Fd\u002F1uHM_esB13VuSf7O1ScGueisnrtu-6usGFD3fs4z5YCE\u002Fedit#slide=id.p) and [code](https:\u002F\u002Fgithub.com\u002Fkjam\u002Fpython-web-scraping-tutorial) are also available.\n* For more complex web scraping projects, [Scrapy](http:\u002F\u002Fscrapy.org\u002F) is a popular application framework that works with Python. It has excellent [documentation](http:\u002F\u002Fdoc.scrapy.org\u002Fen\u002F1.0\u002Findex.html), and here's a [tutorial](https:\u002F\u002Fgithub.com\u002Frdempsey\u002Fddl-data-wrangling) with detailed slides and code.\n* [robotstxt.org](http:\u002F\u002Fwww.robotstxt.org\u002Frobotstxt.html) has a concise explanation of how to write (and read) the `robots.txt` file.\n* [import.io](https:\u002F\u002Fimport.io\u002F) and [Kimono](https:\u002F\u002Fwww.kimonolabs.com\u002F) claim to allow you to scrape websites without writing any code.\n* [How a Math Genius Hacked OkCupid to Find True Love](http:\u002F\u002Fwww.wired.com\u002F2014\u002F01\u002Fhow-to-hack-okcupid\u002Fall\u002F) and [How Netflix Reverse Engineered Hollywood](http:\u002F\u002Fwww.theatlantic.com\u002Ftechnology\u002Farchive\u002F2014\u002F01\u002Fhow-netflix-reverse-engineered-hollywood\u002F282679\u002F?single_page=true) are two fun examples of how web scraping has been used to build interesting datasets.\n\n-----\n\n### Class 8: K-Nearest Neighbors\n* Brief review of Pandas ([notebook](notebooks\u002F08_pandas_review.ipynb))\n* K-nearest neighbors and scikit-learn ([notebook](notebooks\u002F08_knn_sklearn.ipynb))\n* Exercise with NBA player data ([notebook](notebooks\u002F08_nba_knn.ipynb), [data](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT4-students\u002Fblob\u002Fmaster\u002Fkerry\u002FFinal\u002FNBA_players_2015.csv), [data dictionary](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT-project-examples\u002Fblob\u002Fmaster\u002Fpdf\u002Fnba_paper.pdf))\n* Exploring the bias-variance tradeoff ([notebook](notebooks\u002F08_bias_variance.ipynb))\n\n**Homework:**\n* Reading assignment on the [bias-variance tradeoff](homework\u002F09_bias_variance.md)\n* Read Kevin's [introduction to reproducibility](http:\u002F\u002Fwww.dataschool.io\u002Freproducibility-is-not-just-for-researchers\u002F), read Jeff Leek's [guide to creating a reproducible analysis](https:\u002F\u002Fgithub.com\u002Fjtleek\u002Fdatasharing), and watch this related [Colbert Report video](http:\u002F\u002Fthecolbertreport.cc.com\u002Fvideos\u002Fdcyvro\u002Fausterity-s-spreadsheet-error) (8 minutes).\n* Work on your project... your first project presentation is in less than two weeks!\n\n**KNN Resources:**\n* For a recap of the key points about KNN and scikit-learn, watch [Getting started in scikit-learn with the famous iris dataset](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=hd1W4CyPX58) (15 minutes) and [Training a machine learning model with scikit-learn](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=RlQuVL6-qe8) (20 minutes).\n* KNN supports [distance metrics](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.neighbors.DistanceMetric.html) other than Euclidean distance, such as [Mahalanobis distance](http:\u002F\u002Fstats.stackexchange.com\u002Fquestions\u002F62092\u002Fbottom-to-top-explanation-of-the-mahalanobis-distance), which [takes the scale of the data into account](http:\u002F\u002Fblogs.sas.com\u002Fcontent\u002Fiml\u002F2012\u002F02\u002F15\u002Fwhat-is-mahalanobis-distance.html).\n* [A Detailed Introduction to KNN](https:\u002F\u002Fsaravananthirumuruganathan.wordpress.com\u002F2010\u002F05\u002F17\u002Fa-detailed-introduction-to-k-nearest-neighbor-knn-algorithm\u002F) is a bit dense, but provides a more thorough introduction to KNN and its applications.\n* This lecture on [Image Classification](http:\u002F\u002Fcs231n.github.io\u002Fclassification\u002F) shows how KNN could be used for detecting similar images, and also touches on topics we will cover in future classes (hyperparameter tuning and cross-validation).\n* Some applications for which KNN is well-suited are [object recognition](http:\u002F\u002Fvlm1.uta.edu\u002F~athitsos\u002Fnearest_neighbors\u002F), [satellite image enhancement](http:\u002F\u002Fland.umn.edu\u002Fdocuments\u002FFS6.pdf), [document categorization](http:\u002F\u002Fwww.ceng.metu.edu.tr\u002F~e120321\u002Fpaper.pdf), and [gene expression analysis](http:\u002F\u002Fciteseerx.ist.psu.edu\u002Fviewdoc\u002Fsummary?doi=10.1.1.208.993).\n\n**Seaborn Resources:**\n* To get started with Seaborn for visualization, the official website has a series of [detailed tutorials](http:\u002F\u002Fweb.stanford.edu\u002F~mwaskom\u002Fsoftware\u002Fseaborn\u002Ftutorial.html) and an [example gallery](http:\u002F\u002Fweb.stanford.edu\u002F~mwaskom\u002Fsoftware\u002Fseaborn\u002Fexamples\u002Findex.html).\n* [Data visualization with Seaborn](https:\u002F\u002Fbeta.oreilly.com\u002Flearning\u002Fdata-visualization-with-seaborn) is a quick tour of some of the popular types of Seaborn plots.\n* [Visualizing Google Forms Data with Seaborn](http:\u002F\u002Fpbpython.com\u002Fpandas-google-forms-part2.html) and [How to Create NBA Shot Charts in Python](http:\u002F\u002Fsavvastjortjoglou.com\u002Fnba-shot-sharts.html) are both good examples of Seaborn usage on real-world data.\n\n-----\n\n### Class 9: Basic Model Evaluation\n* Optional web scraping homework due ([solution](code\u002F07_web_scraping.py#L136))\n* Reproducibility\n    * Discuss assigned readings: [introduction](http:\u002F\u002Fwww.dataschool.io\u002Freproducibility-is-not-just-for-researchers\u002F), [Colbert Report video](http:\u002F\u002Fthecolbertreport.cc.com\u002Fvideos\u002Fdcyvro\u002Fausterity-s-spreadsheet-error), [cabs article](http:\u002F\u002Fiquantny.tumblr.com\u002Fpost\u002F107245431809\u002Fhow-software-in-half-of-nyc-cabs-generates-5-2), [Tweet](https:\u002F\u002Ftwitter.com\u002Fjakevdp\u002Fstatus\u002F519563939177197571), [creating a reproducible analysis](https:\u002F\u002Fgithub.com\u002Fjtleek\u002Fdatasharing)\n    * Examples: [Classic rock](https:\u002F\u002Fgithub.com\u002Ffivethirtyeight\u002Fdata\u002Ftree\u002Fmaster\u002Fclassic-rock), [student project 1](https:\u002F\u002Fgithub.com\u002Fjwknobloch\u002FDAT4_final_project), [student project 2](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT4-students\u002Ftree\u002Fmaster\u002FJonathan_Bryan\u002FProject_Files)\n* Discuss the reading assignment on the [bias-variance tradeoff](homework\u002F09_bias_variance.md)\n* Model evaluation using train\u002Ftest split ([notebook](notebooks\u002F09_model_evaluation.ipynb))\n* Exploring the scikit-learn documentation: [module reference](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fclasses.html), [user guide](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fuser_guide.html), class and function documentation\n\n**Homework:**\n* Watch [Data science in Python](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=3ZWuPVWq7p4) (35 minutes) for an introduction to linear regression (and a review of other course content), or at the very least, read through the [associated notebook](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fscikit-learn-videos\u002Fblob\u002Fmaster\u002F06_linear_regression.ipynb).\n* **Optional:** For another introduction to linear regression, watch [The Easiest Introduction to Regression Analysis](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=k_OB1tWX9PM) (14 minutes).\n\n**Model Evaluation Resources:**\n* For a recap of some of the key points from today's lesson, watch [Comparing machine learning models in scikit-learn](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=0pP4EwWJgIU) (27 minutes).\n* For another explanation of training error versus testing error, the bias-variance tradeoff, and train\u002Ftest split (also known as the \"validation set approach\"), watch Hastie and Tibshirani's video on [estimating prediction error](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=_2ij6eaaSl0&t=2m34s) (12 minutes, starting at 2:34).\n* Caltech's Learning From Data course includes a fantastic video on [visualizing bias and variance](http:\u002F\u002Fwork.caltech.edu\u002Flibrary\u002F081.html) (15 minutes).\n* [Random Test\u002FTrain Split is Not Always Enough](http:\u002F\u002Fwww.win-vector.com\u002Fblog\u002F2015\u002F01\u002Frandom-testtrain-split-is-not-always-enough\u002F) explains why random train\u002Ftest split may not be a suitable model evaluation procedure if your data has a significant time element.\n\n**Reproducibility Resources:**\n* [What We've Learned About Sharing Our Data Analysis](https:\u002F\u002Fsource.opennews.org\u002Fen-US\u002Farticles\u002Fwhat-weve-learned-about-sharing-our-data-analysis\u002F) includes tips from BuzzFeed News about how to publish a reproducible analysis.\n* [Software development skills for data scientists](http:\u002F\u002Ftreycausey.com\u002Fsoftware_dev_skills.html) discusses the importance of writing functions and proper code comments (among other skills), which are highly useful for creating a reproducible analysis.\n* [Data science done well looks easy - and that is a big problem for data scientists](http:\u002F\u002Fsimplystatistics.org\u002F2015\u002F03\u002F17\u002Fdata-science-done-well-looks-easy-and-that-is-a-big-problem-for-data-scientists\u002F) explains how a reproducible analysis demonstrates all of the work that goes into proper data science.\n\n-----\n\n### Class 10: Linear Regression\n* Machine learning exercise ([article](http:\u002F\u002Fblog.dominodatalab.com\u002F10-interesting-uses-of-data-science\u002F))\n* Linear regression ([notebook](notebooks\u002F10_linear_regression.ipynb))\n    * [Capital Bikeshare dataset](data\u002Fbikeshare.csv) used in a Kaggle competition\n    * [Data dictionary](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fbike-sharing-demand\u002Fdata)\n* Feature engineering example: [Predicting User Engagement in Corporate Collaboration Network](https:\u002F\u002Fgithub.com\u002Fmikeyea\u002FDAT7_project\u002Fblob\u002Fmaster\u002Ffinal%20project\u002FClass_Presention_MYea.ipynb)\n\n**Homework:**\n* Your first project presentation is on Tuesday (9\u002F22)! Please submit a link to your project repository (with slides, code, data, and visualizations) by 6pm on Tuesday.\n* Complete the [homework assignment](homework\u002F10_yelp_votes.md) with the [Yelp data](data\u002Fyelp.csv). This is due on Thursday (9\u002F24).\n\n**Linear Regression Resources:**\n* To go much more in-depth on linear regression, read Chapter 3 of [An Introduction to Statistical Learning](http:\u002F\u002Fwww-bcf.usc.edu\u002F~gareth\u002FISL\u002F). Alternatively, watch the [related videos](http:\u002F\u002Fwww.dataschool.io\u002F15-hours-of-expert-machine-learning-videos\u002F) or read my [quick reference guide](http:\u002F\u002Fwww.dataschool.io\u002Fapplying-and-interpreting-linear-regression\u002F) to the key points in that chapter.\n* This [introduction to linear regression](http:\u002F\u002Fpeople.duke.edu\u002F~rnau\u002Fregintro.htm) is more detailed and mathematically thorough, and includes lots of good advice.\n* This is a relatively quick post on the [assumptions of linear regression](http:\u002F\u002Fpareonline.net\u002Fgetvn.asp?n=2&v=8).\n* Setosa has an [interactive visualization](http:\u002F\u002Fsetosa.io\u002Fev\u002Fordinary-least-squares-regression\u002F) of linear regression.\n* For a brief introduction to confidence intervals, hypothesis testing, p-values, and R-squared, as well as a comparison between scikit-learn code and [Statsmodels](http:\u002F\u002Fstatsmodels.sourceforge.net\u002F) code, read my [DAT7 lesson on linear regression](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT7\u002Fblob\u002Fmaster\u002Fnotebooks\u002F10_linear_regression.ipynb).\n* Here is a useful explanation of [confidence intervals](http:\u002F\u002Fwww.quora.com\u002FWhat-is-a-confidence-interval-in-laymans-terms\u002Fanswer\u002FMichael-Hochster) from Quora.\n* [Hypothesis Testing: The Basics](http:\u002F\u002F20bits.com\u002Farticle\u002Fhypothesis-testing-the-basics) provides a nice overview of the topic, and John Rauser's talk on [Statistics Without the Agonizing Pain](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=5Dnw46eC-0o) (12 minutes) gives a great explanation of how the null hypothesis is rejected.\n* Earlier this year, a major scientific journal banned the use of p-values:\n    * Scientific American has a nice [summary](http:\u002F\u002Fwww.scientificamerican.com\u002Farticle\u002Fscientists-perturbed-by-loss-of-stat-tools-to-sift-research-fudge-from-fact\u002F) of the ban.\n    * This [response](http:\u002F\u002Fwww.nature.com\u002Fnews\u002Fstatistics-p-values-are-just-the-tip-of-the-iceberg-1.17412) to the ban in Nature argues that \"decisions that are made earlier in data analysis have a much greater impact on results\".\n    * Andrew Gelman has a readable [paper](http:\u002F\u002Fwww.stat.columbia.edu\u002F~gelman\u002Fresearch\u002Funpublished\u002Fp_hacking.pdf) in which he argues that \"it's easy to find a p \u003C .05 comparison even if nothing is going on, if you look hard enough\".\n    * [Science Isn't Broken](http:\u002F\u002Ffivethirtyeight.com\u002Ffeatures\u002Fscience-isnt-broken\u002F) includes a neat tool that allows you to \"p-hack\" your way to \"statistically significant\" results.\n* [Accurately Measuring Model Prediction Error](http:\u002F\u002Fscott.fortmann-roe.com\u002Fdocs\u002FMeasuringError.html) compares adjusted R-squared, AIC and BIC, train\u002Ftest split, and cross-validation.\n\n**Other Resources:**\n* Section 3.3.1 of [An Introduction to Statistical Learning](http:\u002F\u002Fwww-bcf.usc.edu\u002F~gareth\u002FISL\u002F) (4 pages) has a great explanation of dummy encoding for categorical features.\n* Kaggle has some nice [visualizations of the bikeshare data](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fbike-sharing-demand\u002Fscripts?outputType=Visualization) we used today.\n\n-----\n\n### Class 11: First Project Presentation\n* Project presentations!\n\n**Homework:**\n* Watch Rahul Patwari's videos on [probability](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=o4QmoNfW3bI) (5 minutes) and [odds](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=GxbXQjX7fC0) (8 minutes) if you're not comfortable with either of those terms.\n* Read these excellent articles from BetterExplained: [An Intuitive Guide To Exponential Functions & e](http:\u002F\u002Fbetterexplained.com\u002Farticles\u002Fan-intuitive-guide-to-exponential-functions-e\u002F) and [Demystifying the Natural Logarithm (ln)](http:\u002F\u002Fbetterexplained.com\u002Farticles\u002Fdemystifying-the-natural-logarithm-ln\u002F). Then, review this [brief summary](notebooks\u002F12_e_log_examples.ipynb) of exponential functions and logarithms.\n\n-----\n\n### Class 12: Logistic Regression\n* Yelp votes homework due ([solution](notebooks\u002F10_yelp_votes_homework.ipynb))\n* Logistic regression ([notebook](notebooks\u002F12_logistic_regression.ipynb))\n    * [Glass identification dataset](https:\u002F\u002Farchive.ics.uci.edu\u002Fml\u002Fdatasets\u002FGlass+Identification)\n* Exercise with Titanic data ([notebook](notebooks\u002F12_titanic_confusion.ipynb), [data](data\u002Ftitanic.csv), [data dictionary](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Ftitanic\u002Fdata))\n* Confusion matrix ([slides](slides\u002F12_confusion_matrix.pdf), [notebook](notebooks\u002F12_titanic_confusion.ipynb))\n\n**Homework:**\n* If you aren't yet comfortable with all of the confusion matrix terminology, watch Rahul Patwari's videos on [Intuitive sensitivity and specificity](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=U4_3fditnWg) (9 minutes) and [The tradeoff between sensitivity and specificity](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=vtYDyGGeQyo) (13 minutes).\n* Video\u002Freading assignment on [ROC curves and AUC](homework\u002F13_roc_auc.md)\n* Video\u002Freading assignment on [cross-validation](homework\u002F13_cross_validation.md)\n\n**Logistic Regression Resources:**\n* To go deeper into logistic regression, read the first three sections of Chapter 4 of [An Introduction to Statistical Learning](http:\u002F\u002Fwww-bcf.usc.edu\u002F~gareth\u002FISL\u002F), or watch the [first three videos](http:\u002F\u002Fwww.dataschool.io\u002F15-hours-of-expert-machine-learning-videos\u002F) (30 minutes) from that chapter.\n* For a math-ier explanation of logistic regression, watch the first seven videos (71 minutes) from week 3 of Andrew Ng's [machine learning course](https:\u002F\u002Fwww.coursera.org\u002Flearn\u002Fmachine-learning\u002Fhome\u002Finfo), or read the [related lecture notes](http:\u002F\u002Fwww.holehouse.org\u002Fmlclass\u002F06_Logistic_Regression.html) compiled by a student.\n* For more on interpreting logistic regression coefficients, read this excellent [guide](http:\u002F\u002Fwww.ats.ucla.edu\u002Fstat\u002Fmult_pkg\u002Ffaq\u002Fgeneral\u002Fodds_ratio.htm) by UCLA's IDRE and these [lecture notes](http:\u002F\u002Fwww.unm.edu\u002F~schrader\u002Fbiostat\u002Fbio2\u002FSpr06\u002Flec11.pdf) from the University of New Mexico.\n* The scikit-learn documentation has a nice [explanation](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fcalibration.html) of what it means for a predicted probability to be calibrated.\n* [Supervised learning superstitions cheat sheet](http:\u002F\u002Fryancompton.net\u002Fassets\u002Fml_cheat_sheet\u002Fsupervised_learning.html) is a very nice comparison of four classifiers we cover in the course (logistic regression, decision trees, KNN, Naive Bayes) and one classifier we do not cover (Support Vector Machines).\n\n**Confusion Matrix Resources:**\n* My [simple guide to confusion matrix terminology](http:\u002F\u002Fwww.dataschool.io\u002Fsimple-guide-to-confusion-matrix-terminology\u002F) may be useful to you as a reference.\n* This blog post about [Amazon Machine Learning](https:\u002F\u002Faws.amazon.com\u002Fblogs\u002Faws\u002Famazon-machine-learning-make-data-driven-decisions-at-scale\u002F) contains a neat [graphic](https:\u002F\u002Fmedia.amazonwebservices.com\u002Fblog\u002F2015\u002Fml_adjust_model_1.png) showing how classification threshold affects different evaluation metrics.\n* This notebook (from another DAT course) explains [how to calculate \"expected value\"](https:\u002F\u002Fgithub.com\u002Fpodopie\u002FDAT18NYC\u002Fblob\u002Fmaster\u002Fclasses\u002F13-expected_value_cost_benefit_analysis.ipynb) from a confusion matrix by treating it as a cost-benefit matrix.\n\n-----\n\n### Class 13: Advanced Model Evaluation\n* Data preparation ([notebook](notebooks\u002F13_advanced_model_evaluation.ipynb))\n    * Handling missing values\n    * Handling categorical features (review)\n* ROC curves and AUC\n    * Discuss the [video\u002Freading assignment](homework\u002F13_roc_auc.md)\n    * Exercise: drawing an ROC curve ([slides](slides\u002F13_drawing_roc.pdf))\n    * Return to the main notebook\n* Cross-validation\n    * Discuss the [video\u002Freading assignment](homework\u002F13_cross_validation.md) and associated [notebook](notebooks\u002F13_cross_validation.ipynb)\n    * Return to the main notebook\n* Exercise with bank marketing data ([notebook](notebooks\u002F13_bank_exercise.ipynb), [data](data\u002Fbank-additional.csv), [data dictionary](https:\u002F\u002Farchive.ics.uci.edu\u002Fml\u002Fdatasets\u002FBank+Marketing))\n\n**Homework:**\n* Reading assignment on [spam filtering](homework\u002F14_spam_filtering.md)\n* Read these [Introduction to Probability](https:\u002F\u002Fdocs.google.com\u002Fpresentation\u002Fd\u002F1cM2dVbJgTWMkHoVNmYlB9df6P2H8BrjaqAcZTaLe9dA\u002Fedit#slide=id.gfc3caad2_00) slides, or skim section 2.1 of the [OpenIntro Statistics textbook](https:\u002F\u002Fwww.openintro.org\u002Fstat\u002Ftextbook.php?stat_book=os) (12 pages). Pay specific attention to the following terms: probability, mutually exclusive, sample space, independent.\n* **Optional:** Try to gain an understanding of conditional probability from this [visualization](http:\u002F\u002Fsetosa.io\u002Fconditional\u002F).\n* **Optional:** For an intuitive introduction to Bayes' theorem, read these posts on [wealth and happiness](http:\u002F\u002Fwww.quora.com\u002FWhat-is-an-intuitive-explanation-of-Bayes-Rule\u002Fanswer\u002FMichael-Hochster), [ducks](https:\u002F\u002Fplanspacedotorg.wordpress.com\u002F2014\u002F02\u002F23\u002Fbayes-rule-for-ducks\u002F), or [legos](http:\u002F\u002Fwww.countbayesie.com\u002Fblog\u002F2015\u002F2\u002F18\u002Fbayes-theorem-with-lego).\n\n**ROC Resources:**\n* Rahul Patwari has a great video on [ROC Curves](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=21Igj5Pr6u4) (12 minutes).\n* [An introduction to ROC analysis](http:\u002F\u002Fpeople.inf.elte.hu\u002Fkiss\u002F13dwhdm\u002Froc.pdf) is a very readable paper on the topic.\n* ROC curves can be used across a wide variety of applications, such as [comparing different feature sets](http:\u002F\u002Fresearch.microsoft.com\u002Fpubs\u002F205472\u002Faisec10-leontjeva.pdf) for detecting fraudulent Skype users, and [comparing different classifiers](http:\u002F\u002Fwww.cse.ust.hk\u002FnevinZhangGroup\u002Freadings\u002Fyi\u002FBradley_PR97.pdf) on a number of popular datasets.\n\n**Cross-Validation Resources:**\n* For more on cross-validation, read section 5.1 of [An Introduction to Statistical Learning](http:\u002F\u002Fwww-bcf.usc.edu\u002F~gareth\u002FISL\u002F) (11 pages) or watch the related videos: [K-fold and leave-one-out cross-validation](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=nZAM5OXrktY) (14 minutes), [cross-validation the right and wrong ways](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=S06JpVoNaA0) (10 minutes).\n* If you want to understand the different variations of cross-validation, this [paper](http:\u002F\u002Fwww.jcheminf.com\u002Fcontent\u002Fpdf\u002F1758-2946-6-10.pdf) examines and compares them in detail.\n* To learn how to use [GridSearchCV and RandomizedSearchCV](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgrid_search.html) for parameter tuning, watch [How to find the best model parameters in scikit-learn](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=Gol_qOgRqfA) (28 minutes) or read the [associated notebook](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fscikit-learn-videos\u002Fblob\u002Fmaster\u002F08_grid_search.ipynb).\n\n**Other Resources:**\n* scikit-learn has extensive documentation on [model evaluation](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fmodel_evaluation.html).\n* [Counterfactual evaluation of machine learning models](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=QWCSxAKR-h0) (45 minutes) is an excellent talk about the sophisticated way in which Stripe evaluates its fraud detection model. (These are the associated [slides](http:\u002F\u002Fwww.slideshare.net\u002FMichaelManapat\u002Fcounterfactual-evaluation-of-machine-learning-models).)\n* [Visualizing Machine Learning Thresholds to Make Better Business Decisions](http:\u002F\u002Fblog.insightdatalabs.com\u002Fvisualizing-classifier-thresholds\u002F) demonstrates how visualizing precision, recall, and \"queue rate\" at different thresholds can help you to maximize the business value of your classifier.\n\n-----\n\n### Class 14: Naive Bayes and Text Data\n* Conditional probability and Bayes' theorem\n    * [Slides](slides\u002F14_bayes_theorem.pdf) (adapted from [Visualizing Bayes' theorem](http:\u002F\u002Foscarbonilla.com\u002F2009\u002F05\u002Fvisualizing-bayes-theorem\u002F))\n    * Applying Bayes' theorem to iris classification ([notebook](notebooks\u002F14_bayes_theorem_iris.ipynb))\n* Naive Bayes classification\n    * [Slides](slides\u002F14_naive_bayes.pdf)\n    * Spam filtering example ([notebook](notebooks\u002F14_naive_bayes_spam.ipynb))\n* Applying Naive Bayes to text data in scikit-learn ([notebook](notebooks\u002F14_text_data_sklearn.ipynb))\n    * [CountVectorizer](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.feature_extraction.text.CountVectorizer.html) documentation\n    * SMS messages: [data](data\u002Fsms.tsv), [data dictionary](https:\u002F\u002Farchive.ics.uci.edu\u002Fml\u002Fdatasets\u002FSMS+Spam+Collection)\n\n**Homework:**\n* Complete another [homework assignment](homework\u002F14_yelp_review_text.md) with the [Yelp data](data\u002Fyelp.csv). This is due on Tuesday (10\u002F6).\n* Confirm that you have [TextBlob](https:\u002F\u002Ftextblob.readthedocs.org\u002F) installed by running `import textblob` from within your preferred Python environment. If it's not installed, run `pip install textblob` at the command line (not from within Python).\n\n**Resources:**\n* Sebastian Raschka's article on [Naive Bayes and Text Classification](http:\u002F\u002Fsebastianraschka.com\u002FArticles\u002F2014_naive_bayes_1.html) covers the conceptual material from today's class in much more detail.\n* For more on conditional probability, read these [slides](https:\u002F\u002Fdocs.google.com\u002Fpresentation\u002Fd\u002F1psUIyig6OxHQngGEHr3TMkCvhdLInnKnclQoNUr4G4U\u002Fedit#slide=id.gfc69f484_00), or read section 2.2 of the [OpenIntro Statistics textbook](https:\u002F\u002Fwww.openintro.org\u002Fstat\u002Ftextbook.php?stat_book=os) (15 pages).\n* For an intuitive explanation of Naive Bayes classification, read this post on [airport security](http:\u002F\u002Fwww.quora.com\u002FIn-laymans-terms-how-does-Naive-Bayes-work\u002Fanswer\u002FKonstantin-Tt).\n* For more details on Naive Bayes classification, Wikipedia has two excellent articles ([Naive Bayes classifier](http:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FNaive_Bayes_classifier) and [Naive Bayes spam filtering](http:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FNaive_Bayes_spam_filtering)), and Cross Validated has a good [Q&A](http:\u002F\u002Fstats.stackexchange.com\u002Fquestions\u002F21822\u002Funderstanding-naive-bayes).\n* When applying Naive Bayes classification to a dataset with continuous features, it is better to use [GaussianNB](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.naive_bayes.GaussianNB.html) rather than [MultinomialNB](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.naive_bayes.MultinomialNB.html). This [notebook](notebooks\u002F14_types_of_naive_bayes.ipynb) compares their performances on such a dataset. Wikipedia has a short [description](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FNaive_Bayes_classifier#Gaussian_naive_Bayes) of Gaussian Naive Bayes, as well as an excellent [example](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FNaive_Bayes_classifier#Sex_classification) of its usage.\n* These [slides](http:\u002F\u002Fwww.umiacs.umd.edu\u002F~jbg\u002Fteaching\u002FDATA_DIGGING\u002Flecture_05.pdf) from the University of Maryland provide more mathematical details on both logistic regression and Naive Bayes, and also explain how Naive Bayes is actually a \"special case\" of logistic regression.\n* Andrew Ng has a [paper](http:\u002F\u002Fai.stanford.edu\u002F~ang\u002Fpapers\u002Fnips01-discriminativegenerative.pdf) comparing the performance of logistic regression and Naive Bayes across a variety of datasets.\n* If you enjoyed Paul Graham's article, you can read [his follow-up article](http:\u002F\u002Fwww.paulgraham.com\u002Fbetter.html) on how he improved his spam filter and this [related paper](http:\u002F\u002Fwww.merl.com\u002Fpublications\u002Fdocs\u002FTR2004-091.pdf) about state-of-the-art spam filtering in 2004.\n* Yelp has found that Naive Bayes is more effective than Mechanical Turks at [categorizing businesses](http:\u002F\u002Fengineeringblog.yelp.com\u002F2011\u002F02\u002Ftowards-building-a-high-quality-workforce-with-mechanical-turk.html).\n\n-----\n\n### Class 15: Natural Language Processing\n* Yelp review text homework due ([solution](notebooks\u002F14_yelp_review_text_homework.ipynb))\n* Natural language processing ([notebook](notebooks\u002F15_natural_language_processing.ipynb))\n* Introduction to our [Kaggle competition](https:\u002F\u002Finclass.kaggle.com\u002Fc\u002Fdat8-stack-overflow)\n    * Create a Kaggle account, join the competition using the invitation link, download the sample submission, and then submit the sample submission (which will require SMS account verification).\n\n**Homework:**\n* Your draft paper is due on Thursday (10\u002F8)! Please submit a link to your project repository (with paper, code, data, and visualizations) before class.\n* Watch [Kaggle: How it Works](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=PoD84TVdD-4) (4 minutes) for a brief overview of the Kaggle platform.\n* Download the competition files, move them to the `DAT8\u002Fdata` directory, and make sure you can open the CSV files using Pandas. If you have any problems opening the files, you probably need to turn off real-time virus scanning (especially Microsoft Security Essentials).\n* **Optional:** Come up with some theories about which features might be relevant to predicting the response, and then explore the data to see if those theories appear to be true.\n* **Optional:** Watch my [project presentation video](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=HGr1yQV3Um0) (16 minutes) for a tour of the end-to-end machine learning process for a Kaggle competition, including feature engineering. (Or, just read through the [slides](https:\u002F\u002Fspeakerdeck.com\u002Fjustmarkham\u002Fallstate-purchase-prediction-challenge-on-kaggle).)\n\n**NLP Resources:**\n* If you want to learn a lot more NLP, check out the excellent [video lectures](https:\u002F\u002Fclass.coursera.org\u002Fnlp\u002Flecture) and [slides](http:\u002F\u002Fweb.stanford.edu\u002F~jurafsky\u002FNLPCourseraSlides.html) from this [Coursera course](https:\u002F\u002Fwww.coursera.org\u002Fcourse\u002Fnlp) (which is no longer being offered).\n* This slide deck defines many of the [key NLP terms](https:\u002F\u002Fgithub.com\u002Fga-students\u002FDAT_SF_9\u002Fblob\u002Fmaster\u002F16_Text_Mining\u002FDAT9_lec16_Text_Mining.pdf).\n* [Natural Language Processing with Python](http:\u002F\u002Fwww.nltk.org\u002Fbook\u002F) is the most popular book for going in-depth with the [Natural Language Toolkit](http:\u002F\u002Fwww.nltk.org\u002F) (NLTK).\n* [A Smattering of NLP in Python](https:\u002F\u002Fgithub.com\u002Fcharlieg\u002FA-Smattering-of-NLP-in-Python\u002Fblob\u002Fmaster\u002FA%20Smattering%20of%20NLP%20in%20Python.ipynb) provides a nice overview of NLTK, as does this [notebook from DAT5](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT5\u002Fblob\u002Fmaster\u002Fnotebooks\u002F14_nlp.ipynb).\n* [spaCy](http:\u002F\u002Fspacy.io\u002F) is a newer Python library for text processing that is focused on performance (unlike NLTK).\n* If you want to get serious about NLP, [Stanford CoreNLP](http:\u002F\u002Fnlp.stanford.edu\u002Fsoftware\u002Fcorenlp.shtml) is a suite of tools (written in Java) that is highly regarded.\n* When working with a large text corpus in scikit-learn, [HashingVectorizer](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Ffeature_extraction.html#vectorizing-a-large-text-corpus-with-the-hashing-trick) is a useful alternative to CountVectorizer.\n* [Automatically Categorizing Yelp Businesses](http:\u002F\u002Fengineeringblog.yelp.com\u002F2015\u002F09\u002Fautomatically-categorizing-yelp-businesses.html) discusses how Yelp uses NLP and scikit-learn to solve the problem of uncategorized businesses.\n* [Modern Methods for Sentiment Analysis](http:\u002F\u002Fdistrictdatalabs.silvrback.com\u002Fmodern-methods-for-sentiment-analysis) shows how \"word vectors\" can be used for more accurate sentiment analysis.\n* [Identifying Humorous Cartoon Captions](http:\u002F\u002Fwww.cs.huji.ac.il\u002F~dshahaf\u002FpHumor.pdf) is a readable paper about identifying funny captions submitted to the New Yorker Caption Contest.\n* [DC Natural Language Processing](http:\u002F\u002Fwww.meetup.com\u002FDC-NLP\u002F) is an active Meetup group in our local area.\n\n-----\n\n### Class 16: Kaggle Competition\n* Overview of how Kaggle works ([slides](slides\u002F16_kaggle.pdf))\n* Kaggle In-Class competition: [Predict whether a Stack Overflow question will be closed](https:\u002F\u002Finclass.kaggle.com\u002Fc\u002Fdat8-stack-overflow)\n    * [Complete code file](code\u002F16_kaggle.py)\n    * [Minimal code file](code\u002F16_kaggle_minimal.py): excludes all exploratory code\n    * [Explanations of log loss](http:\u002F\u002Fwww.quora.com\u002FWhat-is-an-intuitive-explanation-for-the-log-loss-function)\n\n**Homework:**\n* You will be assigned to review the project drafts of two of your peers. You have until Tuesday 10\u002F20 to provide them with feedback, according to the [peer review guidelines](project\u002Fpeer_review.md).\n* Read [A Visual Introduction to Machine Learning](http:\u002F\u002Fwww.r2d3.us\u002Fvisual-intro-to-machine-learning-part-1\u002F) for a brief overview of decision trees.\n* Download and install [Graphviz](http:\u002F\u002Fwww.graphviz.org\u002F), which will allow you to visualize decision trees in scikit-learn.\n    * Windows users should also add Graphviz to your path: Go to Control Panel, System, Advanced System Settings, Environment Variables. Under system variables, edit \"Path\" to include the path to the \"bin\" folder, such as: `C:\\Program Files (x86)\\Graphviz2.38\\bin`\n* **Optional:** Keep working on our Kaggle competition! You can make up to 5 submissions per day, and the competition doesn't close until 6:30pm ET on Tuesday 10\u002F27 (class 21).\n\n**Resources:**\n* [Specialist Knowledge Is Useless and Unhelpful](http:\u002F\u002Fwww.slate.com\u002Farticles\u002Fhealth_and_science\u002Fnew_scientist\u002F2012\u002F12\u002Fkaggle_president_jeremy_howard_amateurs_beat_specialists_in_data_prediction.html) is a brief interview with Jeremy Howard (past president of Kaggle) in which he argues that data science skills are much more important than domain expertise for creating effective predictive models.\n* [Getting in Shape for the Sport of Data Science](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=kwt6XEh7U3g) (74 minutes), also by Jeremy Howard, contains a lot of tips for competitive machine learning.\n* [Learning from the best](http:\u002F\u002Fblog.kaggle.com\u002F2014\u002F08\u002F01\u002Flearning-from-the-best\u002F) is an excellent blog post covering top tips from Kaggle Masters on how to do well on Kaggle.\n* [Feature Engineering Without Domain Expertise](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=bL4b1sGnILU) (17 minutes), a talk by Kaggle Master Nick Kridler, provides some simple advice about how to iterate quickly and where to spend your time during a Kaggle competition.\n* These examples may help you to better understand the process of feature engineering: predicting the number of [passengers at a train station](https:\u002F\u002Fmedium.com\u002F@chris_bour\u002Ffrench-largest-data-science-challenge-ever-organized-shows-the-unreasonable-effectiveness-of-open-8399705a20ef), identifying [fraudulent users of an online store](https:\u002F\u002Fdocs.google.com\u002Fpresentation\u002Fd\u002F1UdI5NY-mlHyseiRVbpTLyvbrHxY8RciHp5Vc-ZLrwmU\u002Fedit#slide=id.p), identifying [bots in an online auction](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Ffacebook-recruiting-iv-human-or-bot\u002Fforums\u002Ft\u002F14628\u002Fshare-your-secret-sauce), predicting who will [subscribe to the next season of an orchestra](http:\u002F\u002Fblog.kaggle.com\u002F2015\u002F01\u002F05\u002Fkaggle-inclass-stanfords-getting-a-handel-on-data-science-winners-report\u002F), and evaluating the [quality of e-commerce search engine results](http:\u002F\u002Fblog.kaggle.com\u002F2015\u002F07\u002F22\u002Fcrowdflower-winners-interview-3rd-place-team-quartet\u002F).\n* [Our perfect submission](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Frestaurant-revenue-prediction\u002Fforums\u002Ft\u002F13950\u002Four-perfect-submission) is a fun read about how great performance on the [public leaderboard](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Frestaurant-revenue-prediction\u002Fleaderboard\u002Fpublic) does not guarantee that a model will generalize to new data.\n\n-----\n\n### Class 17: Decision Trees\n* Decision trees ([notebook](notebooks\u002F17_decision_trees.ipynb))\n* Exercise with Capital Bikeshare data ([notebook](notebooks\u002F17_bikeshare_exercise.ipynb), [data](data\u002Fbikeshare.csv), [data dictionary](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fbike-sharing-demand\u002Fdata))\n\n**Homework:**\n* Read the \"Wisdom of the crowds\" section from MLWave's post on [Human Ensemble Learning](http:\u002F\u002Fmlwave.com\u002Fhuman-ensemble-learning\u002F).\n* **Optional:** Read the abstract from [Do We Need Hundreds of Classifiers to Solve Real World Classification Problems?](http:\u002F\u002Fjmlr.csail.mit.edu\u002Fpapers\u002Fvolume15\u002Fdelgado14a\u002Fdelgado14a.pdf), as well as Kaggle CTO Ben Hamner's [comment](https:\u002F\u002Fnews.ycombinator.com\u002Fitem?id=8719723) about the paper, paying attention to the mentions of \"Random Forests\".\n\n**Resources:**\n* scikit-learn's documentation on [decision trees](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Ftree.html) includes a nice overview of trees as well as tips for proper usage.\n* For a more thorough introduction to decision trees, read section 4.3 (23 pages) of [Introduction to Data Mining](http:\u002F\u002Fwww-users.cs.umn.edu\u002F~kumar\u002Fdmbook\u002Findex.php). (Chapter 4 is available as a free download.)\n* If you want to go deep into the different decision tree algorithms, this slide deck contains [A Brief History of Classification and Regression Trees](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F0B-BKohKl-jUYQ3RpMEF0OGRUU3RHVGpHY203NFd3Z19Nc1ZF\u002Fview).\n* [The Science of Singing Along](http:\u002F\u002Fwww.doc.gold.ac.uk\u002F~mas03dm\u002Fpapers\u002FPawleyMullensiefen_Singalong_2012.pdf) contains a neat regression tree (page 136) for predicting the percentage of an audience at a music venue that will sing along to a pop song.\n* Decision trees are common in the medical field for differential diagnosis, such as this classification tree for [identifying psychosis](http:\u002F\u002Fwww.psychcongress.com\u002Fsites\u002Fnaccme.com\u002Ffiles\u002Fimages\u002Fpcn\u002Fsaundras\u002Fpsychosis_decision_tree.pdf).\n\n-----\n\n### Class 18: Ensembling\n* Finish decision trees lesson ([notebook](notebooks\u002F17_decision_trees.ipynb))\n* Ensembling ([notebook](notebooks\u002F18_ensembling.ipynb))\n    * [Major League Baseball player data](data\u002Fhitters.csv) from 1986-87\n    * [Data dictionary](https:\u002F\u002Fcran.r-project.org\u002Fweb\u002Fpackages\u002FISLR\u002FISLR.pdf) (page 7)\n\n**Resources:**\n* scikit-learn's documentation on [ensemble methods](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fensemble.html) covers both \"averaging methods\" (such as bagging and Random Forests) as well as \"boosting methods\" (such as AdaBoost and Gradient Tree Boosting).\n* MLWave's [Kaggle Ensembling Guide](http:\u002F\u002Fmlwave.com\u002Fkaggle-ensembling-guide\u002F) is very thorough and shows the many different ways that ensembling can take place.\n* Browse the excellent [solution paper](https:\u002F\u002Fdocs.google.com\u002Fviewer?url=https:\u002F\u002Fraw.githubusercontent.com\u002FChenglongChen\u002FKaggle_CrowdFlower\u002Fmaster\u002FDoc\u002FKaggle_CrowdFlower_ChenglongChen.pdf) from the winner of Kaggle's [CrowdFlower competition](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fcrowdflower-search-relevance) for an example of the work and insight required to win a Kaggle competition.\n* [Interpretable vs Powerful Predictive Models: Why We Need Them Both](https:\u002F\u002Fmedium.com\u002F@chris_bour\u002Finterpretable-vs-powerful-predictive-models-why-we-need-them-both-990340074979) is a short post on how the tactics useful in a Kaggle competition are not always useful in the real world.\n* [Not Even the People Who Write Algorithms Really Know How They Work](http:\u002F\u002Fwww.theatlantic.com\u002Ftechnology\u002Farchive\u002F2015\u002F09\u002Fnot-even-the-people-who-write-algorithms-really-know-how-they-work\u002F406099\u002F) argues that the decreased interpretability of state-of-the-art machine learning models has a negative impact on society.\n* For an intuitive explanation of Random Forests, read Edwin Chen's answer to [How do random forests work in layman's terms?](http:\u002F\u002Fwww.quora.com\u002FRandom-Forests\u002FHow-do-random-forests-work-in-laymans-terms\u002Fanswer\u002FEdwin-Chen-1)\n* [Large Scale Decision Forests: Lessons Learned](http:\u002F\u002Fblog.siftscience.com\u002Fblog\u002F2015\u002Flarge-scale-decision-forests-lessons-learned) is an excellent post from Sift Science about their custom implementation of Random Forests.\n* [Unboxing the Random Forest Classifier](http:\u002F\u002Fnerds.airbnb.com\u002Funboxing-the-random-forest-classifier\u002F) describes a way to interpret the inner workings of Random Forests beyond just feature importances.\n* [Understanding Random Forests: From Theory to Practice](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1407.7502v3.pdf) is an in-depth academic analysis of Random Forests, including details of its implementation in scikit-learn.\n\n-----\n\n### Class 19: Advanced scikit-learn and Clustering\n* Advanced scikit-learn ([notebook](notebooks\u002F19_advanced_sklearn.ipynb))\n    * [StandardScaler](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.preprocessing.StandardScaler.html): standardizing features\n    * [Pipeline](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fpipeline.html): chaining steps\n* Clustering ([slides](slides\u002F19_clustering.pdf), [notebook](notebooks\u002F19_clustering.ipynb))\n    * K-means: [documentation](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.cluster.KMeans.html), [visualization 1](http:\u002F\u002Ftech.nitoyon.com\u002Fen\u002Fblog\u002F2013\u002F11\u002F07\u002Fk-means\u002F), [visualization 2](http:\u002F\u002Fwww.naftaliharris.com\u002Fblog\u002Fvisualizing-k-means-clustering\u002F)\n    * DBSCAN: [documentation](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.cluster.DBSCAN.html), [visualization](http:\u002F\u002Fwww.naftaliharris.com\u002Fblog\u002Fvisualizing-dbscan-clustering\u002F)\n\n**Homework:**\n* Reread [Understanding the Bias-Variance Tradeoff](http:\u002F\u002Fscott.fortmann-roe.com\u002Fdocs\u002FBiasVariance.html). (The \"answers\" to the [guiding questions](homework\u002F09_bias_variance.md) have been posted and may be helpful to you.)\n* **Optional:** Watch these two excellent (and related) videos from Caltech's Learning From Data course: [bias-variance tradeoff](http:\u002F\u002Fwork.caltech.edu\u002Flibrary\u002F081.html) (15 minutes) and [regularization](http:\u002F\u002Fwork.caltech.edu\u002Flibrary\u002F121.html) (8 minutes).\n\n**scikit-learn Resources:**\n* This is a longer example of [feature scaling](https:\u002F\u002Fgithub.com\u002Frasbt\u002Fpattern_classification\u002Fblob\u002Fmaster\u002Fpreprocessing\u002Fabout_standardization_normalization.ipynb) in scikit-learn, with additional discussion of the types of scaling you can use.\n* [Practical Data Science in Python](http:\u002F\u002Fradimrehurek.com\u002Fdata_science_python\u002F) is a long and well-written notebook that uses a few advanced scikit-learn features: pipelining, plotting a learning curve, and pickling a model.\n* To learn how to use [GridSearchCV and RandomizedSearchCV](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgrid_search.html) for parameter tuning, watch [How to find the best model parameters in scikit-learn](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=Gol_qOgRqfA) (28 minutes) or read the [associated notebook](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fscikit-learn-videos\u002Fblob\u002Fmaster\u002F08_grid_search.ipynb).\n* Sebastian Raschka has a number of excellent resources for scikit-learn users, including a repository of [tutorials and examples](https:\u002F\u002Fgithub.com\u002Frasbt\u002Fpattern_classification), a library of machine learning [tools and extensions](http:\u002F\u002Frasbt.github.io\u002Fmlxtend\u002F), a new [book](https:\u002F\u002Fgithub.com\u002Frasbt\u002Fpython-machine-learning-book), and a semi-active [blog](http:\u002F\u002Fsebastianraschka.com\u002Fblog\u002F).\n* scikit-learn has an incredibly active [mailing list](https:\u002F\u002Fwww.mail-archive.com\u002Fscikit-learn-general@lists.sourceforge.net\u002Findex.html) that is often much more useful than Stack Overflow for researching functions and asking questions.\n* If you forget how to use a particular scikit-learn function that we have used in class, don't forget that this repository is fully searchable!\n\n**Clustering Resources:**\n* For a very thorough introduction to clustering, read chapter 8 (69 pages) of [Introduction to Data Mining](http:\u002F\u002Fwww-users.cs.umn.edu\u002F~kumar\u002Fdmbook\u002Findex.php) (available as a free download), or browse through the chapter 8 slides.\n* scikit-learn's user guide compares many different [types of clustering](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fclustering.html).\n* This [PowerPoint presentation](http:\u002F\u002Fwww2.research.att.com\u002F~volinsky\u002FDataMining\u002FColumbia2011\u002FSlides\u002FTopic6-Clustering.ppt) from Columbia's Data Mining class provides a good introduction to clustering, including hierarchical clustering and alternative distance metrics.\n* An Introduction to Statistical Learning has useful videos on [K-means clustering](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=aIybuNt9ps4&list=PL5-da3qGB5IBC-MneTc9oBZz0C6kNJ-f2) (17 minutes) and [hierarchical clustering](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=Tuuc9Y06tAc&list=PL5-da3qGB5IBC-MneTc9oBZz0C6kNJ-f2) (15 minutes).\n* This is an excellent interactive visualization of [hierarchical clustering](https:\u002F\u002Fjoyofdata.shinyapps.io\u002Fhclust-shiny\u002F).\n* This is a nice animated explanation of [mean shift clustering](http:\u002F\u002Fspin.atomicobject.com\u002F2015\u002F05\u002F26\u002Fmean-shift-clustering\u002F).\n* The [K-modes algorithm](http:\u002F\u002Fwww.cs.ust.hk\u002F~qyang\u002FTeaching\u002F537\u002FPapers\u002Fhuang98extensions.pdf) can be used for clustering datasets of categorical features without converting them to numerical values. Here is a [Python implementation](https:\u002F\u002Fgithub.com\u002Fnicodv\u002Fkmodes).\n* Here are some fun examples of clustering: [A Statistical Analysis of the Work of Bob Ross](http:\u002F\u002Ffivethirtyeight.com\u002Ffeatures\u002Fa-statistical-analysis-of-the-work-of-bob-ross\u002F) (with [data and Python code](https:\u002F\u002Fgithub.com\u002Ffivethirtyeight\u002Fdata\u002Ftree\u002Fmaster\u002Fbob-ross)), [How a Math Genius Hacked OkCupid to Find True Love](http:\u002F\u002Fwww.wired.com\u002F2014\u002F01\u002Fhow-to-hack-okcupid\u002Fall\u002F), and [characteristics of your zip code](http:\u002F\u002Fwww.esri.com\u002Flanding-pages\u002Ftapestry\u002F).\n\n-----\n\n### Class 20: Regularization and Regular Expressions\n* Regularization ([notebook](notebooks\u002F20_regularization.ipynb))\n    * Regression: [Ridge](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.linear_model.Ridge.html), [RidgeCV](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.linear_model.RidgeCV.html), [Lasso](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.linear_model.Lasso.html), [LassoCV](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.linear_model.LassoCV.html)\n    * Classification: [LogisticRegression](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.linear_model.LogisticRegression.html)\n    * Helper functions: [Pipeline](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fpipeline.html), [GridSearchCV](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgrid_search.html)\n* Regular expressions\n    * [Baltimore homicide data](data\u002Fhomicides.txt)\n    * [Regular expressions 101](https:\u002F\u002Fregex101.com\u002F#python): real-time testing of regular expressions\n    * [Reference guide](code\u002F20_regex_reference.py)\n    * [Exercise](code\u002F20_regex_exercise.py)\n\n**Homework:**\n* Your final project is due next week!\n* **Optional:** Make your final submissions to our Kaggle competition! It closes at 6:30pm ET on Tuesday 10\u002F27.\n* **Optional:** Read this classic paper, which may help you to connect many of the topics we have studied throughout the course: [A Few Useful Things to Know about Machine Learning](http:\u002F\u002Fhomes.cs.washington.edu\u002F~pedrod\u002Fpapers\u002Fcacm12.pdf).\n\n**Regularization Resources:**\n* The scikit-learn user guide for [Generalized Linear Models](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Flinear_model.html) explains different variations of regularization.\n* Section 6.2 of [An Introduction to Statistical Learning](http:\u002F\u002Fwww-bcf.usc.edu\u002F~gareth\u002FISL\u002F) (14 pages) introduces both lasso and ridge regression. Or, watch the related videos on [ridge regression](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=cSKzqb0EKS0&list=PL5-da3qGB5IB-Xdpj_uXJpLGiRfv9UVXI&index=6) (13 minutes) and [lasso regression](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=A5I1G1MfUmA&index=7&list=PL5-da3qGB5IB-Xdpj_uXJpLGiRfv9UVXI) (15 minutes).\n* For more details on lasso regression, read Tibshirani's [original paper](http:\u002F\u002Fstatweb.stanford.edu\u002F~tibs\u002Flasso\u002Flasso.pdf).\n* For a math-ier explanation of regularization, watch the last four videos (30 minutes) from week 3 of Andrew Ng's [machine learning course](https:\u002F\u002Fwww.coursera.org\u002Flearn\u002Fmachine-learning\u002F), or read the [related lecture notes](http:\u002F\u002Fwww.holehouse.org\u002Fmlclass\u002F07_Regularization.html) compiled by a student.\n* This [notebook](https:\u002F\u002Fgithub.com\u002Fluispedro\u002FPenalizedRegression\u002Fblob\u002Fmaster\u002FPenalizedRegression.ipynb) from chapter 7 of [Building Machine Learning Systems with Python](https:\u002F\u002Fwww.packtpub.com\u002Fbig-data-and-business-intelligence\u002Fbuilding-machine-learning-systems-python) has a nice long example of regularized linear regression.\n* There are some special considerations when using dummy encoding for categorical features with a regularized model. This [Cross Validated Q&A](https:\u002F\u002Fstats.stackexchange.com\u002Fquestions\u002F69568\u002Fwhether-to-rescale-indicator-binary-dummy-predictors-for-lasso) debates whether the dummy variables should be standardized (along with the rest of the features), and a comment on this [blog post](http:\u002F\u002Fappliedpredictivemodeling.com\u002Fblog\u002F2013\u002F10\u002F23\u002Fthe-basics-of-encoding-categorical-data-for-predictive-models) recommends that the baseline level should not be dropped.\n\n**Regular Expressions Resources:**\n* Google's Python Class includes an excellent [introductory lesson](https:\u002F\u002Fdevelopers.google.com\u002Fedu\u002Fpython\u002Fregular-expressions) on regular expressions (which also has an associated [video](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=kWyoYtvJpe4&index=4&list=PL5-da3qGB5IA5NwDxcEJ5dvt8F9OQP7q5)).\n* Python for Informatics has a nice [chapter](http:\u002F\u002Fwww.pythonlearn.com\u002Fhtml-270\u002Fbook012.html) on regular expressions. (If you want to run the examples, you'll need to download [mbox.txt](http:\u002F\u002Fwww.py4inf.com\u002Fcode\u002Fmbox.txt) and [mbox-short.txt](http:\u002F\u002Fwww.py4inf.com\u002Fcode\u002Fmbox-short.txt).)\n* [Breaking the Ice with Regular Expressions](https:\u002F\u002Fwww.codeschool.com\u002Fcourses\u002Fbreaking-the-ice-with-regular-expressions\u002F) is an interactive Code School course, though only the first \"level\" is free.\n* If you want to go really deep with regular expressions, [RexEgg](http:\u002F\u002Fwww.rexegg.com\u002F) includes endless articles and tutorials.\n* [5 Tools You Didn't Know That Use Regular Expressions](http:\u002F\u002Fblog.codeschool.io\u002F2015\u002F07\u002F30\u002F5-tools-you-didnt-know-that-use-regular-expressions\u002F) demonstrates how regular expressions can be used with Excel, Word, Google Spreadsheets, Google Forms, text editors, and other tools.\n* [Exploring Expressions of Emotions in GitHub Commit Messages](http:\u002F\u002Fgeeksta.net\u002Fgeeklog\u002Fexploring-expressions-emotions-github-commit-messages\u002F) is a fun example of how regular expressions can be used for data analysis, and [Emojineering](http:\u002F\u002Finstagram-engineering.tumblr.com\u002Fpost\u002F118304328152\u002Femojineering-part-2-implementing-hashtag-emoji) explains how Instagram uses regular expressions to detect emoji in hashtags.\n\n-----\n\n### Class 21: Course Review and Final Project Presentation\n* Project presentations!\n* [Data science review](https:\u002F\u002Fdocs.google.com\u002Fdocument\u002Fd\u002F19gBCkmrbMpFFLPX8wa5daMnyl7J5BXhMV8JNJwgp1pk\u002Fedit?usp=sharing)\n\n**Resources:**\n* scikit-learn's [machine learning map](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Ftutorial\u002Fmachine_learning_map\u002F) may help you to choose the \"best\" model for your task.\n* [Choosing a Machine Learning Classifier](http:\u002F\u002Fblog.echen.me\u002F2011\u002F04\u002F27\u002Fchoosing-a-machine-learning-classifier\u002F) is a short and highly readable comparison of several classification models, [Classifier comparison](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fauto_examples\u002Fclassification\u002Fplot_classifier_comparison.html) is scikit-learn's visualization of classifier decision boundaries, [Comparing supervised learning algorithms](http:\u002F\u002Fwww.dataschool.io\u002Fcomparing-supervised-learning-algorithms\u002F) is a model comparison table that I created, and [Supervised learning superstitions cheat sheet](http:\u002F\u002Fryancompton.net\u002Fassets\u002Fml_cheat_sheet\u002Fsupervised_learning.html) is a more thorough comparison (with links to lots of useful resources).\n* [Machine Learning Done Wrong](http:\u002F\u002Fml.posthaven.com\u002Fmachine-learning-done-wrong), [Machine Learning Gremlins](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=tleeC-KlsKA) (31 minutes), [Clever Methods of Overfitting](http:\u002F\u002Fhunch.net\u002F?p=22), and [Common Pitfalls in Machine Learning](http:\u002F\u002Fdanielnee.com\u002F?p=155) all offer thoughtful advice on how to avoid common mistakes in machine learning.\n* [Practical machine learning tricks from the KDD 2011 best industry paper](http:\u002F\u002Fblog.david-andrzejewski.com\u002Fmachine-learning\u002Fpractical-machine-learning-tricks-from-the-kdd-2011-best-industry-paper\u002F) and Andrew Ng's [Advice for applying machine learning](http:\u002F\u002Fcs229.stanford.edu\u002Fmaterials\u002FML-advice.pdf) include slightly more advanced advice than the resources above.\n* [An Empirical Comparison of Supervised Learning Algorithms](http:\u002F\u002Fwww.cs.cornell.edu\u002F~caruana\u002Fctp\u002Fct.papers\u002Fcaruana.icml06.pdf) is a readable research paper from 2006, which was also presented as a [talk](http:\u002F\u002Fvideolectures.net\u002Fsolomon_caruana_wslmw\u002F) (77 minutes).\n\n-----\n\n### Class 22: Final Project Presentation\n* Project presentations!\n* [What's next?](other\u002Fadvice.md)\n\n-----\n\n## Additional Resources\n\n### Tidy Data\n* [Good Data Management Practices for Data Analysis](https:\u002F\u002Fwww.prometheusresearch.com\u002Fgood-data-management-practices-for-data-analysis-tidy-data-part-2\u002F) briefly summarizes the principles of \"tidy data\".\n* [Hadley Wickham's paper](http:\u002F\u002Fwww.jstatsoft.org\u002Farticle\u002Fview\u002Fv059i10) explains tidy data in detail and includes lots of good examples.\n* Example of a tidy dataset: [Bob Ross](https:\u002F\u002Fgithub.com\u002Ffivethirtyeight\u002Fdata\u002Fblob\u002Fmaster\u002Fbob-ross\u002Felements-by-episode.csv)\n* Examples of untidy datasets: [NFL ticket prices](https:\u002F\u002Fgithub.com\u002Ffivethirtyeight\u002Fdata\u002Fblob\u002Fmaster\u002Fnfl-ticket-prices\u002F2014-average-ticket-price.csv), [airline safety](https:\u002F\u002Fgithub.com\u002Ffivethirtyeight\u002Fdata\u002Fblob\u002Fmaster\u002Fairline-safety\u002Fairline-safety.csv), [Jets ticket prices](https:\u002F\u002Fgithub.com\u002Ffivethirtyeight\u002Fdata\u002Fblob\u002Fmaster\u002Fnfl-ticket-prices\u002Fjets-buyer.csv), [Chipotle orders](https:\u002F\u002Fgithub.com\u002FTheUpshot\u002Fchipotle\u002Fblob\u002Fmaster\u002Forders.tsv)\n* If your co-workers tend to create spreadsheets that are [unreadable by computers](https:\u002F\u002Fbosker.wordpress.com\u002F2014\u002F12\u002F05\u002Fthe-government-statistical-services-terrible-spreadsheet-advice\u002F), they may benefit from reading these [tips for releasing data in spreadsheets](http:\u002F\u002Fwww.clean-sheet.org\u002F). (There are some additional suggestions in this [answer](http:\u002F\u002Fstats.stackexchange.com\u002Fquestions\u002F83614\u002Fbest-practices-for-creating-tidy-data\u002F83711#83711) from Cross Validated.)\n\n### Databases and SQL\n* This [GA slide deck](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT5\u002Fblob\u002Fmaster\u002Fslides\u002F20_sql.pdf) provides a brief introduction to databases and SQL. The [Python script](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT5\u002Fblob\u002Fmaster\u002Fcode\u002F20_sql.py) from that lesson demonstrates basic SQL queries, as well as how to connect to a SQLite database from Python and how to query it using Pandas.\n* The repository for this [SQL Bootcamp](https:\u002F\u002Fgithub.com\u002Fbrandonmburroughs\u002Fsql_bootcamp) contains an extremely well-commented SQL script that is suitable for walking through on your own.\n* This [GA notebook](https:\u002F\u002Fgithub.com\u002Fpodopie\u002FDAT18NYC\u002Fblob\u002Fmaster\u002Fclasses\u002F17-relational_databases.ipynb) provides a shorter introduction to databases and SQL that helpfully contrasts SQL queries with Pandas syntax.\n* [SQLZOO](http:\u002F\u002Fsqlzoo.net\u002Fwiki\u002FSQL_Tutorial), [Mode Analytics](http:\u002F\u002Fsqlschool.modeanalytics.com\u002F), [Khan Academy](https:\u002F\u002Fwww.khanacademy.org\u002Fcomputing\u002Fcomputer-programming\u002Fsql), [Codecademy](https:\u002F\u002Fwww.codecademy.com\u002Fcourses\u002Flearn-sql), [Datamonkey](http:\u002F\u002Fdatamonkey.pro\u002Fguess_sql\u002Flessons\u002F), and [Code School](http:\u002F\u002Fcampus.codeschool.com\u002Fcourses\u002Ftry-sql\u002Fcontents) all have online beginner SQL tutorials that look promising. Code School also offers an [advanced tutorial](https:\u002F\u002Fwww.codeschool.com\u002Fcourses\u002Fthe-sequel-to-sql\u002F), though it's not free.\n* [w3schools](http:\u002F\u002Fwww.w3schools.com\u002Fsql\u002Ftrysql.asp?filename=trysql_select_all) has a sample database that allows you to practice SQL from your browser. Similarly, Kaggle allows you to query a large SQLite database of [Reddit Comments](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Freddit-comments-may-2015\u002Fdata) using their online \"Scripts\" application.\n* [What Every Data Scientist Needs to Know about SQL](http:\u002F\u002Fjoshualande.com\u002Fdata-science-sql\u002F) is a brief series of posts about SQL basics, and [Introduction to SQL for Data Scientists](http:\u002F\u002Fbensresearch.com\u002Fdownloads\u002FSQL.pdf) is a paper with similar goals.\n* [10 Easy Steps to a Complete Understanding of SQL](https:\u002F\u002Fweb.archive.org\u002Fweb\u002F20150402234726\u002Fhttp:\u002F\u002Ftech.pro\u002Ftutorial\u002F1555\u002F10-easy-steps-to-a-complete-understanding-of-sql) is a good article for those who have some SQL experience and want to understand it at a deeper level.\n* SQLite's article on [Query Planning](http:\u002F\u002Fwww.sqlite.org\u002Fqueryplanner.html) explains how SQL queries \"work\".\n* [A Comparison Of Relational Database Management Systems](https:\u002F\u002Fwww.digitalocean.com\u002Fcommunity\u002Ftutorials\u002Fsqlite-vs-mysql-vs-postgresql-a-comparison-of-relational-database-management-systems) gives the pros and cons of SQLite, MySQL, and PostgreSQL.\n* If you want to go deeper into databases and SQL, Stanford has a well-respected series of [14 mini-courses](https:\u002F\u002Flagunita.stanford.edu\u002Fcourses\u002FDB\u002F2014\u002FSelfPaced\u002Fabout).\n* [Blaze](http:\u002F\u002Fblaze.pydata.org) is a Python package enabling you to use Pandas-like syntax to query data living in a variety of data storage systems.\n\n### Recommendation Systems\n* This [GA slide deck](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT4\u002Fblob\u002Fmaster\u002Fslides\u002F18_recommendation_engines.pdf) provides a brief introduction to recommendation systems, and the [Python script](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT4\u002Fblob\u002Fmaster\u002Fcode\u002F18_recommenders_soutions.py) from that lesson demonstrates how to build a simple recommender.\n* Chapter 9 of [Mining of Massive Datasets](http:\u002F\u002Finfolab.stanford.edu\u002F~ullman\u002Fmmds\u002FbookL.pdf) (36 pages) is a more thorough introduction to recommendation systems.\n* Chapters 2 through 4 of [A Programmer's Guide to Data Mining](http:\u002F\u002Fguidetodatamining.com\u002F) (165 pages) provides a friendlier introduction, with lots of Python code and exercises.\n* The Netflix Prize was the famous competition for improving Netflix's recommendation system by 10%. Here are some useful articles about the Netflix Prize:\n    * [Netflix Recommendations: Beyond the 5 stars](http:\u002F\u002Ftechblog.netflix.com\u002F2012\u002F04\u002Fnetflix-recommendations-beyond-5-stars.html): Two posts from the Netflix blog summarizing the competition and their recommendation system\n    * [Winning the Netflix Prize: A Summary](http:\u002F\u002Fblog.echen.me\u002F2011\u002F10\u002F24\u002Fwinning-the-netflix-prize-a-summary\u002F): Overview of the models and techniques that went into the winning solution\n    * [A Perspective on the Netflix Prize](http:\u002F\u002Fwww2.research.att.com\u002F~volinsky\u002Fpapers\u002Fchance.pdf): A summary of the competition by the winning team\n* This [paper](http:\u002F\u002Fwww.cs.umd.edu\u002F~samir\u002F498\u002FAmazon-Recommendations.pdf) summarizes how Amazon.com's recommendation system works, and this [Stack Overflow Q&A](http:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F2323768\u002Fhow-does-the-amazon-recommendation-feature-work) has some additional thoughts.\n* [Facebook](https:\u002F\u002Fcode.facebook.com\u002Fposts\u002F861999383875667\u002Frecommending-items-to-more-than-a-billion-people\u002F) and [Etsy](https:\u002F\u002Fcodeascraft.com\u002F2014\u002F11\u002F17\u002Fpersonalized-recommendations-at-etsy\u002F) have blog posts about how their recommendation systems work.\n* [The Global Network of Discovery](http:\u002F\u002Fwww.gnod.com\u002F) provides some neat recommenders for music, authors, and movies.\n* [The People Inside Your Machine](http:\u002F\u002Fwww.npr.org\u002Fblogs\u002Fmoney\u002F2015\u002F01\u002F30\u002F382657657\u002Fepisode-600-the-people-inside-your-machine) (23 minutes) is a Planet Money podcast episode about how Amazon Mechanical Turks can assist with recommendation engines (and machine learning in general).\n* Coursera has a [course](https:\u002F\u002Fwww.coursera.org\u002Flearn\u002Frecommender-systems) on recommendation systems, if you want to go even deeper into the material.\n","## DAT8 课程仓库\n\n华盛顿特区 [General Assembly 数据科学课程](https:\u002F\u002Fgeneralassemb.ly\u002Feducation\u002Fdata-science\u002Fwashington-dc\u002F) 的课程资料（2015年8月18日至10月29日）。\n\n**讲师:** Kevin Markham ([数据学校博客](http:\u002F\u002Fwww.dataschool.io\u002F)、[邮件通讯](http:\u002F\u002Fwww.dataschool.io\u002Fsubscribe\u002F)、[YouTube 频道](https:\u002F\u002Fwww.youtube.com\u002Fuser\u002Fdataschool))\n\n[![Binder](http:\u002F\u002Fmybinder.org\u002Fbadge.svg)](http:\u002F\u002Fmybinder.org\u002Frepo\u002Fjustmarkham\u002FDAT8)\n\n星期二 | 星期四\n--- | ---\n8月18日：[数据科学导论](#class-1-introduction-to-data-science) | 8月20日：[命令行与版本控制](#class-2-command-line-and-version-control)\n8月25日：[数据读取与清洗](#class-3-data-reading-and-cleaning) | 8月27日：[探索性数据分析](#class-4-exploratory-data-analysis)\n9月1日：[可视化](#class-5-visualization) | 9月3日：[机器学习](#class-6-machine-learning)\n9月8日：[获取数据](#class-7-getting-data) | 9月10日：[K近邻算法](#class-8-k-nearest-neighbors)\n9月15日：[基础模型评估](#class-9-basic-model-evaluation) | 9月17日：[线性回归](#class-10-linear-regression)\n9月22日：[第一次项目展示](#class-11-first-project-presentation) | 9月24日：[逻辑回归](#class-12-logistic-regression)\n9月29日：[高级模型评估](#class-13-advanced-model-evaluation) | 10月1日：[朴素贝叶斯与文本数据](#class-14-naive-bayes-and-text-data)\n10月6日：[自然语言处理](#class-15-natural-language-processing) | 10月8日：[Kaggle 竞赛](#class-16-kaggle-competition)\n10月13日：[决策树](#class-17-decision-trees) | 10月15日：[集成学习](#class-18-ensembling)\n10月20日：[高级 scikit-learn 与聚类](#class-19-advanced-scikit-learn-and-clustering) | 10月22日：[正则化与正则表达式](#class-20-regularization-and-regular-expressions)\n10月27日：[课程回顾](#class-21-course-review-and-final-project-presentation) | 10月29日：[最终项目展示](#class-22-final-project-presentation)\n\n\u003C!--\n### 课程开始前\n* 安装 [Git](http:\u002F\u002Fgit-scm.com\u002Fdownloads)。\n* 在 [GitHub](https:\u002F\u002Fgithub.com\u002F) 网站上创建一个账户。\n    * 无需下载“GitHub for Windows”或“GitHub for Mac”\n* 安装 Python 2.7x 的 [Anaconda 发行版](http:\u002F\u002Fcontinuum.io\u002Fdownloads)。\n    * 如果您选择不使用 Anaconda，这里列出了课程期间需要安装的 [Python 包](other\u002Fpython_packages.md)。\n* 我们希望在课程开始前检查您的笔记本电脑设置：\n    * 您可以在8月11日星期二下午5:30至6:30的中级 Python 工作坊之前、8月15日星期六下午1点至3点在 [15th & K Starbucks](http:\u002F\u002Fwww.yelp.com\u002Fbiz\u002Fstarbucks-washington-15) 处，或者8月18日星期二下午5:30至6:30上课前进行检查。\n    * 或者，您可以自行按照 [设置检查清单](other\u002Fsetup_checklist.md) 进行操作。\n* 收到 Slack 的邀请邮件后，请加入我们的“DAT8 团队”并上传您的照片。\n* 使用以下资源练习 Python。\n-->\n\n### Python 资源\n* [Codecademy 的 Python 课程](http:\u002F\u002Fwww.codecademy.com\u002Fen\u002Ftracks\u002Fpython)：适合初学者，包含大量浏览器内练习。\n* [Dataquest](https:\u002F\u002Fwww.dataquest.io)：通过交互式练习，在数据科学背景下教授 Python。\n* [Google 的 Python 课程](https:\u002F\u002Fdevelopers.google.com\u002Fedu\u002Fpython\u002F)：稍显进阶，包含数小时有用的讲座视频和可下载的练习题（附解答）。\n* [Python 入门](http:\u002F\u002Fintrotopython.org\u002F)：一系列 IPython 笔记本，很好地解释了 Python 核心概念和数据结构。\n* [Python for Informatics](http:\u002F\u002Fwww.pythonlearn.com\u002Fbook.php)：一本非常面向初学者的书籍，配有相关 [幻灯片](https:\u002F\u002Fdrive.google.com\u002Ffolderview?id=0B7X1ycQalUnyal9yeUx3VW81VDg&usp=sharing) 和 [视频](https:\u002F\u002Fwww.youtube.com\u002Fplaylist?list=PLlRFEj9H3Oj4JXIwMwN1_ss1Tk8wZShEJ)。\n* [科学家的 Python 快速入门](http:\u002F\u002Fnbviewer.ipython.org\u002Fgist\u002Frpmuller\u002F5920182)：阅读概述部分，即可快速了解 Python。\n* [Python 2.7 快速参考](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fpython-reference\u002Fblob\u002Fmaster\u002Freference.py)：我的初学者指南，通过简短且注释清晰的示例演示 Python 概念。\n* [初级](code\u002F00_python_beginner_workshop.py) 和 [中级](code\u002F00_python_intermediate_workshop.py) 工作坊代码：可用于复习和参考。\n* [Python Tutor](http:\u002F\u002Fpythontutor.com\u002F)：允许您可视化 Python 代码的执行过程。\n\n\u003C!--\n### 提交表格\n* [反馈表](http:\u002F\u002Fbit.ly\u002Fdat8feedback)\n* [作业和项目提交](http:\u002F\u002Fbit.ly\u002Fdat8homework)\n-->\n\n### [课程项目](project\u002FREADME.md)\n\n### [机器学习模型比较](other\u002Fmodel_comparison.md)\n\n### [模型评估方法与指标比较](other\u002Fmodel_evaluation_comparison.md)\n\n### [提升数据科学能力的建议](other\u002Fadvice.md)\n\n### [其他资源](#additional-resources-1)\n\n-----\n\n### 第1课：数据科学导论\n* 课程概述（[幻灯片](slides\u002F01_course_overview.pdf)）\n* 数据科学简介（[幻灯片](slides\u002F01_intro_to_data_science.pdf)）\n* 讨论课程项目：[要求](project\u002FREADME.md)和[示例项目](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT-project-examples)\n* 数据类型（[幻灯片](slides\u002F01_types_of_data.pdf)）及[公开数据源](project\u002Fpublic_data.md)\n* 来自General Assembly工作人员的欢迎致辞\n\n**作业：**\n* 使用终端（Linux\u002FMac）或Git Bash（Windows），完成GA友好的[命令行教程](http:\u002F\u002Fgeneralassembly.github.io\u002Fprework\u002Fcommand-line\u002F#\u002F)。\n* 阅读这份[命令行参考](code\u002F02_command_line.md)，并完成文末的课前练习。（完成后无需提交任何内容。）\n* 观看[Git与GitHub入门](https:\u002F\u002Fwww.youtube.com\u002Fplaylist?list=PL5-da3qGB5IBLMp7LtN8Nc3Efd4hJq0kD)中的视频1至8（共21分钟），或阅读[Pro Git](http:\u002F\u002Fgit-scm.com\u002Fbook\u002Fen\u002Fv2)的第1.1至2.2节。\n* 如果你的笔记本电脑有任何设置问题，请在周四之前与我们一起解决。如果你的电脑尚未经过检查，建议你周四提前到场，或者自行对照[设置检查清单](other\u002Fsetup_checklist.md)进行检查，并告知我们已完成。\n\n**资源：**\n* 要了解不同类型的数据科学家，可以阅读[分析分析师](http:\u002F\u002Fcdn.oreillystatic.com\u002Foreilly\u002Fradarreport\u002F0636920029014\u002FAnalyzing_the_Analyzers.pdf)（32页）。\n* 想了解数据科学家的工作状态，可以阅读来自[Win-Vector](http:\u002F\u002Fwww.win-vector.com\u002Fblog\u002F2012\u002F09\u002Fon-being-a-data-scientist\u002F)和[Datascope Analytics](http:\u002F\u002Fdatascopeanalytics.com\u002Fwhat-we-think\u002F2014\u002F07\u002F31\u002Fsix-qualities-of-a-great-data-scientist)的短文。\n* Quora上有一个[数据科学主题FAQ](https:\u002F\u002Fwww.quora.com\u002FData-Science)，包含大量有趣的问答。\n* 可以通过Data Community DC的[活动日历](http:\u002F\u002Fwww.datacommunitydc.org\u002Fcalendar)或[每周简报](http:\u002F\u002Fwww.datacommunitydc.org\u002Fnewsletter)了解本地数据相关的活动。\n\n-----\n\n### 第2课：命令行与版本控制\n* Slack使用指南\n* 复习命令行课前练习（[代码](code\u002F02_command_line.md)）\n* Git与GitHub（[幻灯片](slides\u002F02_git_github.pdf)）\n* 中级命令行操作\n\n**作业：**\n* 使用Chipotle数据完成[命令行作业](homework\u002F02_command_line_chipotle.md)。\n* 复习[初级](code\u002F00_python_beginner_workshop.py)和[中级](code\u002F00_python_intermediate_workshop.py)Python工作坊的代码。如果你对其中任何内容感到不熟悉（“requests”和“APIs”部分除外），本周末应花时间练习Python：\n    * [Python入门](http:\u002F\u002Fintrotopython.org\u002F)很好地讲解了Python的基础知识，并提供了大量示例代码。\n    * 如果你喜欢从书本学习，[Python for Informatics](http:\u002F\u002Fwww.pythonlearn.com\u002Fhtml-270\u002F)中有关于字符串、列表和字典的实用章节。\n    * 如果你更喜欢互动式练习，可以尝试[Codecademy](http:\u002F\u002Fwww.codecademy.com\u002Fen\u002Ftracks\u002Fpython)的以下课程：“Python列表与字典”和“超市的一天”。\n    * 如果你有更多时间，可以尝试[DataQuest的Learning Python](https:\u002F\u002Fwww.dataquest.io\u002Fcourse\u002Flearning-python)课程中的任务2和任务3。\n    * 如果你已经掌握了这些内容并希望接受更大挑战，可以尝试解决[Python Challenge](http:\u002F\u002Fwww.pythonchallenge.com\u002F)的第一题（解密信息），并将你的代码发送到Slack。\n* 为了帮助你构建项目思路，观看[什么是机器学习？它是如何工作的？](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=elojMnjn4kk)（10分钟）。（这是视频中展示的[IPython笔记本](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fscikit-learn-videos\u002Fblob\u002Fmaster\u002F01_machine_learning_intro.ipynb)。）或者阅读[A Visual Introduction to Machine Learning](http:\u002F\u002Fwww.r2d3.us\u002Fvisual-intro-to-machine-learning-part-1\u002F)，该文重点介绍了一种名为决策树的特定机器学习模型。\n* **可选：** 浏览更多[学生项目示例](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT-project-examples)，或许能为你的项目提供灵感！\n\n**Git与Markdown资源：**\n* [Pro Git](http:\u002F\u002Fgit-scm.com\u002Fbook\u002Fen\u002Fv2)是一本学习Git的优秀书籍。请阅读前两章，以更深入地理解版本控制和基本命令。\n* 如果你想大量练习Git（并学习更多命令），[Git Immersion](http:\u002F\u002Fgitimmersion.com\u002F)看起来很有前景。\n* 要理解如何在GitHub上贡献代码，首先需要掌握[分支与拉取请求](http:\u002F\u002Fwww.dataschool.io\u002Fsimple-guide-to-forks-in-github-and-git\u002F)的概念。\n* [GitRef](http:\u002F\u002Fgitref.org\u002F)是我最喜欢的Git命令参考指南，而[面向初学者的Git快速参考](http:\u002F\u002Fwww.dataschool.io\u002Fgit-quick-reference-for-beginners\u002F)则是一份更简洁的指南，按工作流程分类列出了常用命令。\n* [破解GitHub增长密码](https:\u002F\u002Fgrowthhackers.com\u002Fgrowth-studies\u002Fgithub)解释了为什么GitHub在开发者中如此受欢迎。\n* [Markdown速查表](https:\u002F\u002Fgithub.com\u002Fadam-p\u002Fmarkdown-here\u002Fwiki\u002FMarkdown-Cheatsheet)提供了详尽的Markdown示例及简要说明。GitHub的[掌握Markdown](https:\u002F\u002Fguides.github.com\u002Ffeatures\u002Fmastering-markdown\u002F)指南则更为简单直观，但内容相对较少。\n\n**命令行资源：**\n* 如果想更深入地学习命令行，[数据科学与命令行](http:\u002F\u002Fshop.oreilly.com\u002Fproduct\u002F0636920032823.do)是一本好书。其[配套网站](http:\u002F\u002Fdatascienceatthecommandline.com\u002F)提供了“数据科学工具箱”（一个预装了大量命令行工具的虚拟机）的安装说明，以及一份关于常用命令行工具的详细参考指南。\n* 如果你想用命令行对CSV文件做更多处理，可以试试[csvkit](http:\u002F\u002Fcsvkit.readthedocs.org\u002F)，它可以通过`pip`安装。 \n\n-----\n\n### 第3课：数据读取与清洗\n* Git和GitHub各类技巧（[幻灯片](slides\u002F02_git_github.pdf)）\n* 复习命令行作业（[解答](homework\u002F02_command_line_chipotle.md)）\n* Python：\n    * Spyder界面\n    * 循环练习\n    * 使用航空安全数据进行文件读取的课程（[代码](code\u002F03_file_reading.py)，[数据](data\u002Fairlines.csv)，[文章](http:\u002F\u002Ffivethirtyeight.com\u002Ffeatures\u002Fshould-travelers-avoid-flying-airlines-that-have-had-crashes-in-the-past\u002F)）\n    * 数据清洗练习\n    * 带着Chipotle数据讲解Python作业（[代码](code\u002F03_python_homework_chipotle.py)，[数据](data\u002Fchipotle.tsv)，[文章](http:\u002F\u002Fwww.nytimes.com\u002Finteractive\u002F2015\u002F02\u002F17\u002Fupshot\u002Fwhat-do-people-actually-order-at-chipotle.html)）\n\n**作业：**\n* 使用Chipotle数据完成[Python作业](code\u002F03_python_homework_chipotle.py)，将带有注释的Python脚本添加到你的GitHub仓库中，并通过作业提交表单提交链接。你需在周二（9月1日）前完成此作业。（**注意**：本作业不得使用第4课中讲到的Pandas库。）\n\n**资源：**\n* 如果你对列表推导式仍感到困惑，[想理解Python的推导式吗？用Excel或SQL来思考](http:\u002F\u002Fblog.lerner.co.il\u002Fwant-to-understand-pythons-comprehensions-think-like-an-accountant\u002F)可能会有所帮助。\n* [我的代码不工作](http:\u002F\u002Fwww.tecoed.co.uk\u002Fuploads\u002F1\u002F4\u002F2\u002F4\u002F14249012\u002F624506_orig.png)是一张很棒的流程图，解释了如何调试Python错误。\n* [PEP 8](https:\u002F\u002Fwww.python.org\u002Fdev\u002Fpeps\u002Fpep-0008\u002F)是Python的“经典”代码风格指南，如果你想编写可读性强且与Python社区其他成员一致的代码，值得一读。\n* 如果你想更深入地理解Python，Ned Batchelder的[像本地人一样循环](http:\u002F\u002Fnedbatchelder.com\u002Ftext\u002Fiter.html)和[Python中的名称与值](http:\u002F\u002Fnedbatchelder.com\u002Ftext\u002Fnames1.html)是非常优秀的演讲。\n\n-----\n\n### 第4课：探索性数据分析\n* Pandas（[代码](code\u002F04_pandas.py)）：\n    * MovieLens 10万条电影评分（[数据](data\u002Fu.user)，[数据字典](http:\u002F\u002Ffiles.grouplens.org\u002Fdatasets\u002Fmovielens\u002Fml-100k-README.txt)，[网站](http:\u002F\u002Fgrouplens.org\u002Fdatasets\u002Fmovielens\u002F)）\n    * 各国酒精消费量（[数据](data\u002Fdrinks.csv)，[文章](http:\u002F\u002Ffivethirtyeight.com\u002Fdatalab\u002Fdear-mona-followup-where-do-people-drink-the-most-beer-wine-and-spirits\u002F)）\n    * UFO目击报告（[数据](data\u002Fufo.csv)，[网站](http:\u002F\u002Fwww.nuforc.org\u002Fwebreports.html)）\n* 项目问题练习\n\n**作业：**\n* 与导师讨论项目想法的截止日期为周二（9月1日），而项目问题的书面说明则需在周四（9月3日）前提交。\n* 阅读[纽约市一半出租车中的软件如何每年带来520万美元的额外小费](http:\u002F\u002Fiquantny.tumblr.com\u002Fpost\u002F107245431809\u002Fhow-software-in-half-of-nyc-cabs-generates-5-2)，这是一篇关于探索性数据分析的绝佳案例。\n* 阅读[安斯康姆四重奏及为何汇总统计无法揭示全部真相](http:\u002F\u002Fdata.heapanalytics.com\u002Fanscombes-quartet-and-why-summary-statistics-dont-tell-the-whole-story\u002F)，这是一个经典的例子，说明可视化的重要性。\n\n**资源：**\n* 浏览或搜索Pandas的[API参考文档](http:\u002F\u002Fpandas.pydata.org\u002Fpandas-docs\u002Fstable\u002Fapi.html)是寻找函数的绝佳方式，即使你并不知道它的准确名称。\n* [通过推文讲述我拿到新数据集时会做什么](http:\u002F\u002Fsimplystatistics.org\u002F2014\u002F06\u002F13\u002Fwhat-i-do-when-i-get-a-new-data-set-as-told-through-tweets\u002F)以一种有趣（但富有启发性）的方式展示了探索性数据分析的过程。\n\n-----\n\n### 第5课：可视化\n* 带有 Chipotle 数据的 Python 作业截止日期（[解决方案](code\u002F03_python_homework_chipotle.py)，[详细解释](notebooks\u002F03_python_homework_chipotle_explained.ipynb)）\n* 使用 Pandas 进行探索性数据分析第 2 部分（[代码](code\u002F04_pandas.py)）\n* 使用 Pandas 和 Matplotlib 进行可视化（[笔记本](notebooks\u002F05_pandas_visualization.ipynb)）\n\n**作业：**\n* 您的项目问题撰写将于周四截止。\n* 完成带有 [IMDb 数据](data\u002Fimdb_1000.csv)的[Pandas 作业](code\u002F05_pandas_homework_imdb.py)。您需要在周二（9月8日）之前完成此作业。\n* 如果您未使用 Anaconda，请使用 `pip` 安装 [Jupyter Notebook]（原名 IPython Notebook）。（Jupyter 或 IPython Notebook 已包含在 Anaconda 中。）\n\n**Pandas 资源：**\n* 若要深入了解 Pandas，可阅读这篇[三部分教程](http:\u002F\u002Fwww.gregreda.com\u002F2013\u002F10\u002F26\u002Fintro-to-pandas-data-structures\u002F)，或查看以下两份优秀的（但非常长的）Pandas 笔记本：[入门](https:\u002F\u002Fgithub.com\u002Ffonnesbeck\u002FBios8366\u002Fblob\u002Fmaster\u002Fnotebooks\u002FSection2_5-Introduction-to-Pandas.ipynb)和[数据清洗](https:\u002F\u002Fgithub.com\u002Ffonnesbeck\u002FBios8366\u002Fblob\u002Fmaster\u002Fnotebooks\u002FSection2_6-Data-Wrangling-with-Pandas.ipynb)。\n* 如果您想深入研究 Pandas（以及 NumPy），可以阅读由 Pandas 的创建者编写的书籍[Python 数据分析](http:\u002F\u002Fshop.oreilly.com\u002Fproduct\u002F0636920023784.do)。\n* 此笔记本演示了 Pandas 中不同类型的[连接操作](notebooks\u002F05_pandas_merge.ipynb)，适用于需要合并两个 DataFrame 的情况。\n* 这是一份关于 Pandas 中[数据透视表]的简洁教程（[链接](https:\u002F\u002Fbeta.oreilly.com\u002Flearning\u002Fpivot-tables)）。\n* 在 Python 中处理地理空间数据时，[GeoPandas](http:\u002F\u002Fgeopandas.org\u002Findex.html)看起来很有前景。这篇[教程](http:\u002F\u002Fmichelleful.github.io\u002Fcode-blog\u002F2015\u002F04\u002F24\u002Fsgmap\u002F)使用 GeoPandas（以及 scikit-learn）构建了新加坡的“语言街道地图”。\n\n**可视化资源：**\n* 观看[看看你的数据](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=coNDCIMH8bk)（18 分钟），这是一个很好的例子，说明为什么可视化对于理解数据非常有用。\n* 如需了解更多关于 Pandas 绘图的内容，可阅读此[笔记本](https:\u002F\u002Fgithub.com\u002Ffonnesbeck\u002FBios8366\u002Fblob\u002Fmaster\u002Fnotebooks\u002FSection2_7-Plotting-with-Pandas.ipynb)或 Pandas 官方文档中的[可视化页面](http:\u002F\u002Fpandas.pydata.org\u002Fpandas-docs\u002Fstable\u002Fvisualization.html)。\n* 若要进一步自定义您的图表，可以浏览此[关于 matplotlib 的笔记本](https:\u002F\u002Fgithub.com\u002Ffonnesbeck\u002FBios8366\u002Fblob\u002Fmaster\u002Fnotebooks\u002FSection2_4-Matplotlib.ipynb)或另一份类似的[笔记本](https:\u002F\u002Fgithub.com\u002Fjrjohansson\u002Fscientific-python-lectures\u002Fblob\u002Fmaster\u002FLecture-4-Matplotlib.ipynb)。\n* 阅读[Python 可视化工具概述](http:\u002F\u002Fpbpython.com\u002Fvisualization-tools-1.html)，其中对 Matplotlib、Pandas、Seaborn、ggplot、Bokeh、Pygal 和 Plotly 进行了有用的比较。\n* 若要了解不同类型的可视化及其适用场景，[选择合适的图表](http:\u002F\u002Fextremepresentation.typepad.com\u002Ffiles\u002Fchoosing-a-good-chart-09.pdf)和[图形连续体](http:\u002F\u002Fwww.coolinfographics.com\u002Fstorage\u002Fpost-images\u002FThe-Graphic-Continuum-POSTER.jpg)是不错的单页参考资料；而交互式的[R 图形目录](http:\u002F\u002Fshiny.stat.ubc.ca\u002Fr-graph-catalog\u002F)则提供了便捷的筛选功能。\n* 哥伦比亚大学数据挖掘课程中的一份[PowerPoint 演示文稿](http:\u002F\u002Fwww2.research.att.com\u002F~volinsky\u002FDataMining\u002FColumbia2011\u002FSlides\u002FTopic2-EDAViz.ppt)包含了大量关于如何正确使用不同类型可视化的好建议。\n* 哈佛大学的数据科学课程（[链接](http:\u002F\u002Fcs109.github.io\u002F2014\u002F)）包含了一堂关于[可视化目标、数据类型和统计图表]的精彩讲座（83 分钟），[讲义](https:\u002F\u002Fdocs.google.com\u002Ffile\u002Fd\u002F0B7IVstmtIvlHLTdTbXdEVENoRzQ\u002Fedit)也可供下载。\n\n-----\n\n### 第6课：机器学习\n* 使用Pandas和Matplotlib进行可视化（第2部分）([笔记本](notebooks\u002F05_pandas_visualization.ipynb))\n* Jupyter\u002FIPython Notebook简要介绍\n* “人类学习”练习：\n    * UCI机器学习库提供的[鸢尾花数据集](http:\u002F\u002Farchive.ics.uci.edu\u002Fml\u002Fdatasets\u002FIris)\n    * [鸢尾花图片](http:\u002F\u002Fsebastianraschka.com\u002FImages\u002F2014_python_lda\u002Firis_petal_sepal.png)\n    * [笔记本](notebooks\u002F06_human_learning_iris.ipynb)\n* 机器学习简介([幻灯片](slides\u002F06_machine_learning.pdf))\n\n**作业：**\n* **可选：** 完成[人类学习笔记本](notebooks\u002F06_human_learning_iris.ipynb)中列出的附加练习。它可以替代你过去或未来错过的一次作业！此作业需在周二（9月8日）前提交。\n* 如果你未使用Anaconda，请使用`pip`安装[requests](http:\u002F\u002Fwww.python-requests.org\u002Fen\u002Flatest\u002Fuser\u002Finstall\u002F)和[Beautiful Soup 4](http:\u002F\u002Fwww.crummy.com\u002Fsoftware\u002FBeautifulSoup\u002Fbs4\u002Fdoc\u002F#installing-beautiful-soup)。（这两个包都包含在Anaconda中。）\n\n**机器学习资源：**\n* 如需快速了解机器学习的关键要点，可观看[什么是机器学习？它是如何工作的？](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=elojMnjn4kk)（10分钟），或阅读[配套笔记本](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fscikit-learn-videos\u002Fblob\u002Fmaster\u002F01_machine_learning_intro.ipynb)。\n* 若要深入了解机器学习，可阅读Hastie与Tibshirani所著优秀书籍《统计学习导论》中的第2.1节（共14页）（可免费下载PDF版！）。\n* 来自[加州理工学院“从数据中学习”课程](http:\u002F\u002Fwork.caltech.edu\u002Ftelecourse.html)的视频[学习范式](http:\u002F\u002Fwork.caltech.edu\u002Flibrary\u002F014.html)（13分钟），对监督学习与无监督学习进行了精彩对比，并介绍了“强化学习”。\n* [现实世界中的主动学习](https:\u002F\u002Fbeta.oreilly.com\u002Fideas\u002Freal-world-active-learning)是一篇通俗易懂且全面的介绍，讲解了“主动学习”——一种机器学习的变体，其中人类仅标注最“重要”的样本。\n* 为提前了解本课程将涵盖的机器学习内容，可阅读Sebastian Raschka撰写的[监督学习流程概述](https:\u002F\u002Fgithub.com\u002Frasbt\u002Fpattern_classification\u002Fblob\u002Fmaster\u002Fmachine_learning\u002Fsupervised_intro\u002Fintroduction_to_supervised_machine_learning.md)。\n* [数据科学、机器学习与统计学：名称背后有何含义？](http:\u002F\u002Fwww.win-vector.com\u002Fblog\u002F2013\u002F04\u002Fdata-science-machine-learning-and-statistics-what-is-in-a-name\u002F)探讨了这些术语（以及其他术语）之间的区别。\n* [表情符号翻译项目](https:\u002F\u002Fwww.kickstarter.com\u002Fprojects\u002Ffred\u002Fthe-emoji-translation-project)是机器学习的一个非常有趣的实际应用。\n* 查阅[您所在邮编的特征](http:\u002F\u002Fwww.esri.com\u002Flanding-pages\u002Ftapestry\u002F)，然后详细阅读[67个不同细分群体](http:\u002F\u002Fdoc.arcgis.com\u002Fen\u002Fesri-demographics\u002Fdata\u002Ftapestry-segmentation.htm)的相关信息。\n\n**IPython Notebook资源：**\n* 如需回顾IPython Notebook的介绍（并预览scikit-learn），可观看[scikit-learn与IPython Notebook](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=IsXXlYVBt1M)（15分钟），或阅读[配套笔记本](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fscikit-learn-videos\u002Fblob\u002Fmaster\u002F02_machine_learning_setup.ipynb)。\n* 若想学习IPython Notebook，官方的[Notebook教程](https:\u002F\u002Fgithub.com\u002Fjupyter\u002Fnotebook\u002Fblob\u002Fmaster\u002Fdocs\u002Fsource\u002Fexamples\u002FNotebook\u002FExamples%20and%20Tutorials%20Index.ipynb)非常有帮助。\n* 这篇[Reddit讨论](https:\u002F\u002Fwww.reddit.com\u002Fr\u002FPython\u002Fcomments\u002F3be5z2\u002Fdo_you_prefer_ipython_notebook_over_ipython\u002F)比较了IPython Notebook与Spyder各自的优缺点。\n\n-----\n\n### 第7课：获取数据\n* 带有IMDb数据的Pandas作业截止日期（[解决方案](code\u002F05_pandas_homework_imdb.py)）\n* 带有iris数据的可选“人类学习”练习截止日期（[解决方案](notebooks\u002F06_human_learning_iris.ipynb)）\n* API（[代码](code\u002F07_api.py)）\n    * [OMDb API](http:\u002F\u002Fwww.omdbapi.com\u002F)\n* 网页抓取（[代码](code\u002F07_web_scraping.py)）\n    * [IMDb: robots.txt](http:\u002F\u002Fwww.imdb.com\u002Frobots.txt)\n    * [示例网页](data\u002Fexample.html)\n    * [IMDb: 惩罚》](http:\u002F\u002Fwww.imdb.com\u002Ftitle\u002Ftt0111161\u002F)\n\n**家庭作业：**\n* **可选：** 完成[网页抓取代码](code\u002F07_web_scraping.py)中列出的作业练习。它可以替代你过去或将来错过的一次作业！此作业需在周二（9月15日）前提交。\n* **可选：** 如果你未使用Anaconda，请使用`pip`安装Seaborn（[安装说明](http:\u002F\u002Fstanford.edu\u002F~mwaskom\u002Fsoftware\u002Fseaborn\u002Finstalling.html)）。如果你使用Anaconda，则可在命令行中运行`conda install seaborn`来安装Seaborn。（请注意，以往课程中有些学生在安装Seaborn后遇到了Anaconda相关问题。）\n\n**API资源：**\n* 这个用于[查询美国人口普查API](https:\u002F\u002Fgithub.com\u002Flaurakurup\u002Fcensus-api)的Python脚本由一位前DAT学员编写。它比我们在课堂上使用的示例稍显复杂，但注释非常详尽，可能为你编写自己的API查询代码提供一个有用的框架。\n* [Mashape](https:\u002F\u002Fwww.mashape.com\u002Fexplore)和[Apigee](https:\u002F\u002Fapigee.com\u002Fproviders)允许你探索大量不同的API。此外，针对许多热门API，也有[Python API封装库](http:\u002F\u002Fwww.pythonforbeginners.com\u002Fapi\u002Flist-of-python-apis)可供使用。\n* [数据科学工具包](http:\u002F\u002Fwww.datasciencetoolkit.org\u002F)收录了基于位置和文本相关的API。\n* [Python中的API集成](https:\u002F\u002Frealpython.com\u002Fblog\u002Fpython\u002Fapi-integration-in-python\u002F)提供了关于REST API的非常易读的入门介绍。\n* 微软的[人脸检测API](https:\u002F\u002Fwww.projectoxford.ai\u002Fdemo\u002Fface#detection)，该API驱动着[How-Old.net](http:\u002F\u002Fhow-old.net\u002F)，是一个很好的例子，展示了如何利用机器学习API构建引人入胜的Web应用。\n\n**网页抓取资源：**\n* [Beautiful Soup文档](http:\u002F\u002Fwww.crummy.com\u002Fsoftware\u002FBeautifulSoup\u002Fbs4\u002Fdoc\u002F)内容极其详尽，但不太适合作为参考指南。不过，其中关于[指定解析器](http:\u002F\u002Fwww.crummy.com\u002Fsoftware\u002FBeautifulSoup\u002Fbs4\u002Fdoc\u002F#specifying-the-parser-to-use)的部分可能会有所帮助，尤其是在Beautiful Soup似乎错误地解析页面时。\n* 更多Beautiful Soup示例和教程可参见：[Python网页抓取入门](http:\u002F\u002Fwww.gregreda.com\u002F2013\u002F03\u002F03\u002Fweb-scraping-101-with-python\u002F)、一位前DAT学员编写的关于[抓取Craigslist](https:\u002F\u002Fgithub.com\u002FAlexjmsherman\u002FDataScience_GeneralAssembly\u002Fblob\u002Fmaster\u002FFinal_Project\u002F1.%20Final_Project_Data%20Scraping.ipynb)的注释丰富的笔记本、斯坦福大学“文本即数据”课程中的这个[笔记本](http:\u002F\u002Fweb.stanford.edu\u002F~zlotnick\u002FTextAsData\u002FWeb_Scraping_with_Beautiful_Soup.html)，以及哈佛大学数据科学课程中的这个[笔记本](https:\u002F\u002Fgithub.com\u002Fcs109\u002F2014\u002Fblob\u002Fmaster\u002Flectures\u002F2014_09_23-lecture\u002Fdata_scraping_transcript.ipynb)和配套的[视频](http:\u002F\u002Fcm.dce.harvard.edu\u002F2015\u002F01\u002F14328\u002FL07\u002Fscreen_H264LargeTalkingHead-16x9.shtml)。\n* 如需更长的网页抓取教程，涵盖Beautiful Soup、lxml、XPath和Selenium等内容，可观看来自PyCon 2014的[Python网页抓取](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=p1iX0uxM1w8)（3小时23分钟）。其[幻灯片](https:\u002F\u002Fdocs.google.com\u002Fpresentation\u002Fd\u002F1uHM_esB13VuSf7O1ScGueisnrtu-6usGFD3fs4z5YCE\u002Fedit#slide=id.p)和[代码](https:\u002F\u002Fgithub.com\u002Fkjam\u002Fpython-web-scraping-tutorial)也可供下载。\n* 对于更复杂的网页抓取项目，[Scrapy](http:\u002F\u002Fscrapy.org\u002F)是一个流行的Python应用框架。它拥有出色的[文档](http:\u002F\u002Fdoc.scrapy.org\u002Fen\u002F1.0\u002Findex.html)，这里还有一份带有详细幻灯片和代码的[教程](https:\u002F\u002Fgithub.com\u002Frdempsey\u002Fddl-data-wrangling)。\n* [robotstxt.org](http:\u002F\u002Fwww.robotstxt.org\u002Frobotstxt.html)对如何编写（及阅读）`robots.txt`文件进行了简明扼要的解释。\n* [import.io](https:\u002F\u002Fimport.io\u002F)和[Kimono](https:\u002F\u002Fwww.kimonolabs.com\u002F)声称可以让你无需编写任何代码即可抓取网站。\n* [数学天才如何破解OkCupid找到真爱](http:\u002F\u002Fwww.wired.com\u002F2014\u002F01\u002Fhow-to-hack-okcupid\u002Fall\u002F)和[Netflix如何逆向工程好莱坞](http:\u002F\u002Fwww.theatlantic.com\u002Ftechnology\u002Farchive\u002F2014\u002F01\u002Fhow-netflix-reverse-engineered-hollywood\u002F282679\u002F?single_page=true)是两个有趣的例子，说明了网页抓取技术如何被用来构建有趣的数据集。\n\n-----\n\n### 第8课：K近邻算法\n* Pandas 简要回顾（[笔记本](notebooks\u002F08_pandas_review.ipynb)）\n* K近邻算法与 scikit-learn（[笔记本](notebooks\u002F08_knn_sklearn.ipynb)）\n* NBA球员数据练习（[笔记本](notebooks\u002F08_nba_knn.ipynb)，[数据](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT4-students\u002Fblob\u002Fmaster\u002Fkerry\u002FFinal\u002FNBA_players_2015.csv)，[数据字典](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT-project-examples\u002Fblob\u002Fmaster\u002Fpdf\u002Fnba_paper.pdf)）\n* 探索偏差-方差权衡（[笔记本](notebooks\u002F08_bias_variance.ipynb)）\n\n**作业：**\n* 阅读关于 [偏差-方差权衡](homework\u002F09_bias_variance.md) 的材料\n* 阅读 Kevin 的 [可重复性简介](http:\u002F\u002Fwww.dataschool.io\u002Freproducibility-is-not-just-for-researchers\u002F)，阅读 Jeff Leek 的 [创建可重复分析指南](https:\u002F\u002Fgithub.com\u002Fjtleek\u002Fdatasharing)，并观看相关的 [科尔伯特报告视频](http:\u002F\u002Fthecolbertreport.cc.com\u002Fvideos\u002Fdcyvro\u002Fausterity-s-spreadsheet-error)（8分钟）。\n* 继续推进你的项目……你们的第一次项目展示将在不到两周后举行！\n\n**KNN 资源：**\n* 有关 KNN 和 scikit-learn 的关键点回顾，可以观看 [使用著名的鸢尾花数据集入门 scikit-learn](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=hd1W4CyPX58)（15分钟）和 [使用 scikit-learn 训练机器学习模型](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=RlQuVL6-qe8)（20分钟）。\n* KNN 支持除欧几里得距离之外的 [距离度量](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.neighbors.DistanceMetric.html)，例如 [马氏距离](http:\u002F\u002Fstats.stackexchange.com\u002Fquestions\u002F62092\u002Fbottom-to-top-explanation-of-the-mahalanobis-distance)，它 [会考虑数据的尺度](http:\u002F\u002Fblogs.sas.com\u002Fcontent\u002Fiml\u002F2012\u002F02\u002F15\u002Fwhat-is-mahalanobis-distance.html)。\n* [KNN 详细介绍](https:\u002F\u002Fsaravananthirumuruganathan.wordpress.com\u002F2010\u002F05\u002F17\u002Fa-detailed-introduction-to-k-nearest-neighbor-knn-algorithm\u002F) 内容稍显密集，但提供了对 KNN 及其应用更为全面的介绍。\n* 这门关于 [图像分类](http:\u002F\u002Fcs231n.github.io\u002Fclassification\u002F) 的课程展示了如何使用 KNN 检测相似图像，同时也涉及我们未来课程中将要学习的主题（超参数调优和交叉验证）。\n* KNN 适合的一些应用场景包括 [目标识别](http:\u002F\u002Fvlm1.uta.edu\u002F~athitsos\u002Fnearest_neighbors\u002F)、[卫星图像增强](http:\u002F\u002Fland.umn.edu\u002Fdocuments\u002FFS6.pdf)、[文档分类](http:\u002F\u002Fwww.ceng.metu.edu.tr\u002F~e120321\u002Fpaper.pdf)以及 [基因表达分析](http:\u002F\u002Fciteseerx.ist.psu.edu\u002Fviewdoc\u002Fsummary?doi=10.1.1.208.993)。\n\n**Seaborn 资源：**\n* 要开始使用 Seaborn 进行可视化，官方网站提供了一系列 [详细教程](http:\u002F\u002Fweb.stanford.edu\u002F~mwaskom\u002Fsoftware\u002Fseaborn\u002Ftutorial.html) 和一个 [示例图库](http:\u002F\u002Fweb.stanford.edu\u002F~mwaskom\u002Fsoftware\u002Fseaborn\u002Fexamples\u002Findex.html)。\n* [使用 Seaborn 进行数据可视化](https:\u002F\u002Fbeta.oreilly.com\u002Flearning\u002Fdata-visualization-with-seaborn) 是对一些流行的 Seaborn 图表类型的快速介绍。\n* [使用 Seaborn 可视化 Google 表单数据](http:\u002F\u002Fpbpython.com\u002Fpandas-google-forms-part2.html) 和 [如何用 Python 制作 NBA 投篮热图](http:\u002F\u002Fsavvastjortjoglou.com\u002Fnba-shot-sharts.html) 都是 Seaborn 在真实世界数据上应用的良好示例。\n\n-----\n\n### 第9课：基本模型评估\n* 可选的网络爬虫作业截止日期（[解决方案](code\u002F07_web_scraping.py#L136)）\n* 可重复性\n    * 讨论指定阅读材料：[简介](http:\u002F\u002Fwww.dataschool.io\u002Freproducibility-is-not-just-for-researchers\u002F)、[科尔伯特报告视频](http:\u002F\u002Fthecolbertreport.cc.com\u002Fvideos\u002Fdcyvro\u002Fausterity-s-spreadsheet-error)、[出租车文章](http:\u002F\u002Fiquantny.tumblr.com\u002Fpost\u002F107245431809\u002Fhow-software-in-half-of-nyc-cabs-generates-5-2)、[推文](https:\u002F\u002Ftwitter.com\u002Fjakevdp\u002Fstatus\u002F519563939177197571)、[创建可重复分析](https:\u002F\u002Fgithub.com\u002Fjtleek\u002Fdatasharing)\n    * 示例：[经典摇滚](https:\u002F\u002Fgithub.com\u002Ffivethirtyeight\u002Fdata\u002Ftree\u002Fmaster\u002Fclassic-rock)、[学生项目 1](https:\u002F\u002Fgithub.com\u002Fjwknobloch\u002FDAT4_final_project)、[学生项目 2](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT4-students\u002Ftree\u002Fmaster\u002FJonathan_Bryan\u002FProject_Files)\n* 讨论关于 [偏差-方差权衡](homework\u002F09_bias_variance.md) 的阅读作业\n* 使用训练\u002F测试集划分进行模型评估（[笔记本](notebooks\u002F09_model_evaluation.ipynb)）\n* 浏览 scikit-learn 文档：[模块参考](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fclasses.html)、[用户指南](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fuser_guide.html)、类和函数文档\n\n**作业：**\n* 观看 [Python 中的数据科学](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=3ZWuPVWq7p4)（35分钟），了解线性回归的入门知识（并复习其他课程内容），或者至少阅读一下 [相关笔记本](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fscikit-learn-videos\u002Fblob\u002Fmaster\u002F06_linear_regression.ipynb)。\n* **可选：** 如果想进一步了解线性回归，可以观看 [最简单的回归分析入门](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=k_OB1tWX9PM)（14分钟）。\n\n**模型评估资源：**\n* 要回顾今天课程的一些要点，可以观看 [在 scikit-learn 中比较机器学习模型](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=0pP4EwWJgIU)（27分钟）。\n* 如果需要进一步解释训练误差与测试误差、偏差-方差权衡以及训练\u002F测试集划分（也称为“验证集方法”），可以观看 Hastie 和 Tibshirani 关于 [估计预测误差](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=_2ij6eaaSl0&t=2m34s) 的视频（12分钟，从 2 分 34 秒开始）。\n* 加州理工学院的《从数据中学习》课程包含一段精彩的视频，讲解 [可视化偏差与方差](http:\u002F\u002Fwork.caltech.edu\u002Flibrary\u002F081.html)（15分钟）。\n* [随机的训练\u002F测试集划分并不总是足够](http:\u002F\u002Fwww.win-vector.com\u002Fblog\u002F2015\u002F01\u002Frandom-testtrain-split-is-not-always-enough\u002F) 解释了为什么如果您的数据具有显著的时间特征，随机的训练\u002F测试集划分可能不是合适的模型评估方法。\n\n**可重复性资源：**\n* [我们关于分享数据分析的经验教训](https:\u002F\u002Fsource.opennews.org\u002Fen-US\u002Farticles\u002Fwhat-weve-learned-about-sharing-our-data-analysis\u002F) 包含 BuzzFeed News 提供的关于如何发布可重复分析的技巧。\n* [数据科学家的软件开发技能](http:\u002F\u002Ftreycausey.com\u002Fsoftware_dev_skills.html) 讨论了编写函数和恰当注释代码的重要性（以及其他技能），这些对于创建可重复分析非常有用。\n* [做得好的数据科学看起来很简单——这对数据科学家来说是个大问题](http:\u002F\u002Fsimplystatistics.org\u002F2015\u002F03\u002F17\u002Fdata-science-done-well-looks-easy-and-that-is-a-big-problem-for-data-scientists\u002F) 解释了可重复分析如何展现背后所有精心准备的工作。\n\n### 第10课：线性回归\n* 机器学习练习（[文章](http:\u002F\u002Fblog.dominodatalab.com\u002F10-interesting-uses-of-data-science\u002F)）\n* 线性回归（[笔记本](notebooks\u002F10_linear_regression.ipynb)）\n    * 使用了Kaggle竞赛中的[Capital Bikeshare数据集](data\u002Fbikeshare.csv)\n    * [数据字典](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fbike-sharing-demand\u002Fdata)\n* 特征工程示例：[预测企业协作网络中的用户参与度](https:\u002F\u002Fgithub.com\u002Fmikeyea\u002FDAT7_project\u002Fblob\u002Fmaster\u002Ffinal%20project\u002FClass_Presention_MYea.ipynb)\n\n**作业：**\n* 您的第一次项目展示将在周二（9月22日）进行！请在周二下午6点前提交您的项目仓库链接（包含幻灯片、代码、数据和可视化内容）。\n* 完成[作业](homework\u002F10_yelp_votes.md)，使用[Yelp数据](data\u002Fyelp.csv)。截止日期为周四（9月24日）。\n\n**线性回归相关资源：**\n* 如果想更深入地了解线性回归，可以阅读[《统计学习导论》](http:\u002F\u002Fwww-bcf.usc.edu\u002F~gareth\u002FISL\u002F)第3章。或者观看[相关视频](http:\u002F\u002Fwww.dataschool.io\u002F15-hours-of-expert-machine-learning-videos\u002F)，或阅读我的[快速参考指南](http:\u002F\u002Fwww.dataschool.io\u002Fapplying-and-interpreting-linear-regression\u002F)，其中总结了该章节的关键要点。\n* 这篇[线性回归简介](http:\u002F\u002Fpeople.duke.edu\u002F~rnau\u002Fregintro.htm)更加详细且数学上更为严谨，并提供了许多实用建议。\n* 这是一篇关于[线性回归假设](http:\u002F\u002Fpareonline.net\u002Fgetvn.asp?n=2&v=8)的简短文章。\n* Setosa网站提供了一个[线性回归的交互式可视化](http:\u002F\u002Fsetosa.io\u002Fev\u002Fordinary-least-squares-regression\u002F)。\n* 关于置信区间、假设检验、p值和R²的简要介绍，以及scikit-learn代码与[Statsmodels](http:\u002F\u002Fstatsmodels.sourceforge.net\u002F)代码的对比，请参阅我的[DAT7线性回归课程笔记](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT7\u002Fblob\u002Fmaster\u002Fnotebooks\u002F10_linear_regression.ipynb)。\n* Quora上有一篇关于[置信区间](http:\u002F\u002Fwww.quora.com\u002FWhat-is-a-confidence-interval-in-laymans-terms\u002Fanswer\u002FMichael-Hochster)的实用解释。\n* [假设检验基础](http:\u002F\u002F20bits.com\u002Farticle\u002Fhypothesis-testing-the-basics)对这一主题进行了很好的概述；John Rauser的演讲[没有痛苦的统计学](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=5Dnw46eC-0o)（12分钟）则很好地解释了如何拒绝零假设。\n* 今年早些时候，一家主要科学期刊禁止使用p值：\n    * 《科学美国人》对此禁令做了不错的[总结](http:\u002F\u002Fwww.scientificamerican.com\u002Farticle\u002Fscientists-perturbed-by-loss-of-stat-tools-to-sift-research-fudge-from-fact\u002F)。\n    * 《自然》杂志对这一禁令的[回应](http:\u002F\u002Fwww.nature.com\u002Fnews\u002Fstatistics-p-values-are-just-the-tip-of-the-iceberg-1.17412)指出：“数据分析早期做出的决策对结果的影响更大。”\n    * Andrew Gelman发表了一篇易读的[论文](http:\u002F\u002Fwww.stat.columbia.edu\u002F~gelman\u002Fresearch\u002Funpublished\u002Fp_hacking.pdf)，他认为：“只要足够努力，即使没有任何实际效应，也很容易找到p\u003C0.05的比较结果。”\n    * [科学并未崩溃](http:\u002F\u002Ffivethirtyeight.com\u002Ffeatures\u002Fscience-isnt-broken\u002F)中包含一个有趣的工具，可以让您通过“p值黑客”手段获得“统计显著”的结果。\n* [准确衡量模型预测误差](http:\u002F\u002Fscott.fortmann-roe.com\u002Fdocs\u002FMeasuringError.html)比较了调整后的R²、AIC和BIC、训练\u002F测试集划分以及交叉验证。\n\n**其他资源：**\n* 《统计学习导论》（[http:\u002F\u002Fwww-bcf.usc.edu\u002F~gareth\u002FISL\u002F](http:\u002F\u002Fwww-bcf.usc.edu\u002F~gareth\u002FISL\u002F)）第3.3.1节（共4页）对分类特征的哑变量编码有非常清晰的解释。\n* Kaggle上有一些关于我们今天使用的[共享单车数据的优秀可视化](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fbike-sharing-demand\u002Fscripts?outputType=Visualization)。\n\n-----\n\n### 第11课：第一次项目展示\n* 项目展示！\n\n**作业：**\n* 如果您对[概率](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=o4QmoNfW3bI)（5分钟）和[赔率](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=GxbXQjX7fC0)（8分钟）这两个概念不太熟悉，请观看Rahul Patwari的相关视频。\n* 阅读BetterExplained上的两篇精彩文章：[指数函数与e的直观指南](http:\u002F\u002Fbetterexplained.com\u002Farticles\u002Fan-intuitive-guide-to-exponential-functions-e\u002F)和[自然对数(ln)揭秘](http:\u002F\u002Fbetterexplained.com\u002Farticles\u002Fdemystifying-the-natural-logarithm-ln\u002F)。然后，复习这份[简要总结](notebooks\u002F12_e_log_examples.ipynb)的指数函数和对数知识。\n\n-----\n\n### 第12课：逻辑回归\n* Yelp投票作业截止（[解决方案](notebooks\u002F10_yelp_votes_homework.ipynb)）\n* 逻辑回归（[笔记本](notebooks\u002F12_logistic_regression.ipynb)）\n    * [玻璃识别数据集](https:\u002F\u002Farchive.ics.uci.edu\u002Fml\u002Fdatasets\u002FGlass+Identification)\n* 泰坦尼克号数据练习（[笔记本](notebooks\u002F12_titanic_confusion.ipynb)，[数据](data\u002Ftitanic.csv)，[数据字典](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Ftitanic\u002Fdata)）\n* 混淆矩阵（[幻灯片](slides\u002F12_confusion_matrix.pdf)，[笔记本](notebooks\u002F12_titanic_confusion.ipynb)）\n\n**家庭作业：**\n* 如果你对混淆矩阵的相关术语还不太熟悉，请观看Rahul Patwari关于[直观的理解敏感性和特异性](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=U4_3fditnWg)（9分钟）和[敏感性与特异性之间的权衡](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=vtYDyGGeQyo)（13分钟）的视频。\n* 关于[ROC曲线和AUC]的视频\u002F阅读作业（homework\u002F13_roc_auc.md）\n* 关于[交叉验证]的视频\u002F阅读作业（homework\u002F13_cross_validation.md）\n\n**逻辑回归相关资源：**\n* 如果想深入学习逻辑回归，可以阅读《统计学习导论》第4章的前三节（[链接](http:\u002F\u002Fwww-bcf.usc.edu\u002F~gareth\u002FISL\u002F)），或者观看该章节的前三个视频（30分钟）（[链接](http:\u002F\u002Fwww.dataschool.io\u002F15-hours-of-expert-machine-learning-videos\u002F)）。\n* 如果需要更数学化的解释，可以观看吴恩达机器学习课程第3周的前七集视频（71分钟）（[链接](https:\u002F\u002Fwww.coursera.org\u002Flearn\u002Fmachine-learning\u002Fhome\u002Finfo)），或者阅读一位学生整理的[相关讲义](http:\u002F\u002Fwww.holehouse.org\u002Fmlclass\u002F06_Logistic_Regression.html)。\n* 关于如何解释逻辑回归系数，可以参考UCLA IDRE提供的优秀[指南](http:\u002F\u002Fwww.ats.ucla.edu\u002Fstat\u002Fmult_pkg\u002Ffaq\u002Fgeneral\u002Fodds_ratio.htm)，以及新墨西哥大学的[讲义](http:\u002F\u002Fwww.unm.edu\u002F~schrader\u002Fbiostat\u002Fbio2\u002FSpr06\u002Flec11.pdf)。\n* scikit-learn文档中有一段关于预测概率校准的精彩[说明](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fcalibration.html)。\n* [监督学习迷信备忘录](http:\u002F\u002Fryancompton.net\u002Fassets\u002Fml_cheat_sheet\u002Fsupervised_learning.html)对本课程中涉及的四种分类器（逻辑回归、决策树、KNN、朴素贝叶斯）以及一种未涉及的分类器（支持向量机）进行了非常清晰的对比。\n\n**混淆矩阵相关资源：**\n* 我的[混淆矩阵术语简易指南](http:\u002F\u002Fwww.dataschool.io\u002Fsimple-guide-to-confusion-matrix-terminology\u002F)可以作为你的参考。\n* 这篇关于[亚马逊机器学习](https:\u002F\u002Faws.amazon.com\u002Fblogs\u002Faws\u002Famazon-machine-learning-make-data-driven-decisions-at-scale\u002F)的博客文章中包含一张精美的[图表](https:\u002F\u002Fmedia.amazonwebservices.com\u002Fblog\u002F2015\u002Fml_adjust_model_1.png)，展示了分类阈值如何影响不同的评估指标。\n* 这份来自另一门DAT课程的笔记本解释了如何将混淆矩阵视为成本效益矩阵，从而计算出“期望值”（[链接](https:\u002F\u002Fgithub.com\u002Fpodopie\u002FDAT18NYC\u002Fblob\u002Fmaster\u002Fclasses\u002F13-expected_value_cost_benefit_analysis.ipynb)）。\n\n-----\n\n### 第13课：高级模型评估\n* 数据准备（[笔记本](notebooks\u002F13_advanced_model_evaluation.ipynb)）\n    * 处理缺失值\n    * 处理分类特征（复习）\n* ROC曲线与AUC\n    * 讨论[视频\u002F阅读作业](homework\u002F13_roc_auc.md)\n    * 练习：绘制ROC曲线（[幻灯片](slides\u002F13_drawing_roc.pdf)）\n    * 返回主笔记本\n* 交叉验证\n    * 讨论[视频\u002F阅读作业](homework\u002F13_cross_validation.md)及配套的[笔记本](notebooks\u002F13_cross_validation.ipynb)\n    * 返回主笔记本\n* 使用银行营销数据的练习（[笔记本](notebooks\u002F13_bank_exercise.ipynb)，[数据](data\u002Fbank-additional.csv)，[数据字典](https:\u002F\u002Farchive.ics.uci.edu\u002Fml\u002Fdatasets\u002FBank+Marketing))\n\n**家庭作业：**\n* 阅读关于[垃圾邮件过滤](homework\u002F14_spam_filtering.md)的材料\n* 阅读这些[概率论导论](https:\u002F\u002Fdocs.google.com\u002Fpresentation\u002Fd\u002F1cM2dVbJgTWMkHoVNmYlB9df6P2H8BrjaqAcZTaLe9dA\u002Fedit#slide=id.gfc3caad2_00)幻灯片，或者略读[OpenIntro统计教材](https:\u002F\u002Fwww.openintro.org\u002Fstat\u002Ftextbook.php?stat_book=os)第2.1节（12页）。特别注意以下术语：概率、互斥事件、样本空间、独立性。\n* **可选：** 尝试通过这个[可视化工具](http:\u002F\u002Fsetosa.io\u002Fconditional\u002F)理解条件概率。\n* **可选：** 若要直观了解贝叶斯定理，可以阅读关于[财富与幸福](http:\u002F\u002Fwww.quora.com\u002FWhat-is-an-intuitive-explanation-of-Bayes-Rule\u002Fanswer\u002FMichael-Hochster)、[鸭子](https:\u002F\u002Fplanspacedotorg.wordpress.com\u002F2014\u002F02\u002F23\u002Fbayes-rule-for-ducks\u002F)或[乐高](http:\u002F\u002Fwww.countbayesie.com\u002Fblog\u002F2015\u002F2\u002F18\u002Fbayes-theorem-with-lego)的文章。\n\n**ROC相关资源：**\n* Rahul Patwari有一段关于[ROC曲线](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=21Igj5Pr6u4)的精彩视频（12分钟）。\n* [ROC分析简介](http:\u002F\u002Fpeople.inf.elte.hu\u002Fkiss\u002F13dwhdm\u002Froc.pdf)是一篇非常易读的论文。\n* ROC曲线可用于多种应用场景，例如[比较不同特征集](http:\u002F\u002Fresearch.microsoft.com\u002Fpubs\u002F205472\u002Faisec10-leontjeva.pdf)以检测Skype上的欺诈用户，以及[比较不同分类器](http:\u002F\u002Fwww.cse.ust.hk\u002FnevinZhangGroup\u002Freadings\u002Fyi\u002FBradley_PR97.pdf)在多个流行数据集上的表现。\n\n**交叉验证相关资源：**\n* 如需深入了解交叉验证，可阅读[统计学习导论](http:\u002F\u002Fwww-bcf.usc.edu\u002F~gareth\u002FISL\u002F)第5.1节（11页），或观看相关视频：[K折交叉验证与留一法交叉验证](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=nZAM5OXrktY)（14分钟）、[正确与错误的交叉验证方法](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=S06JpVoNaA0)（10分钟）。\n* 若想理解交叉验证的不同变体，这篇[论文](http:\u002F\u002Fwww.jcheminf.com\u002Fcontent\u002Fpdf\u002F1758-2946-6-10.pdf)对其进行了详细探讨和比较。\n* 要学习如何使用[GridSearchCV和RandomizedSearchCV](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgrid_search.html)进行参数调优，可观看[如何在scikit-learn中找到最佳模型参数](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=Gol_qOgRqfA)（28分钟），或阅读相关的[笔记本](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fscikit-learn-videos\u002Fblob\u002Fmaster\u002F08_grid_search.ipynb)。\n\n**其他资源：**\n* scikit-learn提供了关于[模型评估](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fmodel_evaluation.html)的详尽文档。\n* [机器学习模型的反事实评估](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=QWCSxAKR-h0)（45分钟）是一场精彩的演讲，介绍了Stripe公司评估其欺诈检测模型的复杂方式。（这是相关的[幻灯片](http:\u002F\u002Fwww.slideshare.net\u002FMichaelManapat\u002Fcounterfactual-evaluation-of-machine-learning-models)。）\n* [可视化机器学习阈值以做出更好的商业决策](http:\u002F\u002Fblog.insightdatalabs.com\u002Fvisualizing-classifier-thresholds\u002F)展示了如何通过可视化不同阈值下的精确率、召回率和“队列率”，帮助您最大化分类器的商业价值。\n\n-----\n\n### 第14课：朴素贝叶斯与文本数据\n* 条件概率与贝叶斯定理\n    * [幻灯片](slides\u002F14_bayes_theorem.pdf)（改编自[可视化贝叶斯定理](http:\u002F\u002Foscarbonilla.com\u002F2009\u002F05\u002Fvisualizing-bayes-theorem\u002F)）\n    * 将贝叶斯定理应用于鸢尾花分类（[笔记本](notebooks\u002F14_bayes_theorem_iris.ipynb)）\n* 朴素贝叶斯分类\n    * [幻灯片](slides\u002F14_naive_bayes.pdf)\n    * 垃圾邮件过滤示例（[笔记本](notebooks\u002F14_naive_bayes_spam.ipynb)）\n* 在scikit-learn中将朴素贝叶斯应用于文本数据（[笔记本](notebooks\u002F14_text_data_sklearn.ipynb)）\n    * [CountVectorizer](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.feature_extraction.text.CountVectorizer.html) 文档\n    * SMS短信：[数据](data\u002Fsms.tsv)，[数据字典](https:\u002F\u002Farchive.ics.uci.edu\u002Fml\u002Fdatasets\u002FSMS+Spam+Collection)\n\n**作业：**\n* 完成另一份[作业](homework\u002F14_yelp_review_text.md)，使用[Yelp数据](data\u002Fyelp.csv)。截止日期为周二（10月6日）。\n* 确认已在您偏好的Python环境中安装了[TextBlob](https:\u002F\u002Ftextblob.readthedocs.org\u002F)，方法是运行`import textblob`。若未安装，请在命令行中运行`pip install textblob`（而非在Python内部）。\n\n**资源：**\n* Sebastian Raschka关于[朴素贝叶斯与文本分类](http:\u002F\u002Fsebastianraschka.com\u002FArticles\u002F2014_naive_bayes_1.html)的文章，更详细地涵盖了今天课程中的概念性内容。\n* 如需了解更多条件概率知识，请阅读这些[幻灯片](https:\u002F\u002Fdocs.google.com\u002Fpresentation\u002Fd\u002F1psUIyig6OxHQngGEHr3TMkCvhdLInnKnclQoNUr4G4U\u002Fedit#slide=id.gfc69f484_00)，或参阅[OpenIntro统计教材](https:\u002F\u002Fwww.openintro.org\u002Fstat\u002Ftextbook.php?stat_book=os)第2.2节（共15页）。\n* 如需对朴素贝叶斯分类进行直观解释，请阅读这篇关于[机场安检](http:\u002F\u002Fwww.quora.com\u002FIn-laymans-terms-how-does-Naive-Bayes-work\u002Fanswer\u002FKonstantin-Tt)的文章。\n* 欲了解更多关于朴素贝叶斯分类的细节，维基百科有两篇优秀的文章（[朴素贝叶斯分类器](http:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FNaive_Bayes_classifier)和[朴素贝叶斯垃圾邮件过滤](http:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FNaive_Bayes_spam_filtering)），而Cross Validated上也有一个很好的[问答](http:\u002F\u002Fstats.stackexchange.com\u002Fquestions\u002F21822\u002Funderstanding-naive-bayes)。\n* 当将朴素贝叶斯分类应用于包含连续特征的数据集时，建议使用[GaussianNB](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.naive_bayes.GaussianNB.html)，而非[MultinomialNB](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.naive_bayes.MultinomialNB.html)。此[笔记本](notebooks\u002F14_types_of_naive_bayes.ipynb)比较了它们在该类数据集上的表现。维基百科对高斯朴素贝叶斯有简短的[描述](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FNaive_Bayes_classifier#Gaussian_naive_Bayes)，并提供了一个极佳的[示例](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FNaive_Bayes_classifier#Sex_classification)说明其应用。\n* 马里兰大学提供的这些[幻灯片](http:\u002F\u002Fwww.umiacs.umd.edu\u002F~jbg\u002Fteaching\u002FDATA_DIGGING\u002Flecture_05.pdf)进一步阐述了逻辑回归和朴素贝叶斯的数学细节，并解释了朴素贝叶斯实际上是逻辑回归的“特例”。\n* Andrew Ng有一篇[论文](http:\u002F\u002Fai.stanford.edu\u002F~ang\u002Fpapers\u002Fnips01-discriminativegenerative.pdf)，比较了逻辑回归和朴素贝叶斯在多种数据集上的表现。\n* 如果您喜欢Paul Graham的文章，还可以阅读他关于如何改进垃圾邮件过滤器的[后续文章](http:\u002F\u002Fwww.paulgraham.com\u002Fbetter.html)，以及这篇关于2004年最先进的垃圾邮件过滤技术的[相关论文](http:\u002F\u002Fwww.merl.com\u002Fpublications\u002Fdocs\u002FTR2004-091.pdf)。\n* Yelp发现，与Mechanical Turk相比，朴素贝叶斯在[企业分类]方面更为有效（http:\u002F\u002Fengineeringblog.yelp.com\u002F2011\u002F02\u002Ftowards-building-a-high-quality-workforce-with-mechanical-turk.html）。\n\n-----\n\n### 第15课：自然语言处理\n* Yelp评论文本作业截止（[解决方案](notebooks\u002F14_yelp_review_text_homework.ipynb)）\n* 自然语言处理（[笔记本](notebooks\u002F15_natural_language_processing.ipynb)）\n* 我们的[Kaggle竞赛](https:\u002F\u002Finclass.kaggle.com\u002Fc\u002Fdat8-stack-overflow)简介\n    * 创建一个Kaggle账号，使用邀请链接加入竞赛，下载示例提交文件，然后提交该示例提交（这将需要短信账户验证）。\n\n**作业：**\n* 您的论文初稿将于周四（10月8日）截止！请在课前提交您的项目仓库链接（包含论文、代码、数据和可视化内容）。\n* 观看[Kaggle：运作方式](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=PoD84TVdD-4)（4分钟），了解Kaggle平台的简要概述。\n* 下载竞赛文件，将其移动到`DAT8\u002Fdata`目录，并确保可以使用Pandas打开这些CSV文件。如果遇到无法打开文件的问题，您可能需要关闭实时病毒扫描功能（尤其是Microsoft Security Essentials）。\n* **可选：** 思考哪些特征可能与预测回复相关，然后探索数据以验证这些假设是否成立。\n* **可选：** 观看我的[项目展示视频](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=HGr1yQV3Um0)（16分钟），了解Kaggle竞赛中端到端机器学习流程的完整演示，包括特征工程。（或者直接阅读[幻灯片](https:\u002F\u002Fspeakerdeck.com\u002Fjustmarkham\u002Fallstate-purchase-prediction-challenge-on-kaggle)。）\n\n**NLP资源：**\n* 如果您想深入学习自然语言处理，可以查看这套优秀的[视频讲座](https:\u002F\u002Fclass.coursera.org\u002Fnlp\u002Flecture)和[幻灯片](http:\u002F\u002Fweb.stanford.edu\u002F~jurafsky\u002FNLPCourseraSlides.html)，它们来自这门[Coursera课程](https:\u002F\u002Fwww.coursera.org\u002Fcourse\u002Fnlp)（目前已不再提供）。\n* 这份幻灯片定义了许多[NLP关键术语](https:\u002F\u002Fgithub.com\u002Fga-students\u002FDAT_SF_9\u002Fblob\u002Fmaster\u002F16_Text_Mining\u002FDAT9_lec16_Text_Mining.pdf)。\n* 《用Python进行自然语言处理》（[Natural Language Processing with Python](http:\u002F\u002Fwww.nltk.org\u002Fbook\u002F)）是深入学习[Natural Language Toolkit](http:\u002F\u002Fwww.nltk.org\u002F)（NLTK）最受欢迎的书籍。\n* [Python中的NLP点滴](https:\u002F\u002Fgithub.com\u002Fcharlieg\u002FA-Smattering-of-NLP-in-Python\u002Fblob\u002Fmaster\u002FA%20Smattering%20of%20NLP%20in%20Python.ipynb)提供了NLTK的良好概述，DAT5的这个[笔记本](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT5\u002Fblob\u002Fmaster\u002Fnotebooks\u002F14_nlp.ipynb)也是如此。\n* [spaCy](http:\u002F\u002Fspacy.io\u002F) 是一种较新的Python文本处理库，专注于性能（与NLTK不同）。\n* 如果您想认真研究NLP，[Stanford CoreNLP](http:\u002F\u002Fnlp.stanford.edu\u002Fsoftware\u002Fcorenlp.shtml)是一套备受推崇的工具集（用Java编写）。\n* 在scikit-learn中处理大型文本语料库时，[HashingVectorizer](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Ffeature_extraction.html#vectorizing-a-large-text-corpus-with-the-hashing-trick)是CountVectorizer的一个有用替代方案。\n* [自动分类Yelp商家](http:\u002F\u002Fengineeringblog.yelp.com\u002F2015\u002F09\u002Fautomatically-categorizing-yelp-businesses.html)介绍了Yelp如何利用NLP和scikit-learn解决未分类商家的问题。\n* [情感分析的现代方法](http:\u002F\u002Fdistrictdatalabs.silvrback.com\u002Fmodern-methods-for-sentiment-analysis)展示了“词向量”如何用于更准确的情感分析。\n* [识别幽默漫画字幕](http:\u002F\u002Fwww.cs.huji.ac.il\u002F~dshahaf\u002FpHumor.pdf)是一篇易读的论文，探讨如何识别提交给《纽约客》字幕大赛的有趣字幕。\n* [DC自然语言处理](http:\u002F\u002Fwww.meetup.com\u002FDC-NLP\u002F)是我们本地一个活跃的Meetup小组。\n\n-----\n\n### 第16课：Kaggle竞赛\n* Kaggle平台运作概览（[幻灯片](slides\u002F16_kaggle.pdf)）\n* Kaggle课堂内竞赛：[预测Stack Overflow问题是否会关闭](https:\u002F\u002Finclass.kaggle.com\u002Fc\u002Fdat8-stack-overflow)\n    * [完整代码文件](code\u002F16_kaggle.py)\n    * [精简代码文件](code\u002F16_kaggle_minimal.py)：不包含任何探索性代码\n    * [关于对数损失的解释](http:\u002F\u002Fwww.quora.com\u002FWhat-is-an-intuitive-explanation-for-the-log-loss-function)\n\n**作业：**\n* 你们将被分配评审两位同学的项目初稿。请在10月20日（周二）前按照[同行评审指南](project\u002Fpeer_review.md)向他们提供反馈。\n* 阅读[机器学习可视化入门](http:\u002F\u002Fwww.r2d3.us\u002Fvisual-intro-to-machine-learning-part-1\u002F)，以了解决策树的简要概述。\n* 下载并安装[Graphviz](http:\u002F\u002Fwww.graphviz.org\u002F)，它可以帮助你在scikit-learn中可视化决策树。\n    * Windows用户还需将Graphviz添加到系统路径中：进入控制面板、系统、高级系统设置、环境变量。在系统变量中，编辑“Path”，加入“bin”文件夹的路径，例如：`C:\\Program Files (x86)\\Graphviz2.38\\bin`\n* **可选：** 继续参与我们的Kaggle竞赛！每天最多可以提交5次，比赛将持续到10月27日（周二）东部时间下午6:30（第21节课）。\n\n**资源：**\n* [专业知识毫无用处且无益](http:\u002F\u002Fwww.slate.com\u002Farticles\u002Fhealth_and_science\u002Fnew_scientist\u002F2012\u002F12\u002Fkaggle_president_jeremy_howard_amateurs_beat_specialists_in_data_prediction.html)是一篇对Jeremy Howard（Kaggle前主席）的简短采访，他在文中指出，在构建有效的预测模型时，数据科学技能远比领域专业知识更为重要。\n* [为数据科学这项运动做好准备](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=kwt6XEh7U3g)（74分钟），同样由Jeremy Howard主讲，提供了许多关于竞争性机器学习的实用技巧。\n* [向顶尖选手学习](http:\u002F\u002Fblog.kaggle.com\u002F2014\u002F08\u002F01\u002Flearning-from-the-best\u002F)是一篇优秀的博客文章，总结了Kaggle大师们在Kaggle竞赛中取得好成绩的顶级技巧。\n* [无需领域专业知识的特征工程](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=bL4b1sGnILU)（17分钟），由Kaggle大师Nick Kridler主讲，提供了一些关于如何快速迭代以及在Kaggle竞赛中应将精力集中在何处的简单建议。\n* 以下案例可以帮助你更好地理解特征工程的过程：预测火车站的[乘客数量](https:\u002F\u002Fmedium.com\u002F@chris_bour\u002Ffrench-largest-data-science-challenge-ever-organized-shows-the-unreasonable-effectiveness-of-open-8399705a20ef)、识别在线商店的[欺诈用户](https:\u002F\u002Fdocs.google.com\u002Fpresentation\u002Fd\u002F1UdI5NY-mlHyseiRVbpTLyvbrHxY8RciHp5Vc-ZLrwmU\u002Fedit#slide=id.p)、检测在线拍卖中的[机器人](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Ffacebook-recruiting-iv-human-or-bot\u002Fforums\u002Ft\u002F14628\u002Fshare-your-secret-sauce)、预测谁会[订阅下个乐季的演出](http:\u002F\u002Fblog.kaggle.com\u002F2015\u002F01\u002F05\u002Fkaggle-inclass-stanfords-getting-a-handel-on-data-science-winners-report\u002F)，以及评估[电商搜索引擎结果的质量](http:\u002F\u002Fblog.kaggle.com\u002F2015\u002F07\u002F22\u002Fcrowdflower-winners-interview-3rd-place-team-quartet\u002F)。\n* [我们的完美提交](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Frestaurant-revenue-prediction\u002Fforums\u002Ft\u002F13950\u002Four-perfect-submission)是一篇有趣的分享，讲述了在[公开排行榜](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Frestaurant-revenue-prediction\u002Fleaderboard\u002Fpublic)上表现优异并不意味着模型能够泛化到新数据。\n\n-----\n\n### 第17课：决策树\n* 决策树（[笔记本](notebooks\u002F17_decision_trees.ipynb)）\n* 使用Capital Bikeshare数据的练习（[笔记本](notebooks\u002F17_bikeshare_exercise.ipynb)，[数据](data\u002Fbikeshare.csv)，[数据字典](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fbike-sharing-demand\u002Fdata))\n\n**作业：**\n* 阅读MLWave关于[人类集成学习](http:\u002F\u002Fmlwave.com\u002Fhuman-ensemble-learning\u002F)的文章中“群体智慧”部分。\n* **可选：** 阅读论文[我们真的需要数百种分类器来解决现实世界的分类问题吗？](http:\u002F\u002Fjmlr.csail.mit.edu\u002Fpapers\u002Fvolume15\u002Fdelgado14a\u002Fdelgado14a.pdf)的摘要，以及Kaggle首席技术官Ben Hamner对该论文的[评论](https:\u002F\u002Fnews.ycombinator.com\u002Fitem?id=8719723)，重点关注其中提到的“随机森林”。\n\n**资源：**\n* scikit-learn关于[决策树](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Ftree.html)的文档，不仅提供了树结构的良好概述，还给出了正确的使用建议。\n* 如果想更深入地了解决策树，可以阅读[数据挖掘导论](http:\u002F\u002Fwww-users.cs.umn.edu\u002F~kumar\u002Fdmbook\u002Findex.php)第4章第3节（共23页）。（第4章可免费下载。）\n* 如果希望深入了解不同的决策树算法，这份幻灯片展示了[分类与回归树的简史](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F0B-BKohKl-jUYQ3RpMEF0OGRUU3RHVGpHY203NFd3Z19Nc1ZF\u002Fview)。\n* [合唱科学](http:\u002F\u002Fwww.doc.gold.ac.uk\u002F~mas03dm\u002Fpapers\u002FPawleyMullensiefen_Singalong_2012.pdf)中包含一个简洁的回归树（第136页），用于预测音乐现场观众中会跟着流行歌曲一起合唱的比例。\n* 决策树在医学领域常用于鉴别诊断，例如这张用于[识别精神病](http:\u002F\u002Fwww.psychcongress.com\u002Fsites\u002Fnaccme.com\u002Ffiles\u002Fimages\u002Fpcn\u002Fsaundras\u002Fpsychosis_decision_tree.pdf)的分类树。\n\n-----\n\n### 第18课：集成学习\n* 完成决策树课程（[笔记本](notebooks\u002F17_decision_trees.ipynb)）\n* 集成学习（[笔记本](notebooks\u002F18_ensembling.ipynb)）\n    * 1986–1987年美国职业棒球大联盟球员数据（[hitters.csv](data\u002Fhitters.csv)）\n    * [数据字典](https:\u002F\u002Fcran.r-project.org\u002Fweb\u002Fpackages\u002FISLR\u002FISLR.pdf)（第7页）\n\n**资源：**\n* scikit-learn关于[集成方法](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fensemble.html)的文档，涵盖了“平均法”（如自助聚合和随机森林）以及“提升法”（如AdaBoost和梯度提升树）。\n* MLWave的[Kaggle集成指南](http:\u002F\u002Fmlwave.com\u002Fkaggle-ensembling-guide\u002F)非常详尽，展示了集成可以采用的多种不同方式。\n* 浏览Kaggle[CrowdFlower竞赛](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Fcrowdflower-search-relevance)冠军的优秀[解决方案论文](https:\u002F\u002Fdocs.google.com\u002Fviewer?url=https:\u002F\u002Fraw.githubusercontent.com\u002FChenglongChen\u002FKaggle_CrowdFlower\u002Fmaster\u002FDoc\u002FKaggle_CrowdFlower_ChenglongChen.pdf)，以了解赢得Kaggle竞赛所需的工作量和洞察力。\n* [可解释性与强大预测模型：为何两者都不可或缺](https:\u002F\u002Fmedium.com\u002F@chris_bour\u002Finterpretable-vs-powerful-predictive-models-why-we-need-them-both-990340074979)是一篇简短的文章，探讨了在Kaggle竞赛中有效的策略并不总是适用于现实世界。\n* [连编写算法的人都未必真正了解其工作原理](http:\u002F\u002Fwww.theatlantic.com\u002Ftechnology\u002Farchive\u002F2015\u002F09\u002Fnot-even-the-people-who-write-algorithms-really-know-how-they-work\u002F406099\u002F)一文指出，当前最先进机器学习模型的可解释性降低对社会产生了负面影响。\n* 如需对随机森林进行直观解释，请阅读Edwin Chen对[用通俗语言解释随机森林是如何工作的？](http:\u002F\u002Fwww.quora.com\u002FRandom-Forests\u002FHow-do-random-forests-work-in-laymans-terms\u002Fanswer\u002FEdwin-Chen-1)的回答。\n* [大规模决策森林：经验教训](http:\u002F\u002Fblog.siftscience.com\u002Fblog\u002F2015\u002Flarge-scale-decision-forests-lessons-learned)是Sift Science发表的一篇优秀文章，介绍了他们自定义实现的随机森林。\n* [拆解随机森林分类器](http:\u002F\u002Fnerds.airbnb.com\u002Funboxing-the-random-forest-classifier\u002F)描述了一种解读随机森林内部机制的方法，而不仅仅是关注特征重要性。\n* [理解随机森林：从理论到实践](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1407.7502v3.pdf)是一份深入的学术分析报告，详细探讨了随机森林，并包含了其在scikit-learn中的具体实现细节。\n\n-----\n\n### 第19课：高级scikit-learn与聚类\n* 高级scikit-learn（[笔记本](notebooks\u002F19_advanced_sklearn.ipynb)）\n    * [StandardScaler](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.preprocessing.StandardScaler.html)：特征标准化\n    * [Pipeline](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fpipeline.html)：步骤串联\n* 聚类（[幻灯片](slides\u002F19_clustering.pdf)、[笔记本](notebooks\u002F19_clustering.ipynb)）\n    * K均值：[文档](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.cluster.KMeans.html)、[可视化1](http:\u002F\u002Ftech.nitoyon.com\u002Fen\u002Fblog\u002F2013\u002F11\u002F07\u002Fk-means\u002F)、[可视化2](http:\u002F\u002Fwww.naftaliharris.com\u002Fblog\u002Fvisualizing-k-means-clustering\u002F)\n    * DBSCAN：[文档](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.cluster.DBSCAN.html)、[可视化](http:\u002F\u002Fwww.naftaliharris.com\u002Fblog\u002Fvisualizing-dbscan-clustering\u002F)\n\n**作业：**\n* 重新阅读[理解偏差-方差权衡](http:\u002F\u002Fscott.fortmann-roe.com\u002Fdocs\u002FBiasVariance.html)。（关于[引导问题](homework\u002F09_bias_variance.md)的“答案”已发布，可能会对您有所帮助。）\n* **可选：** 观看加州理工学院《从数据中学习》课程中的两段精彩且相关的视频：[偏差-方差权衡](http:\u002F\u002Fwork.caltech.edu\u002Flibrary\u002F081.html)（15分钟）和[正则化](http:\u002F\u002Fwork.caltech.edu\u002Flibrary\u002F121.html)（8分钟）。\n\n**scikit-learn资源：**\n* 这是一个较长的[特征缩放](https:\u002F\u002Fgithub.com\u002Frasbt\u002Fpattern_classification\u002Fblob\u002Fmaster\u002Fpreprocessing\u002Fabout_standardization_normalization.ipynb)示例，使用了scikit-learn，并进一步讨论了可以使用的各种缩放方法。\n* [Python中的实用数据科学](http:\u002F\u002Fradimrehurek.com\u002Fdata_science_python\u002F)是一份内容详尽、条理清晰的笔记本，其中用到了一些高级的scikit-learn功能：流水线、绘制学习曲线以及模型的序列化保存。\n* 要学习如何使用[GridSearchCV和RandomizedSearchCV](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgrid_search.html)进行参数调优，请观看[如何在scikit-learn中找到最佳模型参数](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=Gol_qOgRqfA)（28分钟），或阅读[相关笔记本](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002Fscikit-learn-videos\u002Fblob\u002Fmaster\u002F08_grid_search.ipynb)。\n* Sebastian Raschka为scikit-learn用户提供了许多优质资源，包括一个包含[教程和示例](https:\u002F\u002Fgithub.com\u002Frasbt\u002Fpattern_classification)的仓库、一个机器学习[工具与扩展库](http:\u002F\u002Frasbt.github.io\u002Fmlxtend\u002F)、一本新书[《Python机器学习》](https:\u002F\u002Fgithub.com\u002Frasbt\u002Fpython-machine-learning-book)，以及一个半活跃的[博客](http:\u002F\u002Fsebastianraschka.com\u002Fblog\u002F)。\n* scikit-learn拥有一个非常活跃的[邮件列表](https:\u002F\u002Fwww.mail-archive.com\u002Fscikit-learn-general@lists.sourceforge.net\u002Findex.html)，在查找函数信息和提问时，往往比Stack Overflow更有用。\n* 如果您忘记了课堂上使用过的某个scikit-learn函数的用法，别忘了这个仓库是完全可搜索的！\n\n**聚类资源：**\n* 要深入了解聚类，可以阅读[数据挖掘导论](http:\u002F\u002Fwww-users.cs.umn.edu\u002F~kumar\u002Fdmbook\u002Findex.php)第8章（69页，可免费下载），或浏览该章节的幻灯片。\n* scikit-learn的用户指南比较了多种不同的[聚类方法](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fclustering.html)。\n* 哥伦比亚大学数据挖掘课程提供的这份[PowerPoint演示文稿](http:\u002F\u002Fwww2.research.att.com\u002F~volinsky\u002FDataMining\u002FColumbia2011\u002FSlides\u002FTopic6-Clustering.ppt)很好地介绍了聚类，包括层次聚类和替代的距离度量方法。\n* 《统计学习导论》中有关于[K均值聚类](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=aIybuNt9ps4&list=PL5-da3qGB5IBC-MneTc9oBZz0C6kNJ-f2)（17分钟）和[层次聚类](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=Tuuc9Y06tAc&list=PL5-da3qGB5IBC-MneTc9oBZz0C6kNJ-f2)（15分钟）的实用视频。\n* 这是一个优秀的[层次聚类]交互式可视化工具（[链接](https:\u002F\u002Fjoyofdata.shinyapps.io\u002Fhclust-shiny\u002F)）。\n* 这里有一段关于[均值漂移聚类]的精美动画解释（[链接](http:\u002F\u002Fspin.atomicobject.com\u002F2015\u002F05\u002F26\u002Fmean-shift-clustering\u002F)）。\n* [K模式算法](http:\u002F\u002Fwww.cs.ust.hk\u002F~qyang\u002FTeaching\u002F537\u002FPapers\u002Fhuang98extensions.pdf)可用于对类别型特征的数据集进行聚类，而无需将其转换为数值型数据。这里有一个[Python实现](https:\u002F\u002Fgithub.com\u002Fnicodv\u002Fkmodes)。\n* 以下是一些有趣的聚类案例：[鲍勃·罗斯作品的统计分析](http:\u002F\u002Ffivethirtyeight.com\u002Ffeatures\u002Fa-statistical-analysis-of-the-work-of-bob-ross\u002F)（附[数据和Python代码](https:\u002F\u002Fgithub.com\u002Ffivethirtyeight\u002Fdata\u002Ftree\u002Fmaster\u002Fbob-ross)）、[一位数学天才如何破解OkCupid找到真爱](http:\u002F\u002Fwww.wired.com\u002F2014\u002F01\u002Fhow-to-hack-okcupid\u002Fall\u002F)，以及[你所在邮编区域的特征](http:\u002F\u002Fwww.esri.com\u002Flanding-pages\u002Ftapestry\u002F)。\n\n-----\n\n### 第20课：正则化与正则表达式\n* 正则化（[笔记本](notebooks\u002F20_regularization.ipynb)）\n    * 回归：[Ridge](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.linear_model.Ridge.html)、[RidgeCV](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.linear_model.RidgeCV.html)、[Lasso](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.linear_model.Lasso.html)、[LassoCV](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.linear_model.LassoCV.html)\n    * 分类：[LogisticRegression](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgenerated\u002Fsklearn.linear_model.LogisticRegression.html)\n    * 辅助函数：[Pipeline](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fpipeline.html)、[GridSearchCV](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Fgrid_search.html)\n* 正则表达式\n    * [巴尔的摩凶杀案数据](data\u002Fhomicides.txt)\n    * [Regular expressions 101](https:\u002F\u002Fregex101.com\u002F#python)：实时测试正则表达式\n    * [参考指南](code\u002F20_regex_reference.py)\n    * [练习](code\u002F20_regex_exercise.py)\n\n**作业：**\n* 你的期末项目下周就要提交了！\n* **可选：** 将你的最终作品提交到我们的Kaggle竞赛中吧！截止时间为美国东部时间10月27日星期二下午6:30。\n* **可选：** 阅读这篇经典论文，它可能会帮助你将我们整个课程中学到的许多主题联系起来：[关于机器学习的一些有用知识](http:\u002F\u002Fhomes.cs.washington.edu\u002F~pedrod\u002Fpapers\u002Fcacm12.pdf)。\n\n**正则化资源：**\n* scikit-learn用户指南中的[广义线性模型](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fmodules\u002Flinear_model.html)部分解释了不同形式的正则化。\n* 《统计学习导论》第6.2节（14页）介绍了套索回归和岭回归。或者观看相关的视频：[岭回归](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=cSKzqb0EKS0&list=PL5-da3qGB5IB-Xdpj_uXJpLGiRfv9UVXI&index=6)（13分钟）和[套索回归](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=A5I1G1MfUmA&index=7&list=PL5-da3qGB5IB-Xdpj_uXJpLGiRfv9UVXI)（15分钟）。\n* 如果想了解更多关于套索回归的细节，可以阅读Tibshirani的[原始论文](http:\u002F\u002Fstatweb.stanford.edu\u002F~tibs\u002Flasso\u002Flasso.pdf)。\n* 如果需要更数学化的正则化解释，可以观看Andrew Ng的[机器学习课程](https:\u002F\u002Fwww.coursera.org\u002Flearn\u002Fmachine-learning\u002F)第3周的最后四节课（30分钟），或者阅读一位学生整理的[相关讲义](http:\u002F\u002Fwww.holehouse.org\u002Fmlclass\u002F07_Regularization.html)。\n* 这个来自《用Python构建机器学习系统》第7章的[笔记本](https:\u002F\u002Fgithub.com\u002Fluispedro\u002FPenalizedRegression\u002Fblob\u002Fmaster\u002FPenalizedRegression.ipynb)提供了一个关于正则化线性回归的详细示例。\n* 在使用正则化模型时，对分类特征进行哑变量编码有一些特殊考虑。Cross Validated上的一个问答[讨论](https:\u002F\u002Fstats.stackexchange.com\u002Fquestions\u002F69568\u002Fwhether-to-rescale-indicator-binary-dummy-predictors-for-lasso)是否应该将哑变量与其他特征一起标准化；而一篇博客文章[评论](http:\u002F\u002Fappliedpredictivemodeling.com\u002Fblog\u002F2013\u002F10\u002F23\u002Fthe-basics-of-encoding-categorical-data-for-predictive-models)建议不要删除基准水平。\n\n**正则表达式资源：**\n* Google的Python课程包含一个优秀的[入门教程](https:\u002F\u002Fdevelopers.google.com\u002Fedu\u002Fpython\u002Fregular-expressions)（还配有[视频](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=kWyoYtvJpe4&index=4&list=PL5-da3qGB5IA5NwDxcEJ5dvt8F9OQP7q5)）。\n* Python for Informatics有一章很好的[内容](http:\u002F\u002Fwww.pythonlearn.com\u002Fhtml-270\u002Fbook012.html)介绍正则表达式。（如果你想运行示例，需要下载[mbox.txt](http:\u002F\u002Fwww.py4inf.com\u002Fcode\u002Fmbox.txt)和[mbox-short.txt](http:\u002F\u002Fwww.py4inf.com\u002Fcode\u002Fmbox-short.txt)。）\n* [用正则表达式破冰](https:\u002F\u002Fwww.codeschool.com\u002Fcourses\u002Fbreaking-the-ice-with-regular-expressions\u002F)是Code School的一门互动课程，不过只有第一“关”是免费的。\n* 如果你想深入研究正则表达式，[RexEgg](http:\u002F\u002Fwww.rexegg.com\u002F)提供了大量的文章和教程。\n* [你不知道的5种使用正则表达式的工具](http:\u002F\u002Fblog.codeschool.io\u002F2015\u002F07\u002F30\u002F5-tools-you-didnt-know-that-use-regular-expressions\u002F)展示了如何在Excel、Word、Google表格、Google表单、文本编辑器等工具中使用正则表达式。\n* [探索GitHub提交信息中的情感表达](http:\u002F\u002Fgeeksta.net\u002Fgeeklog\u002Fexploring-expressions-emotions-github-commit-messages\u002F)是一个有趣的例子，说明如何利用正则表达式进行数据分析；而[Emojineering](http:\u002F\u002Finstagram-engineering.tumblr.com\u002Fpost\u002F118304328152\u002Femojineering-part-2-implementing-hashtag-emoji)则解释了Instagram如何使用正则表达式来检测标签中的表情符号。\n\n-----\n\n### 第21课：课程回顾与期末项目展示\n* 项目展示！\n* [数据科学回顾](https:\u002F\u002Fdocs.google.com\u002Fdocument\u002Fd\u002F19gBCkmrbMpFFLPX8wa5daMnyl7J5BXhMV8JNJwgp1pk\u002Fedit?usp=sharing)\n\n**资源：**\n* scikit-learn的[机器学习地图](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Ftutorial\u002Fmachine_learning_map\u002F)可以帮助你为任务选择“最佳”模型。\n* [如何选择机器学习分类器](http:\u002F\u002Fblog.echen.me\u002F2011\u002F04\u002F27\u002Fchoosing-a-machine-learning-classifier\u002F)是一篇简短且易读的文章，比较了多种分类模型；[分类器比较](http:\u002F\u002Fscikit-learn.org\u002Fstable\u002Fauto_examples\u002Fclassification\u002Fplot_classifier_comparison.html)则是scikit-learn对分类器决策边界所做的可视化展示；[监督学习算法比较](http:\u002F\u002Fwww.dataschool.io\u002Fcomparing-supervised-learning-algorithms\u002F)是我制作的一张模型比较表格；而[监督学习误区速查表](http:\u002F\u002Fryancompton.net\u002Fassets\u002Fml_cheat_sheet\u002Fsupervised_learning.html)则提供了更为详尽的比较，并附有大量实用资源链接。\n* [机器学习那些坑](http:\u002F\u002Fml.posthaven.com\u002Fmachine-learning-done-wrong)、[机器学习中的幽灵](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=tleeC-KlsKA)（31分钟）、[巧妙的过拟合方法](http:\u002F\u002Fhunch.net\u002F?p=22)以及[机器学习常见陷阱](http:\u002F\u002Fdanielnee.com\u002F?p=155)都提供了关于如何避免机器学习中常见错误的深刻建议。\n* [KDD 2011最佳工业论文中的实用机器学习技巧](http:\u002F\u002Fblog.david-andrzejewski.com\u002Fmachine-learning\u002Fpractical-machine-learning-tricks-from-the-kdd-2011-best-industry-paper\u002F)和Andrew Ng的[应用机器学习的建议](http:\u002F\u002Fcs229.stanford.edu\u002Fmaterials\u002FML-advice.pdf)则包含了比上述资源更高级的指导。\n* [监督学习算法的实证比较](http:\u002F\u002Fwww.cs.cornell.edu\u002F~caruana\u002Fctp\u002Fct.papers\u002Fcaruana.icml06.pdf)是一篇2006年的易读研究论文，同时也以[演讲](http:\u002F\u002Fvideolectures.net\u002Fsolomon_caruana_wslmw\u002F)形式呈现（77分钟）。\n\n-----\n\n### 第22课：期末项目展示\n* 项目展示！\n* [接下来怎么办？](other\u002Fadvice.md)\n\n-----\n\n## 补充资源\n\n### 整洁数据\n* [数据分析中的良好数据管理实践](https:\u002F\u002Fwww.prometheusresearch.com\u002Fgood-data-management-practices-for-data-analysis-tidy-data-part-2\u002F)简要总结了“整洁数据”的原则。\n* Hadley Wickham的[论文](http:\u002F\u002Fwww.jstatsoft.org\u002Farticle\u002Fview\u002Fv059i10)详细解释了整洁数据，并提供了许多优秀的示例。\n* 洁净数据集示例：[鲍勃·罗斯](https:\u002F\u002Fgithub.com\u002Ffivethirtyeight\u002Fdata\u002Fblob\u002Fmaster\u002Fbob-ross\u002Felements-by-episode.csv)\n* 不整洁数据集示例：[NFL门票价格](https:\u002F\u002Fgithub.com\u002Ffivethirtyeight\u002Fdata\u002Fblob\u002Fmaster\u002Fnfl-ticket-prices\u002F2014-average-ticket-price.csv)、[航空安全](https:\u002F\u002Fgithub.com\u002Ffivethirtyeight\u002Fdata\u002Fblob\u002Fmaster\u002Fairline-safety\u002Fairline-safety.csv)、[喷气机队门票价格](https:\u002F\u002Fgithub.com\u002Ffivethirtyeight\u002Fdata\u002Fblob\u002Fmaster\u002Fnfl-ticket-prices\u002Fjets-buyer.csv)、[Chipotle订单](https:\u002F\u002Fgithub.com\u002FTheUpshot\u002Fchipotle\u002Fblob\u002Fmaster\u002Forders.tsv)\n* 如果你的同事倾向于创建[计算机无法读取的]电子表格（参见[Bosker博客文章](https:\u002F\u002Fbosker.wordpress.com\u002F2014\u002F12\u002F05\u002Fthe-government-statistical-services-terrible-spreadsheet-advice\u002F)），他们可能会从阅读这些[发布电子表格数据的建议](http:\u002F\u002Fwww.clean-sheet.org\u002F)中受益。（Cross Validated上也有其他一些建议，详见[此回答](http:\u002F\u002Fstats.stackexchange.com\u002Fquestions\u002F83614\u002Fbest-practices-for-creating-tidy-data\u002F83711#83711)。）\n\n### 数据库与SQL\n* 这份[GA幻灯片](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT5\u002Fblob\u002Fmaster\u002Fslides\u002F20_sql.pdf)提供了数据库和SQL的简要介绍。该课程的[Python脚本](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT5\u002Fblob\u002Fmaster\u002Fcode\u002F20_sql.py)演示了基本的SQL查询，以及如何从Python连接到SQLite数据库并使用Pandas进行查询。\n* 这个[SQL训练营](https:\u002F\u002Fgithub.com\u002Fbrandonmburroughs\u002Fsql_bootcamp)的仓库包含一个注释极其详细的SQL脚本，非常适合自行逐步学习。\n* 这份[GA笔记本](https:\u002F\u002Fgithub.com\u002Fpodopie\u002FDAT18NYC\u002Fblob\u002Fmaster\u002Fclasses\u002F17-relational_databases.ipynb)提供了更简短的数据库和SQL入门，巧妙地将SQL查询与Pandas语法进行了对比。\n* [SQLZOO](http:\u002F\u002Fsqlzoo.net\u002Fwiki\u002FSQL_Tutorial)、[Mode Analytics](http:\u002F\u002Fsqlschool.modeanalytics.com\u002F)、[可汗学院](https:\u002F\u002Fwww.khanacademy.org\u002Fcomputing\u002Fcomputer-programming\u002Fsql)、[Codecademy](https:\u002F\u002Fwww.codecademy.com\u002Fcourses\u002Flearn-sql)、[Datamonkey](http:\u002F\u002Fdatamonkey.pro\u002Fguess_sql\u002Flessons\u002F)以及[Code School](http:\u002F\u002Fcampus.codeschool.com\u002Fcourses\u002Ftry-sql\u002Fcontents)都提供在线初学者SQL教程，看起来颇具吸引力。Code School还提供了一门[进阶教程](https:\u002F\u002Fwww.codeschool.com\u002Fcourses\u002Fthe-sequel-to-sql\u002F)，不过需要付费。\n* [w3schools](http:\u002F\u002Fwww.w3schools.com\u002Fsql\u002Ftrysql.asp?filename=trysql_select_all)有一个示例数据库，允许你直接在浏览器中练习SQL。同样，Kaggle也允许你使用其在线“Scripts”应用程序查询一个大型SQLite数据库——[Reddit评论](https:\u002F\u002Fwww.kaggle.com\u002Fc\u002Freddit-comments-may-2015\u002Fdata)。\n* [每个数据科学家都需要了解的SQL知识](http:\u002F\u002Fjoshualande.com\u002Fdata-science-sql\u002F)是一系列关于SQL基础的简短文章；而[面向数据科学家的SQL入门](http:\u002F\u002Fbensresearch.com\u002Fdownloads\u002FSQL.pdf)则是一篇具有类似目标的论文。\n* [彻底理解SQL的10个简单步骤](https:\u002F\u002Fweb.archive.org\u002Fweb\u002F20150402234726\u002Fhttp:\u002F\u002Ftech.pro\u002Ftutorial\u002F1555\u002F10-easy-steps-to-a-complete-understanding-of-sql)是一篇适合有一定SQL经验、希望深入理解SQL的读者的好文章。\n* SQLite关于[查询计划](http:\u002F\u002Fwww.sqlite.org\u002Fqueryplanner.html)的文章解释了SQL查询是如何“运作”的。\n* [关系型数据库管理系统比较](https:\u002F\u002Fwww.digitalocean.com\u002Fcommunity\u002Ftutorials\u002Fsqlite-vs-mysql-vs-postgresql-a-comparison-of-relational-database-management-systems)列出了SQLite、MySQL和PostgreSQL各自的优缺点。\n* 如果你想更深入地学习数据库和SQL，斯坦福大学有一套备受推崇的[14节迷你课程](https:\u002F\u002Flagunita.stanford.edu\u002Fcourses\u002FDB\u002F2014\u002FSelfPaced\u002Fabout)。\n* [Blaze](http:\u002F\u002Fblaze.pydata.org)是一个Python包，它使你可以使用类似于Pandas的语法来查询存储在各种数据存储系统中的数据。\n\n### 推荐系统\n* 这份 [GA 幻灯片](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT4\u002Fblob\u002Fmaster\u002Fslides\u002F18_recommendation_engines.pdf) 简要介绍了推荐系统，而该课程中的 [Python 脚本](https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT4\u002Fblob\u002Fmaster\u002Fcode\u002F18_recommenders_soutions.py) 则演示了如何构建一个简单的推荐系统。\n* 《大规模数据集挖掘》 第9章（36页）对推荐系统进行了更为深入的介绍。\n* 《程序员数据挖掘指南》 第2至4章（165页）以更友好的方式介绍了推荐系统，提供了大量 Python 代码和练习。\n* Netflix 奖是一项著名的竞赛，旨在将 Netflix 的推荐系统准确率提高 10%。以下是一些关于 Netflix 奖的有用文章：\n    * [Netflix 推荐：超越五星评分](http:\u002F\u002Ftechblog.netflix.com\u002F2012\u002F04\u002Fnetflix-recommendations-beyond-5-stars.html)：Netflix 官方博客的两篇文章，总结了此次竞赛及其推荐系统。\n    * [赢得 Netflix 奖：总结](http:\u002F\u002Fblog.echen.me\u002F2011\u002F10\u002F24\u002Fwinning-the-netflix-prize-a-summary\u002F)：概述了获胜方案中所采用的模型和技术。\n    * [对 Netflix 奖的回顾](http:\u002F\u002Fwww2.research.att.com\u002F~volinsky\u002Fpapers\u002Fchance.pdf)：由获奖团队撰写的竞赛总结。\n* 这篇 [论文](http:\u002F\u002Fwww.cs.umd.edu\u002F~samir\u002F498\u002FAmazon-Recommendations.pdf) 总结了 Amazon.com 推荐系统的工作原理，而这个 [Stack Overflow 问答](http:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F2323768\u002Fhow-does-the-amazon-recommendation-feature-work) 则提供了一些补充见解。\n* Facebook 和 Etsy 分别在其官方博客上发布了关于其推荐系统工作原理的文章。\n* The Global Network of Discovery (GNOD) 提供了一些针对音乐、作者和电影的有趣推荐工具。\n* NPR 的 Planet Money 播客节目《你机器里的那些人》（23分钟）探讨了 Amazon Mechanical Turk 如何协助构建推荐引擎（以及更广泛意义上的机器学习）。\n* 如果你想更深入地学习推荐系统，Coursera 上有一门相关的 [课程](https:\u002F\u002Fwww.coursera.org\u002Flearn\u002Frecommender-systems)。","# DAT8 数据科学课程快速上手指南\n\nDAT8 是 General Assembly 华盛顿校区的数据科学课程开源仓库，由知名数据科学教育家 Kevin Markham 主讲。本指南旨在帮助中国开发者快速搭建环境并开始学习。\n\n## 环境准备\n\n在开始之前，请确保你的系统满足以下要求：\n\n*   **操作系统**：Windows、macOS 或 Linux。\n*   **版本控制**：已安装 [Git](http:\u002F\u002Fgit-scm.com\u002Fdownloads)。\n*   **代码托管**：拥有 [GitHub](https:\u002F\u002Fgithub.com\u002F) 账号（无需安装 GitHub Desktop 客户端）。\n*   **Python 环境**：推荐安装 **Anaconda** (Python 2.7x 版本，依据原课程要求；若需适配现代环境，建议使用 Python 3.x 并手动安装所需包)。\n    *   *国内加速建议*：由于 Anaconda 官方源下载较慢，建议安装后配置清华或中科大镜像源。\n*   **命令行工具**：\n    *   macOS\u002FLinux: 使用终端 (Terminal)。\n    *   Windows: 使用 Git Bash。\n\n## 安装步骤\n\n### 1. 克隆项目仓库\n打开终端或 Git Bash，运行以下命令将课程材料下载到本地：\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fjustmarkham\u002FDAT8.git\ncd DAT8\n```\n\n### 2. 配置 Python 环境 (推荐 Anaconda)\n\n如果你选择使用 Anaconda，请先下载并安装对应版本的 Anaconda。安装完成后，建议更换为国内镜像源以加速包管理：\n\n```bash\n# 配置清华镜像源 (Conda)\nconda config --add channels https:\u002F\u002Fmirrors.tuna.tsinghua.edu.cn\u002Fanaconda\u002Fpkgs\u002Fmain\u002F\nconda config --add channels https:\u002F\u002Fmirrors.tuna.tsinghua.edu.cn\u002Fanaconda\u002Fpkgs\u002Ffree\u002F\nconda config --set show_channel_urls yes\n```\n\n如果不使用 Anaconda，你需要根据 `other\u002Fpython_packages.md` 列表手动安装所需的 Python 包（如 `pandas`, `scikit-learn`, `matplotlib` 等）：\n\n```bash\npip install pandas scikit-learn matplotlib seaborn statsmodels\n# 如需加速 pip 下载\npip install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple pandas scikit-learn matplotlib\n```\n\n### 3. 在线免安装体验 (Binder)\n如果不想在本地配置环境，可以直接点击仓库首页的 Binder 徽章，或在浏览器中访问以下链接，直接在云端运行 Jupyter Notebook：\n\n```text\nhttp:\u002F\u002Fmybinder.org\u002Frepo\u002Fjustmarkham\u002FDAT8\n```\n\n## 基本使用\n\n本课程包含 22 节课的材料，涵盖从命令行基础到机器学习进阶的内容。\n\n### 1. 浏览课程大纲\n进入项目根目录，查看 `README.md` 了解完整的课程日程表。课程按周二\u002F周四排列，例如：\n*   **Class 1**: 数据科学导论\n*   **Class 3**: 数据读取与清洗\n*   **Class 6**: 机器学习基础\n*   **Class 10**: 线性回归\n\n### 2. 运行示例代码\n以第三课（数据读取与清洗）为例，进入对应目录并运行 Python 脚本：\n\n```bash\ncd code\n# 查看文件读取示例 (依赖 data\u002Fairlines.csv)\npython 03_file_reading.py\n\n# 查看 Chipotle 数据清洗练习\npython 03_python_homework_chipotle.py\n```\n\n### 3. 使用 Jupyter Notebook\n大部分课程材料以 IPython Notebook (`.ipynb`) 形式提供。启动 Jupyter 服务器进行交互式学习：\n\n```bash\njupyter notebook\n```\n浏览器会自动打开界面，你可以导航至 `slides` (课件)、`code` (代码) 或 `homework` (作业) 文件夹开始学习。\n\n### 4. 课前练习示例\n课程强调命令行基础。尝试完成第一课布置的命令行参考阅读，并在终端执行基础操作（参考 `code\u002F02_command_line.md`）：\n\n```bash\n# 示例：查看当前目录结构\nls -l\n\n# 示例：查看 Git 状态\ngit status\n```\n\n> **提示**：建议按照 `Class 1` 到 `Class 22` 的顺序逐步学习，并完成每节课后的 `Homework` 部分以巩固技能。项目实战要求请参考 `project\u002FREADME.md`。","一位刚转行数据科学的产品经理，急需在两周内掌握从数据清洗到机器学习建模的全套实战技能以应对新岗位的挑战。\n\n### 没有 DAT8 时\n- 学习路径支离破碎，需要在 Stack Overflow、各类博客和文档间反复跳转，难以构建系统的知识框架。\n- 缺乏统一的实战环境配置指南，常在安装 Python 包、Git 版本控制等前置步骤上耗费数天，甚至因环境报错而放弃。\n- 只有零散的代码片段，缺少从“数据读取”到“模型评估”的完整项目流演示，无法理解各环节如何衔接。\n- 面对抽象的算法理论（如 KNN、逻辑回归），没有配套的交互式 Notebook 进行即时演练，导致“看懂了但写不出”。\n- 遇到瓶颈时找不到经过验证的练习数据集和标准答案，自我评估困难，难以确认学习成果是否达标。\n\n### 使用 DAT8 后\n- 直接沿用 General Assembly 验证过的 22 节系统课表，按部就班地从命令行基础进阶到集成学习，知识体系清晰完整。\n- 利用详细的课前检查清单和 Anaconda 配置指引，快速搭建好包含所有依赖包的开发环境，第一天即可开始写代码。\n- 通过 22 个完整的 Jupyter Notebook 案例，亲手复现从数据清洗、探索性分析到最终 Kaggle 竞赛的全流程，打通任督二脉。\n- 结合 Kevin Markham 的视频讲解与可运行的代码单元，边学边改参数观察结果，将复杂的机器学习算法转化为直观的操作经验。\n- 依托课程提供的专属数据集和项目作业，能够对照标准解法进行自我测试，并在结业项目中产出可展示的作品集。\n\nDAT8 将原本混乱的自学过程转化为一条结构严谨、即开即用的数据科学高速成长通道。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fjustmarkham_DAT8_0e7d44b6.png","justmarkham","Kevin Markham","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fjustmarkham_104a3a00.png","Founder of Data School","Data School","Asheville, NC, USA",null,"https:\u002F\u002Fcourses.dataschool.io","https:\u002F\u002Fgithub.com\u002Fjustmarkham",[85,89,93],{"name":86,"color":87,"percentage":88},"Jupyter Notebook","#DA5B0B",92.5,{"name":90,"color":91,"percentage":92},"Python","#3572A5",7.5,{"name":94,"color":95,"percentage":96},"HTML","#e34c26",0,1623,1057,"2026-04-03T20:55:13","Linux, macOS, Windows","未说明",{"notes":103,"python":104,"dependencies":105},"这是一个 2015 年的数据科学课程资料库，主要基于 Python 2.7 和 Anaconda 发行版。推荐使用 Git 进行版本控制，可通过 Binder 在线运行部分 Notebook。课程涵盖命令行、数据清洗、可视化及传统机器学习算法（如线性回归、决策树等），不涉及深度学习或 GPU 加速需求。","2.7",[106,107,108,109,110,111,112],"Anaconda (Python 2.7 distribution)","Git","scikit-learn","pandas","matplotlib","numpy","csvkit",[53,13,51,26,54],[115,116,108,117,109,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132],"data-science","machine-learning","data-analysis","jupyter-notebook","python","course","linear-regression","logistic-regression","model-evaluation","naive-bayes","natural-language-processing","decision-trees","ensemble-learning","clustering","regular-expressions","web-scraping","data-visualization","data-cleaning","2026-03-27T02:49:30.150509","2026-04-06T05:17:25.293438",[],[]]