[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-kjw0612--awesome-deep-vision":3,"tool-kjw0612--awesome-deep-vision":62},[4,18,26,35,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,2,"2026-04-10T11:39:34",[14,15,13],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":32,"last_commit_at":41,"category_tags":42,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[43,13,15,14],"插件",{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":10,"last_commit_at":50,"category_tags":51,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[52,15,13,14],"语言模型",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":10,"last_commit_at":59,"category_tags":60,"status":17},4292,"Deep-Live-Cam","hacksider\u002FDeep-Live-Cam","Deep-Live-Cam 是一款专注于实时换脸与视频生成的开源工具，用户仅需一张静态照片，即可通过“一键操作”实现摄像头画面的即时变脸或制作深度伪造视频。它有效解决了传统换脸技术流程繁琐、对硬件配置要求极高以及难以实时预览的痛点，让高质量的数字内容创作变得触手可及。\n\n这款工具不仅适合开发者和技术研究人员探索算法边界，更因其极简的操作逻辑（仅需三步：选脸、选摄像头、启动），广泛适用于普通用户、内容创作者、设计师及直播主播。无论是为了动画角色定制、服装展示模特替换，还是制作趣味短视频和直播互动，Deep-Live-Cam 都能提供流畅的支持。\n\n其核心技术亮点在于强大的实时处理能力，支持口型遮罩（Mouth Mask）以保留使用者原始的嘴部动作，确保表情自然精准；同时具备“人脸映射”功能，可同时对画面中的多个主体应用不同面孔。此外，项目内置了严格的内容安全过滤机制，自动拦截涉及裸露、暴力等不当素材，并倡导用户在获得授权及明确标注的前提下合规使用，体现了技术发展与伦理责任的平衡。",88924,"2026-04-06T03:28:53",[14,15,13,61],"视频",{"id":63,"github_repo":64,"name":65,"description_en":66,"description_zh":67,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":78,"owner_email":78,"owner_twitter":78,"owner_website":78,"owner_url":79,"languages":78,"stars":80,"forks":81,"last_commit_at":82,"license":78,"difficulty_score":83,"env_os":84,"env_gpu":85,"env_ram":85,"env_deps":86,"category_tags":89,"github_topics":78,"view_count":32,"oss_zip_url":78,"oss_zip_packed_at":78,"status":17,"created_at":91,"updated_at":92,"faqs":93,"releases":129},5009,"kjw0612\u002Fawesome-deep-vision","awesome-deep-vision","A curated list of deep learning resources for computer vision ","awesome-deep-vision 是一份专为计算机视觉领域打造的深度学习资源精选清单。它系统地整理了该方向的核心学术成果与实用资料，涵盖了从基础的图像分类、目标检测、跟踪，到高级的语义分割、人体姿态估计、图像生成以及图文跨模态理解等广泛主题。\n\n在深度学习技术快速迭代的背景下，研究人员和开发者往往面临海量论文与信息过载的挑战，难以高效定位高质量文献。awesome-deep-vision 正是为了解决这一痛点而生，它将分散的资源按应用场景和技术细分领域进行了结构化梳理，帮助用户快速建立知识体系或追踪前沿进展。除了收录经典的学术论文（如 ResNet、GoogLeNet 等），清单还包含了相关的课程、书籍、视频教程、软件框架及博客文章，形成了完整的学习闭环。\n\n这份资源特别适合计算机视觉领域的研究人员、算法工程师以及希望深入该专业的学生使用。虽然项目目前不再活跃维护，但其沉淀的经典文献目录依然具有极高的参考价值，是入门深造或开展科研工作的理想“导航图”。通过 awesome-deep-vision，用户可以更专注于技术本身，而非在信息海洋中盲目摸索。","# Awesome Deep Vision [![Awesome](https:\u002F\u002Fcdn.rawgit.com\u002Fsindresorhus\u002Fawesome\u002Fd7305f38d29fed78fa85652e3a63e154dd8e8829\u002Fmedia\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Fsindresorhus\u002Fawesome)\n\nA curated list of deep learning resources for computer vision, inspired by [awesome-php](https:\u002F\u002Fgithub.com\u002Fziadoz\u002Fawesome-php) and [awesome-computer-vision](https:\u002F\u002Fgithub.com\u002Fjbhuang0604\u002Fawesome-computer-vision).\n\nMaintainers - [Jiwon Kim](https:\u002F\u002Fgithub.com\u002Fkjw0612), [Heesoo Myeong](https:\u002F\u002Fgithub.com\u002Fhmyeong), [Myungsub Choi](https:\u002F\u002Fgithub.com\u002Fmyungsub), [Jung Kwon Lee](https:\u002F\u002Fgithub.com\u002Fderuci), [Taeksoo Kim](https:\u002F\u002Fgithub.com\u002Fjazzsaxmafia)\n\nThe project is not actively maintained. \n\n## Contributing\nPlease feel free to [pull requests](https:\u002F\u002Fgithub.com\u002Fkjw0612\u002Fawesome-deep-vision\u002Fpulls) to add papers.\n\n[![Join the chat at https:\u002F\u002Fgitter.im\u002Fkjw0612\u002Fawesome-deep-vision](https:\u002F\u002Fbadges.gitter.im\u002FJoin%20Chat.svg)](https:\u002F\u002Fgitter.im\u002Fkjw0612\u002Fawesome-deep-vision?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)\n\n## Sharing\n+ [Share on Twitter](http:\u002F\u002Ftwitter.com\u002Fhome?status=http:\u002F\u002Fjiwonkim.org\u002Fawesome-deep-vision%0ADeep Learning Resources for Computer Vision)\n+ [Share on Facebook](http:\u002F\u002Fwww.facebook.com\u002Fsharer\u002Fsharer.php?u=https:\u002F\u002Fjiwonkim.org\u002Fawesome-deep-vision)\n+ [Share on Google Plus](http:\u002F\u002Fplus.google.com\u002Fshare?url=https:\u002F\u002Fjiwonkim.org\u002Fawesome-deep-vision)\n+ [Share on LinkedIn](http:\u002F\u002Fwww.linkedin.com\u002FshareArticle?mini=true&url=https:\u002F\u002Fjiwonkim.org\u002Fawesome-deep-vision&title=Awesome%20Deep%20Vision&summary=&source=)\n\n## Table of Contents\n- [Papers](#papers)\n  - [ImageNet Classification](#imagenet-classification)\n  - [Object Detection](#object-detection)\n  - [Object Tracking](#object-tracking)\n  - [Low-Level Vision](#low-level-vision)\n    - [Super-Resolution](#super-resolution)\n    - [Other Applications](#other-applications)\n  - [Edge Detection](#edge-detection)\n  - [Semantic Segmentation](#semantic-segmentation)\n  - [Visual Attention and Saliency](#visual-attention-and-saliency)\n  - [Object Recognition](#object-recognition)\n  - [Human Pose Estimation](#human-pose-estimation)\n  - [Understanding CNN](#understanding-cnn)\n  - [Image and Language](#image-and-language)\n    - [Image Captioning](#image-captioning)\n    - [Video Captioning](#video-captioning)\n    - [Question Answering](#question-answering)\n  - [Image Generation](#image-generation)\n  - [Other Topics](#other-topics)\n- [Courses](#courses)\n- [Books](#books)\n- [Videos](#videos)\n- [Software](#software)\n  - [Framework](#framework)\n  - [Applications](#applications)\n- [Tutorials](#tutorials)\n- [Blogs](#blogs)\n\n## Papers\n\n### ImageNet Classification\n![classification](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_e709818fabd8.png)\n(from Alex Krizhevsky, Ilya Sutskever, Geoffrey E. Hinton, ImageNet Classification with Deep Convolutional Neural Networks, NIPS, 2012.)\n* Microsoft (Deep Residual Learning) [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1512.03385v1.pdf)][[Slide](http:\u002F\u002Fimage-net.org\u002Fchallenges\u002Ftalks\u002Filsvrc2015_deep_residual_learning_kaiminghe.pdf)]\n  * Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun, Deep Residual Learning for Image Recognition, arXiv:1512.03385.\n* Microsoft (PReLu\u002FWeight Initialization) [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.01852)\n  * Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun, Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification, arXiv:1502.01852.\n* Batch Normalization [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.03167)\n  * Sergey Ioffe, Christian Szegedy, Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, arXiv:1502.03167.\n* GoogLeNet [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1409.4842)\n  * Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich, CVPR, 2015.\n* VGG-Net [[Web]](http:\u002F\u002Fwww.robots.ox.ac.uk\u002F~vgg\u002Fresearch\u002Fvery_deep\u002F) [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1409.1556)\n  * Karen Simonyan and Andrew Zisserman, Very Deep Convolutional Networks for Large-Scale Visual Recognition, ICLR, 2015.\n* AlexNet [[Paper]](http:\u002F\u002Fpapers.nips.cc\u002Fbook\u002Fadvances-in-neural-information-processing-systems-25-2012)\n  * Alex Krizhevsky, Ilya Sutskever, Geoffrey E. Hinton, ImageNet Classification with Deep Convolutional Neural Networks, NIPS, 2012.\n\n### Object Detection\n![object_detection](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_9e6e1ced5d88.png)\n(from Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun, Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks, arXiv:1506.01497.)\n\n* PVANET [[Paper]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1608.08021) [[Code]](https:\u002F\u002Fgithub.com\u002Fsanghoon\u002Fpva-faster-rcnn)\n  * Kye-Hyeon Kim, Sanghoon Hong, Byungseok Roh, Yeongjae Cheon, Minje Park, PVANET: Deep but Lightweight Neural Networks for Real-time Object Detection, arXiv:1608.08021\n* OverFeat, NYU [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1312.6229.pdf)\n  * OverFeat: Integrated Recognition, Localization and Detection using Convolutional Networks, ICLR, 2014.\n* R-CNN, UC Berkeley [[Paper-CVPR14]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2014\u002Fpapers\u002FGirshick_Rich_Feature_Hierarchies_2014_CVPR_paper.pdf) [[Paper-arXiv14]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1311.2524)\n  * Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik, Rich feature hierarchies for accurate object detection and semantic segmentation, CVPR, 2014.\n* SPP, Microsoft Research [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1406.4729)\n  * Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun, Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition, ECCV, 2014.\n* Fast R-CNN, Microsoft Research [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1504.08083)\n  * Ross Girshick, Fast R-CNN, arXiv:1504.08083.\n* Faster R-CNN, Microsoft Research [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.01497)\n  * Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun, Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks, arXiv:1506.01497.\n* R-CNN minus R, Oxford [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.06981)\n  * Karel Lenc, Andrea Vedaldi, R-CNN minus R, arXiv:1506.06981.\n* End-to-end people detection in crowded scenes [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1506.04878)\n  * Russell Stewart, Mykhaylo Andriluka, End-to-end people detection in crowded scenes, arXiv:1506.04878.\n* You Only Look Once: Unified, Real-Time Object Detection [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1506.02640), [[Paper Version 2]](https:\u002F\u002Farxiv.org\u002Fabs\u002F1612.08242), [[C Code]](https:\u002F\u002Fgithub.com\u002Fpjreddie\u002Fdarknet), [[Tensorflow Code]](https:\u002F\u002Fgithub.com\u002Fthtrieu\u002Fdarkflow)\n  * Joseph Redmon, Santosh Divvala, Ross Girshick, Ali Farhadi, You Only Look Once: Unified, Real-Time Object Detection, arXiv:1506.02640\n  * Joseph Redmon, Ali Farhadi (Version 2)\n* Inside-Outside Net [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1512.04143)\n  * Sean Bell, C. Lawrence Zitnick, Kavita Bala, Ross Girshick, Inside-Outside Net: Detecting Objects in Context with Skip Pooling and Recurrent Neural Networks\n* Deep Residual Network (Current State-of-the-Art) [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1512.03385)\n  * Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun, Deep Residual Learning for Image Recognition\n* Weakly Supervised Object Localization with Multi-fold Multiple Instance Learning [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1503.00949.pdf)]\n* R-FCN [[Paper]](https:\u002F\u002Farxiv.org\u002Fabs\u002F1605.06409) [[Code]](https:\u002F\u002Fgithub.com\u002Fdaijifeng001\u002FR-FCN)\n  * Jifeng Dai, Yi Li, Kaiming He, Jian Sun, R-FCN: Object Detection via Region-based Fully Convolutional Networks\n* SSD [[Paper]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1512.02325v2.pdf) [[Code]](https:\u002F\u002Fgithub.com\u002Fweiliu89\u002Fcaffe\u002Ftree\u002Fssd)\n  * Wei Liu1, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg, SSD: Single Shot MultiBox Detector, arXiv:1512.02325\n* Speed\u002Faccuracy trade-offs for modern convolutional object detectors [[Paper]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.10012v1.pdf)\n  * Jonathan Huang, Vivek Rathod, Chen Sun, Menglong Zhu, Anoop Korattikara, Alireza Fathi, Ian Fischer, Zbigniew Wojna, Yang Song, Sergio Guadarrama, Kevin Murphy, Google Research, arXiv:1611.10012\n\n### Video Classification\n* Nicolas Ballas, Li Yao, Pal Chris, Aaron Courville, \"Delving Deeper into Convolutional Networks for Learning Video Representations\", ICLR 2016. [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.06432v4.pdf)]\n* Michael Mathieu, camille couprie, Yann Lecun, \"Deep Multi Scale Video Prediction Beyond Mean Square Error\", ICLR 2016. [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.05440v6.pdf)]\n\n### Object Tracking\n* Seunghoon Hong, Tackgeun You, Suha Kwak, Bohyung Han, Online Tracking by Learning Discriminative Saliency Map with Convolutional Neural Network, arXiv:1502.06796. [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.06796)\n* Hanxi Li, Yi Li and Fatih Porikli, DeepTrack: Learning Discriminative Feature Representations by Convolutional Neural Networks for Visual Tracking, BMVC, 2014. [[Paper]](http:\u002F\u002Fwww.bmva.org\u002Fbmvc\u002F2014\u002Ffiles\u002Fpaper028.pdf)\n* N Wang, DY Yeung, Learning a Deep Compact Image Representation for Visual Tracking, NIPS, 2013. [[Paper]](http:\u002F\u002Fwinsty.net\u002Fpapers\u002Fdlt.pdf)\n* Chao Ma, Jia-Bin Huang, Xiaokang Yang and Ming-Hsuan Yang, Hierarchical Convolutional Features for Visual Tracking, ICCV 2015 [[Paper](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_iccv_2015\u002Fpapers\u002FMa_Hierarchical_Convolutional_Features_ICCV_2015_paper.pdf)] [[Code](https:\u002F\u002Fgithub.com\u002Fjbhuang0604\u002FCF2)]\n* Lijun Wang, Wanli Ouyang, Xiaogang Wang, and Huchuan Lu, Visual Tracking with fully Convolutional Networks, ICCV 2015  [[Paper](http:\u002F\u002F202.118.75.4\u002Flu\u002FPaper\u002FICCV2015\u002Ficcv15_lijun.pdf)] [[Code](https:\u002F\u002Fgithub.com\u002Fscott89\u002FFCNT)]\n* Hyeonseob Namand Bohyung Han, Learning Multi-Domain Convolutional Neural Networks for Visual Tracking, [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1510.07945.pdf)] [[Code](https:\u002F\u002Fgithub.com\u002FHyeonseobNam\u002FMDNet)] [[Project Page](http:\u002F\u002Fcvlab.postech.ac.kr\u002Fresearch\u002Fmdnet\u002F)]\n\n### Low-Level Vision\n\n#### Super-Resolution\n* Iterative Image Reconstruction\n  * Sven Behnke: Learning Iterative Image Reconstruction. IJCAI, 2001. [[Paper]](http:\u002F\u002Fwww.ais.uni-bonn.de\u002Fbehnke\u002Fpapers\u002Fijcai01.pdf)\n  * Sven Behnke: Learning Iterative Image Reconstruction in the Neural Abstraction Pyramid. International Journal of Computational Intelligence and Applications, vol. 1, no. 4, pp. 427-438, 2001. [[Paper]](http:\u002F\u002Fwww.ais.uni-bonn.de\u002Fbehnke\u002Fpapers\u002Fijcia01.pdf)\n* Super-Resolution (SRCNN) [[Web]](http:\u002F\u002Fmmlab.ie.cuhk.edu.hk\u002Fprojects\u002FSRCNN.html) [[Paper-ECCV14]](http:\u002F\u002Fpersonal.ie.cuhk.edu.hk\u002F~ccloy\u002Ffiles\u002Feccv_2014_deepresolution.pdf) [[Paper-arXiv15]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1501.00092.pdf)\n  * Chao Dong, Chen Change Loy, Kaiming He, Xiaoou Tang, Learning a Deep Convolutional Network for Image Super-Resolution, ECCV, 2014.\n  * Chao Dong, Chen Change Loy, Kaiming He, Xiaoou Tang, Image Super-Resolution Using Deep Convolutional Networks, arXiv:1501.00092.\n* Very Deep Super-Resolution\n  * Jiwon Kim, Jung Kwon Lee, Kyoung Mu Lee, Accurate Image Super-Resolution Using Very Deep Convolutional Networks, arXiv:1511.04587, 2015. [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1511.04587)\n* Deeply-Recursive Convolutional Network\n  * Jiwon Kim, Jung Kwon Lee, Kyoung Mu Lee, Deeply-Recursive Convolutional Network for Image Super-Resolution, arXiv:1511.04491, 2015. [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1511.04491)\n* Casade-Sparse-Coding-Network\n  * Zhaowen Wang, Ding Liu, Wei Han, Jianchao Yang and Thomas S. Huang, Deep Networks for Image Super-Resolution with Sparse Prior. ICCV, 2015. [[Paper]](http:\u002F\u002Fwww.ifp.illinois.edu\u002F~dingliu2\u002Ficcv15\u002Ficcv15.pdf) [[Code]](http:\u002F\u002Fwww.ifp.illinois.edu\u002F~dingliu2\u002Ficcv15\u002F)\n* Perceptual Losses for Super-Resolution\n  * Justin Johnson, Alexandre Alahi, Li Fei-Fei, Perceptual Losses for Real-Time Style Transfer and Super-Resolution, arXiv:1603.08155, 2016. [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1603.08155) [[Supplementary]](http:\u002F\u002Fcs.stanford.edu\u002Fpeople\u002Fjcjohns\u002Fpapers\u002Ffast-style\u002Ffast-style-supp.pdf)\n* SRGAN\n  * Christian Ledig, Lucas Theis, Ferenc Huszar, Jose Caballero, Andrew Cunningham, Alejandro Acosta, Andrew Aitken, Alykhan Tejani, Johannes Totz, Zehan Wang, Wenzhe Shi, Photo-Realistic Single Image Super-Resolution Using a Generative Adversarial Network, arXiv:1609.04802v3, 2016. [[Paper]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1609.04802v3.pdf)\n* Others\n  * Osendorfer, Christian, Hubert Soyer, and Patrick van der Smagt, Image Super-Resolution with Fast Approximate Convolutional Sparse Coding, ICONIP, 2014. [[Paper ICONIP-2014]](http:\u002F\u002Fbrml.org\u002Fuploads\u002Ftx_sibibtex\u002F281.pdf)\n\n#### Other Applications\n* Optical Flow (FlowNet) [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1504.06852)\n  * Philipp Fischer, Alexey Dosovitskiy, Eddy Ilg, Philip Häusser, Caner Hazırbaş, Vladimir Golkov, Patrick van der Smagt, Daniel Cremers, Thomas Brox, FlowNet: Learning Optical Flow with Convolutional Networks, arXiv:1504.06852.\n* Compression Artifacts Reduction [[Paper-arXiv15]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1504.06993)\n  * Chao Dong, Yubin Deng, Chen Change Loy, Xiaoou Tang, Compression Artifacts Reduction by a Deep Convolutional Network, arXiv:1504.06993.\n* Blur Removal\n  * Christian J. Schuler, Michael Hirsch, Stefan Harmeling, Bernhard Schölkopf, Learning to Deblur, arXiv:1406.7444 [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1406.7444.pdf)\n  * Jian Sun, Wenfei Cao, Zongben Xu, Jean Ponce, Learning a Convolutional Neural Network for Non-uniform Motion Blur Removal, CVPR, 2015 [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1503.00593)\n* Image Deconvolution [[Web]](http:\u002F\u002Flxu.me\u002Fprojects\u002Fdcnn\u002F) [[Paper]](http:\u002F\u002Flxu.me\u002Fmypapers\u002Fdcnn_nips14.pdf)\n  * Li Xu, Jimmy SJ. Ren, Ce Liu, Jiaya Jia, Deep Convolutional Neural Network for Image Deconvolution, NIPS, 2014.\n* Deep Edge-Aware Filter [[Paper]](http:\u002F\u002Fjmlr.org\u002Fproceedings\u002Fpapers\u002Fv37\u002Fxub15.pdf)\n  * Li Xu, Jimmy SJ. Ren, Qiong Yan, Renjie Liao, Jiaya Jia, Deep Edge-Aware Filters, ICML, 2015.\n* Computing the Stereo Matching Cost with a Convolutional Neural Network [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FZbontar_Computing_the_Stereo_2015_CVPR_paper.pdf)\n  * Jure Žbontar, Yann LeCun, Computing the Stereo Matching Cost with a Convolutional Neural Network, CVPR, 2015.\n* Colorful Image Colorization Richard Zhang, Phillip Isola, Alexei A. Efros, ECCV, 2016 [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1603.08511.pdf), [[Code]](https:\u002F\u002Fgithub.com\u002Frichzhang\u002Fcolorization)\n* Ryan Dahl, [[Blog]](http:\u002F\u002Ftinyclouds.org\u002Fcolorize\u002F)\n* Feature Learning by Inpainting[[Paper]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1604.07379v1.pdf)[[Code]](https:\u002F\u002Fgithub.com\u002Fpathak22\u002Fcontext-encoder)\n  * Deepak Pathak, Philipp Krahenbuhl, Jeff Donahue, Trevor Darrell, Alexei A. Efros, Context Encoders: Feature Learning by Inpainting, CVPR, 2016\n\n### Edge Detection\n![edge_detection](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_f81ee8a378c2.png)\n(from Gedas Bertasius, Jianbo Shi, Lorenzo Torresani, DeepEdge: A Multi-Scale Bifurcated Deep Network for Top-Down Contour Detection, CVPR, 2015.)\n\n* Holistically-Nested Edge Detection [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1504.06375) [[Code]](https:\u002F\u002Fgithub.com\u002Fs9xie\u002Fhed)\n  * Saining Xie, Zhuowen Tu, Holistically-Nested Edge Detection, arXiv:1504.06375.\n* DeepEdge [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.1123)\n  * Gedas Bertasius, Jianbo Shi, Lorenzo Torresani, DeepEdge: A Multi-Scale Bifurcated Deep Network for Top-Down Contour Detection, CVPR, 2015.\n* DeepContour [[Paper]](http:\u002F\u002Fmc.eistar.net\u002FUpLoadFiles\u002FPapers\u002FDeepContour_cvpr15.pdf)\n  * Wei Shen, Xinggang Wang, Yan Wang, Xiang Bai, Zhijiang Zhang, DeepContour: A Deep Convolutional Feature Learned by Positive-Sharing Loss for Contour Detection, CVPR, 2015.\n\n### Semantic Segmentation\n![semantic_segmantation](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_66d8b80b26b2.png)\n(from Jifeng Dai, Kaiming He, Jian Sun, BoxSup: Exploiting Bounding Boxes to Supervise Convolutional Networks for Semantic Segmentation, arXiv:1503.01640.)\n* PASCAL VOC2012 Challenge Leaderboard (01 Sep. 2016)\n  ![VOC2012_top_rankings](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_01f4b3c8aed0.png)\n  (from PASCAL VOC2012 [leaderboards](http:\u002F\u002Fhost.robots.ox.ac.uk:8080\u002Fleaderboard\u002Fdisplaylb.php?challengeid=11&compid=6))\n* SEC: Seed, Expand and Constrain\n  *  Alexander Kolesnikov, Christoph Lampert, Seed, Expand and Constrain: Three Principles for Weakly-Supervised Image Segmentation, ECCV, 2016. [[Paper]](http:\u002F\u002Fpub.ist.ac.at\u002F~akolesnikov\u002Ffiles\u002FECCV2016\u002Fmain.pdf) [[Code]](https:\u002F\u002Fgithub.com\u002Fkolesman\u002FSEC)\n* Adelaide\n  * Guosheng Lin, Chunhua Shen, Ian Reid, Anton van dan Hengel, Efficient piecewise training of deep structured models for semantic segmentation, arXiv:1504.01013. [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1504.01013) (1st ranked in VOC2012)\n  * Guosheng Lin, Chunhua Shen, Ian Reid, Anton van den Hengel, Deeply Learning the Messages in Message Passing Inference, arXiv:1508.02108. [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.02108) (4th ranked in VOC2012)\n* Deep Parsing Network (DPN)\n  * Ziwei Liu, Xiaoxiao Li, Ping Luo, Chen Change Loy, Xiaoou Tang, Semantic Image Segmentation via Deep Parsing Network, arXiv:1509.02634 \u002F ICCV 2015 [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1509.02634.pdf) (2nd ranked in VOC 2012)\n* CentraleSuperBoundaries, INRIA [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.07386)\n  * Iasonas Kokkinos, Surpassing Humans in Boundary Detection using Deep Learning, arXiv:1411.07386 (4th ranked in VOC 2012)\n* BoxSup [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1503.01640)\n  * Jifeng Dai, Kaiming He, Jian Sun, BoxSup: Exploiting Bounding Boxes to Supervise Convolutional Networks for Semantic Segmentation, arXiv:1503.01640. (6th ranked in VOC2012)\n* POSTECH\n  * Hyeonwoo Noh, Seunghoon Hong, Bohyung Han, Learning Deconvolution Network for Semantic Segmentation, arXiv:1505.04366. [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.04366) (7th ranked in VOC2012)\n  * Seunghoon Hong, Hyeonwoo Noh, Bohyung Han, Decoupled Deep Neural Network for Semi-supervised Semantic Segmentation, arXiv:1506.04924. [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.04924)\n  * Seunghoon Hong,Junhyuk Oh,\tBohyung Han, and\tHonglak Lee, Learning Transferrable Knowledge for Semantic Segmentation with Deep Convolutional Neural Network, arXiv:1512.07928 [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1512.07928.pdf)] [[Project Page](http:\u002F\u002Fcvlab.postech.ac.kr\u002Fresearch\u002Ftransfernet\u002F)]\n* Conditional Random Fields as Recurrent Neural Networks [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.03240)\n  * Shuai Zheng, Sadeep Jayasumana, Bernardino Romera-Paredes, Vibhav Vineet, Zhizhong Su, Dalong Du, Chang Huang, Philip H. S. Torr, Conditional Random Fields as Recurrent Neural Networks, arXiv:1502.03240. (8th ranked in VOC2012)\n* DeepLab\n  * Liang-Chieh Chen, George Papandreou, Kevin Murphy, Alan L. Yuille, Weakly-and semi-supervised learning of a DCNN for semantic image segmentation, arXiv:1502.02734. [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.02734) (9th ranked in VOC2012)\n* Zoom-out [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FMostajabi_Feedforward_Semantic_Segmentation_2015_CVPR_paper.pdf)\n  * Mohammadreza Mostajabi, Payman Yadollahpour, Gregory Shakhnarovich, Feedforward Semantic Segmentation With Zoom-Out Features, CVPR, 2015\n* Joint Calibration [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1507.01581)\n  * Holger Caesar, Jasper Uijlings, Vittorio Ferrari, Joint Calibration for Semantic Segmentation, arXiv:1507.01581.\n* Fully Convolutional Networks for Semantic Segmentation [[Paper-CVPR15]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FLong_Fully_Convolutional_Networks_2015_CVPR_paper.pdf) [[Paper-arXiv15]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.4038)\n  * Jonathan Long, Evan Shelhamer, Trevor Darrell, Fully Convolutional Networks for Semantic Segmentation, CVPR, 2015.\n* Hypercolumn [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FHariharan_Hypercolumns_for_Object_2015_CVPR_paper.pdf)\n  * Bharath Hariharan, Pablo Arbelaez, Ross Girshick, Jitendra Malik, Hypercolumns for Object Segmentation and Fine-Grained Localization, CVPR, 2015.\n* Deep Hierarchical Parsing\n  * Abhishek Sharma, Oncel Tuzel, David W. Jacobs, Deep Hierarchical Parsing for Semantic Segmentation, CVPR, 2015. [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FSharma_Deep_Hierarchical_Parsing_2015_CVPR_paper.pdf)\n* Learning Hierarchical Features for Scene Labeling [[Paper-ICML12]](http:\u002F\u002Fyann.lecun.com\u002Fexdb\u002Fpublis\u002Fpdf\u002Ffarabet-icml-12.pdf) [[Paper-PAMI13]](http:\u002F\u002Fyann.lecun.com\u002Fexdb\u002Fpublis\u002Fpdf\u002Ffarabet-pami-13.pdf)\n  * Clement Farabet, Camille Couprie, Laurent Najman, Yann LeCun, Scene Parsing with Multiscale Feature Learning, Purity Trees, and Optimal Covers, ICML, 2012.\n  * Clement Farabet, Camille Couprie, Laurent Najman, Yann LeCun, Learning Hierarchical Features for Scene Labeling, PAMI, 2013.\n* University of Cambridge [[Web]](http:\u002F\u002Fmi.eng.cam.ac.uk\u002Fprojects\u002Fsegnet\u002F)\n  * Vijay Badrinarayanan, Alex Kendall and Roberto Cipolla \"SegNet: A Deep Convolutional Encoder-Decoder Architecture for Image Segmentation.\" arXiv preprint arXiv:1511.00561, 2015. [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1511.00561)\n* Alex Kendall, Vijay Badrinarayanan and Roberto Cipolla \"Bayesian SegNet: Model Uncertainty in Deep Convolutional Encoder-Decoder Architectures for Scene Understanding.\" arXiv preprint arXiv:1511.02680, 2015. [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1511.00561)\n* Princeton\n  * Fisher Yu, Vladlen Koltun, \"Multi-Scale Context Aggregation by Dilated Convolutions\", ICLR 2016, [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.07122v2.pdf)]\n* Univ. of Washington, Allen AI\n  * Hamid Izadinia, Fereshteh Sadeghi, Santosh Kumar Divvala, Yejin Choi, Ali Farhadi, \"Segment-Phrase Table for Semantic Segmentation, Visual Entailment and Paraphrasing\", ICCV, 2015, [[Paper](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_iccv_2015\u002Fpapers\u002FIzadinia_Segment-Phrase_Table_for_ICCV_2015_paper.pdf)]\n* INRIA\n  * Iasonas Kokkinos, \"Pusing the Boundaries of Boundary Detection Using deep Learning\", ICLR 2016, [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.07386v2.pdf)]\n* UCSB\n  * Niloufar Pourian, S. Karthikeyan, and B.S. Manjunath, \"Weakly supervised graph based semantic segmentation by learning communities of image-parts\", ICCV, 2015, [[Paper](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_iccv_2015\u002Fpapers\u002FPourian_Weakly_Supervised_Graph_ICCV_2015_paper.pdf)]\n\n### Visual Attention and Saliency\n![saliency](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_25f843d73f14.png)\n(from Nian Liu, Junwei Han, Dingwen Zhang, Shifeng Wen, Tianming Liu, Predicting Eye Fixations using Convolutional Neural Networks, CVPR, 2015.)\n\n* Mr-CNN [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FLiu_Predicting_Eye_Fixations_2015_CVPR_paper.pdf)\n  * Nian Liu, Junwei Han, Dingwen Zhang, Shifeng Wen, Tianming Liu, Predicting Eye Fixations using Convolutional Neural Networks, CVPR, 2015.\n* Learning a Sequential Search for Landmarks [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FSingh_Learning_a_Sequential_2015_CVPR_paper.pdf)\n  * Saurabh Singh, Derek Hoiem, David Forsyth, Learning a Sequential Search for Landmarks, CVPR, 2015.\n* Multiple Object Recognition with Visual Attention [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.7755.pdf)\n  * Jimmy Lei Ba, Volodymyr Mnih, Koray Kavukcuoglu, Multiple Object Recognition with Visual Attention, ICLR, 2015.\n* Recurrent Models of Visual Attention [[Paper]](http:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F5542-recurrent-models-of-visual-attention.pdf)\n  * Volodymyr Mnih, Nicolas Heess, Alex Graves, Koray Kavukcuoglu, Recurrent Models of Visual Attention, NIPS, 2014.\n\n### Object Recognition\n* Weakly-supervised learning with convolutional neural networks [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FOquab_Is_Object_Localization_2015_CVPR_paper.pdf)\n  * Maxime Oquab, Leon Bottou, Ivan Laptev, Josef Sivic, Is object localization for free? – Weakly-supervised learning with convolutional neural networks, CVPR, 2015.\n* FV-CNN [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FCimpoi_Deep_Filter_Banks_2015_CVPR_paper.pdf)\n  * Mircea Cimpoi, Subhransu Maji, Andrea Vedaldi, Deep Filter Banks for Texture Recognition and Segmentation, CVPR, 2015.\n\n### Human Pose Estimation\n* Zhe Cao, Tomas Simon, Shih-En Wei, and Yaser Sheikh, Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields, CVPR, 2017.\n* Leonid Pishchulin, Eldar Insafutdinov, Siyu Tang, Bjoern Andres, Mykhaylo Andriluka, Peter Gehler, and Bernt Schiele, Deepcut: Joint subset partition and labeling for multi person pose estimation, CVPR, 2016.\n* Shih-En Wei, Varun Ramakrishna, Takeo Kanade, and Yaser Sheikh, Convolutional pose machines, CVPR, 2016.\n* Alejandro Newell, Kaiyu Yang, and Jia Deng, Stacked hourglass networks for human pose estimation, ECCV, 2016.\n* Tomas Pfister, James Charles, and Andrew Zisserman, Flowing convnets for human pose estimation in videos, ICCV, 2015.\n* Jonathan J. Tompson, Arjun Jain, Yann LeCun, Christoph Bregler, Joint training of a convolutional network and a graphical model for human pose estimation, NIPS, 2014.\n\n### Understanding CNN\n![understanding](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_566ae8b37494.png)\n(from Aravindh Mahendran, Andrea Vedaldi, Understanding Deep Image Representations by Inverting Them, CVPR, 2015.)\n\n* Karel Lenc, Andrea Vedaldi, Understanding image representations by measuring their equivariance and equivalence, CVPR, 2015. [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FLenc_Understanding_Image_Representations_2015_CVPR_paper.pdf)\n* Anh Nguyen, Jason Yosinski, Jeff Clune, Deep Neural Networks are Easily Fooled:High Confidence Predictions for Unrecognizable Images, CVPR, 2015. [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FNguyen_Deep_Neural_Networks_2015_CVPR_paper.pdf)\n* Aravindh Mahendran, Andrea Vedaldi, Understanding Deep Image Representations by Inverting Them, CVPR, 2015. [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FMahendran_Understanding_Deep_Image_2015_CVPR_paper.pdf)\n* Bolei Zhou, Aditya Khosla, Agata Lapedriza, Aude Oliva, Antonio Torralba, Object Detectors Emerge in Deep Scene CNNs, ICLR, 2015. [[arXiv Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1412.6856)\n* Alexey Dosovitskiy, Thomas Brox, Inverting Visual Representations with Convolutional Networks, arXiv, 2015. [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1506.02753)\n* Matthrew Zeiler, Rob Fergus, Visualizing and Understanding Convolutional Networks, ECCV, 2014. [[Paper]](https:\u002F\u002Fwww.cs.nyu.edu\u002F~fergus\u002Fpapers\u002FzeilerECCV2014.pdf)\n\n\n### Image and Language\n\n#### Image Captioning\n![image_captioning](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_45abeceea151.png)\n(from Andrej Karpathy, Li Fei-Fei, Deep Visual-Semantic Alignments for Generating Image Description, CVPR, 2015.)\n\n* UCLA \u002F Baidu [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1410.1090)\n  * Junhua Mao, Wei Xu, Yi Yang, Jiang Wang, Alan L. Yuille, Explain Images with Multimodal Recurrent Neural Networks, arXiv:1410.1090.\n* Toronto [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.2539)\n  * Ryan Kiros, Ruslan Salakhutdinov, Richard S. Zemel, Unifying Visual-Semantic Embeddings with Multimodal Neural Language Models, arXiv:1411.2539.\n* Berkeley [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.4389)\n  * Jeff Donahue, Lisa Anne Hendricks, Sergio Guadarrama, Marcus Rohrbach, Subhashini Venugopalan, Kate Saenko, Trevor Darrell, Long-term Recurrent Convolutional Networks for Visual Recognition and Description, arXiv:1411.4389.\n* Google [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.4555)\n  * Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan, Show and Tell: A Neural Image Caption Generator, arXiv:1411.4555.\n* Stanford [[Web]](http:\u002F\u002Fcs.stanford.edu\u002Fpeople\u002Fkarpathy\u002Fdeepimagesent\u002F) [[Paper]](http:\u002F\u002Fcs.stanford.edu\u002Fpeople\u002Fkarpathy\u002Fcvpr2015.pdf)\n  * Andrej Karpathy, Li Fei-Fei, Deep Visual-Semantic Alignments for Generating Image Description, CVPR, 2015.\n* UML \u002F UT [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.4729)\n  * Subhashini Venugopalan, Huijuan Xu, Jeff Donahue, Marcus Rohrbach, Raymond Mooney, Kate Saenko, Translating Videos to Natural Language Using Deep Recurrent Neural Networks, NAACL-HLT, 2015.\n* CMU \u002F Microsoft [[Paper-arXiv]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.5654) [[Paper-CVPR]](http:\u002F\u002Fwww.cs.cmu.edu\u002F~xinleic\u002Fpapers\u002Fcvpr15_rnn.pdf)\n  * Xinlei Chen, C. Lawrence Zitnick, Learning a Recurrent Visual Representation for Image Caption Generation, arXiv:1411.5654.\n  * Xinlei Chen, C. Lawrence Zitnick, Mind’s Eye: A Recurrent Visual Representation for Image Caption Generation, CVPR 2015\n* Microsoft [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.4952)\n  * Hao Fang, Saurabh Gupta, Forrest Iandola, Rupesh Srivastava, Li Deng, Piotr Dollár, Jianfeng Gao, Xiaodong He, Margaret Mitchell, John C. Platt, C. Lawrence Zitnick, Geoffrey Zweig, From Captions to Visual Concepts and Back, CVPR, 2015.\n* Univ. Montreal \u002F Univ. Toronto [[Web](http:\u002F\u002Fkelvinxu.github.io\u002Fprojects\u002Fcapgen.html)] [[Paper](http:\u002F\u002Fwww.cs.toronto.edu\u002F~zemel\u002Fdocuments\u002FcaptionAttn.pdf)]\n  * Kelvin Xu, Jimmy Lei Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhutdinov, Richard S. Zemel, Yoshua Bengio, Show, Attend, and Tell: Neural Image Caption Generation with Visual Attention, arXiv:1502.03044 \u002F ICML 2015\n* Idiap \u002F EPFL \u002F Facebook [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.03671)]\n  * Remi Lebret, Pedro O. Pinheiro, Ronan Collobert, Phrase-based Image Captioning, arXiv:1502.03671 \u002F ICML 2015\n* UCLA \u002F Baidu [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1504.06692)]\n  * Junhua Mao, Wei Xu, Yi Yang, Jiang Wang, Zhiheng Huang, Alan L. Yuille, Learning like a Child: Fast Novel Visual Concept Learning from Sentence Descriptions of Images, arXiv:1504.06692\n* MS + Berkeley\n  * Jacob Devlin, Saurabh Gupta, Ross Girshick, Margaret Mitchell, C. Lawrence Zitnick, Exploring Nearest Neighbor Approaches for Image Captioning, arXiv:1505.04467 [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.04467.pdf)]\n  * Jacob Devlin, Hao Cheng, Hao Fang, Saurabh Gupta, Li Deng, Xiaodong He, Geoffrey Zweig, Margaret Mitchell, Language Models for Image Captioning: The Quirks and What Works, arXiv:1505.01809 [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.01809.pdf)]\n* Adelaide [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.01144.pdf)]\n  * Qi Wu, Chunhua Shen, Anton van den Hengel, Lingqiao Liu, Anthony Dick, Image Captioning with an Intermediate Attributes Layer, arXiv:1506.01144\n* Tilburg [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.03694.pdf)]\n  * Grzegorz Chrupala, Akos Kadar, Afra Alishahi, Learning language through pictures, arXiv:1506.03694\n* Univ. Montreal [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1507.01053.pdf)]\n  * Kyunghyun Cho, Aaron Courville, Yoshua Bengio, Describing Multimedia Content using Attention-based Encoder-Decoder Networks, arXiv:1507.01053\n* Cornell [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1508.02091.pdf)]\n  * Jack Hessel, Nicolas Savva, Michael J. Wilber, Image Representations and New Domains in Neural Image Captioning, arXiv:1508.02091\n* MS + City Univ. of HongKong [[Paper](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_iccv_2015\u002Fpapers\u002FYao_Learning_Query_and_ICCV_2015_paper.pdf)]\n  * Ting Yao, Tao Mei, and Chong-Wah Ngo, \"Learning Query and Image Similarities\n    with Ranking Canonical Correlation Analysis\", ICCV, 2015\n\n#### Video Captioning\n* Berkeley [[Web]](http:\u002F\u002Fjeffdonahue.com\u002Flrcn\u002F) [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.4389.pdf)\n  * Jeff Donahue, Lisa Anne Hendricks, Sergio Guadarrama, Marcus Rohrbach, Subhashini Venugopalan, Kate Saenko, Trevor Darrell, Long-term Recurrent Convolutional Networks for Visual Recognition and Description, CVPR, 2015.\n* UT \u002F UML \u002F Berkeley [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.4729)\n  * Subhashini Venugopalan, Huijuan Xu, Jeff Donahue, Marcus Rohrbach, Raymond Mooney, Kate Saenko, Translating Videos to Natural Language Using Deep Recurrent Neural Networks, arXiv:1412.4729.\n* Microsoft [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.01861)\n  * Yingwei Pan, Tao Mei, Ting Yao, Houqiang Li, Yong Rui, Joint Modeling Embedding and Translation to Bridge Video and Language, arXiv:1505.01861.\n* UT \u002F Berkeley \u002F UML [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.00487)\n  * Subhashini Venugopalan, Marcus Rohrbach, Jeff Donahue, Raymond Mooney, Trevor Darrell, Kate Saenko, Sequence to Sequence--Video to Text, arXiv:1505.00487.\n* Univ. Montreal \u002F Univ. Sherbrooke [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.08029.pdf)]\n  * Li Yao, Atousa Torabi, Kyunghyun Cho, Nicolas Ballas, Christopher Pal, Hugo Larochelle, Aaron Courville, Describing Videos by Exploiting Temporal Structure, arXiv:1502.08029\n* MPI \u002F Berkeley [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.01698.pdf)]\n  * Anna Rohrbach, Marcus Rohrbach, Bernt Schiele, The Long-Short Story of Movie Description, arXiv:1506.01698\n* Univ. Toronto \u002F MIT [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.06724.pdf)]\n  * Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, Sanja Fidler, Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books, arXiv:1506.06724\n* Univ. Montreal [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1507.01053.pdf)]\n  * Kyunghyun Cho, Aaron Courville, Yoshua Bengio, Describing Multimedia Content using Attention-based Encoder-Decoder Networks, arXiv:1507.01053\n* TAU \u002F USC [[paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1612.06950.pdf)]\n  * Dotan Kaufman, Gil Levi, Tal Hassner, Lior Wolf, Temporal Tessellation for Video Annotation and Summarization, arXiv:1612.06950.\n\n#### Question Answering\n![question_answering](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_453a81d086f4.png)\n(from Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra, C. Lawrence Zitnick, Devi Parikh, VQA: Visual Question Answering, CVPR, 2015 SUNw:Scene Understanding workshop)\n\n* Virginia Tech \u002F MSR [[Web]](http:\u002F\u002Fwww.visualqa.org\u002F) [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.00468)\n  * Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra, C. Lawrence Zitnick, Devi Parikh, VQA: Visual Question Answering, CVPR, 2015 SUNw:Scene Understanding workshop.\n* MPI \u002F Berkeley [[Web]](https:\u002F\u002Fwww.mpi-inf.mpg.de\u002Fdepartments\u002Fcomputer-vision-and-multimodal-computing\u002Fresearch\u002Fvision-and-language\u002Fvisual-turing-challenge\u002F) [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.01121)\n  * Mateusz Malinowski, Marcus Rohrbach, Mario Fritz, Ask Your Neurons: A Neural-based Approach to Answering Questions about Images, arXiv:1505.01121.\n* Toronto [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.02074) [[Dataset]](http:\u002F\u002Fwww.cs.toronto.edu\u002F~mren\u002Fimageqa\u002Fdata\u002Fcocoqa\u002F)\n  * Mengye Ren, Ryan Kiros, Richard Zemel, Image Question Answering: A Visual Semantic Embedding Model and a New Dataset, arXiv:1505.02074 \u002F ICML 2015 deep learning workshop.\n* Baidu \u002F UCLA [[Paper]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.05612) [[Dataset]]()\n  * Hauyuan Gao, Junhua Mao, Jie Zhou, Zhiheng Huang, Lei Wang, Wei Xu, Are You Talking to a Machine? Dataset and Methods for Multilingual Image Question Answering, arXiv:1505.05612.\n* POSTECH [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.05756.pdf)] [[Project Page](http:\u002F\u002Fcvlab.postech.ac.kr\u002Fresearch\u002Fdppnet\u002F)]\n  * Hyeonwoo Noh, Paul Hongsuck Seo, and Bohyung Han, Image Question Answering using Convolutional Neural Network with Dynamic Parameter Prediction, arXiv:1511.05765\n* CMU \u002F Microsoft Research [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.02274v2.pdf)]\n  * Yang, Z., He, X., Gao, J., Deng, L., & Smola, A. (2015). Stacked Attention Networks for Image Question Answering. arXiv:1511.02274.\n* MetaMind [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1603.01417v1.pdf)]\n  * Xiong, Caiming, Stephen Merity, and Richard Socher. \"Dynamic Memory Networks for Visual and Textual Question Answering.\" arXiv:1603.01417 (2016).\n* SNU + NAVER [[Paper](http:\u002F\u002Farxiv.org\u002Fabs\u002F1606.01455)]\n  * Jin-Hwa Kim, Sang-Woo Lee, Dong-Hyun Kwak, Min-Oh Heo, Jeonghee Kim, Jung-Woo Ha, Byoung-Tak Zhang, *Multimodal Residual Learning for Visual QA*, arXiv:1606:01455\n* UC Berkeley + Sony [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.01847)]\n  * Akira Fukui, Dong Huk Park, Daylen Yang, Anna Rohrbach, Trevor Darrell, and Marcus Rohrbach, *Multimodal Compact Bilinear Pooling for Visual Question Answering and Visual Grounding*, arXiv:1606.01847\n* Postech [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.03647.pdf)]\n  * Hyeonwoo Noh and Bohyung Han, *Training Recurrent Answering Units with Joint Loss Minimization for VQA*, arXiv:1606.03647\n* SNU + NAVER [[Paper](http:\u002F\u002Farxiv.org\u002Fabs\u002F1610.04325)]\n  * Jin-Hwa Kim, Kyoung Woon On, Jeonghee Kim, Jung-Woo Ha, Byoung-Tak Zhang, *Hadamard Product for Low-rank Bilinear Pooling*, arXiv:1610.04325.\n\n### Image Generation\n* Convolutional \u002F Recurrent Networks\n  * Aäron van den Oord, Nal Kalchbrenner, Oriol Vinyals, Lasse Espeholt, Alex Graves, Koray Kavukcuoglu. \"Conditional Image Generation with PixelCNN Decoders\"[[Paper]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.05328v2.pdf)[[Code]](https:\u002F\u002Fgithub.com\u002Fkundan2510\u002FpixelCNN)\n  * Alexey Dosovitskiy, Jost Tobias Springenberg, Thomas Brox, \"Learning to Generate Chairs with Convolutional Neural Networks\", CVPR, 2015. [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FDosovitskiy_Learning_to_Generate_2015_CVPR_paper.pdf)\n  * Karol Gregor, Ivo Danihelka, Alex Graves, Danilo Jimenez Rezende, Daan Wierstra, \"DRAW: A Recurrent Neural Network For Image Generation\", ICML, 2015. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.04623v2.pdf)] \n* Adversarial Networks\n  * Ian J. Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, Yoshua Bengio, Generative Adversarial Networks, NIPS, 2014. [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1406.2661)\n  * Emily Denton, Soumith Chintala, Arthur Szlam, Rob Fergus, Deep Generative Image Models using a ￼Laplacian Pyramid of Adversarial Networks, NIPS, 2015. [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1506.05751)\n  * Lucas Theis, Aäron van den Oord, Matthias Bethge, \"A note on the evaluation of generative models\", ICLR 2016. [[Paper](http:\u002F\u002Farxiv.org\u002Fabs\u002F1511.01844)]\n  * Zhenwen Dai, Andreas Damianou, Javier Gonzalez, Neil Lawrence, \"Variationally Auto-Encoded Deep Gaussian Processes\", ICLR 2016. [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.06455v2.pdf)]\n  * Elman Mansimov, Emilio Parisotto, Jimmy Ba, Ruslan Salakhutdinov, \"Generating Images from Captions with Attention\", ICLR 2016, [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.02793v2.pdf)]\n  * Jost Tobias Springenberg, \"Unsupervised and Semi-supervised Learning with Categorical Generative Adversarial Networks\", ICLR 2016, [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.06390v1.pdf)]\n  * Harrison Edwards, Amos Storkey, \"Censoring Representations with an Adversary\", ICLR 2016, [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.05897v3.pdf)]\n  * Takeru Miyato, Shin-ichi Maeda, Masanori Koyama, Ken Nakae, Shin Ishii, \"Distributional Smoothing with Virtual Adversarial Training\", ICLR 2016, [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1507.00677v8.pdf)]\n  * Jun-Yan Zhu, Philipp Krahenbuhl, Eli Shechtman, and Alexei A. Efros, \"Generative Visual Manipulation on the Natural Image Manifold\", ECCV 2016. [[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1609.03552v2.pdf)] [[Code](https:\u002F\u002Fgithub.com\u002Fjunyanz\u002FiGAN)] [[Video](https:\u002F\u002Fyoutu.be\u002F9c4z6YsBGQ0)]\n* Mixing Convolutional and Adversarial Networks\n  * Alec Radford, Luke Metz, Soumith Chintala, \"Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks\", ICLR 2016. [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.06434.pdf)]\n\n### Other Topics\n* Visual Analogy [[Paper](https:\u002F\u002Fweb.eecs.umich.edu\u002F~honglak\u002Fnips2015-analogy.pdf)]\n  * Scott Reed, Yi Zhang, Yuting Zhang, Honglak Lee, Deep Visual Analogy Making, NIPS, 2015\n* Surface Normal Estimation [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FWang_Designing_Deep_Networks_2015_CVPR_paper.pdf)\n  * Xiaolong Wang, David F. Fouhey, Abhinav Gupta, Designing Deep Networks for Surface Normal Estimation, CVPR, 2015.\n* Action Detection [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FGkioxari_Finding_Action_Tubes_2015_CVPR_paper.pdf)\n  * Georgia Gkioxari, Jitendra Malik, Finding Action Tubes, CVPR, 2015.\n* Crowd Counting [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FZhang_Cross-Scene_Crowd_Counting_2015_CVPR_paper.pdf)\n  * Cong Zhang, Hongsheng Li, Xiaogang Wang, Xiaokang Yang, Cross-scene Crowd Counting via Deep Convolutional Neural Networks, CVPR, 2015.\n* 3D Shape Retrieval [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FWang_Sketch-Based_3D_Shape_2015_CVPR_paper.pdf)\n  * Fang Wang, Le Kang, Yi Li, Sketch-based 3D Shape Retrieval using Convolutional Neural Networks, CVPR, 2015.\n* Weakly-supervised Classification\n  * Samaneh Azadi, Jiashi Feng, Stefanie Jegelka, Trevor Darrell, \"Auxiliary Image Regularization for Deep CNNs with Noisy Labels\", ICLR 2016, [[Paper](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.07069v2.pdf)]\n* Artistic Style [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1508.06576) [[Code]](https:\u002F\u002Fgithub.com\u002Fjcjohnson\u002Fneural-style)\n  * Leon A. Gatys, Alexander S. Ecker, Matthias Bethge, A Neural Algorithm of Artistic Style.\n* Human Gaze Estimation\n  * Xucong Zhang, Yusuke Sugano, Mario Fritz, Andreas Bulling, Appearance-Based Gaze Estimation in the Wild, CVPR, 2015. [[Paper]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FZhang_Appearance-Based_Gaze_Estimation_2015_CVPR_paper.pdf) [[Website]](https:\u002F\u002Fwww.mpi-inf.mpg.de\u002Fdepartments\u002Fcomputer-vision-and-multimodal-computing\u002Fresearch\u002Fgaze-based-human-computer-interaction\u002Fappearance-based-gaze-estimation-in-the-wild-mpiigaze\u002F)\n* Face Recognition\n  * Yaniv Taigman, Ming Yang, Marc'Aurelio Ranzato, Lior Wolf, DeepFace: Closing the Gap to Human-Level Performance in Face Verification, CVPR, 2014. [[Paper]](https:\u002F\u002Fwww.cs.toronto.edu\u002F~ranzato\u002Fpublications\u002Ftaigman_cvpr14.pdf)\n  * Yi Sun, Ding Liang, Xiaogang Wang, Xiaoou Tang, DeepID3: Face Recognition with Very Deep Neural Networks, 2015. [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1502.00873)\n  * Florian Schroff, Dmitry Kalenichenko, James Philbin, FaceNet: A Unified Embedding for Face Recognition and Clustering, CVPR, 2015. [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1503.03832)\n* Facial Landmark Detection\n  * Yue Wu, Tal Hassner, KangGeon Kim, Gerard Medioni, Prem Natarajan, Facial Landmark Detection with Tweaked Convolutional Neural Networks, 2015. [[Paper]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1511.04031) [[Project]](http:\u002F\u002Fwww.openu.ac.il\u002Fhome\u002Fhassner\u002Fprojects\u002Ftcnn_landmarks\u002F)\n\n## Courses\n* Deep Vision\n  * [Stanford] [CS231n: Convolutional Neural Networks for Visual Recognition](http:\u002F\u002Fcs231n.stanford.edu\u002F)\n  * [CUHK] [ELEG 5040: Advanced Topics in Signal Processing(Introduction to Deep Learning)](https:\u002F\u002Fpiazza.com\u002Fcuhk.edu.hk\u002Fspring2015\u002Feleg5040\u002Fhome)\n* More Deep Learning\n  * [Stanford] [CS224d: Deep Learning for Natural Language Processing](http:\u002F\u002Fcs224d.stanford.edu\u002F)\n  * [Oxford] [Deep Learning by Prof. Nando de Freitas](https:\u002F\u002Fwww.cs.ox.ac.uk\u002Fpeople\u002Fnando.defreitas\u002Fmachinelearning\u002F)\n  * [NYU] [Deep Learning by Prof. Yann LeCun](http:\u002F\u002Fcilvr.cs.nyu.edu\u002Fdoku.php?id=courses:deeplearning2014:start)\n\n## Books\n* Free Online Books\n  * [Deep Learning by Ian Goodfellow, Yoshua Bengio, and Aaron Courville](http:\u002F\u002Fwww.iro.umontreal.ca\u002F~bengioy\u002Fdlbook\u002F)\n  * [Neural Networks and Deep Learning by Michael Nielsen](http:\u002F\u002Fneuralnetworksanddeeplearning.com\u002F)\n  * [Deep Learning Tutorial by LISA lab, University of Montreal](http:\u002F\u002Fdeeplearning.net\u002Ftutorial\u002Fdeeplearning.pdf)\n\n## Videos\n* Talks\n  * [Deep Learning, Self-Taught Learning and Unsupervised Feature Learning By Andrew Ng](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=n1ViNeWhC24)\n  * [Recent Developments in Deep Learning By Geoff Hinton](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=vShMxxqtDDs)\n  * [The Unreasonable Effectiveness of Deep Learning by Yann LeCun](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=sc-KbuZqGkI)\n  * [Deep Learning of Representations by Yoshua bengio](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=4xsVFLnHC_0)\n\n\n## Software\n### Framework\n* Tensorflow: An open source software library for numerical computation using data flow graph by Google [[Web](https:\u002F\u002Fwww.tensorflow.org\u002F)]\n* Torch7: Deep learning library in Lua, used by Facebook and Google Deepmind [[Web](http:\u002F\u002Ftorch.ch\u002F)]\n  * Torch-based deep learning libraries: [[torchnet](https:\u002F\u002Fgithub.com\u002Ftorchnet\u002Ftorchnet)],\n* Caffe: Deep learning framework by the BVLC [[Web](http:\u002F\u002Fcaffe.berkeleyvision.org\u002F)]\n* Theano: Mathematical library in Python, maintained by LISA lab [[Web](http:\u002F\u002Fdeeplearning.net\u002Fsoftware\u002Ftheano\u002F)]\n  * Theano-based deep learning libraries: [[Pylearn2](http:\u002F\u002Fdeeplearning.net\u002Fsoftware\u002Fpylearn2\u002F)], [[Blocks](https:\u002F\u002Fgithub.com\u002Fmila-udem\u002Fblocks)], [[Keras](http:\u002F\u002Fkeras.io\u002F)], [[Lasagne](https:\u002F\u002Fgithub.com\u002FLasagne\u002FLasagne)]\n* MatConvNet: CNNs for MATLAB [[Web](http:\u002F\u002Fwww.vlfeat.org\u002Fmatconvnet\u002F)]\n* MXNet: A flexible and efficient deep learning library for heterogeneous distributed systems with multi-language support [[Web](http:\u002F\u002Fmxnet.io\u002F)]\n* Deepgaze: A computer vision library for human-computer interaction based on CNNs [[Web](https:\u002F\u002Fgithub.com\u002Fmpatacchiola\u002Fdeepgaze)]\n\n### Applications\n* Adversarial Training\n  * Code and hyperparameters for the paper \"Generative Adversarial Networks\" [[Web]](https:\u002F\u002Fgithub.com\u002Fgoodfeli\u002Fadversarial)\n* Understanding and Visualizing\n  * Source code for \"Understanding Deep Image Representations by Inverting Them,\" CVPR, 2015. [[Web]](https:\u002F\u002Fgithub.com\u002Faravindhm\u002Fdeep-goggle)\n* Semantic Segmentation\n  * Source code for the paper \"Rich feature hierarchies for accurate object detection and semantic segmentation,\" CVPR, 2014. [[Web]](https:\u002F\u002Fgithub.com\u002Frbgirshick\u002Frcnn)\n  * Source code for the paper \"Fully Convolutional Networks for Semantic Segmentation,\" CVPR, 2015. [[Web]](https:\u002F\u002Fgithub.com\u002Flongjon\u002Fcaffe\u002Ftree\u002Ffuture)\n* Super-Resolution\n  * Image Super-Resolution for Anime-Style-Art [[Web]](https:\u002F\u002Fgithub.com\u002Fnagadomi\u002Fwaifu2x)\n* Edge Detection\n  * Source code for the paper \"DeepContour: A Deep Convolutional Feature Learned by Positive-Sharing Loss for Contour Detection,\" CVPR, 2015. [[Web]](https:\u002F\u002Fgithub.com\u002Fshenwei1231\u002FDeepContour)\n  * Source code for the paper \"Holistically-Nested Edge Detection\", ICCV 2015. [[Web]](https:\u002F\u002Fgithub.com\u002Fs9xie\u002Fhed)\n\n## Tutorials\n* [CVPR 2014] [Tutorial on Deep Learning in Computer Vision](https:\u002F\u002Fsites.google.com\u002Fsite\u002Fdeeplearningcvpr2014\u002F)\n* [CVPR 2015] [Applied Deep Learning for Computer Vision with Torch](https:\u002F\u002Fgithub.com\u002Fsoumith\u002Fcvpr2015)\n\n## Blogs\n* [Deep down the rabbit hole: CVPR 2015 and beyond@Tombone's Computer Vision Blog](http:\u002F\u002Fwww.computervisionblog.com\u002F2015\u002F06\u002Fdeep-down-rabbit-hole-cvpr-2015-and.html)\n* [CVPR recap and where we're going@Zoya Bylinskii (MIT PhD Student)'s Blog](http:\u002F\u002Fzoyathinks.blogspot.kr\u002F2015\u002F06\u002Fcvpr-recap-and-where-were-going.html)\n* [Facebook's AI Painting@Wired](http:\u002F\u002Fwww.wired.com\u002F2015\u002F06\u002Ffacebook-googles-fake-brains-spawn-new-visual-reality\u002F)\n* [Inceptionism: Going Deeper into Neural Networks@Google Research](http:\u002F\u002Fgoogleresearch.blogspot.kr\u002F2015\u002F06\u002Finceptionism-going-deeper-into-neural.html)\n* [Implementing Neural networks](http:\u002F\u002Fpeterroelants.github.io\u002F) \n","# 令人惊叹的深度视觉 [![Awesome](https:\u002F\u002Fcdn.rawgit.com\u002Fsindresorhus\u002Fawesome\u002Fd7305f38d29fed78fa85652e3a63e154dd8e8829\u002Fmedia\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002Fsindresorhus\u002Fawesome)\n\n一份精心整理的计算机视觉深度学习资源列表，灵感来源于 [awesome-php](https:\u002F\u002Fgithub.com\u002Fziadoz\u002Fawesome-php) 和 [awesome-computer-vision](https:\u002F\u002Fgithub.com\u002Fjbhuang0604\u002Fawesome-computer-vision)。\n\n维护者 - [Jiwon Kim](https:\u002F\u002Fgithub.com\u002Fkjw0612), [Heesoo Myeong](https:\u002F\u002Fgithub.com\u002Fhmyeong), [Myungsub Choi](https:\u002F\u002Fgithub.com\u002Fmyungsub), [Jung Kwon Lee](https:\u002F\u002Fgithub.com\u002Fderuci), [Taeksoo Kim](https:\u002F\u002Fgithub.com\u002Fjazzsaxmafia)\n\n该项目目前未处于积极维护状态。\n\n## 贡献\n欢迎通过 [pull requests](https:\u002F\u002Fgithub.com\u002Fkjw0612\u002Fawesome-deep-vision\u002Fpulls) 添加论文。\n\n[![加入聊天 https:\u002F\u002Fgitter.im\u002Fkjw0612\u002Fawesome-deep-vision](https:\u002F\u002Fbadges.gitter.im\u002FJoin%20Chat.svg)](https:\u002F\u002Fgitter.im\u002Fkjw0612\u002Fawesome-deep-vision?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)\n\n## 分享\n+ [在 Twitter 上分享](http:\u002F\u002Ftwitter.com\u002Fhome?status=http:\u002F\u002Fjiwonkim.org\u002Fawesome-deep-vision%0ADeep Learning Resources for Computer Vision)\n+ [在 Facebook 上分享](http:\u002F\u002Fwww.facebook.com\u002Fsharer\u002Fsharer.php?u=https:\u002F\u002Fjiwonkim.org\u002Fawesome-deep-vision)\n+ [在 Google Plus 上分享](http:\u002F\u002Fplus.google.com\u002Fshare?url=https:\u002F\u002Fjiwonkim.org\u002Fawesome-deep-vision)\n+ [在 LinkedIn 上分享](http:\u002F\u002Fwww.linkedin.com\u002FshareArticle?mini=true&url=https:\u002F\u002Fjiwonkim.org\u002Fawesome-deep-vision&title=Awesome%20Deep%20Vision&summary=&source=)\n\n## 目录\n- [论文](#papers)\n  - [ImageNet 分类](#imagenet-classification)\n  - [目标检测](#object-detection)\n  - [目标跟踪](#object-tracking)\n  - [低层视觉](#low-level-vision)\n    - [超分辨率](#super-resolution)\n    - [其他应用](#other-applications)\n  - [边缘检测](#edge-detection)\n  - [语义分割](#semantic-segmentation)\n  - [视觉注意力与显著性](#visual-attention-and-saliency)\n  - [物体识别](#object-recognition)\n  - [人体姿态估计](#human-pose-estimation)\n  - [理解 CNN](#understanding-cnn)\n  - [图像与语言](#image-and-language)\n    - [图像描述生成](#image-captioning)\n    - [视频描述生成](#video-captioning)\n    - [问答](#question-answering)\n  - [图像生成](#image-generation)\n  - [其他主题](#other-topics)\n- [课程](#courses)\n- [书籍](#books)\n- [视频](#videos)\n- [软件](#software)\n  - [框架](#framework)\n  - [应用](#applications)\n- [教程](#tutorials)\n- [博客](#blogs)\n\n## 论文\n\n### ImageNet 分类\n![classification](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_e709818fabd8.png)\n（摘自 Alex Krizhevsky, Ilya Sutskever, Geoffrey E. Hinton，《使用深度卷积神经网络进行 ImageNet 分类》，NIPS，2012年。）\n* 微软（深度残差学习）[[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1512.03385v1.pdf)][[幻灯片](http:\u002F\u002Fimage-net.org\u002Fchallenges\u002Ftalks\u002Filsvrc2015_deep_residual_learning_kaiminghe.pdf)]\n  * Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun，《用于图像识别的深度残差学习》，arXiv:1512.03385。\n* 微软（PReLu\u002F权重初始化）[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.01852)\n  * Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun，《深入研究修正线性单元：超越人类水平的 ImageNet 分类性能》，arXiv:1502.01852。\n* 批量归一化 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.03167)\n  * Sergey Ioffe, Christian Szegedy，《批量归一化：通过减少内部协变量偏移加速深度网络训练》，arXiv:1502.03167。\n* GoogLeNet [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1409.4842)\n  * Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich，CVPR，2015年。\n* VGG-Net [[网页]](http:\u002F\u002Fwww.robots.ox.ac.uk\u002F~vgg\u002Fresearch\u002Fvery_deep\u002F) [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1409.1556)\n  * Karen Simonyan 和 Andrew Zisserman，《用于大规模视觉识别的非常深的卷积网络》，ICLR，2015年。\n* AlexNet [[论文]](http:\u002F\u002Fpapers.nips.cc\u002Fbook\u002Fadvances-in-neural-information-processing-systems-25-2012)\n  * Alex Krizhevsky, Ilya Sutskever, Geoffrey E. Hinton，《使用深度卷积神经网络进行 ImageNet 分类》，NIPS，2012年。\n\n### 目标检测\n![object_detection](https:\u002F\u002Fcloud.githubusercontent.com\u002Fassets\u002F5226447\u002F8452063\u002Ff76ba5000-2022-11e5-8db1-2cd5d490e3b3.PNG)\n（摘自邵庆仁、何凯明、罗斯·吉尔希克、孙剑：《Faster R-CNN：基于区域建议网络的实时目标检测》，arXiv:1506.01497。）\n\n* PVANET [[论文]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1608.08021) [[代码]](https:\u002F\u002Fgithub.com\u002Fsanghoon\u002Fpva-faster-rcnn)\n  * 奎贤金、桑勋洪、炳锡卢、英在千、民哲朴：《PVANET：用于实时目标检测的深层但轻量级神经网络》，arXiv:1608.08021\n* OverFeat，纽约大学 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1312.6229.pdf)\n  * OverFeat：利用卷积网络实现集成式识别、定位与检测，ICLR，2014年。\n* R-CNN，加州大学伯克利分校 [[CVPR14论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2014\u002Fpapers\u002FGirshick_Rich_Feature_Hierarchies_2014_CVPR_paper.pdf) [[arXiv14论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1311.2524)\n  * 罗斯·吉尔希克、杰夫·多纳休、特雷弗·达雷尔、吉滕德拉·马利克：《用于精确目标检测和语义分割的丰富特征层次结构》，CVPR，2014年。\n* SPP，微软研究院 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1406.4729)\n  * 何凯明、张祥宇、邵庆仁、孙剑：《视觉识别中深度卷积网络的空间金字塔池化》，ECCV，2014年。\n* Fast R-CNN，微软研究院 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1504.08083)\n  * 罗斯·吉尔希克：《Fast R-CNN》，arXiv:1504.08083。\n* Faster R-CNN，微软研究院 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.01497)\n  * 邵庆仁、何凯明、罗斯·吉尔希克、孙剑：《Faster R-CNN：基于区域建议网络的实时目标检测》，arXiv:1506.01497。\n* R-CNN minus R，牛津大学 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.06981)\n  * 卡雷尔·伦茨、安德烈亚·韦达尔迪：《R-CNN minus R》，arXiv:1506.06981。\n* 拥挤场景中的端到端人群检测 [[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1506.04878)\n  * 罗素·斯图尔特、米哈伊洛·安德里卢卡：《拥挤场景中的端到端人群检测》，arXiv:1506.04878。\n* You Only Look Once：统一的实时目标检测 [[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1506.02640)，[[论文第2版]](https:\u002F\u002Farxiv.org\u002Fabs\u002F1612.08242)，[[C语言代码]](https:\u002F\u002Fgithub.com\u002Fpjreddie\u002Fdarknet)，[[TensorFlow代码]](https:\u002F\u002Fgithub.com\u002Fthtrieu\u002Fdarkflow)\n  * 约瑟夫·雷德蒙、桑托什·迪瓦拉、罗斯·吉尔希克、阿里·法尔哈迪：《You Only Look Once：统一的实时目标检测》，arXiv:1506.02640\n  * 约瑟夫·雷德蒙、阿里·法尔哈迪（第2版）\n* Inside-Outside Net [[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1512.04143)\n  * 肖恩·贝尔、C·劳伦斯·齐特尼克、卡维塔·巴拉、罗斯·吉尔希克：《Inside-Outside Net：通过跳跃池化和循环神经网络在上下文中检测目标》\n* 深度残差网络（当前最先进水平）[[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1512.03385)\n  * 何凯明、张祥宇、邵庆仁、孙剑：《用于图像识别的深度残差学习》\n* 弱监督目标定位与多折叠多实例学习 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1503.00949.pdf)\n* R-FCN [[论文]](https:\u002F\u002Farxiv.org\u002Fabs\u002F1605.06409) [[代码]](https:\u002F\u002Fgithub.com\u002Fdaijifeng001\u002FR-FCN)\n  * 戴继峰、李毅、何凯明、孙剑：《R-FCN：基于区域的全卷积网络目标检测》\n* SSD [[论文]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1512.02325v2.pdf) [[代码]](https:\u002F\u002Fgithub.com\u002Fweiliu89\u002Fcaffe\u002Ftree\u002Fssd)\n  * 刘伟1、德拉戈米尔·安格洛夫、杜米特鲁·埃尔汉、克里斯蒂安·塞格迪、斯科特·里德、程阳·傅、亚历山大·C·伯格：《SSD：单次多框检测器》，arXiv:1512.02325\n* 现代卷积目标检测器的速度\u002F精度权衡 [[论文]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.10012v1.pdf)\n  * 乔纳森·黄、维韦克·拉托德、陈孙、孟龙朱、阿努普·科拉蒂卡拉、阿里雷扎·法西、伊恩·费舍尔、兹比格涅夫·沃伊纳、杨松、塞尔吉奥·瓜达拉马、凯文·墨菲：谷歌研究院，《现代卷积目标检测器的速度\u002F精度权衡》，arXiv:1611.10012\n\n### 视频分类\n* 尼古拉斯·巴拉斯、姚立、帕尔·克里斯、阿隆·库维尔：“深入卷积网络以学习视频表示”，ICLR 2016。[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.06432v4.pdf)\n* 迈克尔·马修、卡米耶·库普里、扬·勒丘恩：“超越均方误差的深度多尺度视频预测”，ICLR 2016。[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.05440v6.pdf)\n\n### 目标跟踪\n* 晟勋洪、宅根尤、秀河郭、宝亨韩：《基于卷积神经网络学习判别性显著图的在线跟踪》，arXiv:1502.06796。[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.06796)\n* 李翰熙、李毅和法提赫·波里克利：《DeepTrack：利用卷积神经网络学习判别性特征表示进行视觉跟踪》，BMVC，2014年。[[论文]](http:\u002F\u002Fwww.bmva.org\u002Fbmvc\u002F2014\u002Ffiles\u002Fpaper028.pdf)\n* N Wang、DY Yeung：《为视觉跟踪学习深度紧凑的图像表示》，NIPS，2013年。[[论文]](http:\u002F\u002Fwinsty.net\u002Fpapers\u002Fdlt.pdf)\n* 马超、黄家斌、杨晓康和杨明轩：《用于视觉跟踪的层次化卷积特征》，ICCV 2015 [[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_iccv_2015\u002Fpapers\u002FMa_Hierarchical_Convolutional_Features_ICCV_2015_paper.pdf) [[代码]](https:\u002F\u002Fgithub.com\u002Fjbhuang0604\u002FCF2)\n* 王利军、欧阳万力、王小刚和陆虎川：《使用全卷积网络进行视觉跟踪》，ICCV 2015 [[论文]](http:\u002F\u002F202.118.75.4\u002Flu\u002FPaper\u002FICCV2015\u002Ficcv15_lijun.pdf) [[代码]](https:\u002F\u002Fgithub.com\u002Fscott89\u002FFCNT)\n* 贤燮南和宝亨韩：《为视觉跟踪学习多领域卷积神经网络》，[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1510.07945.pdf) [[代码]](https:\u002F\u002Fgithub.com\u002FHyeonseobNam\u002FMDNet) [[项目页面]](http:\u002F\u002Fcvlab.postech.ac.kr\u002Fresearch\u002Fmdnet\u002F)\n\n### 低层视觉\n\n#### 超分辨率\n* 迭代图像重建\n  * 斯文·贝恩克：学习迭代图像重建。IJCAI，2001年。[[论文]](http:\u002F\u002Fwww.ais.uni-bonn.de\u002Fbehnke\u002Fpapers\u002Fijcai01.pdf)\n  * 斯文·贝恩克：在神经抽象金字塔中学习迭代图像重建。国际计算智能与应用期刊，第1卷，第4期，第427–438页，2001年。[[论文]](http:\u002F\u002Fwww.ais.uni-bonn.de\u002Fbehnke\u002Fpapers\u002Fijcia01.pdf)\n* 超分辨率（SRCNN）[[网页]](http:\u002F\u002Fmmlab.ie.cuhk.edu.hk\u002Fprojects\u002FSRCNN.html) [[论文-ECCV14]](http:\u002F\u002Fpersonal.ie.cuhk.edu.hk\u002F~ccloy\u002Ffiles\u002Feccv_2014_deepresolution.pdf) [[论文-arXiv15]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1501.00092.pdf)\n  * 董超、陈昌立、何凯明、汤晓鸥：学习用于图像超分辨率的深度卷积网络，ECCV，2014年。\n  * 董超、陈昌立、何凯明、汤晓鸥：使用深度卷积网络进行图像超分辨率，arXiv:1501.00092。\n* 极深超分辨率\n  * 金智源、李正权、李京穆：使用极深卷积网络实现精确的图像超分辨率，arXiv:1511.04587，2015年。[[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1511.04587)\n* 深度递归卷积网络\n  * 金智源、李正权、李京穆：用于图像超分辨率的深度递归卷积网络，arXiv:1511.04491，2015年。[[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1511.04491)\n* 分层稀疏编码网络\n  * 王兆文、刘丁、韩伟、杨建超和托马斯·S·黄：具有稀疏先验的深度网络用于图像超分辨率。ICCV，2015年。[[论文]](http:\u002F\u002Fwww.ifp.illinois.edu\u002F~dingliu2\u002Ficcv15\u002Ficcv15.pdf) [[代码]](http:\u002F\u002Fwww.ifp.illinois.edu\u002F~dingliu2\u002Ficcv15\u002F)\n* 用于超分辨率的感知损失\n  * 贾斯汀·约翰逊、亚历山大·阿拉希、李飞飞：用于实时风格迁移和超分辨率的感知损失，arXiv:1603.08155，2016年。[[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1603.08155) [[补充材料]](http:\u002F\u002Fcs.stanford.edu\u002Fpeople\u002Fjcjohns\u002Fpapers\u002Ffast-style\u002Ffast-style-supp.pdf)\n* SRGAN\n  * 克里斯蒂安·莱迪格、卢卡斯·泰斯、费伦茨·胡萨尔、何塞·卡巴列罗、安德鲁·坎宁安、亚历杭德罗·阿科斯塔、安德鲁·艾特肯、阿利汗·特贾尼、约翰内斯·托茨、泽汉·王、温哲·史：使用生成对抗网络实现照片级真实感单幅图像超分辨率，arXiv:1609.04802v3，2016年。[[论文]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1609.04802v3.pdf)\n* 其他\n  * 奥森多夫、克里斯蒂安、于贝尔·索耶和帕特里克·范德·斯马赫特：使用快速近似卷积稀疏编码进行图像超分辨率，ICONIP，2014年。[[ICONIP-2014论文]](http:\u002F\u002Fbrml.org\u002Fuploads\u002Ftx_sibibtex\u002F281.pdf)\n\n#### 其他应用\n* 光流（FlowNet）[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1504.06852)\n  * 菲利普·费舍尔、阿列克谢·多索维茨基、埃迪·伊尔格、菲利普·豪瑟、卡内尔·哈兹尔巴斯、弗拉基米尔·戈尔科夫、帕特里克·范德·斯马赫特、丹尼尔·克雷默斯、托马斯·布罗克斯：FlowNet：用卷积网络学习光流，arXiv:1504.06852。\n* 压缩伪影去除[[论文-arXiv15]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1504.06993)\n  * 董超、邓宇斌、陈昌立、汤晓鸥：通过深度卷积网络去除压缩伪影，arXiv:1504.06993。\n* 模糊去除\n  * 克里斯蒂安·J·舒勒、迈克尔·希尔施、斯特凡·哈梅林、伯恩哈德·舍尔科普夫：学习去模糊，arXiv:1406.7444 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1406.7444.pdf)\n  * 孙坚、曹文飞、许宗本、让·庞斯：学习用于非均匀运动模糊去除的卷积神经网络，CVPR，2015年 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1503.00593)\n* 图像反卷积 [[网页]](http:\u002F\u002Flxu.me\u002Fprojects\u002Fdcnn\u002F) [[论文]](http:\u002F\u002Flxu.me\u002Fmypapers\u002Fdcnn_nips14.pdf)\n  * 徐立、任志军、刘策、贾佳亚：用于图像反卷积的深度卷积神经网络，NIPS，2014年。\n* 深度边缘感知滤波器 [[论文]](http:\u002F\u002Fjmlr.org\u002Fproceedings\u002Fpapers\u002Fv37\u002Fxub15.pdf)\n  * 徐立、任志军、严琼、廖仁杰、贾佳亚：深度边缘感知滤波器，ICML，2015年。\n* 使用卷积神经网络计算立体匹配代价 [[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FZbontar_Computing_the_Stereo_2015_CVPR_paper.pdf)\n  * 尤雷·日邦塔尔、扬·勒丘恩：使用卷积神经网络计算立体匹配代价，CVPR，2015年。\n* 彩色图像着色 理查德·张、菲利普·伊索拉、阿列克谢·A·叶夫罗斯：ECCV，2016年 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1603.08511.pdf)，[[代码]](https:\u002F\u002Fgithub.com\u002Frichzhang\u002Fcolorization)\n  * 瑞安·达尔，[[博客]](http:\u002F\u002Ftinyclouds.org\u002Fcolorize\u002F)\n* 通过修复学习特征[[论文]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1604.07379v1.pdf)[[代码]](https:\u002F\u002Fgithub.com\u002Fpathak22\u002Fcontext-encoder)\n  * 迪帕克·帕塔克、菲利普·克拉亨布尔、杰夫·多纳休、特雷弗·达雷尔、阿列克谢·A·叶夫罗斯：上下文编码器：通过修复学习特征，CVPR，2016年。\n\n### 边缘检测\n![edge_detection](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_f81ee8a378c2.png)\n（来自盖达斯·贝尔塔修斯、施健博、洛伦佐·托雷萨尼：DeepEdge：一种用于自顶向下轮廓检测的多尺度分支深度网络，CVPR，2015年。）\n\n* 整体嵌套式边缘检测 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1504.06375) [[代码]](https:\u002F\u002Fgithub.com\u002Fs9xie\u002Fhed)\n  * 谢赛宁、涂卓文：整体嵌套式边缘检测，arXiv:1504.06375。\n* DeepEdge [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.1123)\n  * 盖达斯·贝尔塔修斯、施健博、洛伦佐·托雷萨尼：DeepEdge：一种用于自顶向下轮廓检测的多尺度分支深度网络，CVPR，2015年。\n* DeepContour [[论文]](http:\u002F\u002Fmc.eistar.net\u002FUpLoadFiles\u002FPapers\u002FDeepContour_cvpr15.pdf)\n  * 沈伟、王兴刚、王燕、白翔、张志江：DeepContour：一种通过正向共享损失学习的深度卷积特征，用于轮廓检测，CVPR，2015年。\n\n### 语义分割\n![semantic_segmantation](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_66d8b80b26b2.png)\n（摘自戴继峰、何恺明、孙剑：BoxSup：利用边界框监督卷积网络进行语义分割，arXiv:1503.01640。）\n* PASCAL VOC2012 挑战赛排行榜（2016年9月1日）\n  ![VOC2012_top_rankings](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_01f4b3c8aed0.png)\n  （摘自 PASCAL VOC2012 [排行榜](http:\u002F\u002Fhost.robots.ox.ac.uk:8080\u002Fleaderboard\u002Fdisplaylb.php?challengeid=11&compid=6)）\n* SEC：种子、扩展与约束\n  * 亚历山大·科列斯尼科夫、克里斯托夫·兰佩特：种子、扩展与约束——弱监督图像分割的三项原则，ECCV 2016。[[论文]](http:\u002F\u002Fpub.ist.ac.at\u002F~akolesnikov\u002Ffiles\u002FECCV2016\u002Fmain.pdf) [[代码]](https:\u002F\u002Fgithub.com\u002Fkolesman\u002FSEC)\n* 阿德莱德大学\n  * 林国生、沈春华、伊恩·里德、安东·范登亨格尔：用于语义分割的深度结构化模型的高效分段训练，arXiv:1504.01013。[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1504.01013)（在 VOC2012 中排名第一）\n  * 林国生、沈春华、伊恩·里德、安东·范登亨格尔：在消息传递推理中深度学习消息，arXiv:1508.02108。[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.02108)（在 VOC2012 中排名第四）\n* 深度解析网络（DPN）\n  * 刘子威、李晓晓、罗平、陈昌礼、唐晓鸥：基于深度解析网络的语义图像分割，arXiv:1509.02634 \u002F ICCV 2015 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1509.02634.pdf)（在 VOC 2012 中排名第二）\n* CentraleSuperBoundaries，INRIA [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.07386)\n  * 伊阿索纳斯·科基诺斯：利用深度学习超越人类的边界检测能力，arXiv:1411.07386（在 VOC 2012 中排名第四）\n* BoxSup [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1503.01640)\n  * 戴继峰、何恺明、孙剑：BoxSup——利用边界框监督卷积网络进行语义分割，arXiv:1503.01640。（在 VOC2012 中排名第六）\n* 浦项工科大学\n  * 诺贤宇、洪承勋、韩宝炯：用于语义分割的反卷积网络学习，arXiv:1505.04366。[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.04366)（在 VOC2012 中排名第七）\n  * 洪承勋、诺贤宇、韩宝炯：用于半监督语义分割的解耦深度神经网络，arXiv:1506.04924。[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.04924)\n  * 洪承勋、吴俊赫、韩宝炯以及李洪洛克：利用深度卷积神经网络学习可迁移知识以进行语义分割，arXiv:1512.07928 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1512.07928.pdf)] [[项目页面](http:\u002F\u002Fcvlab.postech.ac.kr\u002Fresearch\u002Ftransfernet\u002F)]\n* 条件随机场作为循环神经网络 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.03240)\n  * 郑帅、萨迪普·贾亚苏马纳、贝尔纳迪诺·罗梅拉-帕雷德斯、维布哈夫·维尼特、朱志忠、杜达龙、黄昌、菲利普·H·S·托尔：条件随机场作为循环神经网络，arXiv:1502.03240。（在 VOC2012 中排名第八）\n* DeepLab\n  * 陈亮杰、乔治·帕潘德里欧、凯文·墨菲、艾伦·L·尤伊尔：用于语义图像分割的弱监督与半监督 DCNN 学习，arXiv:1502.02734。[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.02734)（在 VOC2012 中排名第九）\n* 放大视野 [[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FMostajabi_Feedforward_Semantic_Segmentation_2015_CVPR_paper.pdf)\n  * 穆罕默德·雷扎·莫斯塔贾比、派曼·亚多拉胡尔、格雷戈里·沙赫纳罗维奇：具有放大视野特征的前馈语义分割，CVPR 2015\n* 联合校准 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1507.01581)\n  * 霍尔格·凯撒、贾斯珀·乌伊林斯、维托里奥·费拉里：用于语义分割的联合校准，arXiv:1507.01581。\n* 用于语义分割的全卷积网络 [[CVPR15 论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FLong_Fully_Convolutional_Networks_2015_CVPR_paper.pdf) [[arXiv15 论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.4038)\n  * 乔纳森·朗、埃文·谢尔哈默、特雷弗·达雷尔：用于语义分割的全卷积网络，CVPR 2015。\n* 超柱 [[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FHariharan_Hypercolumns_for_Object_2015_CVPR_paper.pdf)\n  * 巴拉特·哈里哈兰、巴勃罗·阿尔贝莱斯、罗斯·吉尔希克、吉滕德拉·马利克：用于目标分割和细粒度定位的超柱，CVPR 2015。\n* 深度层次解析\n  * 阿比舍克·夏尔马、昂塞尔·图泽尔、大卫·W·雅各布斯：用于语义分割的深度层次解析，CVPR 2015。[[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FSharma_Deep_Hierarchical_Parsing_2015_CVPR_paper.pdf)\n* 用于场景标注的层次特征学习 [[ICML12 论文]](http:\u002F\u002Fyann.lecun.com\u002Fexdb\u002Fpublis\u002Fpdf\u002Ffarabet-icml-12.pdf) [[PAMI13 论文]](http:\u002F\u002Fyann.lecun.com\u002Fexdb\u002Fpublis\u002Fpdf\u002Ffarabet-pami-13.pdf)\n  * 克莱芒·法拉贝特、卡米耶·库普里、洛朗·纳伊曼、扬·勒丘恩：通过多尺度特征学习、纯度树和最优覆盖进行场景解析，ICML 2012。\n  * 克莱芒·法拉贝特、卡米耶·库普里、洛朗·纳伊曼、扬·勒丘恩：用于场景标注的层次特征学习，PAMI 2013。\n* 剑桥大学 [[网站]](http:\u002F\u002Fmi.eng.cam.ac.uk\u002Fprojects\u002Fsegnet\u002F)\n  * 维杰·巴德里纳拉扬、亚历克斯·肯德尔和罗伯托·西波拉：“SegNet：一种用于图像分割的深度卷积编码器-解码器架构。” arXiv 预印本 arXiv:1511.00561，2015。[[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1511.00561)\n* 亚历克斯·肯德尔、维杰·巴德里纳拉扬和罗伯托·西波拉：“贝叶斯 SegNet：场景理解中深度卷积编码器-解码器架构中的模型不确定性。” arXiv 预印本 arXiv:1511.02680，2015。[[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1511.00561)\n* 普林斯顿大学\n  * 费舍尔·余、弗拉德伦·科尔顿：“通过空洞卷积进行多尺度上下文聚合”，ICLR 2016，[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.07122v2.pdf)\n* 华盛顿大学、艾伦人工智能研究所\n  * 哈米德·伊扎迪尼亚、费雷什特·萨德吉、桑托什·库马尔·迪瓦拉、叶津·崔、阿里·法拉迪：“用于语义分割、视觉蕴含和释义的片段-短语表”，ICCV 2015，[[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_iccv_2015\u002Fpapers\u002FIzadinia_Segment-Phrase_Table_for_ICCV_2015_paper.pdf)\n* INRIA\n  * 伊阿索纳斯·科基诺斯：“利用深度学习突破边界检测的极限”，ICLR 2016，[[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.07386v2.pdf)\n* 加州大学圣芭芭拉分校\n  * 尼鲁法尔·普里安、S·卡尔蒂凯扬和 B·S·曼朱纳特：“通过学习图像部件社区实现弱监督的基于图的语义分割”，ICCV 2015，[[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_iccv_2015\u002Fpapers\u002FPourian_Weakly_Supervised_Graph_ICCV_2015_paper.pdf)\n\n### 视觉注意力与显著性\n![saliency](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_25f843d73f14.png)\n（摘自刘念、韩俊伟、张丁文、温世峰、刘天明，《利用卷积神经网络预测眼动注视点》，CVPR 2015。）\n\n* Mr-CNN [[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FLiu_Predicting_Eye_Fixations_2015_CVPR_paper.pdf)\n  * 刘念、韩俊伟、张丁文、温世峰、刘天明，《利用卷积神经网络预测眼动注视点》，CVPR 2015。\n* 学习用于地标检测的序列搜索 [[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FSingh_Learning_a_Sequential_2015_CVPR_paper.pdf)\n  * Saurabh Singh、Derek Hoiem、David Forsyth，《学习用于地标检测的序列搜索》，CVPR 2015。\n* 基于视觉注意力的多目标识别 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.7755.pdf)\n  * Jimmy Lei Ba、Volodymyr Mnih、Koray Kavukcuoglu，《基于视觉注意力的多目标识别》，ICLR 2015。\n* 视觉注意力的循环模型 [[论文]](http:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F5542-recurrent-models-of-visual-attention.pdf)\n  * Volodymyr Mnih、Nicolas Heess、Alex Graves、Koray Kavukcuoglu，《视觉注意力的循环模型》，NIPS 2014。\n\n### 目标识别\n* 基于卷积神经网络的弱监督学习 [[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FOquab_Is_Object_Localization_2015_CVPR_paper.pdf)\n  * Maxime Oquab、Leon Bottou、Ivan Laptev、Josef Sivic，《目标定位是免费的吗？——基于卷积神经网络的弱监督学习》，CVPR 2015。\n* FV-CNN [[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FCimpoi_Deep_Filter_Banks_2015_CVPR_paper.pdf)\n  * Mircea Cimpoi、Subhransu Maji、Andrea Vedaldi，《用于纹理识别和分割的深度滤波器组》，CVPR 2015。\n\n### 人体姿态估计\n* Zhe Cao、Tomas Simon、Shih-En Wei 和 Yaser Sheikh，《基于部位亲和场的实时多人二维姿态估计》，CVPR 2017。\n* Leonid Pishchulin、Eldar Insafutdinov、Siyu Tang、Bjoern Andres、Mykhaylo Andriluka、Peter Gehler 和 Bernt Schiele，《Deepcut：多人姿态估计中的子集划分与标注联合方法》，CVPR 2016。\n* Shih-En Wei、Varun Ramakrishna、Takeo Kanade 和 Yaser Sheikh，《卷积姿态机器》，CVPR 2016。\n* Alejandro Newell、Kaiyu Yang 和 Jia Deng，《用于人体姿态估计的堆叠沙漏网络》，ECCV 2016。\n* Tomas Pfister、James Charles 和 Andrew Zisserman，《用于视频中人体姿态估计的流式卷积网络》，ICCV 2015。\n* Jonathan J. Tompson、Arjun Jain、Yann LeCun、Christoph Bregler，《卷积网络与图模型联合训练用于人体姿态估计》，NIPS 2014。\n\n### 理解卷积神经网络\n![understanding](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_566ae8b37494.png)\n（摘自 Aravindh Mahendran、Andrea Vedaldi，《通过反演理解深层图像表示》，CVPR 2015。）\n\n* Karel Lenc、Andrea Vedaldi，《通过测量图像表示的等变性和等价性来理解图像表示》，CVPR 2015。[[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FLenc_Understanding_Image_Representations_2015_CVPR_paper.pdf)\n* Anh Nguyen、Jason Yosinski、Jeff Clune，《深度神经网络极易被欺骗：对无法识别图像的高置信度预测》，CVPR 2015。[[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FNguyen_Deep_Neural_Networks_2015_CVPR_paper.pdf)\n* Aravindh Mahendran、Andrea Vedaldi，《通过反演理解深层图像表示》，CVPR 2015。[[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FMahendran_Understanding_Deep_Image_2015_CVPR_paper.pdf)\n* Bolei Zhou、Aditya Khosla、Agata Lapedriza、Aude Oliva、Antonio Torralba，《深度场景卷积神经网络中涌现的目标检测器》，ICLR 2015。[[arXiv论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1412.6856)\n* Alexey Dosovitskiy、Thomas Brox，《用卷积网络反演视觉表示》，arXiv 2015。[[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1506.02753)\n* Matthrew Zeiler、Rob Fergus，《可视化与理解卷积网络》，ECCV 2014。[[论文]](https:\u002F\u002Fwww.cs.nyu.edu\u002F~fergus\u002Fpapers\u002FzeilerECCV2014.pdf)\n\n\n### 图像与语言\n\n#### 图像描述生成\n![image_captioning](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_45abeceea151.png)\n（摘自 Andrej Karpathy、李飞飞，《用于生成图像描述的深度视觉-语义对齐》，CVPR 2015。）\n\n* UCLA \u002F 百度 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1410.1090)\n  * Junhua Mao, Wei Xu, Yi Yang, Jiang Wang, Alan L. Yuille, 使用多模态循环神经网络解释图像，arXiv:1410.1090。\n* 多伦多 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.2539)\n  * Ryan Kiros, Ruslan Salakhutdinov, Richard S. Zemel, 通过多模态神经语言模型统一视觉-语义嵌入，arXiv:1411.2539。\n* 伯克利 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.4389)\n  * Jeff Donahue, Lisa Anne Hendricks, Sergio Guadarrama, Marcus Rohrbach, Subhashini Venugopalan, Kate Saenko, Trevor Darrell，用于视觉识别与描述的长期循环卷积网络，arXiv:1411.4389。\n* 谷歌 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.4555)\n  * Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan，展示与讲述：一种神经图像字幕生成器，arXiv:1411.4555。\n* 斯坦福 [[网页]](http:\u002F\u002Fcs.stanford.edu\u002Fpeople\u002Fkarpathy\u002Fdeepimagesent\u002F) [[论文]](http:\u002F\u002Fcs.stanford.edu\u002Fpeople\u002Fkarpathy\u002Fcvpr2015.pdf)\n  * Andrej Karpathy, Li Fei-Fei，用于生成图像描述的深度视觉-语义对齐，CVPR 2015。\n* UML \u002F UT [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.4729)\n  * Subhashini Venugopalan, Huijuan Xu, Jeff Donahue, Marcus Rohrbach, Raymond Mooney, Kate Saenko，使用深度循环神经网络将视频翻译成自然语言，NAACL-HLT 2015。\n* CMU \u002F 微软 [[论文-arXiv]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.5654) [[论文-CVPR]](http:\u002F\u002Fwww.cs.cmu.edu\u002F~xinleic\u002Fpapers\u002Fcvpr15_rnn.pdf)\n  * Xinlei Chen, C. Lawrence Zitnick，学习用于图像字幕生成的循环视觉表示，arXiv:1411.5654。\n  * Xinlei Chen, C. Lawrence Zitnick，心灵之眼：用于图像字幕生成的循环视觉表示，CVPR 2015。\n* 微软 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.4952)\n  * Hao Fang, Saurabh Gupta, Forrest Iandola, Rupesh Srivastava, Li Deng, Piotr Dollár, Jianfeng Gao, Xiaodong He, Margaret Mitchell, John C. Platt, C. Lawrence Zitnick, Geoffrey Zweig，从字幕到视觉概念再返回，CVPR 2015。\n* 蒙特利尔大学 \u002F 多伦多大学 [[网页](http:\u002F\u002Fkelvinxu.github.io\u002Fprojects\u002Fcapgen.html)] [[论文](http:\u002F\u002Fwww.cs.toronto.edu\u002F~zemel\u002Fdocuments\u002FcaptionAttn.pdf)]\n  * Kelvin Xu, Jimmy Lei Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhutdinov, Richard S. Zemel, Yoshua Bengio，展示、注意与讲述：基于视觉注意力的神经图像字幕生成，arXiv:1502.03044 \u002F ICML 2015。\n* Idiap \u002F EPFL \u002F Facebook [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.03671)]\n  * Remi Lebret, Pedro O. Pinheiro, Ronan Collobert，基于短语的图像字幕生成，arXiv:1502.03671 \u002F ICML 2015。\n* UCLA \u002F 百度 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1504.06692)]\n  * Junhua Mao, Wei Xu, Yi Yang, Jiang Wang, Zhiheng Huang, Alan L. Yuille，像孩子一样学习：从图像的句子描述中快速学习新的视觉概念，arXiv:1504.06692。\n* MS + 伯克利\n  * Jacob Devlin, Saurabh Gupta, Ross Girshick, Margaret Mitchell, C. Lawrence Zitnick，探索最近邻方法用于图像字幕生成，arXiv:1505.04467 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.04467.pdf)]\n  * Jacob Devlin, Hao Cheng, Hao Fang, Saurabh Gupta, Li Deng, Xiaodong He, Geoffrey Zweig，Margaret Mitchell，用于图像字幕生成的语言模型：其特点及有效方法，arXiv:1505.01809 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.01809.pdf)]。\n* 阿德莱德 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.01144.pdf)]\n  * Qi Wu, Chunhua Shen, Anton van den Hengel, Lingqiao Liu, Anthony Dick，带有中间属性层的图像字幕生成，arXiv:1506.01144。\n* 提尔堡 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.03694.pdf)]\n  * Grzegorz Chrupala, Akos Kadar, Afra Alishahi，通过图片学习语言，arXiv:1506.03694。\n* 蒙特利尔大学 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1507.01053.pdf)]\n  * Kyunghyun Cho, Aaron Courville，Yoshua Bengio，利用基于注意力的编码器-解码器网络描述多媒体内容，arXiv:1507.01053。\n* 康奈尔 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1508.02091.pdf)]\n  * Jack Hessel, Nicolas Savva, Michael J. Wilber，神经图像字幕生成中的图像表示与新领域，arXiv:1508.02091。\n* MS + 香港城市大学 [[论文](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_iccv_2015\u002Fpapers\u002FYao_Learning_Query_and_ICCV_2015_paper.pdf)]\n  * Ting Yao, Tao Mei 和 Chong-Wah Ngo，“通过排序典型相关分析学习查询与图像相似性”，ICCV 2015。\n\n#### 视频字幕生成\n* 伯克利 [[网页]](http:\u002F\u002Fjeffdonahue.com\u002Flrcn\u002F) [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1411.4389.pdf)\n  * Jeff Donahue, Lisa Anne Hendricks, Sergio Guadarrama, Marcus Rohrbach, Subhashini Venugopalan, Kate Saenko, Trevor Darrell，用于视觉识别和描述的长期循环卷积网络，CVPR 2015。\n* UT \u002F UML \u002F 伯克利 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.4729)\n  * Subhashini Venugopalan, Huijuan Xu, Jeff Donahue, Marcus Rohrbach, Raymond Mooney, Kate Saenko，使用深度循环神经网络将视频翻译成自然语言，arXiv:1412.4729。\n* 微软 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.01861)\n  * Yingwei Pan, Tao Mei, Ting Yao, Houqiang Li, Yong Rui，联合建模嵌入与翻译以连接视频与语言，arXiv:1505.01861。\n* UT \u002F 伯克利 \u002F UML [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.00487)\n  * Subhashini Venugopalan, Marcus Rohrbach, Jeff Donahue, Raymond Mooney，Trevor Darrell，Kate Saenko，序列到序列——视频转文字，arXiv:1505.00487。\n* 蒙特利尔大学 \u002F 舍布鲁克大学 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.08029.pdf)]\n  * Li Yao，Atousa Torabi，Kyunghyun Cho，Nicolas Ballas，Christopher Pal，Hugo Larochelle，Aaron Courville，通过利用时间结构描述视频，arXiv:1502.08029。\n* MPI \u002F 伯克利 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.01698.pdf)]\n  * Anna Rohrbach，Marcus Rohrbach，Bernt Schiele，电影描述的长短期故事，arXiv:1506.01698。\n* 多伦多大学 \u002F MIT [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.06724.pdf)]\n  * Yukun Zhu，Ryan Kiros，Richard Zemel，Ruslan Salakhutdinov，Raquel Urtasun，Antonio Torralba，Sanja Fidler，通过对电影的观看和书籍的阅读来对齐书籍与电影：迈向类似故事的视觉解释，arXiv:1506.06724。\n* 蒙特利尔大学 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1507.01053.pdf)]\n  * Kyunghyun Cho，Aaron Courville，Yoshua Bengio，利用基于注意力的编码器-解码器网络描述多媒体内容，arXiv:1507.01053。\n* TAU \u002F USC [[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1612.06950.pdf)]\n  * Dotan Kaufman，Gil Levi，Tal Hassner，Lior Wolf，用于视频标注与摘要的时间镶嵌，arXiv:1612.06950。\n\n#### 问答\n![question_answering](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_readme_453a81d086f4.png)\n（来自 Stanislaw Antol，Aishwarya Agrawal，Jiasen Lu，Margaret Mitchell，Dhruv Batra，C. Lawrence Zitnick，Devi Parikh，VQA：视觉问答，CVPR 2015 SUNw：场景理解研讨会）\n\n* 弗吉尼亚理工大学 \u002F 微软研究院 [[网页]](http:\u002F\u002Fwww.visualqa.org\u002F) [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.00468)\n  * 斯坦尼斯瓦夫·安托尔、艾什瓦里娅·阿格拉瓦尔、贾森·卢、玛格丽特·米切尔、德鲁夫·巴特拉、C·劳伦斯·齐特尼克、黛薇·帕里克，VQA：视觉问答，CVPR 2015 SUNw：场景理解研讨会。\n* 马普所 \u002F 伯克利 [[网页]](https:\u002F\u002Fwww.mpi-inf.mpg.de\u002Fdepartments\u002Fcomputer-vision-and-multimodal-computing\u002Fresearch\u002Fvision-and-language\u002Fvisual-turing-challenge\u002F) [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.01121)\n  * 马特乌什·马利诺夫斯基、马库斯·罗尔巴赫、马里奥·弗里茨，问问你的神经元：一种基于神经网络的图像问答方法，arXiv:1505.01121。\n* 多伦多 [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.02074) [[数据集]](http:\u002F\u002Fwww.cs.toronto.edu\u002F~mren\u002Fimageqa\u002Fdata\u002Fcocoqa\u002F)\n  * 孟也仁、瑞安·基罗斯、理查德·泽梅尔，图像问答：一种视觉语义嵌入模型及新数据集，arXiv:1505.02074 \u002F ICML 2015 深度学习研讨会。\n* 百度 \u002F UCLA [[论文]](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1505.05612) [[数据集]]()\n  * 高怀远、毛俊华、周杰、黄志恒、王磊、徐伟，你在和机器对话吗？多语言图像问答的数据集与方法，arXiv:1505.05612。\n* 浦项工科大学 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.05756.pdf)] [[项目页面](http:\u002F\u002Fcvlab.postech.ac.kr\u002Fresearch\u002Fdppnet\u002F)]\n  * 诺贤宇、保罗·洪淑·徐、韩宝亨，利用动态参数预测的卷积神经网络进行图像问答，arXiv:1511.05765\n* 卡内基梅隆大学 \u002F 微软研究院 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.02274v2.pdf)]\n  * 杨、何、高、邓、斯莫拉 (2015)。用于图像问答的堆叠注意力网络。arXiv:1511.02274。\n* MetaMind [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1603.01417v1.pdf)]\n  * 蔡明雄、史蒂芬·梅里蒂、理查德·索彻。“用于视觉和文本问答的动态记忆网络”。arXiv:1603.01417（2016）。\n* 首尔大学 + NAVER [[论文](http:\u002F\u002Farxiv.org\u002Fabs\u002F1606.01455)]\n  * 金镇华、李相佑、郭东贤、许敏旿、金正熙、河正宇、张炳卓，《面向视觉问答的多模态残差学习》，arXiv:1606.01455\n* 加州大学伯克利分校 + 索尼 [[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.01847)]\n  * 阿基拉·福库伊、朴东赫、杨戴伦、安娜·罗尔巴赫、特雷弗·达雷尔、马库斯·罗尔巴赫，《用于视觉问答和视觉定位的多模态紧凑双线性池化》，arXiv:1606.01847\n* 浦项工科大学 [[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.03647.pdf)]\n  * 诺贤宇和韩宝亨，《通过联合损失最小化训练循环式回答单元以用于VQA》，arXiv:1606.03647\n* 首尔大学 + NAVER [[论文](http:\u002F\u002Farxiv.org\u002Fabs\u002F1610.04325)]\n  * 金镇华、权京云、金正熙、河正宇、张炳卓，《低秩双线性池化的哈达玛积》，arXiv:1610.04325。\n\n\n\n### 图像生成\n* 卷积 \u002F 循环网络\n  * 阿龙·范登奥德、纳尔·卡尔希布伦纳、奥里奥尔·维尼亚尔斯、拉斯·埃斯佩霍尔特、亚历克斯·格雷夫斯、科雷·卡武克乔卢。“使用PixelCNN解码器的条件图像生成”[[论文]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.05328v2.pdf)[[代码]](https:\u002F\u002Fgithub.com\u002Fkundan2510\u002FpixelCNN)\n  * 阿列克谢·多索维茨基、约斯特·托比亚斯·施普林根贝格、托马斯·布罗克斯，“使用卷积神经网络学习生成椅子”，CVPR 2015。[[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FDosovitskiy_Learning_to_Generate_2015_CVPR_paper.pdf)\n  * 卡罗尔·格雷戈尔、伊沃·丹尼赫尔卡、亚历克斯·格雷夫斯、达尼洛·希门内斯·雷森德、达安·维尔斯特拉，“DRAW：用于图像生成的循环神经网络”，ICML 2015。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1502.04623v2.pdf)] \n* 对抗网络\n  * 伊恩·J·古德费洛、让·普热-阿巴迪、梅迪·米尔扎、宾·徐、大卫·瓦尔德-法利、谢尔吉尔·奥扎伊尔、亚伦·库维尔、约书亚·本吉奥，生成对抗网络，NIPS 2014。[[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1406.2661)\n  * 艾米莉·登顿、苏米思·钦塔拉、阿瑟·兹拉姆、罗布·费格斯，使用对抗网络拉普拉斯金字塔的深度生成图像模型，NIPS 2015。[[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1506.05751)\n  * 卢卡斯·泰斯、阿龙·范登奥德、马蒂亚斯·贝特格，“关于生成模型评估的一点说明”，ICLR 2016。[[论文](http:\u002F\u002Farxiv.org\u002Fabs\u002F1511.01844)]\n  * 甄文戴、安德烈亚斯·达米亚努、哈维尔·冈萨雷斯、尼尔·劳伦斯，“变分自编码的深度高斯过程”，ICLR 2016。[[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.06455v2.pdf)]\n  * 埃尔曼·曼西莫夫、埃米利奥·帕里索托、吉米·巴、鲁斯兰·萨拉胡丁诺夫，“通过注意力机制从描述生成图像”，ICLR 2016，[[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.02793v2.pdf)]\n  * 尤斯特·托比亚斯·施普林根贝格，“使用分类生成对抗网络进行无监督和半监督学习”，ICLR 2016，[[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.06390v1.pdf)]\n  * 哈里森·爱德华兹、阿莫斯·斯托基，“用对手审查表示”，ICLR 2016，[[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.05897v3.pdf)]\n  * 片桐武、前田信一、小山正典、中江健、石井真，“通过虚拟对抗训练进行分布平滑”，ICLR 2016，[[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1507.00677v8.pdf)]\n  * 朱俊彦、菲利普·克拉亨布尔、伊莱·谢赫特曼和阿列克谢·A·叶夫罗斯，“在自然图像流形上进行生成式视觉操控”，ECCV 2016。[[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1609.03552v2.pdf)] [[代码](https:\u002F\u002Fgithub.com\u002Fjunyanz\u002FiGAN)] [[视频](https:\u002F\u002Fyoutu.be\u002F9c4z6YsBGQ0)]\n* 卷积与对抗网络的结合\n  * 亚历克·拉德福德、卢克·梅茨、苏米思·钦塔拉，“使用深度卷积生成对抗网络进行无监督表征学习”，ICLR 2016。[[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.06434.pdf)]\n\n### 其他主题\n* 视觉类比 [[论文](https:\u002F\u002Fweb.eecs.umich.edu\u002F~honglak\u002Fnips2015-analogy.pdf)]\n  * 斯科特·里德、张毅、张宇婷、李洪洛，深度视觉类比生成，NIPS，2015年\n* 表面法线估计 [[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FWang_Designing_Deep_Networks_2015_CVPR_paper.pdf)\n  * 王晓龙、大卫·F·福黑、阿比纳夫·古普塔，面向表面法线估计的深度网络设计，CVPR，2015年。\n* 动作检测 [[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FGkioxari_Finding_Action_Tubes_2015_CVPR_paper.pdf)\n  * 乔治娅·吉奥克萨里、吉滕德拉·马利克，寻找动作管，CVPR，2015年。\n* 人群计数 [[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FZhang_Cross-Scene_Crowd_Counting_2015_CVPR_paper.pdf)\n  * 张聪、李宏生、王晓刚、杨晓康，基于深度卷积神经网络的跨场景人群计数，CVPR，2015年。\n* 3D形状检索 [[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FWang_Sketch-Based_3D_Shape_2015_CVPR_paper.pdf)\n  * 王芳、康乐、李毅，使用卷积神经网络的基于草图的3D形状检索，CVPR，2015年。\n* 弱监督分类\n  * 萨曼内·阿扎迪、冯嘉诗、施特法妮·耶格尔卡、特雷弗·达雷尔，“用于带有噪声标签的深度CNN的辅助图像正则化”，ICLR 2016，[[论文](http:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.07069v2.pdf)]\n* 艺术风格 [[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1508.06576) [[代码]](https:\u002F\u002Fgithub.com\u002Fjcjohnson\u002Fneural-style)\n  * 列昂·A·加蒂斯、亚历山大·S·埃克尔、马蒂亚斯·贝特格，艺术风格的神经算法。\n* 人眼视线估计\n  * 张旭聪、杉野佑介、马里奥·弗里茨、安德烈亚斯·布林，野外基于外观的眼球运动估计，CVPR，2015年。[[论文]](http:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2015\u002Fpapers\u002FZhang_Appearance-Based_Gaze_Estimation_2015_CVPR_paper.pdf) [[网站]](https:\u002F\u002Fwww.mpi-inf.mpg.de\u002Fdepartments\u002Fcomputer-vision-and-multimodal-computing\u002Fresearch\u002Fgaze-based-human-computer-interaction\u002Fappearance-based-gaze-estimation-in-the-wild-mpiigaze\u002F)\n* 人脸识别\n  * 亚尼夫·泰格曼、杨明、马克·奥雷利奥·兰扎托、利奥尔·沃尔夫，DeepFace：在人脸验证任务中逼近人类水平的表现，CVPR，2014年。[[论文]](https:\u002F\u002Fwww.cs.toronto.edu\u002F~ranzato\u002Fpublications\u002Ftaigman_cvpr14.pdf)\n  * 孙怡、丁亮、王晓刚、唐小鸥，DeepID3：使用超深层神经网络的人脸识别，2015年。[[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1502.00873)\n  * 弗洛里安·施罗夫、德米特里·卡列尼琴科、詹姆斯·菲尔宾，FaceNet：用于人脸识别与聚类的统一嵌入，CVPR，2015年。[[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1503.03832)\n* 面部关键点检测\n  * 吴岳、塔尔·哈斯纳、金康根、杰拉德·梅迪奥尼、普雷姆·纳塔拉詹，改进型卷积神经网络用于面部关键点检测，2015年。[[论文]](http:\u002F\u002Farxiv.org\u002Fabs\u002F1511.04031) [[项目]](http:\u002F\u002Fwww.openu.ac.il\u002Fhome\u002Fhassner\u002Fprojects\u002Ftcnn_landmarks\u002F)\n\n## 课程\n* 深度视觉\n  * [斯坦福] [CS231n：用于视觉识别的卷积神经网络](http:\u002F\u002Fcs231n.stanford.edu\u002F)\n  * [香港中文大学] [ELEG 5040：信号处理高级专题（深度学习导论）](https:\u002F\u002Fpiazza.com\u002Fcuhk.edu.hk\u002Fspring2015\u002Feleg5040\u002Fhome)\n* 更多深度学习\n  * [斯坦福] [CS224d：自然语言处理中的深度学习](http:\u002F\u002Fcs224d.stanford.edu\u002F)\n  * [牛津大学] [由南多·德·弗雷塔斯教授讲授的深度学习](https:\u002F\u002Fwww.cs.ox.ac.uk\u002Fpeople\u002Fnando.defreitas\u002Fmachinelearning\u002F)\n  * [纽约大学] [由扬·勒丘恩教授讲授的深度学习](http:\u002F\u002Fcilvr.cs.nyu.edu\u002Fdoku.php?id=courses:deeplearning2014:start)\n\n## 书籍\n* 免费在线书籍\n  * [深度学习，作者：伊恩·古德费洛、约书亚·本吉奥和亚伦·库维尔](http:\u002F\u002Fwww.iro.umontreal.ca\u002F~bengioy\u002Fdlbook\u002F)\n  * [神经网络与深度学习，作者：迈克尔·尼尔森](http:\u002F\u002Fneuralnetworksanddeeplearning.com\u002F)\n  * [蒙特利尔大学LISA实验室编写的深度学习教程](http:\u002F\u002Fdeeplearning.net\u002Ftutorial\u002Fdeeplearning.pdf)\n\n## 视频\n* 讲座\n  * [安德鲁·吴关于深度学习、自监督学习和无监督特征学习的演讲](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=n1ViNeWhC24)\n  * [杰夫·辛顿关于深度学习最新进展的演讲](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=vShMxxqtDDs)\n  * [扬·勒丘恩关于深度学习的不可思议有效性](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=sc-KbuZqGkI)\n  * [约书亚·本吉奥关于表示学习的深度讲解](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=4xsVFLnHC_0)\n\n\n## 软件\n### 框架\n* TensorFlow：由谷歌开发的开源数值计算软件库，采用数据流图技术 [[官网](https:\u002F\u002Fwww.tensorflow.org\u002F)]\n* Torch7：基于Lua的深度学习库，被Facebook和Google DeepMind使用 [[官网](http:\u002F\u002Ftorch.ch\u002F)]\n  * 基于Torch的深度学习库：[[torchnet](https:\u002F\u002Fgithub.com\u002Ftorchnet\u002Ftorchnet)],\n* Caffe：由BVLC开发的深度学习框架 [[官网](http:\u002F\u002Fcaffe.berkeleyvision.org\u002F)]\n* Theano：Python中的数学库，由LISA实验室维护 [[官网](http:\u002F\u002Fdeeplearning.net\u002Fsoftware\u002Ftheano\u002F)]\n  * 基于Theano的深度学习库：[[Pylearn2](http:\u002F\u002Fdeeplearning.net\u002Fsoftware\u002Fpylearn2\u002F)、[[Blocks](https:\u002F\u002Fgithub.com\u002Fmila-udem\u002Fblocks)]、[[Keras](http:\u002F\u002Fkeras.io\u002F)]、[[Lasagne](https:\u002F\u002Fgithub.com\u002FLasagne\u002FLasagne)]\n* MatConvNet：用于MATLAB的卷积神经网络 [[官网](http:\u002F\u002Fwww.vlfeat.org\u002Fmatconvnet\u002F)]\n* MXNet：一种灵活高效的深度学习库，适用于异构分布式系统，并支持多种语言 [[官网](http:\u002F\u002Fmxnet.io\u002F)]\n* Deepgaze：基于CNN的人机交互计算机视觉库 [[官网](https:\u002F\u002Fgithub.com\u002Fmpatacchiola\u002Fdeepgaze)]\n\n### 应用程序\n* 对抗训练\n  * “生成对抗网络”论文的代码及超参数 [[官网](https:\u002F\u002Fgithub.com\u002Fgoodfeli\u002Fadversarial)]\n* 理解与可视化\n  * CVPR 2015“通过反演理解深度图像表示”的源代码。[[官网](https:\u002F\u002Fgithub.com\u002Faravindhm\u002Fdeep-goggle)]\n* 语义分割\n  * CVPR 2014“用于精确目标检测和语义分割的丰富特征层次结构”论文的源代码。[[官网](https:\u002F\u002Fgithub.com\u002Frbgirshick\u002Frcnn)]\n  * CVPR 2015“用于语义分割的全卷积网络”论文的源代码。[[官网](https:\u002F\u002Fgithub.com\u002Flongjon\u002Fcaffe\u002Ftree\u002Ffuture)]\n* 超分辨率\n  * 动漫风格艺术的图像超分辨率 [[官网](https:\u002F\u002Fgithub.com\u002Fnagadomi\u002Fwaifu2x)]\n* 边缘检测\n  * CVPR 2015“DeepContour：通过正共享损失学习的深度卷积特征用于边缘检测”论文的源代码。[[官网](https:\u002F\u002Fgithub.com\u002Fshenwei1231\u002FDeepContour)]\n  * ICCV 2015“整体嵌套式边缘检测”论文的源代码。[[官网](https:\u002F\u002Fgithub.com\u002Fs9xie\u002Fhed)]\n\n## 教程\n* [CVPR 2014] [计算机视觉中的深度学习教程](https:\u002F\u002Fsites.google.com\u002Fsite\u002Fdeeplearningcvpr2014\u002F)\n* [CVPR 2015] [使用Torch的计算机视觉深度学习应用教程](https:\u002F\u002Fgithub.com\u002Fsoumith\u002Fcvpr2015)\n\n## 博客\n* [深入兔子洞：CVPR 2015 及其之后@Tombone 的计算机视觉博客](http:\u002F\u002Fwww.computervisionblog.com\u002F2015\u002F06\u002Fdeep-down-rabbit-hole-cvpr-2015-and.html)\n* [CVPR 回顾与未来方向@Zoya Bylinskii（MIT 博士生）的博客](http:\u002F\u002Fzoyathinks.blogspot.kr\u002F2015\u002F06\u002Fcvpr-recap-and-where-were-going.html)\n* [Facebook 的人工智能绘画@Wired](http:\u002F\u002Fwww.wired.com\u002F2015\u002F06\u002Ffacebook-googles-fake-brains-spawn-new-visual-reality\u002F)\n* [Inceptionism：深入神经网络@Google 研究](http:\u002F\u002Fgoogleresearch.blogspot.kr\u002F2015\u002F06\u002Finceptionism-going-deeper-into-neural.html)\n* [神经网络的实现](http:\u002F\u002Fpeterroelants.github.io\u002F)","# Awesome Deep Vision 快速上手指南\n\n**项目说明**：`awesome-deep-vision` 并非一个可直接安装的软件库或框架，而是一个**精选资源列表**（Curated List）。它汇集了计算机视觉领域深度学习相关的经典论文、课程、书籍、开源代码库及教程。\n\n本指南将指导开发者如何利用该列表查找资源，并快速搭建环境运行列表中推荐的经典模型（如 ResNet, YOLO, SRCNN 等）。\n\n## 1. 环境准备\n\n由于列表中涵盖的模型多基于主流深度学习框架，建议优先配置 **Python** 和 **PyTorch** 或 **TensorFlow** 环境。大多数现代复现代码推荐使用 PyTorch。\n\n### 系统要求\n*   **操作系统**: Linux (Ubuntu 18.04+ 推荐), macOS, 或 Windows (WSL2 推荐)\n*   **GPU**: NVIDIA GPU (显存建议 8GB 以上以运行较新模型)，需安装 CUDA Toolkit (通常 11.3+)\n*   **Python**: 3.8 - 3.10\n\n### 前置依赖\n建议使用 `conda` 管理环境，并使用国内镜像源加速下载。\n\n```bash\n# 1. 安装 Miniconda (如果尚未安装)\n# 请访问 https:\u002F\u002Fmirrors.tuna.tsinghua.edu.cn\u002Fhelp\u002Fanaconda\u002F 下载\n\n# 2. 创建虚拟环境\nconda create -n deepvision python=3.9 -y\n\n# 3. 激活环境\nconda activate deepvision\n\n# 4. 安装 PyTorch (使用清华大学镜像源)\n# 注意：请根据你的 CUDA 版本调整 cudatoolkit 版本，此处以 CPU 版本为例，GPU 版本请参考 pytorch.org\npip install torch torchvision torchaudio --index-url https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n\n# 5. 安装通用依赖\npip install opencv-python matplotlib numpy pandas jupyter -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n## 2. 获取资源与安装示例\n\n由于本项目是资源列表，你需要根据需求选择具体的子项目（论文对应的代码库）进行安装。以下以列表中经典的 **目标检测 (Object Detection)** 和 **超分辨率 (Super-Resolution)** 为例。\n\n### 步骤一：浏览与选择\n访问项目主页或 GitHub 仓库，在 `Papers` 章节查找你需要的任务类别（如 `Object Detection` -> `YOLO` 或 `Faster R-CNN`）。\n\n### 步骤二：克隆具体实现代码\n列表中的每个条目通常都链接到了原始的论文和第三方实现的 GitHub 仓库。以 **SRCNN (图像超分辨率)** 为例：\n\n```bash\n# 克隆一个典型的 SRCNN 实现仓库 (示例地址，实际请以列表中链接为准)\ngit clone https:\u002F\u002Fgithub.com\u002Ftwtygqyy\u002Fpytorch-SRCNN.git\ncd pytorch-SRCNN\n```\n\n### 步骤三：安装特定项目依赖\n进入具体项目目录后，安装其专属依赖：\n\n```bash\n# 安装该项目要求的依赖\npip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n## 3. 基本使用\n\n以下演示如何运行一个典型的深度学习视觉任务（以超分辨率 SRCNN 为例，逻辑适用于列表中大多数分类、检测模型）。\n\n### 数据准备\n确保你拥有测试图片。大多数仓库会提供 `data` 文件夹或下载脚本。\n\n```bash\n# 示例：创建测试目录并放入图片\nmkdir input_images\n# 将你的低分辨率图片放入 input_images 文件夹\n```\n\n### 运行推理\u002F训练\n根据具体仓库的 `README` 执行命令。通常分为“预训练模型推理”和“从头训练”。\n\n**场景 A：使用预训练模型进行推理 (最快上手)**\n\n```bash\n# 运行测试脚本，输入图片路径和输出路径\npython test.py --input_dir .\u002Finput_images --output_dir .\u002Fresults --model_path pretrained\u002Fsrcnn.pth\n```\n\n**场景 B：开始训练 (以 ImageNet 分类为例)**\n\n```bash\n# 许多列表中的项目支持直接指定数据集路径开始训练\npython train.py --data_root \u002Fpath\u002Fto\u002Fimagenet --batch_size 64 --epochs 90\n```\n\n### 查看结果\n*   **图像任务**：检查输出目录生成的图片文件。\n*   **检测\u002F分割任务**：运行脚本通常会弹出窗口显示带有边界框或掩膜的图像，或保存为视频文件。\n*   **日志**：观察终端输出的 Loss 值和 Accuracy 指标。\n\n---\n**提示**：`awesome-deep-vision` 的核心价值在于其分类索引。当你需要研究“语义分割”或“人体姿态估计”时，请直接查阅该列表的对应章节，找到高星标的 GitHub 仓库链接，然后按照上述“克隆 -> 安装依赖 -> 运行”的流程操作即可。","某初创公司的算法工程师小李正负责开发一款智能零售货架系统，需要快速复现最新的商品检测与超分辨率重建算法以优化监控画面。\n\n### 没有 awesome-deep-vision 时\n- **文献检索如大海捞针**：在 arXiv 和 Google Scholar 上盲目搜索关键词，耗费数天筛选，却常遗漏像 ResNet 或 Batch Normalization 这类奠基性论文。\n- **技术选型缺乏依据**：面对物体检测、语义分割等细分领域，难以快速对比不同模型的优劣，导致选用了过时或不适配的架构。\n- **学习资源分散零碎**：寻找配套的代码框架、教程博客和公开课程需要在多个网站间跳转，知识体系难以构建，新人上手极慢。\n- **重复造轮子风险高**：因不了解已有的开源实现（Software 章节），团队花费大量时间重新编写本可复用的基础模块。\n\n### 使用 awesome-deep-vision 后\n- **核心资源一键直达**：直接通过分类目录（如 Object Detection、Super-Resolution）定位到微软的 Deep Residual Learning 等关键论文及幻灯片，调研效率提升十倍。\n- **技术路线清晰明确**：依托整理好的细分领域列表，迅速锁定当前 SOTA（最先进）模型，为货架商品识别选择了最优的 CNN 架构。\n- **全栈学习路径完整**：从 Papers 到 Courses、Books 再到 Tutorials，团队成员能按图索骥建立系统的深度学习视觉知识树，缩短培训周期。\n- **开源复用加速落地**：利用 Software 章节提供的成熟框架和应用案例，直接集成现有工具，将原本两个月的原型开发期压缩至两周。\n\nawesome-deep-vision 将散落的计算机视觉珍珠串成项链，让开发者从繁琐的搜集工作中解放，专注于核心算法的创新与落地。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fkjw0612_awesome-deep-vision_9e6e1ced.png","kjw0612","Jiwon Kim","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fkjw0612_a0217bea.png","AI Researcher","SK Telecom",null,"https:\u002F\u002Fgithub.com\u002Fkjw0612",11150,2780,"2026-04-05T14:33:04",1,"","未说明",{"notes":87,"python":85,"dependencies":88},"该项目不是一个可执行的软件工具或代码库，而是一个计算机视觉深度学习资源的精选列表（Awesome List），主要包含论文、课程、书籍和博客文章的链接。因此，它本身没有特定的操作系统、GPU、内存、Python 版本或依赖库要求。用户需根据列表中具体引用的论文所对应的独立代码仓库来确定相应的运行环境需求。此外，README 明确指出该项目目前不再积极维护。",[],[15,90],"其他","2026-03-27T02:49:30.150509","2026-04-11T18:33:33.155149",[94,99,104,109,114,119,124],{"id":95,"question_zh":96,"answer_zh":97,"source_url":98},22761,"在哪里可以找到关于人体姿态估计（Human Pose Estimation）的论文列表和资源？","可以参考专门整理的资源库：https:\u002F\u002Fgithub.com\u002Fcbsudux\u002Fawesome-human-pose-estimation。此外，该领域的重要论文包括：1. Cao et al. (2017) 'Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields' (OpenPose); 2. Pishchulin et al. (2016) 'Deepcut'; 3. Wei et al. (2016) 'Convolutional pose machines'; 4. Newell et al. (2016) 'Stacked hourglass networks'。这些论文涵盖了从深度学习早期到实时多人姿态估计的关键进展。","https:\u002F\u002Fgithub.com\u002Fkjw0612\u002Fawesome-deep-vision\u002Fissues\u002F71",{"id":100,"question_zh":101,"answer_zh":102,"source_url":103},22762,"图像着色（Colorization）相关的研究应该归类到哪里？","图像着色工作通常归类于“底层视觉”（low-level vision）或“其他应用”（other applications）部分。相关的重要工作包括 Richard Zhang 等人的 'Colorful Image Colorization' (ECCV 2016) 以及 Ryan Dahl 的项目。","https:\u002F\u002Fgithub.com\u002Fkjw0612\u002Fawesome-deep-vision\u002Fissues\u002F44",{"id":105,"question_zh":106,"answer_zh":107,"source_url":108},22763,"文档中的教程链接失效或页面无法访问怎么办？","如果原始教程页面已损坏（例如 Torch 教程），维护者通常会将链接更新为对应的 GitHub 代码仓库地址。遇到此类问题时，建议检查该项目是否已迁移至 GitHub，或直接搜索该教程名称的官方仓库。","https:\u002F\u002Fgithub.com\u002Fkjw0612\u002Fawesome-deep-vision\u002Fissues\u002F39",{"id":110,"question_zh":111,"answer_zh":112,"source_url":113},22764,"微软何恺明等人的“深度残差学习”（Deep Residual Learning）幻灯片链接显示“页面丢失”怎么办？","原微软研究服务器上的链接可能已失效。建议直接在学术搜索引擎（如 Google Scholar）中搜索论文标题 \"Deep Residual Learning for Image Recognition\"，或在作者的个人主页、GitHub 仓库以及会议（如 ILSVRC）的归档页面中寻找最新的幻灯片或论文 PDF 版本。","https:\u002F\u002Fgithub.com\u002Fkjw0612\u002Fawesome-deep-vision\u002Fissues\u002F63",{"id":115,"question_zh":116,"answer_zh":117,"source_url":118},22765,"如何为这个 Awesome 列表贡献新的资源或修复错误？","用户可以通过提交 Pull Request (PR) 来添加新资源或修复链接。在提交前，可以在 Issue 中先与维护者确认新资源的分类位置（例如确认某项技术属于哪个章节），待确认后即可提交 PR 进行合并。","https:\u002F\u002Fgithub.com\u002Fkjw0612\u002Fawesome-deep-vision\u002Fissues\u002F40",{"id":120,"question_zh":121,"answer_zh":122,"source_url":123},22766,"有没有关于立体匹配（Stereo Matching）和视觉跟踪（Visual Tracking）的推荐论文？","对于目标检测相关的基础，推荐阅读 Shaoqing Ren 的论文（即 Faster R-CNN）。针对立体匹配和视觉跟踪，建议关注 CVPR 2015 和 ICML 2015 等顶级会议上的最新发表成果，这些会议通常收录了该年度最具影响力的相关工作。","https:\u002F\u002Fgithub.com\u002Fkjw0612\u002Fawesome-deep-vision\u002Fissues\u002F2",{"id":125,"question_zh":126,"answer_zh":127,"source_url":128},22767,"项目中的外部链接出现 404 错误该如何处理？","当发现特定研究机构（如 MPI-INF）的项目页面出现 404 错误时，通常是因为网站重构或项目归档。解决方法是访问该机构的主页，导航至相应的计算机视觉或多模态计算部门，在“研究”或“出版物”栏目中重新定位该项目，或直接搜索论文标题获取最新资源地址。","https:\u002F\u002Fgithub.com\u002Fkjw0612\u002Fawesome-deep-vision\u002Fissues\u002F49",[]]