[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-patrick-llgc--Learning-Deep-Learning":3,"tool-patrick-llgc--Learning-Deep-Learning":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":79,"owner_company":80,"owner_location":80,"owner_email":80,"owner_twitter":80,"owner_website":81,"owner_url":82,"languages":83,"stars":88,"forks":89,"last_commit_at":90,"license":80,"difficulty_score":91,"env_os":92,"env_gpu":93,"env_ram":93,"env_deps":94,"category_tags":97,"github_topics":98,"view_count":23,"oss_zip_url":80,"oss_zip_packed_at":80,"status":16,"created_at":113,"updated_at":114,"faqs":115,"releases":116},3983,"patrick-llgc\u002FLearning-Deep-Learning","Learning-Deep-Learning","Paper reading notes on Deep Learning and Machine Learning","Learning-Deep-Learning 是一个专注于深度学习与机器学习领域的论文阅读笔记仓库。它由英伟达自动驾驶项目 AI 总监 Patrick Langechuan Liu 维护，旨在帮助从业者系统性地梳理和消化海量的学术文献。\n\n面对深度学习领域论文层出不穷、新手难以入手以及研究人员追踪前沿技术成本高等痛点，Learning-Deep-Learning 提供了结构化的知识路径。它不仅整理了从基础卷积神经网络到前沿 Transformer 架构的必读论文清单，还针对计算机视觉和自动驾驶场景，提供了如鸟瞰图感知（BEV）、语义占用预测、3D 车道线检测等热门主题的深度解读。\n\n这套资源特别适合人工智能开发者、算法研究人员以及对自动驾驶技术感兴趣的学生使用。对于刚入门的读者，它提供了明确的“首月阅读计划”；对于资深从业者，其结合工业界量产经验与学术界最新成果的视角极具参考价值。\n\nLearning-Deep-Learning 的独特亮点在于其“理论 + 实践”的双重维度：除了标准的论文笔记外，作者还通过\"The Thinking Car\"专栏撰写了一系列技术博客，将抽象的算法原理与","Learning-Deep-Learning 是一个专注于深度学习与机器学习领域的论文阅读笔记仓库。它由英伟达自动驾驶项目 AI 总监 Patrick Langechuan Liu 维护，旨在帮助从业者系统性地梳理和消化海量的学术文献。\n\n面对深度学习领域论文层出不穷、新手难以入手以及研究人员追踪前沿技术成本高等痛点，Learning-Deep-Learning 提供了结构化的知识路径。它不仅整理了从基础卷积神经网络到前沿 Transformer 架构的必读论文清单，还针对计算机视觉和自动驾驶场景，提供了如鸟瞰图感知（BEV）、语义占用预测、3D 车道线检测等热门主题的深度解读。\n\n这套资源特别适合人工智能开发者、算法研究人员以及对自动驾驶技术感兴趣的学生使用。对于刚入门的读者，它提供了明确的“首月阅读计划”；对于资深从业者，其结合工业界量产经验与学术界最新成果的视角极具参考价值。\n\nLearning-Deep-Learning 的独特亮点在于其“理论 + 实践”的双重维度：除了标准的论文笔记外，作者还通过\"The Thinking Car\"专栏撰写了一系列技术博客，将抽象的算法原理与中国自动驾驶量产落地的实际挑战相结合，为读者提供了难得的产业界洞察。","# Paper notes\nThis repository contains my paper reading notes on deep learning and machine learning. It is inspired by [Denny Britz](https:\u002F\u002Fgithub.com\u002Fdennybritz\u002Fdeeplearning-papernotes) and [Daniel Takeshi](https:\u002F\u002Fgithub.com\u002FDanielTakeshi\u002FPaper_Notes). A minimalistic webpage generated with Github io can be found [here](https:\u002F\u002Fpatrick-llgc.github.io\u002FLearning-Deep-Learning\u002F).\n\n## About me\nMy name is [Patrick Langechuan Liu](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fpatrick-llgc\u002F). After about a decade of training and research in physics, I found my passion in deep learning and autonomous driving. \n\nI am currently Director of AI at Nvidia, leading the ML modeling effort for Nvidia's end-to-end autonomous driving project, Alpamayo.\n\n## What to read\nIf you are new to deep learning in computer vision and don't know where to start, I suggest you spend your first month or so dive deep into [this list of papers](start\u002Ffirst_cnn_papers.md). I did so ([see my notes](start\u002Ffirst_cnn_papers_notes.md)) and it served me well.\n\nHere is [a list of trustworthy sources of papers](trusty.md) in case I ran out of papers to read.\n\n## My review posts by topics\nI regularly update my blog column [the Thinking Car](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car).\n\n- [A Crash Course of Planning for Perception Engineers in Autonomous Driving](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fa-crash-course-of-planning-for-perception-engineers-in-autonomous-driving-ede324d78717)\n- [BEV Perception in Mass Production Autonomous Driving](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fbev-perception-in-mass-production-autonomous-driving-c6e3f1e46ae0)\n- [Challenges of Mass Production Autonomous Driving in China](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fchallenges-of-mass-production-autonomous-driving-in-china-407c7e2dc5d8)\n- [Vision-centric Semantic Occupancy Prediction for Autonomous Driving](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fvision-centric-semantic-occupancy-prediction-for-autonomous-driving-16a46dbd6f65) ([related paper notes](topics\u002Ftopic_occupancy_network.md))\n- [Drivable Space in Autonomous Driving — The Industry](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fdrivable-space-in-autonomous-driving-the-industry-7a4624b94d41)\n- [Drivable Space in Autonomous Driving — The Academia](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fdrivable-space-in-autonomous-driving-a-review-of-academia-ef1a6aa4dc15)\n- [Drivable Space in Autonomous Driving — The Concept](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fdrivable-space-in-autonomous-driving-the-concept-df699bb8682f)\n- [Monocular BEV Perception with Transformers in Autonomous Driving](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fmonocular-bev-perception-with-transformers-in-autonomous-driving-c41e4a893944) ([related paper notes](topics\u002Ftopic_transformers_bev.md))\n- [Illustrated Differences between MLP and Transformers for Tensor Reshaping in Deep Learning](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fillustrated-difference-between-mlp-and-transformers-for-tensor-reshaping-52569edaf89)\n- [Monocular 3D Lane Line Detection in Autonomous Driving](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fmonocular-3d-lane-line-detection-in-autonomous-driving-4d7cdfabf3b6) ([related paper notes](topics\u002Ftopic_3d_lld.md))\n- [Deep-Learning based Object detection in Crowded Scenes](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fdeep-learning-based-object-detection-in-crowded-scenes-1c9fddbd7bc4) ([related paper notes](topics\u002Ftopic_crowd_detection.md))\n- [Monocular Bird’s-Eye-View Semantic Segmentation for Autonomous Driving](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fmonocular-birds-eye-view-semantic-segmentation-for-autonomous-driving-ee2f771afb59) ([related paper notes](topics\u002Ftopic_bev_segmentation.md))\n- [Deep Learning in Mapping for Autonomous Driving](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fdeep-learning-in-mapping-for-autonomous-driving-9e33ee951a44)\n- [Monocular Dynamic Object SLAM in Autonomous Driving](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fmonocular-dynamic-object-slam-in-autonomous-driving-f12249052bf1)\n- [Monocular 3D Object Detection in Autonomous Driving — A Review](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fmonocular-3d-object-detection-in-autonomous-driving-2476a3c7f57e)\n- [Self-supervised Keypoint Learning — A Review](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fself-supervised-keypoint-learning-aade18081fc3)\n- [Single Stage Instance Segmentation — A Review](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fsingle-stage-instance-segmentation-a-review-1eeb66e0cc49)\n- [Self-paced Multitask Learning — A Review](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fself-paced-multitask-learning-76c26e9532d0)\n- [Convolutional Neural Networks with Heterogeneous Metadata](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fconvolutional-neural-networks-with-heterogeneous-metadata-2af9241218a9)\n- [Lifting 2D object detection to 3D in autonomous driving](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fgeometric-reasoning-based-cuboid-generation-in-monocular-3d-object-detection-5ee2996270d1)\n- [Multimodal Regression](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fanchors-and-multi-bin-loss-for-multi-modal-target-regression-647ea1974617)\n\n## Notes of AI Podcasts\n- [Interview of Jiali Weng at OpenAI by WhynotTV](https:\u002F\u002Fwww.bilibili.com\u002Fvideo\u002FBV1darmBcE4A\u002F) [[Notes](podcast\u002Fjialiweng.md)] [Interview, Jiali Weng\u002F翁家笠, OpenAI, WhynotTV]\n\n## Scratchpad by Topics\nThis session contains quick notes (like git-gist) to my future self.\n\n- [Compute Hardware](gist\u002Fcompute_hardware.md)\n- [Attention masks](gist\u002Fattention_mask.md)\n\n\n## 2025-02 (1)\n- [DFlash: Block Diffusion for Flash Speculative Decoding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.06036)\n- [VLM4VLA: Revisiting Vision-Language-Models in Vision-Language-Action Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.03309) [[Notes](paper_notes\u002Fvlm4vla.md)] [Qwen team]\n\n## 2025-01 (10)\n- [Stepwise Internalization: From Explicit CoT to Implicit CoT: Learning to Internalize CoT Step by Step](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14838) [[Notes](paper_notes\u002Fstepwise_internalization.md)] [Yejin Choi, latent CoT]\n- [Coconut: Training Large Language Models to Reason in a Continuous Latent Space](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.06769) [[Notes](paper_notes\u002Fcoconut.md)] \u003Ckbd>COLM 2025\u003C\u002Fkbd> [Yuandong Tian, latent CoT]\n- [DLCM: Dynamic Large Concept Models: Latent Reasoning in an Adaptive Semantic Space](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.24617) [[Notes](paper_notes\u002Fdlcm.md)] [Xingwei Qu, ByteDance, latent CoT --> concept models with interpretability]\n- [A Survey on Latent Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.06203) [[Notes](paper_notes\u002Flatent_cot_horizon.md)] [Xingwei Qu, ByteDance]\n- [Large Language Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.09992)\n- [Dream 7B: Diffusion Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.15487)\n- [Seed Diffusion: A Large-Scale Diffusion Language Model with High-Speed Inference](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.02193) [Bytedance Seed]\n- [MMaDA: Multimodal Large Diffusion Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.15809) [Bytedance Seed]\n- [Fast-dLLM: Training-free Acceleration of Diffusion LLM by Enabling KV Cache and Parallel Decoding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.22618) [[Notes](paper_notes\u002Ffast_dllm.md)] [Song Han, Enze Xie]\n- [Fast-dLLM v2: Efficient Block-Diffusion LLM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.26328) [[Notes](paper_notes\u002Ffast_dllm_v2.md)] [Song Han, Enze Xie]\n- [Efficient-DLM: From Autoregressive to Diffusion Language Models, and Beyond in Speed](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.14067) [[Notes](paper_notes\u002Fefficient_dlm.md)] [Pavlo, Song Han, Nvidia]\n- [TiDAR: Think in Diffusion, Talk in Autoregression](https:\u002F\u002Farxiv.org\u002Fabs\u002F2511.08923) [[Notes](paper_notes\u002Ftidar.md)] [Nvidia, Pavlo]\n- [DFlash: Block Diffusion for Flash Speculative Decoding](https:\u002F\u002Fz-lab.ai\u002Fprojects\u002Fdflash\u002F)\n- [Dream 7B: Diffusion Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.15487)\n- [Block Diffusion: Interpolating Between Autoregressive and Diffusion Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.09573) \u003Ckbd>ICLR 2025 Oral\u003C\u002Fkbd>\n- [SD3: Scaling Rectified Flow Transformers for High-Resolution Image Synthesis](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.03206) [Patrick Esser, Stable Diffusion]\n- [Wan: Open and Advanced Large-Scale Video Generative Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.20314) [通义万象]\n- [RHO-1: Not All Tokens Are What You Need](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.07965) [[Notes](paper_notes\u002Frho1.md)] \u003Ckbd>NeurIPS 2024 Oral\u003C\u002Fkbd>\n- [WOD-E2E: Waymo Open Dataset for End-to-End Driving in Challenging Long-tail Scenarios](https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.26125) [[Notes](paper_notes\u002Fwod_e2e.md)] [Waymo]\n- [LVP: Large Video Planner Enables Generalizable Robot Control](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.15840) [[Notes](paper_notes\u002Flvp.md)]\n- [Robust Finetuning of Vision-Language-Action Robot Policies via Parameter Merging](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.08333)\n- [Epona: Autoregressive Diffusion World Model for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.24113) \u003Ckbd>ICCV 2025\u003C\u002Fkbd> [Horizonp]\n- [HaMeR: Reconstructing Hands in 3D with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.05251) [hand mesh recovery, Berkeley, mono recon of MANO parameterization]\n- [MegaSaM: Accurate, Fast, and Robust Structure and Motion from\nCasual Dynamic Videos](https:\u002F\u002Farxiv.org\u002Fabd\u002F2412.04463) \u003Ckbd>CVPR 2025 Best Paper Shortlist\u003C\u002Fkbd> [SaM: structure and Motion] \n\n\n## 2025-12 (0)\n- [DDPM: Denoising Diffusion Probabilistic Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.11239) [Berkeley, Jonathan Ho, Pieter Abbeel]\n- [DDIM: Denoising Diffusion Implicit Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.02502) \u003Ckbd>ICLR 2021\u003C\u002Fkbd> [Stanford, Stefano Ermon]\n- [score SDE: Score-Based Generative Modeling through Stochastic Differential Equations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.13456) \u003Ckbd>ICLR 2021 Oral\u003C\u002Fkbd> [Stanford, Yang Song, Stefano Ermon]\n- [Improved DDPM: Improved Denoising Diffusion Probabilistic Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.09672) [OpenAI, Prafulla Dhariwal]\n- [Classifier Guidance: Diffusion Models Beat GANs on Image Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.05233) [OpenAI, Prafulla Dhariwal]\n- [LDM: Latent Diffusion Model: High-Resolution Image Synthesis with Latent Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.10752) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [Stable Diffusion v1, Patrick Esser]\n- [CFG: Classifier-Free Diffusion Guidance](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.12598) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd> [Jonathan Ho, Google Brain]\n- [Rectified flow: Flow Straight and Fast: Learning to Generate and Transfer Data with Rectified Flow](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.03003) [Xingchao Liu, UT Austin, even earlier than flow matching paper]\n- [Flow Matching for Generative Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02747) [FAIR]\n- [DiT: Scalable Diffusion Models with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09748) [Saining Xie]\n- [OT-CFM: Improving and generalizing flow-based generative models with minibatch optimal transport](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.00482) \u003Ckbd>TMLR 2023\u003C\u002Fkbd> [Alex Tong]\n- [Mean Flows for One-step Generative Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.13447) [Kaiming He]\n- [iMF: Improved Mean Flows: On the Challenges of Fastforward Generative Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.02012) [Kaiming He]\n\n## 2025-09 (2)\n- [Cosmos-Reason1: From Physical Common Sense To Embodied Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.15558) [[Notes](paper_notes\u002Fcosmos_reason1.md)] [Nvidia]\n- [CoVLA: Comprehensive Vision-Language-Action Dataset for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.10845) [[Notes](paper_notes\u002Fcovla.md)] \u003Ckbd>WACV 2025\u003C\u002Fkbd> [~80 h real-world driving videos with paired language and trajectory annotations; largest VLA-for-driving dataset]\n- [SimLingo: Vision-Only Closed-Loop Autonomous Driving with Language-Action Alignment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.09594) \u003Ckbd>CVPR 2025\u003C\u002Fkbd> [1st Place @ CARLA Challenge 2024; SOTA on CARLA LB 2.0 and Bench2Drive; cited by AutoVLA]\n- [AutoVLA: A Vision-Language-Action Model for End-to-End Autonomous Driving with Adaptive Reasoning and Reinforcement Fine-Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.13757) \u003Ckbd>arXiv 2025-06\u003C\u002Fkbd> [Trajectory tokenization; dual-thinking (fast vs slow CoT) modes; GRPO fine-tuning; evaluated on nuPlan, nuScenes, Waymo, CARLA; cites SimLingo]\n- [DriveAgent-R1: Advancing VLM-based Autonomous Driving with Hybrid Thinking and Active Perception](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.20879) \u003Ckbd>arXiv 2025-07\u003C\u002Fkbd> [Hybrid Thinking (text vs tool based CoT) + Active Perception; 3-stage RL training; built on DriveVLM lineage]\n- [AgentThink: A Unified Framework for Tool-Augmented Chain-of-Thought Reasoning in Vision-Language Models for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.15298) [Kangan Qian, Sicong Jiang, Xiaomi] \u003Ckbd>EMNLP25\u003C\u002Fkbd>\n- [VERDI: VLM-Embedded Reasoning for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.15925) \u003Ckbd>arXiv 2025-05\u003C\u002Fkbd> [Distills VLM reasoning into a modular AD stack; aligns perception, prediction, planning; improves nuScenes performance with no VLM inference cost]\n- [Poutine: Vision-Language-Trajectory Pre-Training and Reinforcement Post-Training Enable Robust End-to-End Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.11234) \u003Ckbd>arXiv 2025-06\u003C\u002Fkbd> [3B-param VLM trained on 83 h CoVLA + 11 h Waymo long-tail; RL fine-tuned (GRPO); 1st Place in Waymo Vision-Based E2E Driving Challenge (RFS=7.99)]\n- [ReasonPlan: Unified Scene Prediction and Decision Reasoning for Closed-Loop Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.20024) \u003Ckbd>arXiv 2025-05\u003C\u002Fkbd> [Chain-of-Thought planning; significant closed-loop improvements on Bench2Drive]\n- [DiffVLA: Vision-Language-Guided Diffusion Planning for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.19381) \u003Ckbd>arXiv 2025-05\u003C\u002Fkbd> [VLM-guided diffusion trajectory planning; top performance in Autonomous Grand Challenge 2025]\n- [VLAD: A VLM-Augmented Autonomous Driving Framework](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.01284) \u003Ckbd>ITSC 2025\u003C\u002Fkbd> [VLM generates high-level commands for E2E controller; enhances interpretability and planning safety]\n- [DriveAction (benchmark): DriveAction: A Benchmark for Exploring Human-like Driving Decisions in VLA Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.24044) \u003Ckbd>arXiv 2025-06\u003C\u002Fkbd> [Action-rooted evaluation with QA pairs across driving scenarios; cited by VLA4AD survey; gaining traction]\n- [DINOv3](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.10104) [High res Dino]\n- [LiveVLM: Efficient Online Video Understanding via Streaming-Oriented KV Cache and Retrieval](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.15269)\n- [DexUMI: Using Human Hand as the Universal Manipulation Interface for Dexterous Manipulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.21864) [dextrous hand data collection, Shuran Song, Jim Fan]\n- [DEXOP: A Device for Robotic Transfer of Dexterous Human Manipulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.04441) [Best Paper Award @ RSS 2025]\n- [HAD dataset: Grounding Human-to-Vehicle Advice for Self-driving Vehicles](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06978) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [John Canny, Honda reesarch institute, 2019, VLA OG]\n- [RAD: Training an End-to-End Driving Policy via Large-Scale 3DGS-based Reinforcement Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13144) \u003Ckbd>NeurIPS 2025\u003C\u002Fkbd> [Horizon]\n- [PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.13180) [FAIR]\n- [Difix3D+: Improving 3D Reconstructions with Single-Step Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.01774) \u003Ckbd>CVPR 2025 best paper candidates\u003C\u002Fkbd> [Nvidia, Sanja Fidler]\n- [π∗0.6: a VLA That Learns From Experience](https:\u002F\u002Farxiv.org\u002Fabs\u002F2511.14759) [VLA + RL]\n- [π𝚁𝙻: Online RL Fine-tuning for Flow-based Vision-Language-Action Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.25889) [VLA + RL]\n- [VLA-RL: Towards Masterful and General Robotic Manipulation with Scalable Reinforcement Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.18719) [VLA + RL]\n- [GR-RL: Going Dexterous and Precise for Long-Horizon Robotic Manipulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.01801) [VLA + RL]\n- [GR-3 Technical Report](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.15493) [VLA, Bytedance]\n- [Mixture of Horizons in Action Chunking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2511.19433)\n\n\n## 2025-06 (1)\n- [V-JEPA 2: Self-Supervised Video Models Enable Understanding, Prediction and Planning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.09985) [LeCun]\n- [V-JEPA: Revisiting Feature Prediction for Learning Visual Representations from Video](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.08471) \u003Ckbd>ICLR 2025\u003C\u002Fkbd>\n- [I-JEPA: Self-Supervised Learning from Images with a Joint-Embedding Predictive Architecture](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.08243) \u003Ckbd>ICCV 2023\u003C\u002Fkbd>\n- [PlaNet: Learning Latent Dynamics for Planning from Pixels](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.04551)\n- [DreamerV1: Dream to Control: Learning Behaviors by Latent Imagination](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01603)\n- [DreamerV2: Mastering Atari with Discrete World Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.02193) \u003Ckbd>ICLR 2021\u003C\u002Fkbd>\n- [DreamerV3: Mastering Diverse Domains through World Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.04104) \u003Ckbd>Nature 2025\u003C\u002Fkbd>\n- [DayDreamer: World Models for Physical Robot Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14176) \u003Ckbd>CoRL 2022\u003C\u002Fkbd>\n- [Dynalang: Learning to Model the World with Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.01399) \u003Ckbd>ICML 2024\u003C\u002Fkbd>\n- [Tokenize the World into Object-level Knowledge to Address Long-tail Events in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.00959) [[Notes](paper_notes\u002Ftoken_ad.md)] [Marco Pavone, Nvidia]\n- [SparseDrive: End-to-End Autonomous Driving via Sparse Scene Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.19620) [[Notes](sparse_drive.md)] \u003Ckbd>ICRA 2025\u003C\u002Fkbd> [Horizon]\n- [HE-Drive: Human-Like End-to-End Driving with Vision Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.05051) \u003Ckbd>IROS 2025\u003C\u002Fkbd> [Horizon]\n- [GPT-Driver: Learning to Drive with GPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01415) [NeurIPS 2023, Hang Zhao]\n- [Driving with LLMs: Fusing Object-Level Vector Modality for Explainable Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01957) [[Notes](paper_notes\u002Fdriving_with_llms.md)] \u003Ckbd>ICRA 2024\u003C\u002Fkbd> [Wayve]\n- [PARA-Drive: Parallelized Architecture for Real-time Autonomous Driving](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2024\u002Fhtml\u002FWeng_PARA-Drive_Parallelized_Architecture_for_Real-time_Autonomous_Driving_CVPR_2024_paper.html) \u003Ckbd>CVPR 2024\u003C\u002Fkbd> [Marco Pavone, NVidia]\n- [PDM-Closed: Parting with Misconceptions about Learning-based Vehicle Motion Planning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.07962) [[Notes](paper_notes\u002Fpdm_closed.md)] \u003Ckbd>CoRL 2023\u003C\u002Fkbd>\n- [Ego-MLP: Is Ego Status All You Need for Open-Loop End-to-End Autonomous Driving?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.03031) \u003Ckbd>CVPR 2024\u003C\u002Fkbd>\n- [AD-MLP: Rethinking the Open-Loop Evaluation of End-to-End Autonomous Driving in nuScenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10430) [Baidu]\n- [GAIA-2: A Controllable Multi-View Generative World Model for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.20523) [Wayve]\n- [Cameras as Relative Positional Encoding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.10496)\n\n## 2025-04\n- [Scenario Dreamer: Vectorized Latent Diffusion for Generating Driving Simulation Environments](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.22496) \u003Ckbd>CVPR 2025\u003C\u002Fkbd>\n- [Hi Robot: Open-Ended Instruction Following with Hierarchical Vision-Language-Action Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.19417) [Physical Intelligence]\n- [Finetuning Generative Trajectory Model with Reinforcement Learning from Human Feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.10434) [Li Auto, RLHF]\n- [TokenFLEX: Unified VLM Training for Flexible Visual Tokens Inference](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.03154) [Li Auto]\n- [Fast-in-Slow: A Dual-System Foundation Model Unifying Fast Manipulation within Slow Reasoning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.01953)\n- [STORM: Spatio-Temporal Reconstruction Model for Large-Scale Outdoor Scenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.00602)\n\n\n## 2024-12 (0)\n- [VLM-AD: End-to-End Autonomous Driving through Vision-Language Model Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.14446) [Cruise]\n- [GPD-1: Generative Pre-training for Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.08643) [PhiGent]\n- [Transformers Inference Optimization Toolset](https:\u002F\u002Fastralord.github.io\u002Fposts\u002Ftransformer-inference-optimization-toolset\u002F)\n- [Thinking in Space: How Multimodal Large Language Models See, Remember, and Recall Spaces](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.14171) [Fei-fei Li]\n- [Probing the 3D Awareness of Visual Foundation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.08636) \u003Ckbd>CVPR 2024\u003C\u002Fkbd>\n- [iVideoGPT: Interactive VideoGPTs are Scalable World Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15223) \u003Ckbd>NeurIPS 2024\u003C\u002Fkbd>\n- [CarLLaVA: Vision language models for camera-only closed-loop driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.10165) [Wayve]\n- [Hints of Prompt: Enhancing Visual Representation for Multimodal LLMs in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.13076) [DeepRoute]\n- [LAW: Enhancing End-to-End Autonomous Driving with Latent World Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.08481)\n- [TCP: Trajectory-guided Control Prediction for End-to-end Autonomous Driving: A Simple yet Strong Baseline](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.08129) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [E2E planning, Hongyang]\n- [When Worse is Better: Navigating the compression-generation tradeoff in visual tokenization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16326)\n- [RoGs: Large Scale Road Surface Reconstruction with Meshgrid Gaussian](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14342)\n- [RoMe: Towards Large Scale Road Surface Reconstruction via Mesh Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.11368)\n- [SLEDGE: Synthesizing Driving Environments with Generative Models and Rule-Based Traffic](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.17933) \u003Ckbd>ECCV 2024\u003C\u002Fkbd>\n- [Lookahead: Break the Sequential Dependency of LLM Inference Using Lookahead Decoding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.02057) [specdec]\n- [EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.15077) [specdec]\n- [EAGLE-2: Faster Inference of Language Models with Dynamic Draft Trees](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.16858) [specdec]\n- [Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.10774)\n- [RealGen: Retrieval Augmented Generation for Controllable Traffic Scenarios](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.13303) \u003Ckbd>ECCV 2024\u003C\u002Fkbd>\n- [MobileVLM V2: Faster and Stronger Baseline for Vision Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.03766)\n- [Open Sourcing π0](https:\u002F\u002Fwww.physicalintelligence.company\u002Fblog\u002Fopenpi) [PI, Industry]\n- [Helix: A Vision-Language-Action Model for Generalist Humanoid Control](https:\u002F\u002Fwww.figure.ai\u002Fnews\u002Fhelix) [Figure, Industry]\n- [AM-RADIO: Agglomerative Vision Foundation Model -- Reduce All Domains Into One](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.06709v5) \u003Ckbd>CVPR 2024\u003C\u002Fkbd>\n- [Transfusion: Predict the Next Token and Diffuse Images with One Multi-Modal Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.11039)\n- [iVideoGPT: Interactive VideoGPTs are Scalable World Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15223) \u003Ckbd>NeurIPS 2024\u003C\u002Fkbd>\n- [MetaMorph: Multimodal Understanding and Generation via Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.14164)\n- [WORLDMEM: Long-term Consistent World Simulation with Memory](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.12369) [long term memory]\n- [PADriver: Towards Personalized Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2505.05240) [megvii, personalized driving]\n\n\n## 2024-11 (1)\n- [On the Opportunities and Risks of Foundation Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.07258) [[Notes](paper_notes\u002Fopportunities_foundation_models.md)]\n- [π0: A Vision-Language-Action Flow Model for General Robot Control](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.24164) [Physical Intelligence, VLA]\n- [EMMA: End-to-End Multimodal Model for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.23262) [Waymo, VLA]\n- [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.10891) \u003Ckbd>CVPR 2024\u003C\u002Fkbd>\n- [Depth Anything V2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.09414) \u003Ckbd>NeurIPS 2024\u003C\u002Fkbd>\n- [CarLLaVA: Vision language models for camera-only closed-loop driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.10165)\n- [LVSM: A Large View Synthesis Model with Minimal 3D Inductive Bias](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.17242) [Scene tokenization]\n- [NAVSIM: Data-Driven Non-Reactive Autonomous Vehicle Simulation and Benchmarking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.15349) \u003Ckbd>NeurIPS 2024\u003C\u002Fkbd>\n- [Driving Everywhere with Large Language Model Policy Adaptation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.05932) \u003Ckbd>CVPR 2024\u003C\u002Fkbd> [Marco Pavone]\n- [Consistency Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.01469) [difusion speedup, OpenAI, Yang Song]\n- [VILA: On Pre-training for Visual Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.07533) \u003Ckbd>CVPR 2024\u003C\u002Fkbd> [Song Han, Yao Lu]\n\n\n## 2024-06 (8)\n- [LINGO-1: Exploring Natural Language for Autonomous Driving](https:\u002F\u002Fwayve.ai\u002Fthinking\u002Flingo-natural-language-autonomous-driving\u002F) [[Notes](paper_notes\u002Flingo_1.md)] [Wayve, open-loop world model]\n- [LINGO-2: Driving with Natural Language](https:\u002F\u002Fwayve.ai\u002Fthinking\u002Flingo-2-driving-with-language\u002F) [[Notes](paper_notes\u002Flingo_2.md)] [Wayve, closed-loop world model]\n- [OpenVLA: An Open-Source Vision-Language-Action Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.09246) [open source RT-2]\n- [Parting with Misconceptions about Learning-based Vehicle Motion Planning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.07962) \u003Ckbd>CoRL 2023\u003C\u002Fkbd> [Simple non-learning based baseline]\n- [QuAD: Query-based Interpretable Neural Motion Planning for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.01486) [Waabi]\n- [MPDM: Multipolicy decision-making in dynamic, uncertain environments for autonomous driving](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F7139412) [[Notes](paper_notes\u002Fmpdm.md)] \u003Ckbd>ICRA 2015\u003C\u002Fkbd> [Behavior planning, UMich, May Autonomy]\n- [MPDM2: Multipolicy Decision-Making for Autonomous Driving via Changepoint-based Behavior Prediction](https:\u002F\u002Fwww.roboticsproceedings.org\u002Frss11\u002Fp43.pdf) [[Notes](paper_notes\u002Fmpdm2.md)] \u003Ckbd>RSS 2015\u003C\u002Fkbd> [Behavior planning]\n- [MPDM3: Multipolicy decision-making for autonomous driving via changepoint-based behavior prediction: Theory and experiment](https:\u002F\u002Flink.springer.com\u002Farticle\u002F10.1007\u002Fs10514-017-9619-z) \u003Ckbd>RSS 2017\u003C\u002Fkbd> [Behavior planning]\n- [EUDM: Efficient Uncertainty-aware Decision-making for Automated Driving Using Guided Branching](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02746) [[Notes](paper_notes\u002Feudm.md)] \u003Ckbd>ICRA 2020\u003C\u002Fkbd> [Wenchao Ding, Shaojie Shen, Behavior planning]\n- [TPP: Tree-structured Policy Planning with Learned Behavior Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.11902) \u003Ckbd>ICRA 2023\u003C\u002Fkbd> [Marco Pavone, Nvidia, Behavior planning]\n- [MARC: Multipolicy and Risk-aware Contingency Planning for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.12021) [[Notes](paper_notes\u002Fmarc.md)] \u003Ckbd>RAL 2023\u003C\u002Fkbd> [Shaojie Shen, Behavior planning]\n- [EPSILON: An Efficient Planning System for Automated Vehicles in Highly Interactive Environments](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.07993) \u003Ckbd>TRO 2021\u003C\u002Fkbd> [Wenchao Ding, encyclopedia of pnc]\n- [trajdata: A Unified Interface to Multiple Human Trajectory Datasets](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.13924) \u003Ckbd>NeurIPS 2023\u003C\u002Fkbd> [Marco Pavone, Nvidia]\n- [Optimal Vehicle Trajectory Planning for Static Obstacle Avoidance using Nonlinear Optimization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.09466) [Xpeng]\n- [Jointly Learnable Behavior and Trajectory Planning for Self-Driving Vehicles](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.04586) [[Notes](paper_notes\u002Fjoint_learned_bptp.md)] \u003Ckbd>IROS 2019 Oral\u003C\u002Fkbd> [Uber ATG, behavioral planning, motion planning]\n- [Enhancing End-to-End Autonomous Driving with Latent World Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.08481)\n- [OccNeRF: Advancing 3D Occupancy Prediction in LiDAR-Free Environments](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.09243) [Jiwen Lu]\n- [RenderOcc: Vision-Centric 3D Occupancy Prediction with 2D Rendering Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.09502) \u003Ckbd>ICRA 2024\u003C\u002Fkbd>\n- [EmerNeRF: Emergent Spatial-Temporal Scene Decomposition via Self-Supervision](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.02077) [Sanja, Marco, NV]\n- [FB-OCC: 3D Occupancy Prediction based on Forward-Backward View Transformation](https:\u002F\u002Fopendrivelab.com\u002Fe2ead\u002FAD23Challenge\u002FTrack_3_NVOCC.pdf?=&linkId=100000205404832)\n- [Trajeglish: Traffic Modeling as Next-Token Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.04535) \u003Ckbd>ICLR 2024\u003C\u002Fkbd>\n- [Autonomous Driving Strategies at Intersections: Scenarios, State-of-the-Art, and Future Outlooks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.13052) \u003Ckbd>ITSC 2021\u003C\u002Fkbd>\n- [Learning-Based Approach for Online Lane Change Intention Prediction](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F6629564\u002F) \u003Ckbd>IV 2013\u003C\u002Fkbd> [SVM, LC intention prediction]\n- [Traffic Flow-Based Crowdsourced Mapping in Complex Urban Scenario](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10171417) \u003Ckbd>RAL 2023\u003C\u002Fkbd> [Wenchao Ding, Huawei, crowdsourced map]\n- [FlowMap: Path Generation for Automated Vehicles in Open Space Using Traffic Flow](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.01622) \u003Ckbd>ICRA 2023\u003C\u002Fkbd>\n- [Hybrid A-star: Path Planning for Autonomous Vehicles in Unknown Semi-structured Environments](https:\u002F\u002Fwww.semanticscholar.org\u002Fpaper\u002FPath-Planning-for-Autonomous-Vehicles-in-Unknown-Dolgov-Thrun\u002F0e8c927d9c2c46b87816a0f8b7b8b17ed1263e9c) \u003Ckbd>IJRR 2010\u003C\u002Fkbd> [Dolgov, Thrun, Searching]\n- [Optimal Trajectory Generation for Dynamic Street Scenarios in a Frenet Frame](https:\u002F\u002Fwww.semanticscholar.org\u002Fpaper\u002FOptimal-trajectory-generation-for-dynamic-street-in-Werling-Ziegler\u002F6bda8fc13bda8cffb3bb426a73ce5c12cc0a1760) \u003Ckbd>ICRA 2010\u003C\u002Fkbd> [Werling, Thrun, Sampling] [MUST READ for planning folks]\n- [Autonomous Driving on Curvy Roads Without Reliance on Frenet Frame: A Cartesian-Based Trajectory Planning Method](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9703250) \u003Ckbd>TITS 2022\u003C\u002Fkbd>\n- [Baidu Apollo EM Motion Planner](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.08048) [[Notes](paper_notes\u002Fapollo_em_planner.md)][Optimization]\n- [基于改进混合A*的智能汽车时空联合规划方法](https:\u002F\u002Fwww.qichegongcheng.com\u002FCN\u002Fabstract\u002Fabstract1500.shtml) \u003Ckbd>汽车工程: 规划&决策2023年\u003C\u002Fkbd> [Joint optimization, search]\n- [Enable Faster and Smoother Spatio-temporal Trajectory Planning for Autonomous Vehicles in Constrained Dynamic Environment](https:\u002F\u002Fjournals.sagepub.com\u002Fdoi\u002Fabs\u002F10.1177\u002F0954407020906627) \u003Ckbd>JAE 2020\u003C\u002Fkbd> [Joint optimization, search]\n- [Focused Trajectory Planning for Autonomous On-Road Driving](https:\u002F\u002Fwww.ri.cmu.edu\u002Fpub_files\u002F2013\u002F6\u002FIV2013-Tianyu.pdf) \u003Ckbd>IV 2013\u003C\u002Fkbd> [Joint optimization, Iteration]\n- [SSC: Safe Trajectory Generation for Complex Urban Environments Using Spatio-Temporal Semantic Corridor](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.09788) \u003Ckbd>RAL 2019\u003C\u002Fkbd> [Joint optimization, SSC, Wenchao Ding, Motion planning]\n- [AlphaGo: Mastering the game of Go with deep neural networks and tree search](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fnature16961) [[Notes](paper_notes\u002Falphago.md)] \u003Ckbd>Nature 2016\u003C\u002Fkbd> [DeepMind, MTCS]\n- [AlphaZero: A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play](https:\u002F\u002Fwww.science.org\u002Fdoi\u002Ffull\u002F10.1126\u002Fscience.aar6404) \u003Ckbd>Science 2017\u003C\u002Fkbd> [DeepMind]\n- [MuZero: Mastering Atari, Go, chess and shogi by planning with a learned model](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-020-03051-4) \u003Ckbd>Nature 2020\u003C\u002Fkbd> [DeepMind]\n- [Grandmaster-Level Chess Without Search](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.04494) [DeepMind]\n- [Safe, Multi-Agent, Reinforcement Learning for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1610.03295) [MobileEye, desire and traj optimization]\n- [Comprehensive Reactive Safety: No Need For A Trajectory If You Have A Strategy](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.00198) \u003Ckbd>IROS 2022\u003C\u002Fkbd> [Da Fang, Qcraft]\n- [BEVGPT: Generative Pre-trained Large Model for Autonomous Driving Prediction, Decision-Making, and Planning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.10357) \u003Ckbd>AAAI 2024\u003C\u002Fkbd>\n- [LLM-MCTS: Large Language Models as Commonsense Knowledge for Large-Scale Task Planning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14078) \u003Ckbd>NeurIPS 2023\u003C\u002Fkbd>\n- [Hivt: Hierarchical vector transformer for multi-agent motion prediction](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2022\u002Fpapers\u002FZhou_HiVT_Hierarchical_Vector_Transformer_for_Multi-Agent_Motion_Prediction_CVPR_2022_paper.pdf) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [Zikang Zhou, agent-centric, motion prediction]\n- [QCNet: Query-Centric Trajectory Prediction](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fpapers\u002FZhou_Query-Centric_Trajectory_Prediction_CVPR_2023_paper.pdf) [[Notes](paper_notes\u002Fqcnet.md)] \u003Ckbd>CVPR 2023\u003C\u002Fkbd> [Zikang Zhou, scene-centric, motion prediction]\n\n## 2024-03 (11)\n- [Genie: Generative Interactive Environments](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.15391) [[Notes](paper_notes\u002Fgenie.md)] [DeepMind, World Model]\n- [DriveDreamer: Towards Real-world-driven World Models for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.09777) [[Notes](paper_notes\u002Fdrive_dreamer.md)] [Jiwen Lu, World Model]\n- [WorldDreamer: Towards General World Models for Video Generation via Predicting Masked Tokens](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.09985) [[Notes](paper_notes\u002Fworld_dreamer.md)] [Jiwen Lu, World Model]\n- [VideoPoet: A Large Language Model for Zero-Shot Video Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14125) [Like sora, but LLM, NOT world model]\n- [Align your Latents: High-Resolution Video Synthesis with Latent Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08818) [[Notes](paper_notes\u002Fvideo_ldm.md)] \u003Ckbd>CVPR 2023\u003C\u002Fkbd> [Sanja, Nvidia, VideoLDM, Video prediction]\n- [Video PreTraining (VPT): Learning to Act by Watching Unlabeled Online Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.11795) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [[Notes](paper_notes\u002Fvpt.md)] [OpenAI]\n- [MineDojo: Building Open-Ended Embodied Agents with Internet-Scale Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.08853) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [NVidia, Outstanding paper award]\n- [Humanoid Locomotion as Next Token Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.19469) [[Notes](paper_notes\u002Flocomotion_next_token_pred.md)] [Berkeley, EAI]\n- [RPT: Robot Learning with Sensorimotor Pre-training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.10007) [[Notes](paper_notes\u002Frpt.md)] \u003Ckbd>CoRL 2023 Oral\u003C\u002Fkbd> [Berkeley, EAI]\n- [MVP: Real-World Robot Learning with Masked Visual Pre-training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03109) [[Notes](paper_notes\u002Fmvp.md)] \u003Ckbd>CoRL 2022\u003C\u002Fkbd> [Berkeley, EAI]\n- [BC-Z: Zero-Shot Task Generalization with Robotic Imitation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.02005) [[Notes](paper_notes\u002Fbc_z.md)] \u003Ckbd>CoRL 2021\u003C\u002Fkbd> [Eric Jang, 1X]\n- [GenAD: Generalized Predictive Model for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09630) [[Notes](paper_notes\u002Fgenad.md)] \u003Ckbd>CVPR 2024\u003C\u002Fkbd>\n- [HG-DAgger: Interactive Imitation Learning with Human Experts](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.02890) [DAgger]\n- [DriveGAN: Towards a Controllable High-Quality Neural Simulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.15060) [[Notes](paper_notes\u002Fdrive_gan.md)] \u003Ckbd>CVPR 2021 oral\u003C\u002Fkbd> [Nvidia, Sanja]\n- [VideoGPT: Video Generation using VQ-VAE and Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10157) [[Notes](paper_notes\u002Fvideogpt.md)] [Pieter Abbeel]\n- [LLM, Vision Tokenizer and Vision Intelligence, by Lu Jiang](https:\u002F\u002Fmp.weixin.qq.com\u002Fs\u002FHamz5XMT1tSZHKdPaCBTKg) [[Notes](paper_notes\u002Fllm_vision_intel.md)] [Interview Lu Jiang]\n- [AV2.0: Reimagining an autonomous vehicle](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.05805) [[Notes](paper_notes\u002Fav20.md)] [Wayve, Alex Kendall]\n- [Simulation for E2E AD](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=8fivoXbT1Ao&ab_channel=Wayve) [Wayve, Tech Sharing, E2E]\n- [E2E lateral planning](https:\u002F\u002Fblog.comma.ai\u002Fend-to-end-lateral-planning\u002F) [Comma.ai, E2E planning]\n- [Learning and Leveraging World Models in Visual Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.00504) [LeCun, JEPA series]\n- [LVM: Sequential Modeling Enables Scalable Learning for Large Vision Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.00785) [Large Vision Models, Jitendra Malik]\n- [LWM: World Model on Million-Length Video And Language With RingAttention](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08268) [Pieter Abbeel]\n- [OccWorld: Learning a 3D Occupancy World Model for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.16038) [Jiwen Lu, World Model]\n- [GenAD: Generative End-to-End Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.11502)\n- [Transfuser: Multi-Modal Fusion Transformer for End-to-End Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.09224) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [E2E planning, Geiger]\n- [Driving with LLMs: Fusing Object-Level Vector Modality for Explainable Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01957) [Wayve, LLM + AD]\n- [LingoQA: Video Question Answering for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14115) [Wayve, LLM + AD]\n- [Panacea: Panoramic and Controllable Video Generation for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.16813) \u003Ckbd>CVPR 2024\u003C\u002Fkbd> [Megvii]\n- [PlanT: Explainable Planning Transformers via Object-Level Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.14222) \u003Ckbd>CoRL 2022\u003C\u002Fkbd>\n- [Scene as Occupancy](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.02851) \u003Ckbd>ICCV 2023\u003C\u002Fkbd>\n- [The Shift from Models to Compound AI Systems](https:\u002F\u002Fbair.berkeley.edu\u002Fblog\u002F2024\u002F02\u002F18\u002Fcompound-ai-systems\u002F)\n- [Roach: End-to-End Urban Driving by Imitating a Reinforcement Learning Coach](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.08265) \u003Ckbd>ICCV 2021\u003C\u002Fkbd>\n- [Learning from All Vehicles](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11934) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [LBC: Learning by Cheating](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.12294) \u003Ckbd>CoRL 2019\u003C\u002Fkbd>\n- [Learning to drive from a world on rails](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.00636) \u003Ckbd>ICCV 2021 oral\u003C\u002Fkbd> [Philipp Krähenbühl]\n- [Learning from All Vehicles](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11934) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [Philipp Krähenbühl]\n- [VADv2: End-to-End Vectorized Autonomous Driving via Probabilistic Planning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.13243) [Horizon]\n- [VQ-VAE: Neural Discrete Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.00937) \u003Ckbd>NeurIPS 2017\u003C\u002Fkbd> [Image Tokenizer]\n- [VQ-GAN: Taming Transformers for High-Resolution Image Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.09841) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Image Tokenizer]\n- [ViT-VQGAN: Vector-quantized Image Modeling with Improved VQGAN](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.04627) \u003Ckbd>ICLR 2022\u003C\u002Fkbd> [Image Tokenizer]\n- [MaskGIT: Masked Generative Image Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.04200) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [LLM, non-autoregressive]\n- [MAGVIT: Masked Generative Video Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.05199) \u003Ckbd>CVPR 2023 highlight\u003C\u002Fkbd> [Video Tokenizer]\n- [MAGVIT-v2: Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.05737) \u003Ckbd>ICLR 2024\u003C\u002Fkbd> [Video Tokenizer]\n- [Sora: A Review on Background, Technology, Limitations, and Opportunities of Large Vision Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.17177) [Reverse Engineering of Sora]\n- [GLaM: Efficient Scaling of Language Models with Mixture-of-Experts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.06905) \u003Ckbd>ICML 2022\u003C\u002Fkbd> [MoE, LLM]\n- [Lifelong Language Pretraining with Distribution-Specialized Experts](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.12281) \u003Ckbd>ICML 2023\u003C\u002Fkbd> [MoE, LLM]\n- [DriveLM: Drive on Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14150) [Hongyang Li]\n- [MotionLM: Multi-Agent Motion Forecasting as Language Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16534) \u003Ckbd>ICCV 2023\u003C\u002Fkbd> [Waymo, LLM + AD]\n- CubeLLM: align 2D\u002F3D with language\n- EmerNeRF: ICLR 2024\n- A Language Agent for Autonomous Driving\n- [Toward Driving Scene Understanding: A Dataset for Learning Driver Behavior and Causal]\n- [DriveDreamer-2: LLM-Enhanced World Models for Diverse Driving Video Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.06845)\n- [DriveWorld: 4D Pre-trained Scene Understanding via World Models for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.04390) \u003Ckbd>CVPR 2024\u003C\u002Fkbd> [Zheng Zhu]\n- [Is Sora a World Simulator? A Comprehensive Survey on General World Models and Beyond](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.03520) [Zheng Zhu]\n\n## 2024-02 (7)\n- [End-to-end Autonomous Driving: Challenges and Frontiers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.16927) [[Notes](paper_notes\u002Fe2e_review_hongyang.md)] [Hongyang Li, Shanghai AI labs]\n- [DriveVLM: The convergence of Autonomous Driving and Large Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.12289) [[Notes](paper_notes\u002Fdrivevlm.md)] [Hang Zhao]\n- [DriveGPT4: Interpretable End-to-end Autonomous Driving via Large Language Model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01412) [[Notes](paper_notes\u002Fdrivegpt4.md)] [HKU]\n- [GAIA-1: A Generative World Model for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17080) [[Notes](paper_notes\u002Fgaia_1.md)] [Wayve, vision foundation model]\n- [ADriver-I: A General World Model for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.13549) [[Notes](paper_notes\u002Fadriver_i.md)] [Megvii, Xiangyu]\n- [Drive-WM: Driving into the Future: Multiview Visual Forecasting and Planning with World Model for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.17918) [[Notes](paper_notes\u002Fdrive_wm.md)]\n- [X]() [[Notes](paper_notes\u002Fx.md)] [E2E planning]\n\n\n## 2023-12 (4)\n- [ChatGPT for Robotics: Design Principles and Model Abilities](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.17582) [[Notes](paper_notes\u002Fprompt_craft.md)] [Microsoft, LLM for robotics]\n- [RoboVQA: Multimodal Long-Horizon Reasoning for Robotics](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.00899) [[Notes](paper_notes\u002Frobovqa.md)] [Google DeepMind, LLM for robotics]\n- [ChatGPT Empowered Long-Step Robot Control in Various Environments: A Case Application](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10235949) [Microsoft Robotics]\n- [GPT-4V(ision) for Robotics: Multimodal Task Planning from Human Demonstration](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.12015) [[Notes](paper_notes\u002Fgpt4v_robotics.md)] [LLM for robotics, Microsoft Robotics]\n- [LLM-Brain: LLM as A Robotic Brain: Unifying Egocentric Memory and Control](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.09349) [[Notes](paper_notes\u002Fllm_brain.md)]\n- [Voyager: An Open-Ended Embodied Agent with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16291) [[Notes](paper_notes\u002Fvoyager.md)] [Reasoning Critique, Linxi Jim Fan]\n\n## 2023-09 (3)\n- [RetNet: Retentive Network: A Successor to Transformer for Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.08621) [[Notes](paper_notes\u002Fretnet.md)] [MSRA]\n- [Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.16236) [[Notes](paper_notes\u002Ftransformers_are_rnns.md)] \u003Ckbd>ICML 2020\u003C\u002Fkbd> [Linear attention]\n- [AFT: An Attention Free Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.14103) [[Notes](paper_notes\u002Faft.md)] [Apple]\n\n\n## 2023-08 (3)\n- [RT-1: Robotics Transformer for Real-World Control at Scale](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.06817) [[Notes](paper_notes\u002Frt1.md)] [DeepMind]\n- [RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control](https:\u002F\u002Frobotics-transformer2.github.io\u002Fassets\u002Frt2.pdf) [[Notes](paper_notes\u002Frt2.md)] [DeepMind, end-to-end visuomotor]\n- [RWKV: Reinventing RNNs for the Transformer Era](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13048) [[Notes](paper_notes\u002Frwkv.md)]\n\n## 2023-07 (6)\n- [MILE: Model-Based Imitation Learning for Urban Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07729) [[Notes](paper_notes\u002Fmile.md)] \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [Alex Kendall]\n- [PaLM-E: An embodied multimodal language model](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03378) [[Notes](paper_notes\u002Fpalm_e.md)] [Google Robotics]\n- [VoxPoser: Composable 3D Value Maps for Robotic Manipulation with Language Models](https:\u002F\u002Fvoxposer.github.io\u002Fvoxposer.pdf) [[Notes](paper_notes\u002Fvoxposer.md)] [Feifei Li]\n- [CaP: Code as Policies: Language Model Programs for Embodied Control](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.07753) [[Notes](paper_notes\u002Fcap.md)] [[Project](https:\u002F\u002Fcode-as-policies.github.io\u002F)]\n- [ProgPrompt: Generating Situated Robot Task Plans using Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.11302) \u003Ckbd>ICRA 2023\u003C\u002Fkbd>\n- [TidyBot: Personalized Robot Assistance with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.05658) [[Notes](paper_notes\u002Ftidybot.md)] [[Project](https:\u002F\u002Ftidybot.cs.princeton.edu\u002F)]\n- [SayCan: Do As I Can, Not As I Say: Grounding Language in Robotic Affordances](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.01691) [[Notes](paper_notes\u002Fsaycan.md)] [[Project](https:\u002F\u002Fsay-can.github.io\u002F)]\n\n\n## 2023-06 (5)\n- [End-to-end review by Shanghai AI Labs](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FEnd-to-end-Autonomous-Driving)\n- [Pix2seq v2: A Unified Sequence Interface for Vision Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.07669) [[Notes](paper_notes\u002Fpix2seq_v2.md)] \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [Geoffrey Hinton]\n- 🦩 [Flamingo: a Visual Language Model for Few-Shot Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.14198) [[Notes](paper_notes\u002Fflamingo.md)] \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [DeepMind]\n- 😼 [Gato: A Generalist Agent](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.06175) [[Notes](paper_notes\u002Fgato.md)] \u003Ckbd>TMLR 2022\u003C\u002Fkbd> [DeepMind]\n- [BC-SAC: Imitation Is Not Enough: Robustifying Imitation with Reinforcement Learning for Challenging Driving Scenarios](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.11419) [[Notes](paper_notes\u002Fbc_sac.md)] \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [Waymo]\n- [MGAIL-AD: Hierarchical Model-Based Imitation Learning for Planning in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.09539) [[Notes](paper_notes\u002Fmgail_ad.md)] \u003Ckbd>IROS 2022\u003C\u002Fkbd> [Waymo]\n\n\n\n## 2023-05 (7)\n- [SurroundOcc: Multi-Camera 3D Occupancy Prediction for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.09551) [[Notes](paper_notes\u002Fsurroundocc.md)] [Occupancy Network, Wei Yi, Jiwen Lu]\n- [Occ3D: A Large-Scale 3D Occupancy Prediction Benchmark for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.14365) [[Notes](paper_notes\u002Focc3d.md)] [Occupancy Network, Zhao Hang]\n- [Occupancy Networks: Learning 3D Reconstruction in Function Space](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.03828) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [[Notes](paper_notes\u002Foccupancy_networks.md)] [Andreas Geiger]\n- [OccFormer: Dual-path Transformer for Vision-based 3D Semantic Occupancy Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05316) [Occupancy Network, PhiGent]\n- [Pix2seq: A Language Modeling Framework for Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.10852) [[Notes](paper_notes\u002Fpix2seq.md)] \u003Ckbd>ICLR 2022\u003C\u002Fkbd> [Geoffrey Hinton]\n- [VisionLLM: Large Language Model is also an Open-Ended Decoder for Vision-Centric Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11175) [[Notes](paper_notes\u002Fvision_llm.md)] [Jifeng Dai]\n- [HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17580) [[Notes](paper_notes\u002Fhugging_gpt.md)]\n\n\n## 2023-04 (1)\n- [UniAD: Planning-oriented Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10156) [[Notes](paper_notes\u002Funiad.md)] \u003Ckbd>CVPR 2023 best paper\u003C\u002Fkbd> [BEV, e2e, Hongyang Li]\n\n\n\n## 2023-03 (5)\n- [GPT-4 Technical Report](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.08774) [[Notes](paper_notes\u002Fgpt4.md)] [OpenAI, GPT]\n- [OpenOccupancy: A Large Scale Benchmark for Surrounding Semantic Occupancy Perception](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03991) [[Notes](paper_notes\u002Fopenoccupancy.md)] [Occupancy Network, Jiwen Lu]\n- [VoxFormer: Sparse Voxel Transformer for Camera-based 3D Semantic Scene Completion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.12251) [[Note](paper_notes\u002Fvoxformer.md)] \u003Ckbd>CVPR 2023 highlight\u003C\u002Fkbd> [Occupancy Network, Nvidia]\n- [MonoScene: Monocular 3D Semantic Scene Completion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.00726) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [[Notes](paper_notes\u002Fmonoscene.md)] [Occupancy Network, single cam]\n- [CoReNet: Coherent 3D scene reconstruction from a single RGB image](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.12989) [[Notes](paper_notes\u002Fcorenet.md)] \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd>\n\n\n## 2023-02 (4)\n- [Will we run out of data? An analysis of the limits of scaling datasets in Machine Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.04325) [[Notes](paper_notes\u002Fout_of_data.md)] [Epoch.ai industry report]\n- [Codex: Evaluating Large Language Models Trained on Code](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374) [[Notes](paper_notes\u002Fcodex.md)] [GPT, OpenAI]\n- [InstructGPT: Training language models to follow instructions with human feedback](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.02155) [[Notes](paper_notes\u002Finstructgpt.md)] [GPT, OpenAI]\n- [TPVFormer: Tri-Perspective View for Vision-Based 3D Semantic Occupancy Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.07817) [[Notes](paper_notes\u002Ftpvformer.md)] \u003Ckbd>CVPR 2023\u003C\u002Fkbd> [Occupancy Network, Jiwen Lu]\n\n\n## 2023-01 (2)\n- [PPGeo: Policy Pre-training for End-to-end Autonomous Driving via Self-supervised Geometric Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.01006) [[Notes](paper_notes\u002Fppgeo.md)] \u003Ckbd>ICLR 2023\u003C\u002Fkbd>\n- [nuPlan: A closed-loop ML-based planning benchmark for autonomous vehicles](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.11810) [[Notes](paper_notes\u002Fnuplan.md)]\n\n\n\n## 2022-11 (1)\n- [M2I: From Factored Marginal Trajectory Prediction to Interactive Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.11884) [[Notes](paper_notes\u002Fm2i.md)] \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n\n\n## 2022-10 (1)\n- [Delving into the Devils of Bird's-eye-view Perception: A Review, Evaluation and Recipe](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.05324) [[Notes](paper_notes\u002Fdelving_bev.md)] [PJLab]\n\n## 2022-09 (3)\n- [ViP3D: End-to-end Visual Trajectory Prediction via 3D Agent Queries](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.01582) [[Notes](paper_notes\u002Fvip3d.md)] [BEV, perception + prediction, Hang Zhao]\n- [MapTR: Structured Modeling and Learning for Online Vectorized HD Map Construction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.14437) [[Notes](paper_notes\u002Fmaptr.md)] [Horizon, BEVNet]\n- [StopNet: Scalable Trajectory and Occupancy Prediction for Urban Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.00991) \u003Ckbd>ICRA 2022\u003C\u002Fkbd>\n- [MOTR: End-to-End Multiple-Object Tracking with Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.03247) \u003Ckbd>ECCV 2022\u003C\u002Fkbd> [Megvii, MOT]\n- [Anchor DETR: Query Design for Transformer-Based Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.07107) [[Notes](paper_notes\u002Fanchor_detr.md)] \u003Ckbd>AAAI 2022\u003C\u002Fkbd> [Megvii]\n\n\n## 2022-08 (1)\n- [HOME: Heatmap Output for future Motion Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.10968) [[Notes](paper_notes\u002Fhome.md)] \u003Ckbd>ITSC 2021\u003C\u002Fkbd> [behavior prediction, Huawei Paris]\n\n## 2022-07 (8)\n- [PersFormer: 3D Lane Detection via Perspective Transformer and the OpenLane Benchmark](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11089) [[Notes](paper_notes\u002Fpersformer.md)] [BEVNet, lane line]\n- [VectorMapNet: End-to-end Vectorized HD Map Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.08920) [[Notes](paper_notes\u002Fvectormapnet.md)] [BEVNet, LLD, Hang Zhao]\n- [PETR: Position Embedding Transformation for Multi-View 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.05625) [[Notes](paper_notes\u002Fpetr.md)] \u003Ckbd>ECCV 2022\u003C\u002Fkbd> [BEVNet]\n- [PETRv2: A Unified Framework for 3D Perception from Multi-Camera Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.01256) [[Notes](paper_notes\u002Fpetrv2.md)] [BEVNet, MegVii]\n- [M^2BEV: Multi-Camera Joint 3D Detection and Segmentation with Unified Birds-Eye View Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.05088) [[Notes](paper_notes\u002Fm2bev.md)] [BEVNet, nvidia]\n- [BEVDepth: Acquisition of Reliable Depth for Multi-view 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.10092) [[Notes](paper_notes\u002Fbevdepth.md)] [BEVNet, NuScenes SOTA, Megvii]\n- [CVT: Cross-view Transformers for real-time Map-view Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.02833) [[Notes](paper_notes\u002Fcvt.md)] \u003Ckbd>CVPR 2022 oral\u003C\u002Fkbd> [UTAustin, Philipp]\n- [Wayformer: Motion Forecasting via Simple & Efficient Attention Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.05844) [[Notes](paper_notes\u002Fwayformer.md)] [Behavior prediction, Waymo]\n\n## 2022-06 (3)\n- [BEVDet4D: Exploit Temporal Cues in Multi-camera 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.17054) [[Notes](paper_notes\u002Fbevdet4d.md)] [BEVNet]\n- [BEVerse: Unified Perception and Prediction in Birds-Eye-View for Vision-Centric Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.09743) [[Notes](paper_notes\u002Fbeverse.md)] [Jiwen Lu, BEVNet, perception + prediction]\n- [BEVFusion: Multi-Task Multi-Sensor Fusion with Unified Bird's-Eye View Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.13542) [[Notes](paper_notes\u002Fbevfusion.md)] [BEVNet, Han Song]\n\n## 2022-03 (1)\n- [BEVFormer: Learning Bird's-Eye-View Representation from Multi-Camera Images via Spatiotemporal Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.17270) [[Notes](paper_notes\u002Fbevformer.md)] \u003Ckbd>ECCV 2022\u003C\u002Fkbd> [BEVNet, Hongyang Li, Jifeng Dai]\n\n## 2022-02 (1)\n- [TNT: Target-driveN Trajectory Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.08294) [[Notes](paper_notes\u002Ftnt.md)] \u003Ckbd>CoRL 2020\u003C\u002Fkbd> [prediction, Waymo, Hang Zhao]\n- [DenseTNT: End-to-end Trajectory Prediction from Dense Goal Sets](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.09640) [[Notes](paper_notes\u002Fdense_tnt.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [prediction, Waymo, 1st place winner WOMD]\n\n## 2022-01 (1)\n- [Manydepth: The Temporal Opportunist: Self-Supervised Multi-Frame Monocular Depth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.14540) [[Notes](paper_notes\u002Fmanydepth.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [monodepth, Niantic]\n- [DEKR: Bottom-Up Human Pose Estimation Via Disentangled Keypoint Regression](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.02300) [[Notes](paper_notes\u002Fdekr.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n\n## 2021-12 (5)\n- [BN-FFN-BN: Leveraging Batch Normalization for Vision Transformers](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021W\u002FNeurArch\u002Fpapers\u002FYao_Leveraging_Batch_Normalization_for_Vision_Transformers_ICCVW_2021_paper.pdf) [[Notes](paper_notes\u002Fbn_ffn_bn.md)] \u003Ckbd>ICCVW 2021\u003C\u002Fkbd> [BN, transformers]\n- [PowerNorm: Rethinking Batch Normalization in Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.07845) [[Notes](paper_notes\u002Fpowernorm.md)] \u003Ckbd>ICML 2020\u003C\u002Fkbd> [BN, transformers]\n- [MultiPath++: Efficient Information Fusion and Trajectory Aggregation for Behavior Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.14973) [[Notes](paper_notes\u002Fmultipath++.md)] \u003Ckbd>ICRA 2022\u003C\u002Fkbd> [Waymo, behavior prediction]\n- [BEVDet: High-performance Multi-camera 3D Object Detection in Bird-Eye-View](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.11790) [[Notes](paper_note\u002Fbevdet.md)]\n- [Translating Images into Maps](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.00966) [[Notes](paper_notes\u002Ftranslating_images_to_maps.md)] \u003Ckbd>ICRA 2022\u003C\u002Fkbd> [BEVNet, transformers]\n\n## 2021-11 (4)\n- [DETR3D: 3D Object Detection from Multi-view Images via 3D-to-2D Queries](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.06922) [[Notes](paper_notes\u002Fdetr3d.md)] \u003Ckbd>CoRL 2021\u003C\u002Fkbd> [BEVNet, transformers]\n- [Robust-CVD: Robust Consistent Video Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.05901) \u003Ckbd>CVPR 2021 oral\u003C\u002Fkbd> [[website](https:\u002F\u002Frobust-cvd.github.io\u002F)]\n- [MAE: Masked Autoencoders Are Scalable Vision Learners](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.06377) [[Notes](paper_notes\u002Fmae.md)] [Kaiming He, unsupervised learning]\n- [SimMIM: A Simple Framework for Masked Image Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09886) [[Notes](paper_notes\u002Fsimmim.md)] [MSRA, unsupervised learning, MAE]\n- [iBOT: Image BERT Pre-Training with Online Tokenizer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.07832)\n\n## 2021-10 (3)\n- [STSU: Structured Bird's-Eye-View Traffic Scene Understanding from Onboard Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.01997) [[Notes](paper_notes\u002Fstsu.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [BEV feat stitching, Luc Van Gool]\n- [PanopticBEV: Bird's-Eye-View Panoptic Segmentation Using Monocular Frontal View Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.03227) [[Notes](paper_notes\u002Fpanoptic_bev.md)] \u003Ckbd>RAL 2022\u003C\u002Fkbd> [BEVNet, vertical\u002Fhorizontal features]\n- [NEAT: Neural Attention Fields for End-to-End Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.04456) [[Notes](paper_notes\u002Fneat.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [[supplementary](http:\u002F\u002Fwww.cvlibs.net\u002Fpublications\u002FChitta2021ICCV_supplementary.pdf)] [BEVNet]\n\n\n## 2021-09 (11)\n- [DD3D: Is Pseudo-Lidar needed for Monocular 3D Object detection?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.06417) [[Notes](paper_notes\u002Fdd3d.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [mono3D, Toyota]\n- [EfficientDet: Scalable and Efficient Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.09070) [[Notes](paper_notes\u002Fefficientdet.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [BiFPN, Tesla AI day]\n- [PnPNet: End-to-End Perception and Prediction with Tracking in the Loop](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14711) [[Notes](paper_notes\u002Fpnpnet.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Uber ATG]\n- [MP3: A Unified Model to Map, Perceive, Predict and Plan](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.06806) [[Notes](paper_notes\u002Fmp3.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Uber, planning]\n- [BEV-Net: Assessing Social Distancing Compliance by Joint People Localization and Geometric Reasoning](http:\u002F\u002Farxiv.org\u002Fabs\u002F2110.04931) [[Notes](paper_notes\u002Fbevnet_sdca.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [BEVNet, surveillance]\n- [LiDAR R-CNN: An Efficient and Universal 3D Object Detector](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.15297) [[Notes](paper_notes\u002Flidar_rcnn.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [TuSimple, Naiyan Wang]\n- [Corner Cases for Visual Perception in Automated Driving: Some Guidance on Detection Approaches](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.05897) [[Notes](paper_notes\u002Fcorner_case_vision_arxiv.md)] [corner cases]\n- [Systematization of Corner Cases for Visual Perception in Automated Driving](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9304789) [[Notes](paper_notes\u002Fcorner_case_vision_iv.md)] \u003Ckbd>IV 2020\u003C\u002Fkbd> [corner cases]\n- [An Application-Driven Conceptualization of Corner Cases for Perception in Highly Automated Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.03678) [[Notes](paper_notes\u002Fcorner_case_multisensor.md)] \u003Ckbd>IV 2021\u003C\u002Fkbd> [corner cases]\n- [PYVA: Projecting Your View Attentively: Monocular Road Scene Layout Estimation via Cross-view Transformation](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fhtml\u002FYang_Projecting_Your_View_Attentively_Monocular_Road_Scene_Layout_Estimation_via_CVPR_2021_paper.html) [[Notes](paper_notes\u002Fpyva.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [[Supplementary](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fsupplemental\u002FYang_Projecting_Your_View_CVPR_2021_supplemental.zip)] [BEVNet]\n- [YOLOF: You Only Look One-level Feature](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.09460) [[Notes](paper_notes\u002Fyolof.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [megvii]\n- [Perceiving Humans: from Monocular 3D Localization to Social Distancing](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.00984) [[Notes](paper_notes\u002Fperceiving_humans.md)] \u003Ckbd>TITS 2021\u003C\u002Fkbd> [monoloco++]\n- [PifPaf: Composite Fields for Human Pose Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.06593) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Bird's-Eye-View Panoptic Segmentation Using Monocular Frontal View Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.03227) [BEVNet]\n- [TransformerFusion: Monocular RGB Scene Reconstruction using Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.02191)\n- [Projecting Your View Attentively: Monocular Road Scene Layout Estimation via Cross-view Transformation](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fpapers\u002FYang_Projecting_Your_View_Attentively_Monocular_Road_Scene_Layout_Estimation_via_CVPR_2021_paper.pdf) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [Multi-Modal Fusion Transformer for End-to-End Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.09224) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [Conditional DETR for Fast Training Convergence](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.06152)\n- [Probabilistic and Geometric Depth: Detecting Objects in Perspective](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.14160) \u003Ckbd>CoRL 2021\u003C\u002Fkbd>\n\n\n## 2021-08 (11)\n- [EgoNet: Exploring Intermediate Representation for Monocular Vehicle Pose Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.08464) [[Notes](paper_notes\u002Fegonet.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [MonoEF: Monocular 3D Object Detection: An Extrinsic Parameter Free Approach](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.15796) [[Notes](paper_notes\u002Fmonoef.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [GAC: Ground-aware Monocular 3D Object Detection for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.00690) [[Notes](paper_notes\u002Fgac.md)] \u003Ckbd>RAL 2021\u003C\u002Fkbd> [mono3D]\n- [FCOS3D: Fully Convolutional One-Stage Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10956) [[Notes](paper_notes\u002Ffcos3d.md)] \u003Ckbd>NeurIPS 2020\u003C\u002Fkbd> [mono3D, senseTime]\n- [GUPNet: Geometry Uncertainty Projection Network for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.13774) [[Notes](paper_notes\u002Fgupnet.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [mono3D, Wanli Ouyang]\n- [DARTS: Differentiable Architecture Search](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.09055) [[Notes](paper_notes\u002Fdarts.md)] \u003Ckbd>ICLR 2019\u003C\u002Fkbd> [VGG author]\n- [FBNet: Hardware-Aware Efficient ConvNet Design via Differentiable Neural Architecture Search](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.03443) [[Notes](paper_notes\u002Ffbnet.md)] \u003Ckbd>CVPR 20219\u003C\u002Fkbd> [DARTS]\n- [FBNetV2: Differentiable Neural Architecture Search for Spatial and Channel Dimensions](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.05565) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [FBNetV3: Joint Architecture-Recipe Search using Predictor Pretraining](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.02049) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [Perceiver: General Perception with Iterative Attention](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.03206) [[Notes](paper_notes\u002Fperceiver.md)] \u003Ckbd>ICML 2021\u003C\u002Fkbd> [transformers, multimodal]\n- [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.14795) [[Notes](paper_notes\u002Fperceiver_io.md)]\n- [PillarMotion: Self-Supervised Pillar Motion Learning for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08683)  [[Notes](paper_notes\u002Fpillar_motion.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Qcraft, Alan Yuille]\n- [SimTrack: Exploring Simple 3D Multi-Object Tracking for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.10312) [[Notes](paper_notes\u002Fsimtrack.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [QCraft, Alan Yuille]\n\n\n## 2021-07 (1)\n- [HDMapNet: An Online HD Map Construction and Evaluation Framework](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.06307) [[Notes](paper_notes\u002Fhdmapnet.md)] \u003Ckbd>CVPR 2021 workshop\u003C\u002Fkbd> [youtube video only, Li Auto]\n\n\n## 2021-06 (2)\n- [FIERY: Future Instance Prediction in Bird's-Eye View from Surround Monocular Cameras](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10490) [[Notes](paper_notes\u002Ffiery.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [BEVNet, perception + prediction]\n- [Baidu's CNN seg](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F35034215) [[Notes](paper_notes\u002Fcnn_seg.md)]\n\n## 2021-04 (5)\n- [Rethinking the Heatmap Regression for Bottom-up Human Pose Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.15175) [[Notes](paper_notes\u002Fswahr.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [megvii] \n- [CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.00324) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [The Overlooked Elephant of Object Detection: Open Set](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_WACV_2020\u002Fhtml\u002FDhamija_The_Overlooked_Elephant_of_Object_Detection_Open_Set_WACV_2020_paper.html) \u003Ckbd>WACV 2021\u003C\u002Fkbd>\n- [Class-Agnostic Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14204) \u003Ckbd>WACV 2021\u003C\u002Fkbd>\n- [OWOD: Towards Open World Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.02603) [[Notes](paper_notes\u002Fowod.md)] \u003Ckbd>CVPR 2021 oral\u003C\u002Fkbd>\n- [FsDet: Frustratingly Simple Few-Shot Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.06957) \u003Ckbd>ICML 2020\u003C\u002Fkbd>\n- [MonoFlex: Objects are Different: Flexible Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.02323) [[Notes](paper_notes\u002Fmonoflex.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D, Jiwen Lu, cropped]\n- [monoDLE: Delving into Localization Errors for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.16237) [[Notes](paper_notes\u002Fmonodle.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [Exploring 2D Data Augmentation for 3D Monocular Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10786)\n- [OCM3D: Object-Centric Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.06041) [mono3D]\n- [FSM: Full Surround Monodepth from Multiple Cameras](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.00152) [[Notes](paper_notes\u002Ffsm.md)] \u003Ckbd>ICRA 2021\u003C\u002Fkbd> [monodepth, Xnet]\n\n\n## 2021-03 (4)\n- [CaDDN: Categorical Depth Distribution Network for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.01100) [[Notes](paper_notes\u002Fcaddn.md)] \u003Ckbd>CVPR 2021 oral\u003C\u002Fkbd> [mono3D, BEVNet]\n- [DSNT: Numerical Coordinate Regression with Convolutional Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1801.07372) [[Notes](paper_notes\u002Fdsnt.md)] [differentiable spatial to numerical transform]\n- [Soft-Argmax: Human pose regression by combining indirect part detection and contextual information](https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.02322)\n- [INSTA-YOLO: Real-Time Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.06777) [[Notes](paper_notes\u002Finsta_yolo.md)] \u003Ckbd>ICML workshop 2020\u003C\u002Fkbd> [single stage instance segmentation]\n- [CenterNet2: Probabilistic two-stage detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.07461) [[Notes](paper_notes\u002Fcenternet2.md)] [CenterNet, two-stage]\n\n\n## 2021-01 (7)\n- [Confluence: A Robust Non-IoU Alternative to Non-Maxima Suppression in Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.00257) [[Notes](paper_notes\u002Fconfluence.md)] [NMS]\n- [BoxInst: High-Performance Instance Segmentation with Box Annotations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.02310) [[Notes](paper_notes\u002Fboxinst.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Chunhua Shen, Tian Zhi]\n- [3DSSD: Point-based 3D Single Stage Object Detector](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.10187) [[Notes](paper_notes\u002F3dssd.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [RepVGG: Making VGG-style ConvNets Great Again](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.03697) [[Notes](paper_notes\u002Frepvgg.md)] [Megvii, Xiangyu Zhang, ACNet]\n- [ACNet: Strengthening the Kernel Skeletons for Powerful CNN via Asymmetric Convolution Blocks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03930) [[Notes](paper_notes\u002Facnet.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [BEV-Feat-Stitching: Understanding Bird's-Eye View Semantic HD-Maps Using an Onboard Monocular Camera](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.03040) [[Notes](paper_notes\u002Fbev_feat_stitching.md)] [BEVNet, mono3D, Luc Van Gool]\n- [PSS: Object Detection Made Simpler by Eliminating Heuristic NMS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.11782) [[Notes](paper_notes\u002Fpss.md)] [Transformer, DETR]\n\n## 2020-12 (17)\n- [DeFCN: End-to-End Object Detection with Fully Convolutional Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.03544) [[Notes](paper_notes\u002Fdefcn.md)] [Transformer, DETR]\n- [OneNet: End-to-End One-Stage Object Detection by Classification Cost](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.05780) [[Notes](paper_notes\u002Fonenet.md)] [Transformer, DETR]\n- [Traffic Light Mapping, Localization, and State Detection for Autonomous Vehicles](http:\u002F\u002Fdriving.stanford.edu\u002Fpapers\u002FICRA2011.pdf) [[Notes](paper_notes\u002Ftfl_stanford.md)] \u003Ckbd>ICRA 2011\u003C\u002Fkbd> [traffic light, Sebastian Thrun]\n- [Towards lifelong feature-based mapping in semi-static environments](https:\u002F\u002Fstorage.googleapis.com\u002Fpub-tools-public-publication-data\u002Fpdf\u002F43966.pdf) [[Notes](paper_notes\u002Flifelong_feature_mapping_google.md)] \u003Ckbd>ICRA 2016\u003C\u002Fkbd>\n- [How to Keep HD Maps for Automated Driving Up To Date](http:\u002F\u002Fwww.lewissoft.com\u002Fpdf\u002FICRA2020\u002F1484.pdf) [[Notes](paper_notes\u002Fkeep_hd_maps_updated_bmw.md)] \u003Ckbd>ICRA 2020\u003C\u002Fkbd> [BMW]\n- [Generalized Focal Loss V2: Learning Reliable Localization Quality Estimation for Dense Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.12885) [[Notes](paper_notes\u002Fgfocalv2.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [focal loss]\n- [Visual SLAM for Automated Driving: Exploring the Applications of Deep Learning](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018_workshops\u002Fpapers\u002Fw9\u002FMilz_Visual_SLAM_for_CVPR_2018_paper.pdf) [[Notes](paper_notes\u002Fvslam_for_ad.md)] \u003Ckbd>CVPR 2018 workshop\u003C\u002Fkbd>\n- [Centroid Voting: Object-Aware Centroid Voting for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.09836) [[Notes](paper_notes\u002Fcentroid_voting.md)] \u003Ckbd>IROS 2020\u003C\u002Fkbd> [mono3D, geometry + appearance = distance]\n- [Monocular 3D Object Detection in Cylindrical Images from Fisheye Cameras](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.03759) [[Notes](paper_notes\u002Fmono3d_fisheye.md)] [GM Israel, mono3D]\n- [DeepPS: Vision-Based Parking-Slot Detection: A DCNN-Based Approach and a Large-Scale Benchmark Dataset](https:\u002F\u002Fcslinzhang.github.io\u002Fdeepps\u002Fparkingslot.pdf) \u003Ckbd>TIP 2018\u003C\u002Fkbd> [Parking slot detection, PS2.0 dataset]\n- [PSDet: Efficient and Universal Parking Slot Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.05528) [[Notes](paper_notes\u002Fpsdet.md)] \u003Ckbd>IV 2020\u003C\u002Fkbd> [Zongmu, Parking slot detection]\n- [PatDNN: Achieving Real-Time DNN Execution on Mobile Devices with Pattern-based Weight Pruning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.00138) [[Notes](paper_notes\u002Fpatdnn.md)] \u003Ckbd>ASPLOS 2020\u003C\u002Fkbd> [pruning]\n- [Scaled-YOLOv4: Scaling Cross Stage Partial Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.08036) [[Notes](paper_notes\u002Fscaled_yolov4.md)] [yolo]\n- [Yolov5 by Ultralytics](https:\u002F\u002Fgithub.com\u002Fultralytics\u002Fyolov5) [[Notes](paper_notes\u002Fyolov5.md)] [yolo, spatial2channel]\n- [PP-YOLO: An Effective and Efficient Implementation of Object Detector](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.12099) [[Notes](paper_notes\u002Fpp_yolo.md)] [yolo, paddle-paddle, baidu]\n- [PointPainting: Sequential Fusion for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.10150.pdf) [[Notes](paper_notes\u002Fpoint_painting.md)] [nuscenece]\n- [MotionNet: Joint Perception and Motion Prediction for Autonomous Driving Based on Bird's Eye View Maps](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.06754) [[Notes](paper_notes\u002Fmotionnet.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Unseen moving objects, BEV]\n- [Locating Objects Without Bounding Boxes](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.07564) [[Notes](paper_notes\u002Fobjects_without_bboxes.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [weighted Haussdorf distance, NMS-free]\n\n\n## 2020-11 (18)\n- [TSP: Rethinking Transformer-based Set Prediction for Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.10881) [[Notes](paper_notes\u002Ftsp.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [DETR, transformers, Kris Kitani]\n- [Sparse R-CNN: End-to-End Object Detection with Learnable Proposals](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.12450) [[Notes](paper_notes\u002Fsparse_rcnn.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [DETR, Transformer]\n- [Unsupervised Monocular Depth Learning in Dynamic Scenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.16404) [[Notes](paper_notes\u002Flearn_depth_and_motion.md)] \u003Ckbd>CoRL 2020\u003C\u002Fkbd> [LearnK improved ver, Google]\n- [MoNet3D: Towards Accurate Monocular 3D Object Localization in Real Time](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.16007) [[Notes](paper_notes\u002Fmonet3d.md)] \u003Ckbd>ICML 2020\u003C\u002Fkbd> [Mono3D, pairwise relationship]\n- [Argoverse: 3D Tracking and Forecasting with Rich Maps](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.02620) [[Notes](paper_notes\u002Fargoverse.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [HD maps, dataset, CV lidar]\n- [The H3D Dataset for Full-Surround 3D Multi-Object Detection and Tracking in Crowded Urban Scenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.01568) [[Notes](paper_notes\u002Fh3d.md)] \u003Ckbd>ICRA 2019\u003C\u002Fkbd>\n- [Cityscapes 3D: Dataset and Benchmark for 9 DoF Vehicle Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.07864) \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [dataset, Daimler, mono3D]\n- [NYC3DCars: A Dataset of 3D Vehicles in Geographic Context](https:\u002F\u002Fwww.cs.cornell.edu\u002F~snavely\u002Fpublications\u002Fpapers\u002Fnyc3dcars_iccv13.pdf) \u003Ckbd>ICCV 2013\u003C\u002Fkbd>\n- [Towards Fully Autonomous Driving: Systems and Algorithms](https:\u002F\u002Fwww.ri.cmu.edu\u002Fwp-content\u002Fuploads\u002F2017\u002F12\u002Flevinson-iv2011.pdf) \u003Ckbd>IV 2011\u003C\u002Fkbd>\n- [Center3D: Center-based Monocular 3D Object Detection with Joint Depth Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.13423) [[Notes](paper_notes\u002Fcenter3d.md)] [mono3D, LID+DepJoint]\n- [ZoomNet: Part-Aware Adaptive Zooming Neural Network for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.00529) \u003Ckbd>AAAI 2020 oral\u003C\u002Fkbd> [mono3D] \n- [CenterFusion: Center-based Radar and Camera Fusion for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.04841) [[Notes](paper_notes\u002Fcenterfusion.md)] \u003Ckbd>WACV 2021\u003C\u002Fkbd> [early fusion, camera, radar]\n- [3D-LaneNet+: Anchor Free Lane Detection using a Semi-Local Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.01535) [[Notes](paper_notes\u002F3d_lanenet+.md)] \u003Ckbd>NeurIPS 2020 workshop\u003C\u002Fkbd> [GM Israel, 3D LLD]\n- [LSTR: End-to-end Lane Shape Prediction with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.04233) [[Notes](paper_notes\u002Flstr.md)] \u003Ckbd>WACV 2021\u003C\u002Fkbd> [LLD, transformers]\n- [PIXOR: Real-time 3D Object Detection from Point Clouds](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.06326) [[Notes](paper_notes\u002Fpixor.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (birds eye view)\n- [HDNET\u002FPIXOR++: Exploiting HD Maps for 3D Object Detection](http:\u002F\u002Fproceedings.mlr.press\u002Fv87\u002Fyang18b\u002Fyang18b.pdf) [[Notes](paper_notes\u002Fpixor++.md)] \u003Ckbd>CoRL 2018\u003C\u002Fkbd>\n- [CPNDet: Corner Proposal Network for Anchor-free, Two-stage Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.13816) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [anchor free, two stage]\n- [MVF: End-to-End Multi-View Fusion for 3D Object Detection in LiDAR Point Clouds](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06528) [[Notes](paper_notes\u002Fmvf.md)] \u003Ckbd>CoRL 2019\u003C\u002Fkbd> [Waymo, VoxelNet 1st author]\n- [Pillar-based Object Detection for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.10323) [[Notes](paper_notes\u002Fpillar_od.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [Training-Time-Friendly Network for Real-Time Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00700) \u003Ckbd>AAAI 2020\u003C\u002Fkbd> [anchor-free, fast training]\n- [Autonomous Driving with Deep Learning: A Survey of State-of-Art Technologies](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.06091) [Review of autonomous stack, Yu Huang]\n- [Dense Monocular Depth Estimation in Complex Dynamic Scenes](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2016\u002Fpapers\u002FRanftl_Dense_Monocular_Depth_CVPR_2016_paper.pdf) \u003Ckbd>CVPR 2016\u003C\u002Fkbd>\n- [Probabilistic Future Prediction for Video Scene Understanding](https:\u002F\u002Fanthonyhu.github.io\u002Fresearch\u002Fprobabilistic-future-prediction\u002F)\n- [AB3D: A Baseline for 3D Multi-Object Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.03961) \u003Ckbd>IROS 2020\u003C\u002Fkbd> [3D MOT]\n- [Spatial-Temporal Relation Networks for Multi-Object Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.11489) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [MOT, feature location over time]\n- [Beyond Pixels: Leveraging Geometry and Shape Cues for Online Multi-Object Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.09298) \u003Ckbd>ICRA 2018\u003C\u002Fkbd> [MOT, IIT, 3D shape]\n- [ST-3D: Joint Spatial-Temporal Optimization for Stereo 3D Object Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.09305) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Peilinag LI, author of VINS and S3DOT]\n- [Augment Your Batch: Improving Generalization Through Instance Repetition](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FHoffer_Augment_Your_Batch_Improving_Generalization_Through_Instance_Repetition_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [RetinaTrack: Online Single Stage Joint Detection and Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13870) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [MOT]\n- [Object as Hotspots: An Anchor-Free 3D Object Detection Approach via Firing of Hotspots](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.12791)\n- [Gradient Centralization: A New Optimization Technique for Deep Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.01461) \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd>\n- [Depth Completion via Deep Basis Fitting](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.10336) \u003Ckbd>WACV 2020\u003C\u002Fkbd>\n- [BTS: From Big to Small: Multi-Scale Local Planar Guidance for Monocular Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.10326) [monodepth, supervised]\n- [The Edge of Depth: Explicit Constraints between Segmentation and Depth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.00171) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [monodepth, Xiaoming Liu]\n- [On the Continuity of Rotation Representations in Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.07035) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [rotational representation]\n- [VDO-SLAM: A Visual Dynamic Object-aware SLAM System](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.11052) \u003Ckbd>IJRR 2020\u003C\u002Fkbd>\n- [Dynamic SLAM: The Need For Speed](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08584)\n- [Pseudo RGB-D for Self-Improving Monocular SLAM and Depth Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.10681) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [Traffic Light Mapping and Detection](https:\u002F\u002Fstatic.googleusercontent.com\u002Fmedia\u002Fresearch.google.com\u002Fen\u002F\u002Fpubs\u002Farchive\u002F37259.pdf) [[Notes](paper_notes\u002Ftfl_mapping_google.md)] \u003Ckbd>ICRA 2011\u003C\u002Fkbd> [traffic light, Google, Chris Urmson]\n- [Traffic light recognition exploiting map and localization at every stage](https:\u002F\u002Fweb.yonsei.ac.kr\u002Fjksuhr\u002Fpapers\u002FTraffic%20light%20recognition%20exploiting%20map%20and%20localization%20at%20every%20stage.pdf) [[Notes](paper_notes\u002Ftfl_exploting_map_korea.md)] \u003Ckbd>Expert Systems 2017\u003C\u002Fkbd> [traffic light, 鲜于明镐，徐在圭，郑浩奇]\n- [Traffic Light Recognition Using Deep Learning and Prior Maps for Autonomous Cars](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.11886) [[Notes](paper_notes\u002Ftfl_lidar_map_building_brazil.md)] \u003Ckbd> IJCNN 2019\u003C\u002Fkbd> [traffic light, Espirito Santo Brazil]\n\n\n## 2020-10 (14)\n- [TSM: Temporal Shift Module for Efficient Video Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.08383) [[Notes](paper_notes\u002Ftsm.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Song Han, video, object detection]\n- [WOD: Waymo Dataset: Scalability in Perception for Autonomous Driving: Waymo Open Dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04838) [[Notes](paper_notes\u002Fwod.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04388) [[Notes](paper_notes\u002Fgfocal.md)] \u003Ckbd>NeurIPS 2020\u003C\u002Fkbd> [classification as regression]\n- [A Ranking-based, Balanced Loss Function Unifying Classification and Localisation in Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.13592) \u003Ckbd>NeurIPS 2020 spotlight\u003C\u002Fkbd>\n- [Rethinking the Value of Labels for Improving Class-Imbalanced Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.07529) \u003Ckbd>NeurIPS 2020\u003C\u002Fkbd>\n- [RepLoss: Repulsion Loss: Detecting Pedestrians in a Crowd](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.07752) [[Notes](paper_notes\u002Frep_loss.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [crowd detection, Megvii]\n- [Adaptive NMS: Refining Pedestrian Detection in a Crowd](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.03629) [[Notes](paper_notes\u002Fadaptive_nms.md)] \u003Ckbd>CVPR 2019 oral\u003C\u002Fkbd> [crowd detection, NMS]\n- [AggLoss: Occlusion-aware R-CNN: Detecting Pedestrians in a Crowd](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.08407) [[Notes](paper_notes\u002Fagg_loss.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [crowd detection]\n- [CrowdDet: Detection in Crowded Scenes: One Proposal, Multiple Predictions](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.09163) [[Notes](paper_notes\u002Fcrowd_det.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [crowd detection, Megvii, Earth mover's distance]\n- [R2-NMS: NMS by Representative Region: Towards Crowded Pedestrian Detection by Proposal Pairing](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.12729) [[Notes](paper_notes\u002Fr2_nms.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Double Anchor R-CNN for Human Detection in a Crowd](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.09998) [[Notes](paper_notes\u002Fdouble_anchor.md)] [head-body bundle]\n- [Review: AP vs MR](paper_notes\u002Fap_mr.md)\n- [SKU110K: Precise Detection in Densely Packed Scenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.00853) [[Notes](paper_notes\u002Fsku110k.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [crowd detection, no occlusion]\n- [GossipNet: Learning non-maximum suppression](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.02950) \u003Ckbd>CVPR 2017\u003C\u002Fkbd> \n- [TLL: Small-scale Pedestrian Detection Based on Somatic Topology Localization and Temporal Feature Aggregation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.01438) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [Learning Monocular 3D Vehicle Detection without 3D Bounding Box Labels](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.03506) \u003Ckbd>GCPR 2020\u003C\u002Fkbd> [mono3D, Daniel Cremers, TUM]\n- [CubifAE-3D: Monocular Camera Space Cubification on Autonomous Vehicles for Auto-Encoder based 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04080) [[Notes](paper_notes\u002Fcubifae_3d.md)] [mono3D, depth AE pretraining]\n- [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.04159) [[Notes](paper_notes\u002Fdeformable_detr.md)] \u003Ckbd>ICLR 2021\u003C\u002Fkbd> [Jifeng Dai, DETR]\n- [ViT: An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.11929) [[Notes](paper_notes\u002Fvit.md)] \u003Ckbd>ICLR 2021\u003C\u002Fkbd>\n- [BYOL: Bootstrap your own latent: A new approach to self-supervised Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.07733) [self-supervised]\n\n## 2020-09 (15)\n- [SDFLabel: Autolabeling 3D Objects With Differentiable Rendering of SDF Shape Priors](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.11288) [[Notes](paper_notes\u002Fsdflabel.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [TRI, differentiable rendering]\n- [DensePose: Dense Human Pose Estimation In The Wild](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.00434) [[Notes](paper_notes\u002Fdensepose.md)] \u003Ckbd>CVPR 2018 oral\u003C\u002Fkbd> [FAIR]\n- [NOCS: Normalized Object Coordinate Space for Category-Level 6D Object Pose and Size Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.02970) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [monoDR: Monocular Differentiable Rendering for Self-Supervised 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.14524) [[Notes](paper_notes\u002Fmonodr.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [TRI, mono3D]\n- [Lift, Splat, Shoot: Encoding Images From Arbitrary Camera Rigs by Implicitly Unprojecting to 3D](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.05711) [[Notes](paper_notes\u002Flift_splat_shoot.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [BEV-Net, Utoronto, Sanja Fidler]\n- [Implicit Latent Variable Model for Scene-Consistent Motion Forecasting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.12036) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Uber ATG, Rachel Urtasun]\n- [FISHING Net: Future Inference of Semantic Heatmaps In Grids](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.09917) [[Notes](paper_notes\u002Ffishing_net.md)] \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [BEV-Net, Mapping, Zoox]\n- [VPN: Cross-view Semantic Segmentation for Sensing Surroundings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.03560) [[Notes](paper_notes\u002Fvpn.md)] \u003Ckbd>RAL 2020\u003C\u002Fkbd> [Bolei Zhou, BEV-Net]\n- [VED: Monocular Semantic Occupancy Grid Mapping with Convolutional Variational Encoder-Decoder Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.02176) [[Notes](paper_notes\u002Fved.md)] \u003Ckbd>ICRA 2019\u003C\u002Fkbd> [BEV-Net]\n- [Cam2BEV: A Sim2Real Deep Learning Approach for the Transformation of Images from Multiple Vehicle-Mounted Cameras to a Semantically Segmented Image in Bird's Eye View](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.04078) [[Notes](paper_notes\u002Fcam2bev.md)] \u003Ckbd>ITSC 2020\u003C\u002Fkbd> [BEV-Net] \n- [Learning to Look around Objects for Top-View Representations of Outdoor Scenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.10870) [[Notes](paper_notes\u002Flearning_to_look_around_objects.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [BEV-Net, UCSD, Manmohan Chandraker]\n- [A Parametric Top-View Representation of Complex Road Scenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.06152) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [BEV-Net, UCSD, Manmohan Chandraker]\n- [FTM: Understanding Road Layout from Videos as a Whole](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.00822) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [BEV-Net, UCSD, Manmohan Chandraker]\n- [KM3D-Net: Monocular 3D Detection with Geometric Constraints Embedding and Semi-supervised Training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.00764) [[Notes](paper_notes\u002Fkm3d_net.md)] \u003Ckbd>RAL 2021\u003C\u002Fkbd> [RTM3D, Peixuan Li]\n- [InstanceMotSeg: Real-time Instance Motion Segmentation for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.07008) [[Notes](paper_notes\u002Finstance_mot_seg.md)] \u003Ckbd>IROS 2020\u003C\u002Fkbd> [motion segmentation]\n- [MPV-Nets: Monocular Plan View Networks for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.06937) [[Notes](paper_notes\u002Fmpv_nets.md)] \u003Ckbd>IROS 2019\u003C\u002Fkbd> [BEV-Net]\n- [Class-Balanced Loss Based on Effective Number of Samples](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.05555) [[Notes](paper_notes\u002Fclass_balanced_loss.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [Focal loss authors]\n- [Geometric Pretraining for Monocular Depth Estimation](http:\u002F\u002Flewissoft.com\u002Fpdf\u002FICRA2020\u002F0035.pdf) [[Notes](paper_notes\u002Fgeometric_pretraining.md)] \u003Ckbd>ICRA 2020\u003C\u002Fkbd>\n- [Robust Traffic Light and Arrow Detection Using Digital Map with Spatial Prior Information for Automated Driving](https:\u002F\u002Fwww.mdpi.com\u002F1424-8220\u002F20\u002F4\u002F1181) [[Notes](paper_notes\u002Ftfl_robust_japan.md)] \u003Ckbd>Sensors 2020\u003C\u002Fkbd> [traffic light, 金沢]\n\n\n## 2020-08 (26)\n- [Feature-metric Loss for Self-supervised Learning of Depth and Egomotion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.10603) [[Notes](paper_notes\u002Ffeature_metric.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [feature-metric, local minima, monodepth]\n- [Depth-VO-Feat: Unsupervised Learning of Monocular Depth Estimation and Visual Odometry with Deep Feature Reconstruction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.03893) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [feature-metric, monodepth]\n- [MonoResMatch: Learning monocular depth estimation infusing traditional stereo knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.04144) [[Notes](paper_notes\u002Fmonoresmatch.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [monodepth, local minima, cheap stereo GT]\n- [SGDepth: Self-Supervised Monocular Depth Estimation: Solving the Dynamic Object Problem by Semantic Guidance](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.06936) [[Notes](paper_notes\u002Fsgdepth.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Moving objects]\n- [Every Pixel Counts: Unsupervised Geometry Learning with Holistic 3D Motion Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.10556) \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [dynamic objects, rigid and dynamic motion]\n- [Every Pixel Counts ++: Joint Learning of Geometry and Motion with 3D Holistic Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.06125) \u003Ckbd>TPAMI 2018\u003C\u002Fkbd>\n- [CC: Competitive Collaboration: Joint Unsupervised Learning of Depth, Camera Motion, Optical Flow and Motion Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.09806) [[Notes](paper_notes\u002Fcc.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [ObjMotionNet: Self-supervised Object Motion and Depth Estimation from Video](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04250) [[Notes](paper_notes\u002Fobj_motion_net.md)] \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [object motion prediction, velocity prediction]\n- [Instance-wise Depth and Motion Learning from Monocular Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.09351)\n- [Semantics-Driven Unsupervised Learning for Monocular Depth and Ego-Motion Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04371)\n- [Self-Supervised Joint Learning Framework of Depth Estimation via Implicit Cues](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.09876)\n- [DF-Net: Unsupervised Joint Learning of Depth and Flow using Cross-Task Consistency](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.01649) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [LineNet: a Zoomable CNN for Crowdsourced High Definition Maps Modeling in Urban Environments](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.05696) [mapping]\n- [Road-SLAM: Road Marking based SLAM with Lane-level Accuracy](https:\u002F\u002Fwww.naverlabs.com\u002Fimg\u002FautonomousDriving\u002Fintelligence\u002Fdissertation\u002FRoad-SLAM_Road%20Marking%20based%20SLAM%20with%20Lane-level%20Accuracy.pdf) [[Notes](paper_notes\u002Froad_slam.md)] [HD mapping]\n- [AVP-SLAM: Semantic Visual Mapping and Localization for Autonomous Vehicles in the Parking Lot](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.01813) [[Notes](paper_notes\u002Favp_slam.md)] \u003Ckbd>IROS 2020\u003C\u002Fkbd> [Huawei, HD mapping, Tong Qin, VINS author, autonomous valet parking]\n- [AVP-SLAM-Late-Fusion: Mapping and Localization using Semantic Road Marking with Centimeter-level Accuracy in Indoor Parking Lots](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8917529) [[Notes](paper_notes\u002Favp_slam_late_fusion.md)] \u003Ckbd>ITSC 2019\u003C\u002Fkbd>\n- [Lane markings-based relocalization on highway](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8917254) \u003Ckbd>ITSC 2019\u003C\u002Fkbd>\n- [DeepRoadMapper: Extracting Road Topology from Aerial Images](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2017\u002Fpapers\u002FMattyus_DeepRoadMapper_Extracting_Road_ICCV_2017_paper.pdf) [[Notes](paper_notes\u002Fdeep_road_mapper.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd> [Uber ATG, NOT HD maps]\n- [RoadTracer: Automatic Extraction of Road Networks from Aerial Images](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FBastani_RoadTracer_Automatic_Extraction_CVPR_2018_paper.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [NOT HD maps]\n- [PolyMapper: Topological Map Extraction From Overhead Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.01497) [[Notes](paper_notes\u002Fpolymapper.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [mapping, polygon, NOT HD maps]\n- [HRAN: Hierarchical Recurrent Attention Networks for Structured Online Maps](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FHomayounfar_Hierarchical_Recurrent_Attention_CVPR_2018_paper.pdf) [[Notes](paper_notes\u002Fhran.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [HD mapping, highway, polyline loss, Chamfer distance]\n- [Deep Structured Crosswalk: End-to-End Deep Structured Models for Drawing Crosswalks](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCV_2018\u002Fpapers\u002FJustin_Liang_End-to-End_Deep_Structured_ECCV_2018_paper.pdf) [[Notes](paper_notes\u002Fdeep_structured_crosswalk.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [DeepBoundaryExtractor: Convolutional Recurrent Network for Road Boundary Extraction](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2019\u002Fhtml\u002FLiang_Convolutional_Recurrent_Network_for_Road_Boundary_Extraction_CVPR_2019_paper.html) [[Notes](paper_notes\u002Fdeep_boundary_extractor.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [HD mapping, boundary, polyline loss]\n- [DAGMapper: Learning to Map by Discovering Lane Topology](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FHomayounfar_DAGMapper_Learning_to_Map_by_Discovering_Lane_Topology_ICCV_2019_paper.pdf) [[Notes](paper_notes\u002Fdagmapper.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [HD mapping, highway, forks and merges, polyline loss]\n- [Sparse-HD-Maps: Exploiting Sparse Semantic HD Maps for Self-Driving Vehicle Localization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03274) [[Notes](paper_notes\u002Fsparse_hd_maps.md)] \u003Ckbd>IROS 2019 oral\u003C\u002Fkbd> [Uber ATG, metadata, mapping, localization]\n- [Aerial LaneNet: Lane Marking Semantic Segmentation in Aerial Imagery using Wavelet-Enhanced Cost-sensitive Symmetric Fully Convolutional Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.06904) \u003Ckbd>IEEE TGRS 2018\u003C\u002Fkbd>\n- [Monocular Localization with Vector HD Map (MLVHM): A Low-Cost Method for Commercial IVs](https:\u002F\u002Fwww.mdpi.com\u002F1424-8220\u002F20\u002F7\u002F1870\u002Fhtm) \u003Ckbd>Sensors 2020\u003C\u002Fkbd> [Tsinghua, 3D HD maps]\n- [PatchNet: Rethinking Pseudo-LiDAR Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.04582) [[Notes](paper_notes\u002Fpatchnet.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [SenseTime, Wanli Ouyang]\n- [D4LCN: Learning Depth-Guided Convolutions for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04799) [[Notes](paper_notes\u002Fd4lcn.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [mono3D]\n- [MfS: Learning Stereo from Single Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.01484) [[Notes](paper_notes\u002Fmfs.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [mono for stereo, learn stereo matching with mono]\n- [BorderDet: Border Feature for Dense Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.11056) \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd> [Megvii]\n- [Scale-Aware Trident Networks for Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.01892) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [different heads for different scales]\n- [Learning Depth from Monocular Videos using Direct Methods](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.00175)\n- [Vid2Depth: Unsupervised Learning of Depth and Ego-Motion from Monocular Video Using 3D Geometric Constraints](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.05522) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [Google]\n- [NeRF in the Wild: Neural Radiance Fields for Unconstrained Photo Collections](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.02268)\n- [Supervising the new with the old: learning SFM from SFM](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCV_2018\u002Fpapers\u002FMaria_Klodt_Supervising_the_new_ECCV_2018_paper.pdf) [[Notes](paper_notes\u002Flearn_sfm_from_sfm.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [Neural RGB->D Sensing: Depth and Uncertainty from a Video Camera](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.02571) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [multi-frame monodepth]\n- [Don't Forget The Past: Recurrent Depth Estimation from Monocular Video](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.02613) [multi-frame monodepth, RNN]\n- [Recurrent Neural Network for (Un-)supervised Learning of Monocular VideoVisual Odometry and Depth](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.07087) [multi-frame monodepth, RNN]\n- [Exploiting temporal consistency for real-time video depth estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03706) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [multi-frame monodepth, RNN, indoor]\n- [SfM-Net: Learning of Structure and Motion from Video](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.07804) [dynamic object, SfM]\n- [MB-Net: MergeBoxes for Real-Time 3D Vehicles Detection](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F8500395) [[Notes](paper_notes\u002Fmb_net.md)] \u003Ckbd>IV 2018\u003C\u002Fkbd> [mono3D: Daimler]\n- [BS3D: Beyond Bounding Boxes: Using Bounding Shapes for Real-Time 3D Vehicle Detection from Monocular RGB Images](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8814036\u002F) [[Notes](paper_notes\u002Fbs3d.md)] \u003Ckbd>IV 2019\u003C\u002Fkbd> [mono3D, Daimler]\n- [3D-GCK: Single-Shot 3D Detection of Vehicles from Monocular RGB Images via\nGeometrically Constrained Keypoints in Real-Time](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.13084) [[Notes](paper_notes\u002F3d_gck.md)] \u003Ckbd>IV 2020\u003C\u002Fkbd> [[mono3D, Daimler]\n- [UR3D: Distance-Normalized Unified Representation for Monocular 3D Object Detection](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fhtml\u002F6559_ECCV_2020_paper.php) [[Notes](paper_notes\u002Fur3d.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [mono3D]\n- [DA-3Det: Monocular 3D Object Detection via Feature Domain Adaptation](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fpapers\u002F123540018.pdf) [[Notes](paper_notes\u002Fda_3det.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [mono3D]\n- [RAR-Net: Reinforced Axial Refinement Network for Monocular 3D Object Detection](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fhtml\u002F2822_ECCV_2020_paper.php) [[Notes](paper_notes\u002Frarnet.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [mono3D]\n\n\n## 2020-07 (25)\n- [CenterTrack: Tracking Objects as Points](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.01177) [[Notes](paper_notes\u002Fcentertrack.md)] \u003Ckbd>ECCV 2020 spotlight\u003C\u002Fkbd> [camera based 3D MOD, MOT SOTA, CenterNet, video based object detection, Philipp Krähenbühl]\n- [CenterPoint: Center-based 3D Object Detection and Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.11275) [[Notes](paper_notes\u002Fcenterpoint.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [lidar based 3D MOD, CenterNet]\n- [Tracktor: Tracking without bells and whistles](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.05625) [[Notes](paper_notes\u002Ftracktor.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Tracktor\u002FTracktor++, Laura Leal-Taixe@TUM]\n- [FairMOT: A Simple Baseline for Multi-Object Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.01888) [[Notes](paper_notes\u002Ffairmot.md)]\n- [DeepMOT: A Differentiable Framework for Training Multiple Object Trackers](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.06618) [[Notes](paper_notes\u002Fdeepmot.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [trainable Hungarian, Laura Leal-Taixe@TUM]\n- [MPNTracker: Learning a Neural Solver for Multiple Object Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.07515) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [trainable Hungarian, Laura Leal-Taixe@TUM]\n- [nuScenes: A multimodal dataset for autonomous driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.11027) [[Notes](paper_notes\u002Fnuscenes.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [dataset, point cloud, radar]\n- [CBGS: Class-balanced Grouping and Sampling for Point Cloud 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.09492) [[Notes](paper_notes\u002Fcbgs.md)] \u003Ckbd>CVPRW 2019\u003C\u002Fkbd> [Megvii, lidar, WAD challenge winner]\n- [AFDet: Anchor Free One Stage 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.12671) and [Competition solution](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.15505.pdf) [[Notes](paper_notes\u002Fafdet.md)]  \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [Horizon robotics, lidar, winning for Waymo challenge] \n- Review of MOT and SOT [[Notes](paper_notes\u002Fmot_and_sot.md)]\n- [CrowdHuman: A Benchmark for Detecting Human in a Crowd](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.00123) [[Notes](paper_notes\u002Fcrowdhuman.md)] [megvii, pedestrian, dataset]\n- [WiderPerson: A Diverse Dataset for Dense Pedestrian Detection in the Wild](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.12118) [[Notes](paper_notes\u002Fwiderperson.md)] \u003Ckbd>TMM 2019\u003C\u002Fkbd> [dataset, pedestrian]\n- [Tsinghua-Daimler Cyclists: A New Benchmark for Vison-Based Cyclist Detection](http:\u002F\u002Fwww.gavrila.net\u002FPublications\u002Fiv16_cyclist_benchmark.pdf) [[Notes](paper_notes\u002Ftsinghua_daimler_cyclist.md)] \u003Ckbd>IV 2016\u003C\u002Fkbd> [dataset, cyclist Detection]\n- [Specialized Cyclist Detection Dataset: Challenging Real-World Computer Vision Dataset for Cyclist Detection Using a Monocular RGB Camera](https:\u002F\u002Fdrive.google.com\u002Fdrive\u002Fu\u002F0\u002Ffolders\u002F1inawrX9NVcchDQZepnBeJY4i9aAI5mg9) [[Notes]([paper_notes\u002Fspecialized_cyclists.md)] \u003Ckbd>IV 2019\u003C\u002Fkbd> [Extention to KITTI]\n- [PointTrack: Segment as Points for Efficient Online Multi-Object Tracking and Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.01550) [[Notes](paper_notes\u002Fpointtrack.md)] \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd> [MOTS]\n- [PointTrack++ for Effective Online Multi-Object Tracking and Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.01549) [[Notes](paper_notes\u002Fpointtrack++.md)] \u003Ckbd>CVPR 2020 workshop\u003C\u002Fkbd> [CVPR2020 MOTS Challenge Winner. PointTrack++ ranks first on KITTI MOTS]\n- [SpatialEmbedding: Instance Segmentation by Jointly Optimizing Spatial Embeddings and Clustering Bandwidth](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.11109) [[Notes](paper_notes\u002Fspatial_embedding.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [one-stage, instance segmentation]\n- [BA-Net: Dense Bundle Adjustment Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.04807) [[Notes](paper_notes\u002Fbanet.md)] \u003Ckbd>ICLR 2019\u003C\u002Fkbd> [Bundle adjustment, multi-frame monodepth, feature-metric]\n- [DeepSFM: Structure From Motion Via Deep Bundle Adjustment](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.09697) \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd> [multi-frame monodepth, indoor scene]\n- [CVD: Consistent Video Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.15021) [[Notes](paper_notes\u002Fcvd.md)] \u003Ckbd>SIGGRAPH 2020\u003C\u002Fkbd> [multi-frame monodepth, online finetune]\n- [DeepV2D: Video to Depth with Differentiable Structure from Motion](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.04605) [[Notes](paper_notes\u002Fdeepv2d.md)] \u003Ckbd>ICLR 2020\u003C\u002Fkbd> [multi-frame monodepth, Jia Deng]\n- [GeoNet: Unsupervised Learning of Dense Depth, Optical Flow and Camera Pose](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.02276) [[Notes](paper_notes\u002Fgeonet.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [residual optical flow, monodepth, rigid and dynamic motion]\n- [GLNet: Self-supervised Learning with Geometric Constraints in Monocular Video: Connecting Flow, Depth, and Camera](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.05820) [[Notes](paper_notes\u002Fglnet.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [online finetune, rigid and dynamic motion]\n- [Depth Hints: Self-Supervised Monocular Depth Hints](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.09051) [[Notes](paper_notes\u002Fdepth_hints.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [monodepth, local minima, cheap stereo GT]\n- [MonoUncertainty: On the uncertainty of self-supervised monocular depth estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.06209) [[Notes](paper_notes\u002Fmono_uncertainty.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [depth uncertainty]\n- [Self-Supervised Learning of Depth and Ego-motion with Differentiable Bundle Adjustment](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.13163) [[Notes](paper_notes\u002Fba_sfm_learner.md)] [Bundle adjustment, xmotors.ai, multi-frame monodepth]\n- [Kinematic 3D Object Detection in Monocular Video](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.09548) [[Notes](paper_notes\u002Fkinematic_mono3d.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [multi-frame mono3D, Xiaoming Liu]\n- [VelocityNet: Camera-based vehicle velocity estimation from monocular video](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.07094) [[Notes](paper_notes\u002Fvelocity_net.md)] \u003Ckbd>CVPR 2017 workshop\u003C\u002Fkbd> [monocular velocity estimation, CVPR 2017 challenge winner]\n- [Vehicle Centric VelocityNet: End-to-end Learning for Inter-Vehicle Distance and Relative Velocity Estimation in ADAS with a Monocular Camera](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04082) [[Notes](paper_notes\u002Fvehicle_centric_velocity_net.md)] [monocular velocity estimation, monocular distance, SOTA]\n\n## 2020-06 (20)\n- [LeGO-LOAM: Lightweight and Ground-Optimized Lidar Odometry and Mapping on Variable Terrain](http:\u002F\u002Fpersonal.stevens.edu\u002F~benglot\u002FShan_Englot_IROS_2018_Preprint.pdf) [[Notes](paper_notes\u002Flego_loam.md)] \u003Ckbd>IROS 2018\u003C\u002Fkbd> [lidar, mapping]\n- [PIE: A Large-Scale Dataset and Models for Pedestrian Intention Estimation and Trajectory Prediction](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FRasouli_PIE_A_Large-Scale_Dataset_and_Models_for_Pedestrian_Intention_Estimation_ICCV_2019_paper.pdf) [[Notes](paper_notes\u002Fpie.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [JAAD: Are They Going to Cross? A Benchmark Dataset and Baseline for Pedestrian\nCrosswalk Behavior](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2017_workshops\u002Fpapers\u002Fw3\u002FRasouli_Are_They_Going_ICCV_2017_paper.pdf) \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [Pedestrian Action Anticipation using Contextual Feature Fusion in Stacked RNNs](https:\u002F\u002Fbmvc2019.org\u002Fwp-content\u002Fuploads\u002Fpapers\u002F0283-paper.pdf) \u003Ckbd>BMVC 2019\u003C\u002Fkbd>\n- [Is the Pedestrian going to Cross? Answering by 2D Pose Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.10580) \u003Ckbd>IV 2018\u003C\u002Fkbd>\n- [Intention Recognition of Pedestrians and Cyclists by 2D Pose Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.03858) \u003Ckbd>ITSC 2019\u003C\u002Fkbd> [skeleton, pedestrian, cyclist intention]\n- [Attentive Single-Tasking of Multiple Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08918) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [DETR: End-to-End Object Detection with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.12872) [[Notes](paper_notes\u002Fdetr.md)] \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd> [FAIR]\n- [Transformer: Attention Is All You Need](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.03762) [[Notes](paper_notes\u002Ftransformer.md)] \u003Ckbd>NIPS 2017\u003C\u002Fkbd>\n- [SpeedNet: Learning the Speediness in Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.06130) [[Notes](paper_notes\u002Fspeednet.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd>\n- [MonoPair: Monocular 3D Object Detection Using Pairwise Spatial Relationships](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.00504) [[Notes](paper_notes\u002Fmonopair.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Mono3D, pairwise relationship]\n- [SMOKE: Single-Stage Monocular 3D Object Detection via Keypoint Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.10111) [[Notes](paper_notes\u002Fsmoke.md)] \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [Mono3D, Zongmu]\n- [Vehicle Re-ID for Surround-view Camera System](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1e6y8wtHAricaEHS9CpasSGOx0aAxCGib\u002Fview) [[Notes](paper_notes\u002Freid_surround_fisheye.md)] \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [tireline, vehicle ReID, Zongmu]\n- [End-to-End Lane Marker Detection via Row-wise Classification](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.08630) [[Notes](paper_notes\u002Fe2e_lmd.md)] [Qualcomm Korea, LLD as cls]\n- [Reliable multilane detection and classification by utilizing CNN as a regression network](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCVW_2018\u002Fpapers\u002F11133\u002FChougule_Reliable_multilane_detection_and_classification_by_utilizing_CNN_as_a_ECCVW_2018_paper.pdf) \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [LLD as reg]\n- [SUPER: A Novel Lane Detection System](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.07277) [[Notes](paper_notes\u002Fsuper.md)]\n- [Learning Lightweight Lane Detection CNNs by Self Attention Distillation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.00821) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [StixelNet: A Deep Convolutional Network for Obstacle Detection and Road Segmentation](http:\u002F\u002Fwww.bmva.org\u002Fbmvc\u002F2015\u002Fpapers\u002Fpaper109\u002Fpaper109.pdf) \u003Ckbd>BMVC 2015\u003C\u002Fkbd>\n- [StixelNetV2: Real-time category-based and general obstacle detection for autonomous driving](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2017_workshops\u002Fpapers\u002Fw3\u002FGarnett_Real-Time_Category-Based_and_ICCV_2017_paper.pdf) [[Notes](paper_notes\u002Fstixelnetv2.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd> [DS]\n- [Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F1609.05158) [[Notes](paper_notes\u002Fsubpixel_conv.md)] \u003Ckbd>CVPR 2016\u003C\u002Fkbd> [channel-to-pixel]\n- [Car Pose in Context: Accurate Pose Estimation with Ground Plane Constraints](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04363) [mono3D]\n- [Self-Mono-SF: Self-Supervised Monocular Scene Flow Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.04143) [[Notes](paper_notes\u002Fself_mono_sf.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [scene-flow, Stereo input]\n- [MEBOW: Monocular Estimation of Body Orientation In the Wild](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.13688) [[Notes](paper_notes\u002Fmebow.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [VG-NMS: Visibility Guided NMS: Efficient Boosting of Amodal Object Detection in Crowded Traffic Scenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.08547) [[Notes](paper_notes\u002Fvg_nms.md)] \u003Ckbd>NeurIPS 2019 workshop\u003C\u002Fkbd> [Crowded scene, NMS, Daimler]\n- [WYSIWYG: What You See is What You Get: Exploiting Visibility for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04986) [[Notes](paper_notes\u002Fwysiwyg.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [occupancy grid]\n- [Real-Time Panoptic Segmentation From Dense Detections](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01202) [[Notes](paper_notes\u002Frealtime_panoptic.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [bbox + semantic segmentation = panoptic segmentation, Toyota]\n- [Human-Centric Efficiency Improvements in Image Annotation for Autonomous Driving](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1DY95vfWBLKOOZZyq8gLDd0heZ6aBSdji\u002Fview) [[Notes](paper_notes\u002Fhuman_centric_annotation.md)] \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [efficient annotation]\n- [SurfelGAN: Synthesizing Realistic Sensor Data for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.03844) [[Notes](paper_notes\u002Fsurfel_gan.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [Waymo, auto data generation, surfel]\n- [LiDARsim: Realistic LiDAR Simulation by Leveraging the Real World](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.09348) [[Notes](paper_notes\u002Flidarsim.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [Uber ATG, auto data generation, surfel]\n- [SuMa++: Efficient LiDAR-based Semantic SLAM](http:\u002F\u002Fwww.ipb.uni-bonn.de\u002Fwp-content\u002Fpapercite-data\u002Fpdf\u002Fchen2019iros.pdf) \u003Ckbd>IROS 2019\u003C\u002Fkbd> [semantic segmentation, lidar, SLAM]\n- [PON\u002FPyrOccNet: Predicting Semantic Map Representations from Images using Pyramid Occupancy Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13402) [[Notes](paper_notes\u002Fpyroccnet.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [BEV-Net, OFT]\n- [MonoLayout: Amodal scene layout from a single image](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08394) [[Notes](paper_notes\u002Fmonolayout.md)] \u003Ckbd>WACV 2020\u003C\u002Fkbd> [BEV-Net]\n- [BEV-Seg: Bird’s Eye View Semantic Segmentation Using Geometry and Semantic Point Cloud](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.11436) [[Notes](paper_notes\u002Fbev_seg.md)] \u003Ckbd>CVPR 2020 workshop\u003C\u002Fkbd> [BEV-Net, Mapping]\n- [A Geometric Approach to Obtain a Bird's Eye View from an Image](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02231) \u003Ckbd>ICCVW 2019\u003C\u002Fkbd> [mapping, geometry, Andrew Zisserman]\n- [FrozenDepth: Learning the Depths of Moving People by Watching Frozen People](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.11111) [[Notes](paper_notes\u002Ffrozen_depth.md)] \u003Ckbd>CVPR 2019 oral\u003C\u002Fkbd>\n- [ORB-SLAM: a Versatile and Accurate Monocular SLAM System](https:\u002F\u002Farxiv.org\u002Fabs\u002F1502.00956) \u003Ckbd>TRO 2015\u003C\u002Fkbd>\n- [ORB-SLAM2: an Open-Source SLAM System for Monocular, Stereo and RGB-D Cameras](https:\u002F\u002Farxiv.org\u002Fabs\u002F1610.06475) \u003Ckbd>TRO 2016\u003C\u002Fkbd>\n- [CubeSLAM: Monocular 3D Object SLAM](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.00557) [[Notes](paper_notes\u002Fcube_slam.md)] \u003Ckbd>TRO 2019\u003C\u002Fkbd> [dynamic SLAM, orb slam + mono3D]\n- [ClusterVO: Clustering Moving Instances and Estimating Visual Odometry for Self and Surroundings](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.12980) [[Notes](paper_notes\u002Fcluster_vo.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [general dynamic SLAM]\n- [S3DOT: Stereo Vision-based Semantic 3D Object and Ego-motion Tracking for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.02062) [[Notes](paper_notes\u002Fs3dot.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [Peiliang Li]\n- [Multi-object Monocular SLAM for Dynamic Environments](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.03528) [[Notes](paper_notes\u002Fmulti_object_mono_slam.md)] \u003Ckbd>IV 2020\u003C\u002Fkbd> [monolayout authors]\n- [PWC-Net: CNNs for Optical Flow Using Pyramid, Warping, and Cost Volume](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.02371) [[Notes](paper_notes\u002Fpwc_net.md)] \u003Ckbd>CVPR 2018 oral\u003C\u002Fkbd> [Optical flow]\n- [LiteFlowNet: A Lightweight Convolutional Neural Network for Optical Flow Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.07036) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [Optical flow]\n- [FlowNet: Learning Optical Flow With Convolutional Networks](https:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_iccv_2015\u002Fpapers\u002FDosovitskiy_FlowNet_Learning_Optical_ICCV_2015_paper.pdf) \u003Ckbd>ICCV 2015\u003C\u002Fkbd> [Optical flow]\n- [FlowNet 2.0: Evolution of Optical Flow Estimation with Deep Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1612.01925) \u003Ckbd>CVPR 2017\u003C\u002Fkbd> [Optical flow]\n- [ESPNetv2: A Light-weight, Power Efficient, and General Purpose Convolutional Neural Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.11431) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [semantic segmentation, lightweight]\n- [Mono-SF: Multi-View Geometry Meets Single-View Depth for Monocular Scene Flow Estimation of Dynamic Traffic Scenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.06316) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [depth uncertainty]\n\n  \n## 2020-05 (19)\n- [Egocentric Vision-based Future Vehicle Localization for Intelligent Driving Assistance Systems](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.07408) [[Notes](paper_notes\u002Fhevi.md)] [Honda] \u003Ckbd>ICRA 2019\u003C\u002Fkbd>\n- [PackNet: 3D Packing for Self-Supervised Monocular Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02693) [[Notes](paper_notes\u002Fpacknet.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [Scale aware depth]\n- [PackNet-SG: Semantically-Guided Representation Learning for Self-Supervised Monocular Depth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.12319) [[Notes](paper_notes\u002Fpacknet_sg.md)] \u003Ckbd>ICLR 2020\u003C\u002Fkbd> [TRI, infinite-depth problem]\n- [TrianFlow: Towards Better Generalization: Joint Depth-Pose Learning without PoseNet](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.01314) [[Notes](paper_notes\u002Ftrianflow.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Scale aware]\n- [Understanding the Limitations of CNN-based Absolute Camera Pose Regression](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.07504) [[Notes](paper_notes\u002Funderstanding_apr.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [Drawbacks of PoseNet, MapNet, Laura Leal-Taixe@TUM]\n- [To Learn or Not to Learn: Visual Localization from Essential Matrices](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.01293) [[Notes](paper_notes\u002Fto_learn_or_not.md)] \u003Ckbd>ICRA 2020\u003C\u002Fkbd> [SIFT + 5 pt solver >> others for VO, Laura Leal-Taixe@TUM]\n- [DF-VO: Visual Odometry Revisited: What Should Be Learnt?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.09803) [[Notes](paper_notes\u002Fdf_vo.md)] \u003Ckbd>ICRA 2020\u003C\u002Fkbd> [Depth and Flow for accurate VO]\n- [D3VO: Deep Depth, Deep Pose and Deep Uncertainty for Monocular Visual Odometry](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.01060) [[Notes](paper_notes\u002Fd3vo.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [Daniel Cremers, TUM, depth uncertainty]\n- [Network Slimming: Learning Efficient Convolutional Networks through Network Slimming](https:\u002F\u002Farxiv.org\u002Fabs\u002F1708.06519) [[Notes](paper_notes\u002Fnetwork_slimming.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [BatchNorm Pruning: Rethinking the Smaller-Norm-Less-Informative Assumption in Channel Pruning of Convolution Layers](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.00124) [[Notes](paper_notes\u002Fbatchnorm_pruning.md)] \u003Ckbd>ICLR 2018\u003C\u002Fkbd>\n- [Direct Sparse Odometry](https:\u002F\u002Farxiv.org\u002Fabs\u002F1607.02565) \u003Ckbd>PAMI 2018\u003C\u002Fkbd>\n- [Train in Germany, Test in The USA: Making 3D Object Detectors Generalize](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.08139) [[Notes](paper_notes\u002Ftrain_in_germany.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [PseudoLidarV3: End-to-End Pseudo-LiDAR for Image-Based 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.03080) [[Notes](paper_notes\u002Fpseudo_lidar_v3.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [ATSS: Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.02424) [[Notes](paper_notes\u002Fatss.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd>\n- [Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.08287) \u003Ckbd>AAAI 2020\u003C\u002Fkbd>\n- [Enhancing Geometric Factors in Model Learning and Inference for Object Detection and Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.03572) [Journal version]\n- [YOLOv4: Optimal Speed and Accuracy of Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.10934) [[Notes](paper_notes\u002Fyolov4.md)]\n- [CBN: Cross-Iteration Batch Normalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.05712) [[Notes](paper_notes\u002Fcbn.md)]\n- [Stitcher: Feedback-driven Data Provider for Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.12432) [[Notes](paper_notes\u002Fstitcher.md)]\n- [SKNet: Selective Kernel Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.06586) [[Notes](paper_notes\u002Fsknet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [CBAM: Convolutional Block Attention Module](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.06521) [[Notes](paper_notes\u002Fcbam.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd> \n- [ResNeSt: Split-Attention Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.08955) [[Notes](paper_notes\u002Fresnest.md)]\n\n## 2020-04 (14)\n- [ChauffeurNet: Learning to Drive by Imitating the Best and Synthesizing the Worst](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.03079.pdf) [[Notes](paper_notes\u002Fchauffeurnet.md)] \u003Ckbd>RSS 2019\u003C\u002Fkbd> [Waymo]\n- [IntentNet: Learning to Predict Intention from Raw Sensor Data](http:\u002F\u002Fwww.cs.toronto.edu\u002F~wenjie\u002Fpapers\u002Fintentnet_corl18.pdf) [[Notes](paper_notes\u002Fintentnet.md)] \u003Ckbd>CoRL 2018\u003C\u002Fkbd> [Uber ATG, perception and prediction, Lidar+Map]\n- [RoR: Rules of the Road: Predicting Driving Behavior with a Convolutional Model of Semantic Interactions](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.08945) [[Notes](paper_notes\u002Fror.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [Zoox]\n- [MultiPath: Multiple Probabilistic Anchor Trajectory Hypotheses for Behavior Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05449) [[Notes](paper_notes\u002Fmultipath.md)] \u003Ckbd>CoRL 2019\u003C\u002Fkbd> [Waymo, authors from RoR and ChauffeurNet]\n- [NMP: End-to-end Interpretable Neural Motion Planner](http:\u002F\u002Fwww.cs.toronto.edu\u002F~wenjie\u002Fpapers\u002Fcvpr19\u002Fnmp.pdf) [[Notes](paper_notes\u002Fnmp.md)] \u003Ckbd>CVPR 2019 oral\u003C\u002Fkbd> [Uber ATG]\n- [Multimodal Trajectory Predictions for Autonomous Driving using Deep Convolutional Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.10732) [[Notes](paper_notes\u002Fmultipath_uber.md)] \u003Ckbd>ICRA 2019\u003C\u002Fkbd> [Henggang Cui, Multimodal, Uber ATG Pittsburgh]\n- [Uncertainty-aware Short-term Motion Prediction of Traffic Actors for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.05819) \u003Ckbd>WACV 2020\u003C\u002Fkbd> [Uber ATG Pittsburgh] \n- [TensorMask: A Foundation for Dense Object Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.12174) [[Notes](paper_notes\u002Ftensormask.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [single-stage instance seg]\n- [BlendMask: Top-Down Meets Bottom-Up for Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.00309) [[Notes](paper_notes\u002Fblendmask.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd>\n- [Mask Encoding for Single Shot Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.11712) [[Notes](paper_notes\u002Fmeinst.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [single-stage instance seg, Chunhua Shen]\n- [PolarMask: Single Shot Instance Segmentation with Polar Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.13226) [[Notes](paper_notes\u002Fpolarmask.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [single-stage instance seg]\n- [SOLO: Segmenting Objects by Locations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04488) [[Notes](paper_notes\u002Fsolo.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [single-stage instance seg, Chunhua Shen]\n- [SOLOv2: Dynamic, Faster and Stronger](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.10152) [[Notes](paper_notes\u002Fsolov2.md)] [single-stage instance seg, Chunhua Shen]\n- [CondInst: Conditional Convolutions for Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.05664) [[Notes](paper_notes\u002Fcondinst.md)] \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd> [single-stage instance seg, Chunhua Shen]\n- [CenterMask: Single Shot Instance Segmentation With Point Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.04446) [[Notes](paper_notes\u002Fcentermask.md)]\u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n\n\n## 2020-03 (15)\n- [VPGNet: Vanishing Point Guided Network for Lane and Road Marking Detection and Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.06288) [[Notes](paper_notes\u002Fvpgnet.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [Which Tasks Should Be Learned Together in Multi-task Learning?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07553) [[Notes](paper_notes\u002Ftask_grouping.md)] [Stanford, MTL] \u003Ckbd>ICML 2020\u003C\u002Fkbd>\n- [MGDA: Multi-Task Learning as Multi-Objective Optimization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04650) \u003Ckbd>NeurIPS 2018\u003C\u002Fkbd>\n- [Taskonomy: Disentangling Task Transfer Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.08328) [[Notes](paper_notes\u002Ftaskonomy.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [Rethinking ImageNet Pre-training](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.08883) [[Notes](paper_notes\u002Frethinking_pretraining.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Kaiming He]\n- [UnsuperPoint: End-to-end Unsupervised Interest Point Detector and Descriptor](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.04011) [[Notes](paper_notes\u002Funsuperpoint.md)] [superpoint]\n- [KP2D: Neural Outlier Rejection for Self-Supervised Keypoint Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.10615) [[Notes](paper_notes\u002Fkp2d.md)] \u003Ckbd>ICLR 2020\u003C\u002Fkbd> (pointNet)\n- [KP3D: Self-Supervised 3D Keypoint Learning for Ego-motion Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.03426) [[Notes](paper_notes\u002Fkp3d.md)] \u003Ckbd>CoRL 2020\u003C\u002Fkbd> [Toyota, superpoint]\n- [NG-RANSAC: Neural-Guided RANSAC: Learning Where to Sample Model Hypotheses](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.04132) [[Notes](paper_notes\u002Fng_ransac.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [pointNet]\n- [Learning to Find Good Correspondences](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.05971) [[Notes](paper_notes\u002Flearning_correspondence.md)] \u003Ckbd>CVPR 2018 Oral\u003C\u002Fkbd> (pointNet)\n- [RefinedMPL: Refined Monocular PseudoLiDAR for 3D Object Detection in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.09712) [[Notes](paper_notes\u002Frefined_mpl.md)] [Huawei, Mono3D]\n- [DSP: Monocular 3D Object Detection with Decoupled Structured Polygon Estimation and Height-Guided Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.01619) [[Notes](paper_notes\u002Fdsp.md)] \u003Ckbd>AAAI 2020\u003C\u002Fkbd> (SenseTime, Mono3D)\n- [Robust Lane Detection from Continuous Driving Scenes Using Deep Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.02193) (LLD, LSTM)\n- [LaneNet: Towards End-to-End Lane Detection: an Instance Segmentation Approach](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.05591) [[Notes](paper_notes\u002Flanenet.md)] \u003Ckbd>IV 2018\u003C\u002Fkbd> (LaneNet)\n- [3D-LaneNet: End-to-End 3D Multiple Lane Detection](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FGarnett_3D-LaneNet_End-to-End_3D_Multiple_Lane_Detection_ICCV_2019_paper.pdf) [[Notes](paper_notes\u002F3d_lanenet.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [Semi-Local 3D Lane Detection and Uncertainty Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.05257) [[Notes](paper_notes\u002Fsemilocal_3d_lanenet.md)] [GM Israel, 3D LLD]\n- [Gen-LaneNet: A Generalized and Scalable Approach for 3D Lane Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.10656) [[Notes](paper_notes\u002Fgen_lanenet.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Apollo, 3D LLD]\n- [Long-Term On-Board Prediction of People in Traffic Scenes under Uncertainty](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.09026) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [Egocentric prediction]\n- [It’s Not All About Size: On the Role of Data Properties in Pedestrian Detection](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCVW_2018\u002Fpapers\u002F11129\u002FRasouli_Its_Not_All_About_Size_On_the_Role_of_Data_ECCVW_2018_paper.pdf) \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [pedestrian]\n\n\n## 2020-02 (12)\n- [Associative Embedding: End-to-End Learning for Joint Detection and Grouping](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.05424) [[Notes](paper_notes\u002Fassociative_embedding.md)] \u003Ckbd>NIPS 2017\u003C\u002Fkbd>\n- [Pixels to Graphs by Associative Embedding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.07365) [[Notes](paper_notes\u002Fpixels_to_graphs.md)] \u003Ckbd>NIPS 2017\u003C\u002Fkbd>\n- [Social LSTM: Human Trajectory Prediction in Crowded Spaces](http:\u002F\u002Fcvgl.stanford.edu\u002Fpapers\u002FCVPR16_Social_LSTM.pdf) [[Notes](paper_notes\u002Fsocial_lstm.md)] \u003Ckbd>CVPR 2017\u003C\u002Fkbd> \n- [Online Video Object Detection using Association LSTM](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2017\u002Fpapers\u002FLu__Online_Video_ICCV_2017_paper.pdf) [[Notes](paper_notes\u002Fassociation_lstm.md)] [single stage, recurrent]\n- [SuperPoint: Self-Supervised Interest Point Detection and Description](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.07629) [[Notes](paper_notes\u002Fsuperpoint.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (channel-to-pixel, deep SLAM, Magic Leap)\n- [PointRend: Image Segmentation as Rendering](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.08193) [[Notes](paper_notes\u002Fpointrend.md)] \u003Ckbd>CVPR 2020 Oral\u003C\u002Fkbd> [Kaiming He, FAIR]\n- [Multigrid: A Multigrid Method for Efficiently Training Video Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.00998) [[Notes](paper_notes\u002Fmultigrid_training.md)] \u003Ckbd>CVPR 2020 Oral\u003C\u002Fkbd> [Kaiming He, FAIR]\n- [GhostNet: More Features from Cheap Operations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.11907) [[Notes](paper_notes\u002Fghostnet.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [FixRes: Fixing the train-test resolution discrepancy](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.06423) [[Notes](paper_notes\u002Ffixres.md)] \u003Ckbd>NIPS 2019\u003C\u002Fkbd> [FAIR]\n- [MoVi-3D: Towards Generalization Across Depth for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.08035) [[Notes](paper_notes\u002Fmovi_3d.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Virtual Cam, viewport, Mapillary\u002FFacebook, Mono3D] \n- [Amodal Completion and Size Constancy in Natural Scenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F1509.08147) [[Notes](paper_notes\u002Famodal_completion.md)] \u003Ckbd>ICCV 2015\u003C\u002Fkbd> (Amodal completion)\n- [MoCo: Momentum Contrast for Unsupervised Visual Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.05722) [[Notes](paper_notes\u002Fmoco.md)] \u003Ckbd>CVPR 2020 Oral\u003C\u002Fkbd> [FAIR, Kaiming He]\n\n\n## 2020-01 (19)\n- [Double Descent: Reconciling modern machine learning practice and the bias-variance trade-of](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.11118) [[Notes](paper_notes\u002Fdouble_descent.md)] \u003Ckbd>PNAS 2019\u003C\u002Fkbd>\n- [Deep Double Descent: Where Bigger Models and More Data Hurt](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.02292) [[Notes](paper_notes\u002Fdeep_double_descent.md)]\n- [Visualizing the Loss Landscape of Neural Nets](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.09913) \u003Ckbd>NIPS 2018\u003C\u002Fkbd>\n- [The ApolloScape Open Dataset for Autonomous Driving and its Application](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.06184.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (dataset)\n- [ApolloCar3D: A Large 3D Car Instance Understanding Benchmark for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.12222) [[Notes](paper_notes\u002Fapollocar3d.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Part-level Car Parsing and Reconstruction from a Single Street View](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.10837) [[Notes](paper_notes\u002Fapollo_car_parts.md)] [Baidu]\n- [6D-VNet: End-to-end 6DoF Vehicle Pose Estimation from Monocular RGB Images](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPRW_2019\u002Fpapers\u002FAutonomous%20Driving\u002FWu_6D-VNet_End-to-End_6-DoF_Vehicle_Pose_Estimation_From_Monocular_RGB_Images_CVPRW_2019_paper.pdf) [[Notes](paper_notes\u002F6d_vnet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [RTM3D: Real-time Monocular 3D Detection from Object Keypoints for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.03343) [[Notes](paper_notes\u002Frtm3d.md)] \u003Ckbd>ECCV 2020 spotlight\u003C\u002Fkbd>\n- [DORN: Deep Ordinal Regression Network for Monocular Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.02446) [[Notes](paper_notes\u002Fdorn.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [monodepth, supervised]\n- [D&T: Detect to Track and Track to Detect](https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.03958) [[Notes](paper_notes\u002Fdetect_track.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd> (from Feichtenhofer)\n- [CRF-Net: A Deep Learning-based Radar and Camera Sensor Fusion Architecture for Object Detection](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8916629\u002F) [[Notes](paper_notes\u002Fcrf_net.md)] \u003Ckbd>SDF 2019\u003C\u002Fkbd> (radar detection)\n- [RVNet: Deep Sensor Fusion of Monocular Camera and Radar for Image-based Obstacle Detection in Challenging Environments](https:\u002F\u002Fwww.researchgate.net\u002Fprofile\u002FVijay_John3\u002Fpublication\u002F335833918_RVNet_Deep_Sensor_Fusion_of_Monocular_Camera_and_Radar_for_Image-based_Obstacle_Detection_in_Challenging_Environments\u002Flinks\u002F5d7f164e92851c87c38b09f1\u002FRVNet-Deep-Sensor-Fusion-of-Monocular-Camera-and-Radar-for-Image-based-Obstacle-Detection-in-Challenging-Environments.pdf) [[Notes](paper_notes\u002Frvnet.md)] \u003Ckbd>PSIVT 2019\u003C\u002Fkbd>\n- [RRPN: Radar Region Proposal Network for Object Detection in Autonomous Vehicles](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.00526) [[Notes](paper_notes\u002Frrpn_radar.md)] \u003Ckbd>ICIP 2019\u003C\u002Fkbd>\n- [ROLO: Spatially Supervised Recurrent Convolutional Neural Networks for Visual Object Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F1607.05781) [[Notes](paper_notes\u002Frolo.md)] \u003Ckbd>ISCAS 2016\u003C\u002Fkbd>\n- [Recurrent SSD: Recurrent Multi-frame Single Shot Detector for Video Object Detection](https:\u002F\u002Fwww.merl.com\u002Fpublications\u002Fdocs\u002FTR2018-137.pdf) [[Notes](paper_notes\u002Frecurrent_ssd.md)] \u003Ckbd>BMVC 2018\u003C\u002Fkbd> (Mitsubishi)\n- [Recurrent RetinaNet: A Video Object Detection Model Based on Focal Loss](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-3-030-04212-7_44) [[Notes](paper_notes\u002Frecurrent_retinanet.md)] \u003Ckbd>ICONIP 2018\u003C\u002Fkbd> (single stage, recurrent)\n- [Actions as Moving Points](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.04608) [[Notes](paper_notes\u002Fmoc.md)] [not suitable for online]\n- [The PREVENTION dataset: a novel benchmark for PREdiction of VEhicles iNTentIONs](10.1109\u002FITSC.2019.8917433) [[Notes](paper_notes\u002Fprevention_dataset.md)] \u003Ckbd>ITSC 2019\u003C\u002Fkbd> [dataset, cut-in]\n- [Semi-Automatic High-Accuracy Labelling Tool for Multi-Modal Long-Range Sensor Dataset](https:\u002F\u002Fsci-hub.tw\u002F10.1109\u002FIVS.2018.8500672) [[Notes](paper_notes\u002Fprevention_annotation.md)] \u003Ckbd>IV 2018\u003C\u002Fkbd>\n- [Astyx dataset: Automotive Radar Dataset for Deep Learning Based 3D Object Detection](https:\u002F\u002Fwww.astyx.com\u002Ffileadmin\u002Fredakteur\u002Fdokumente\u002FAutomotive_Radar_Dataset_for_Deep_learning_Based_3D_Object_Detection.PDF) [[Notes](paper_notes\u002Fastyx_dataset.md)] \u003Ckbd>EuRAD 2019\u003C\u002Fkbd> (Astyx)\n- [Astyx camera radar: Deep Learning Based 3D Object Detection for Automotive Radar and Camera](https:\u002F\u002Fwww.astyx.net\u002Ffileadmin\u002Fredakteur\u002Fdokumente\u002FDeep_Learning_Based_3D_Object_Detection_for_Automotive_Radar_and_Camera.PDF) [[Notes](paper_notes\u002Fastyx_radar_camera_fusion.md)] \u003Ckbd>EuRAD 2019\u003C\u002Fkbd> (Astyx)\n\n\n## 2019-12 (12)\n- [How Do Neural Networks See Depth in Single Images?](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002Fvan_Dijk_How_Do_Neural_Networks_See_Depth_in_Single_Images_ICCV_2019_paper.pdf) [[Notes](paper_notes\u002Fwhat_monodepth_see.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [Self-supervised Sparse-to-Dense: Self-supervised Depth Completion from LiDAR and Monocular Camera](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.00275) \u003Ckbd>ICRA 2019\u003C\u002Fkbd>  (depth completion)\n- [DC: Depth Coefficients for Depth Completion](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.05421) [[Notes](paper_notes\u002Fdepth_coeff.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [Xiaoming Liu, Multimodal]\n- [Parse Geometry from a Line: Monocular Depth Estimation with Partial Laser Observation](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.02174.pdf) [[Notes](paper_notes\u002Fdepth_from_one_line.md)] \u003Ckbd>ICRA 2017\u003C\u002Fkbd>\n- [VO-Monodepth: Enhancing self-supervised monocular depth estimation with traditional visual odometry](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03127) [[Notes](paper_notes\u002Fvo_monodepth.md)] \u003Ckbd>3DV 2019\u003C\u002Fkbd> (sparse to dense)\n- [Probabilistic Object Detection: Definition and Evaluation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.10800) [[Notes](paper_notes\u002Fpdq.md)]\n- [The Fishyscapes Benchmark: Measuring Blind Spots in Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.03215) [[Notes](paper_notes\u002Ffishyscape.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [On Calibration of Modern Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.04599) [[Notes](paper_notes\u002Fcalib_modern_nn.md)] \u003Ckbd>ICML 2017\u003C\u002Fkbd> (Weinberger)\n- [Extreme clicking for efficient object annotation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1708.02750) [[Notes](paper_notes\u002Fextreme_clicking.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [Radar and Camera Early Fusion for Vehicle Detection in Advanced Driver Assistance Systems](https:\u002F\u002Fml4ad.github.io\u002Ffiles\u002Fpapers\u002FRadar%20and%20Camera%20Early%20Fusion%20for%20Vehicle%20Detection%20in%20Advanced%20Driver%20Assistance%20Systems.pdf) [[Notes](paper_notes\u002Fradar_camera_qcom.md)] \u003Ckbd>NeurIPS 2019\u003C\u002Fkbd> (radar)\n- [Deep Active Learning for Efficient Training of a LiDAR 3D Object Detector](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.10609) [[Notes](paper_notes\u002Fdeep_active_learning_lidar.md)] \u003Ckbd>IV 2019\u003C\u002Fkbd>\n- [C3DPO: Canonical 3D Pose Networks for Non-Rigid Structure From Motion](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02533) [[Notes](paper_notes\u002Fc3dpo.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [YOLACT: Real-time Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.02689) [[Notes](paper_notes\u002Fyolact.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [single-stage instance seg]\n- [YOLACT++: Better Real-time Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.06218) [single-stage instance seg]\n\n\n## 2019-11 (20)\n- [Review of Image and Feature Descriptors](paper_notes\u002Freview_descriptors.md)\n- [Vehicle Detection With Automotive Radar Using Deep Learning on Range-Azimuth-Doppler Tensors](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCVW_2019\u002Fpapers\u002FCVRSUAD\u002FMajor_Vehicle_Detection_With_Automotive_Radar_Using_Deep_Learning_on_Range-Azimuth-Doppler_ICCVW_2019_paper.pdf) [[Notes](paper_notes\u002Fradar_fft_qcom.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [GPP: Ground Plane Polling for 6DoF Pose Estimation of Objects on the Road](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.06666) [[Notes](paper_notes\u002Fgpp.md)] \u003Ckbd>IV 2020\u003C\u002Fkbd> [UCSD, Trevidi, mono 3DOD]\n- [MVRA: Multi-View Reprojection Architecture for Orientation Estimation](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCVW_2019\u002Fpapers\u002FADW\u002FChoi_Multi-View_Reprojection_Architecture_for_Orientation_Estimation_ICCVW_2019_paper.pdf) [[Notes](paper_notes\u002Fmvra.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [YOLOv3: An Incremental Improvement](https:\u002F\u002Fpjreddie.com\u002Fmedia\u002Ffiles\u002Fpapers\u002FYOLOv3.pdf)\n- [Gaussian YOLOv3: An Accurate and Fast Object Detector Using Localization Uncertainty for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.04620) [[Notes](paper_notes\u002Fgaussian_yolov3.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> (Detection with Uncertainty)\n- [Bayesian YOLOv3: Uncertainty Estimation in One-Stage Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.10296) [[Notes](paper_notes\u002Fbayesian_yolov3.md)] [DriveU]\n- [Towards Safe Autonomous Driving: Capture Uncertainty in the Deep Neural Network For Lidar 3D Vehicle Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.05132) [[Notes](paper_notes\u002Ftowards_safe_ad.md)] \u003Ckbd>ITSC 2018\u003C\u002Fkbd> (DriveU)\n- [Leveraging Heteroscedastic Aleatoric Uncertainties for Robust Real-Time LiDAR 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.05590) [[Notes](paper_notes\u002Ftowards_safe_ad2.md)] \u003Ckbd>IV 2019\u003C\u002Fkbd> (DriveU)\n- [Can We Trust You? On Calibration of a Probabilistic Object Detector for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.12358) [[Notes](paper_notes\u002Ftowards_safe_ad_calib.md)] \u003Ckbd>IROS 2019\u003C\u002Fkbd> (DriveU)\n- [LaserNet: An Efficient Probabilistic 3D Object Detector for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.08701) [[Notes](paper_notes\u002Flasernet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (uncertainty)\n- [LaserNet KL: Learning an Uncertainty-Aware Object Detector for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11375) [[Notes](paper_notes\u002Flasernet_kl.md)] [LaserNet with KL divergence]\n- [IoUNet: Acquisition of \tLocalization Confidence for Accurate Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.11590) [[Notes](paper_notes\u002Fiou_net.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [gIoU: Generalized Intersection over Union: A Metric and A Loss for Bounding Box Regression](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.09630) [[Notes](paper_notes\u002Fgiou.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [The Lovász-Softmax loss: A tractable surrogate for the optimization of the intersection-over-union measure in neural networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.08790) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [IoU as loss]\n- [KL Loss: Bounding Box Regression with Uncertainty for Accurate Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.08545) [[Notes](paper_notes\u002Fkl_loss.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [CAM-Convs: Camera-Aware Multi-Scale Convolutions for Single-View Depth](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.02028) [[Notes](paper_notes\u002Fcam_conv.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [BayesOD: A Bayesian Approach for Uncertainty Estimation in Deep Object Detectors](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.03838) [[Notes](paper_notes\u002Fbayes_od.md)]\n- [TW-SMNet: Deep Multitask Learning of Tele-Wide Stereo Matching](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04463) [[Notes](paper_notes\u002Ftwsm_net.md)] \u003Ckbd>ICIP 2019\u003C\u002Fkbd>\n- [Accurate Uncertainties for Deep Learning Using Calibrated Regression](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.00263) [[Notes](paper_notes\u002Fdl_regression_calib.md)] \u003Ckbd>ICML 2018\u003C\u002Fkbd>\n- [Calibrating Uncertainties in Object Localization Task](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.11210) [[Notes](paper_notes\u002F2dod_calib.md)] \u003Ckbd>NIPS 2018\u003C\u002Fkbd>\n- [SMWA: On the Over-Smoothing Problem of CNN Based Disparity Estimation](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FChen_On_the_Over-Smoothing_Problem_of_CNN_Based_Disparity_Estimation_ICCV_2019_paper.pdf) [[Notes](paper_notes\u002Fsmwa.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Multimodal, depth estimation]\n- [Sparse-to-Dense: Depth Prediction from Sparse Depth Samples and a Single Image](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.07492) [[Notes](paper_notes\u002Fsparse_to_dense.md)] \u003Ckbd>ICRA 2018\u003C\u002Fkbd> (depth completion)\n\n\n## 2019-10 (18)\n- [Review of monocular object detection](paper_notes\u002Freview_mono_3dod.md)\n- [Review of 2D 3D contraints in Mono 3DOD](paper_notes\u002Fmono_3dod_2d3d_constraints.md)\n- [MonoGRNet 2: Monocular 3D Object Detection via Geometric Reasoning on Keypoints](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.05618) [[Notes](paper_notes\u002Fmonogrnet_russian.md)] [estimates depth from keypoints]\n- [Deep MANTA: A Coarse-to-fine Many-Task Network for joint 2D and 3D vehicle analysis from monocular image](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.07570) [[Notes](paper_notes\u002Fdeep_manta.md)] \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [SS3D: Monocular 3D Object Detection and Box Fitting Trained End-to-End Using Intersection-over-Union Loss](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.08070) [[Notes](paper_notes\u002Fss3d.md)] [rergess distance from images, centernet like]\n- [GS3D: An Efficient 3D Object Detection Framework for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.10955) [[Notes](paper_notes\u002Fgs3d.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [M3D-RPN: Monocular 3D Region Proposal Network for Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.06038) [[Notes](paper_notes\u002Fm3d_rpn.md)] \u003Ckbd>ICCV 2019 oral\u003C\u002Fkbd> [3D anchors, cyclists, Xiaoming Liu]\n- [TLNet: Triangulation Learning Network: from Monocular to Stereo 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01193) [[Notes](paper_notes\u002Ftlnet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [A Survey on 3D Object Detection Methods for Autonomous Driving Applications](http:\u002F\u002Fwrap.warwick.ac.uk\u002F114314\u002F1\u002FWRAP-survey-3D-object-detection-methods-autonomous-driving-applications-Arnold-2019.pdf) [[Notes](paper_notes\u002F3dod_review.md)] \u003Ckbd>TITS 2019\u003C\u002Fkbd> [Review]\n- [BEV-IPM: Deep Learning based Vehicle Position and Orientation Estimation via Inverse Perspective Mapping Image](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8814050) [[Notes](paper_notes\u002Fbev_od_ipm.md)] \u003Ckbd>IV 2019\u003C\u002Fkbd>\n- [ForeSeE: Task-Aware Monocular Depth Estimation for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.07701) [[Notes](paper_notes\u002Fforesee_mono3dod.md)] \u003Ckbd>AAAI 2020 oral\u003C\u002Fkbd> [successor to pseudo-lidar, mono 3DOD SOTA]\n- [Obj-dist: Learning Object-specific Distance from a Monocular Image](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.04182) [[Notes](paper_notes\u002Fobj_dist_iccv2019.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> (xmotors.ai + NYU) [monocular distance]\n- [DisNet: A novel method for distance estimation from monocular camera](https:\u002F\u002Fproject.inria.fr\u002Fppniv18\u002Ffiles\u002F2018\u002F10\u002Fpaper22.pdf) [[Notes](paper_notes\u002Fdisnet.md)] \u003Ckbd>IROS 2018\u003C\u002Fkbd> [monocular distance]\n- [BirdGAN: Learning 2D to 3D Lifting for Object Detection in 3D for Autonomous Vehicles](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08494) [[Notes](paper_notes\u002Fbirdgan.md)] \u003Ckbd>IROS 2019\u003C\u002Fkbd> \n- [Shift R-CNN: Deep Monocular 3D Object Detection with Closed-Form Geometric Constraints](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.09970) [[Notes](paper_notes\u002Fshift_rcnn.md)] \u003Ckbd>ICIP 2019\u003C\u002Fkbd>\n- [3D-RCNN: Instance-level 3D Object Reconstruction via Render-and-Compare](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FKundu_3D-RCNN_Instance-Level_3D_CVPR_2018_paper.pdf) [[Notes](paper_notes\u002F3d_rcnn.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [Deep Optics for Monocular Depth Estimation and 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08601) [[Notes](paper_notes\u002Fdeep_optics.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [MonoLoco: Monocular 3D Pedestrian Localization and Uncertainty Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.06059) [[Notes](paper_notes\u002Fmonoloco.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [Joint Monocular 3D Vehicle Detection and Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.10742) [[Notes](paper_notes\u002Fmono_3d_tracking.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> (Berkeley DeepDrive)\n- [CasGeo: 3D Bounding Box Estimation for Autonomous Vehicles by Cascaded Geometric Constraints and Depurated 2D Detections Using 3D Results](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.01867) [[Notes](paper_notes\u002Fcasgeo.md)]\n\n\n\n## 2019-09 (17)\n- [Slimmable Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.08928) [[Notes](paper_notes\u002Fslimmable_networks.md)] \u003Ckbd>ICLR 2019\u003C\u002Fkbd>\n- [Universally Slimmable Networks and Improved Training Techniques](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.05134) [[Notes](paper_notes\u002Funiversal_slimmable.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [AutoSlim: Towards One-Shot Architecture Search for Channel Numbers](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.11728)\n- [Once for All: Train One Network and Specialize it for Efficient Deployment](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1908.09791.pdf)\n- [DOTA: A Large-scale Dataset for Object Detection in Aerial Images](https:\u002F\u002Fvision.cornell.edu\u002Fse3\u002Fwp-content\u002Fuploads\u002F2018\u002F03\u002F2666.pdf) [[Notes](paper_notes\u002Fdota.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (rotated bbox)\n- [RoiTransformer: Learning RoI Transformer for Oriented Object Detection in Aerial Images](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2019\u002Fpapers\u002FDing_Learning_RoI_Transformer_for_Oriented_Object_Detection_in_Aerial_Images_CVPR_2019_paper.pdf) [[Notes](paper_notes\u002Froi_transformer.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (rotated bbox)\n- [RRPN: Arbitrary-Oriented Scene Text Detection via Rotation Proposals](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.01086) \u003Ckbd>TMM 2018\u003C\u002Fkbd>\n- [R2CNN: Rotational Region CNN for Orientation Robust Scene Text Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.09579) (rotated bbox)\n- [TI white paper: Webinar: mmWave Radar for Automotive and Industrial applications\n](https:\u002F\u002Ftraining.ti.com\u002Fepd-pro-rap-mmwaveradar-adh-tr-webinar-eu) [[Notes](paper_notes\u002Fti_mmwave_radar_webinar.md)] [TI, radar]\n- [Federated Learning: Strategies for Improving Communication Efficiency](https:\u002F\u002Farxiv.org\u002Fabs\u002F1610.05492) [[Notes](paper_notes\u002Ffederated_learning_comm.md)] \u003Ckbd>NIPS 2016\u003C\u002Fkbd>\n- [sort: Simple Online and Realtime Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F1602.00763) [[Notes](paper_notes\u002Fsort.md)] \u003Ckbd>ICIP 2016\u003C\u002Fkbd>\n- [deep-sort: Simple Online and Realtime Tracking with a Deep Association Metric](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.07402) [[Notes](paper_notes\u002Fdeep_sort.md)]\n- [MT-CNN: Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Networks](https:\u002F\u002Fkpzhang93.github.io\u002FMTCNN_face_detection_alignment\u002F) [[Notes](paper_notes\u002Fmtcnn.md)] \u003Ckbd>SPL 2016\u003C\u002Fkbd> (real time, facial landmark)\n- [RetinaFace: Single-stage Dense Face Localisation in the Wild](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.00641) [[Notes](paper_notes\u002Fretina_face.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [joint object and landmark detection]\n- [SC-SfM-Learner: Unsupervised Scale-consistent Depth and Ego-motion Learning from Monocular Video](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.10553) [[Notes](paper_notes\u002Fsc_sfm_learner.md)] \u003Ckbd>NIPS 2019\u003C\u002Fkbd>\n- [SiamMask: Fast Online Object Tracking and Segmentation: A Unifying Approach](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.05050) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (tracking, segmentation, label propagation)\n- [Review of Kálmán Filter](https:\u002F\u002Fwww.bzarg.com\u002Fp\u002Fhow-a-kalman-filter-works-in-pictures\u002F) (from Tim Babb, Pixar Animation) [[Notes](paper_notes\u002Fkalman_filter.md)]\n- [R-FCN: Object Detection via Region-based Fully Convolutional Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1605.06409) [[Notes](paper_notes\u002Frfcn.md)] \u003Ckbd>NIPS 2016\u003C\u002Fkbd>\n- [Guided backprop: Striving for Simplicity: The All Convolutional Net](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.6806.pdf) [[Notes](paper_notes\u002Fguided_backprop.md)] \u003Ckbd>ICLR 2015\u003C\u002Fkbd>\n- [Occlusion-Net: 2D\u002F3D Occluded Keypoint Localization Using Graph Networks](http:\u002F\u002Fwww.cs.cmu.edu\u002F~mvo\u002Findex_files\u002FPapers\u002FONet_19.pdf) [[Notes](paper_notes\u002Focclusion_net.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Boxy Vehicle Detection in Large Images](https:\u002F\u002Fboxy-dataset.com\u002Fboxy\u002Findex) [[Notes](paper_notes\u002Fboxy.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [FQNet: Deep Fitting Degree Scoring Network for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.12681) [[Notes](paper_notes\u002Ffqnet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [Mono 3DOD, Jiwen Lu]\n\n\n## 2019-08 (18)\n- [Mono3D: Monocular 3D Object Detection for Autonomous Driving](https:\u002F\u002Fwww.cs.toronto.edu\u002F~urtasun\u002Fpublications\u002Fchen_etal_cvpr16.pdf) [[Notes](paper_notes\u002Fmono3d.md)] \u003Ckbd>CVPR2016\u003C\u002Fkbd>\n- [MonoDIS: Disentangling Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.12365) [[Notes](paper_notes\u002Fmonodis.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [Pseudo lidar-e2e: Monocular 3D Object Detection with Pseudo-LiDAR Point Cloud](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.09847) [[Notes](paper_notes\u002Fpseudo_lidar_e2e.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> (pseudo-lidar with 2d and 3d consistency loss, better than PL and worse than PL++, SOTA for pure mono3D)\n- [MonoGRNet: A Geometric Reasoning Network for Monocular 3D Object Localization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.10247) [[Notes](paper_notes\u002Fmonogrnet.md)] \u003Ckbd>AAAI 2019\u003C\u002Fkbd> (SOTA of Mono3DOD, MLF \u003C MonoGRNet \u003C Pseudo-lidar)\n- [MLF: Multi-Level Fusion based 3D Object Detection from Monocular Images](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FXu_Multi-Level_Fusion_Based_CVPR_2018_paper.pdf) [[Notes](paper_notes\u002Fmlf.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (precursor to pseudo-lidar)\n- [ROI-10D: Monocular Lifting of 2D Detection to 6D Pose and Metric Shape](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.02781) [[Notes](paper_notes\u002Froi10d.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [AM3D: Accurate Monocular 3D Object Detection via Color-Embedded 3D Reconstruction for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.11444) [[Notes](paper_notes\u002Fam3d.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [similar to pseudo-lidar, color-enhanced]\n- [Mono3D++: Monocular 3D Vehicle Detection with Two-Scale 3D Hypotheses and Task Priors](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.03446) [[Notes](paper_notes\u002Fmono3d++.md)] (from Stefano Soatto) \u003Ckbd>AAAI 2019\u003C\u002Fkbd>\n- [Deep Metadata Fusion for Traffic Light to Lane Assignment](https:\u002F\u002Fieeexplore.ieee.org\u002Fstamp\u002Fstamp.jsp?arnumber=8613841) [[Notes](paper_notes\u002Fdeep_lane_association.md)] \u003Ckbd>IEEE RA-L 2019\u003C\u002Fkbd> (traffic lights association)\n- [Automatic Traffic Light to Ego Vehicle Lane Association at Complex Intersections](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F8569421) \u003Ckbd>ITSC 2019\u003C\u002Fkbd> (traffic lights association)\n- [Distant Vehicle Detection Using Radar and Vision](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.10951)[[Notes](paper_notes\u002Fdistant_object_radar.md)] \u003Ckbd>ICRA 2019\u003C\u002Fkbd> [radar, vision, radar tracklets fusion]\n- [Distance Estimation of Monocular Based on Vehicle Pose Information](https:\u002F\u002Fiopscience.iop.org\u002Farticle\u002F10.1088\u002F1742-6596\u002F1168\u002F3\u002F032040\u002Fpdf) [[Notes](paper_notes\u002Fdistance_estimation_pose_radar.md)]\n- [Multi-Task Learning Using Uncertainty to Weigh Losses for Scene Geometry and Semantics](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.07115) [[Notes](paper_notes\u002Funcertainty_multitask.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (Alex Kendall)\n- [GradNorm: Gradient Normalization for Adaptive Loss Balancing in Deep Multitask Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.02257) [[Notes](paper_notes\u002Fgradnorm.md)] \u003Ckbd>ICML 2018\u003C\u002Fkbd> (multitask)\n- [DTP: Dynamic Task Prioritization for Multitask Learning](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCV_2018\u002Fpapers\u002FMichelle_Guo_Focus_on_the_ECCV_2018_paper.pdf) [[Notes](paper_notes\u002Fdtp.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [multitask, Stanford]\n- [Will this car change the lane? - Turn signal recognition in the frequency domain](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F6856477\u002F) [[Notes](paper_notes\u002Ftsl_frequency.md)] \u003Ckbd>IV 2014\u003C\u002Fkbd>\n- [Complex-YOLO: Real-time 3D Object Detection on Point Clouds](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.06199) [[Notes](paper_notes\u002Fcomplex_yolo.md)] (BEV detection only)\n- [Complexer-YOLO: Real-Time 3D Object Detection and Tracking on Semantic Point Clouds](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.07537) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (sensor fusion and tracking)\n- [An intriguing failing of convolutional neural networks and the CoordConv solution](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.03247) [[Notes](paper_notes\u002Fcoord_conv.md)] \u003Ckbd>NIPS 2018\u003C\u002Fkbd>\n\n\n\n## 2019-07 (19)\n- [Deep Parametric Continuous Convolutional Neural Networks](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FWang_Deep_Parametric_Continuous_CVPR_2018_paper.pdf) [[Notes](paper_notes\u002Fparametric_cont_conv.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (@Uber, sensor fusion)\n- [ContFuse: Deep Continuous Fusion for Multi-Sensor 3D Object Detection](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCV_2018\u002Fpapers\u002FMing_Liang_Deep_Continuous_Fusion_ECCV_2018_paper.pdf) [[Notes](paper_notes\u002Fcontfuse.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [Uber ATG, sensor fusion, BEV]\n- [Fast and Furious: Real Time End-to-End 3D Detection, Tracking and Motion Forecasting with a Single Convolutional Net](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FLuo_Fast_and_Furious_CVPR_2018_paper.pdf) [[Notes](paper_notes\u002Ffaf.md)] \u003Ckbd>CVPR 2018 oral\u003C\u002Fkbd> [lidar only, perception and prediction]\n- [LearnK: Depth from Videos in the Wild: Unsupervised Monocular Depth Learning from Unknown Cameras](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.04998.pdf) [[Notes](paper_notes\u002Flearnk.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [monocular depth estimation, intrinsic estimation, SOTA]\n- [monodepth: Unsupervised Monocular Depth Estimation with Left-Right Consistency](https:\u002F\u002Farxiv.org\u002Fabs\u002F1609.03677) [[Notes](paper_notes\u002Fmonodepth.md)] \u003Ckbd>CVPR 2017 oral\u003C\u002Fkbd> (monocular depth estimation, stereo for training)\n- [Struct2depth: Depth Prediction Without the Sensors: Leveraging Structure for Unsupervised Learning from Monocular Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.06152) [[Notes](paper_notes\u002Fstruct2depth.md)] \u003Ckbd>AAAI 2019\u003C\u002Fkbd> [monocular depth estimation, estimating movement of dynamic object, infinite depth problem, online finetune]\n- [Unsupervised Learning of Geometry with Edge-aware Depth-Normal Consistency](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.03665.pdf) [[Notes](paper_notes\u002Fedge_aware_depth_normal.md)] \u003Ckbd>AAAI 2018\u003C\u002Fkbd> (monocular depth estimation, static assumption, surface normal)\n- [LEGO Learning Edge with Geometry all at Once by Watching Videos](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.05648.pdf) [[Notes](paper_notes\u002Flego.md)] \u003Ckbd>CVPR 2018 spotlight\u003C\u002Fkbd> (monocular depth estimation, static assumption, surface normal)\n- [Object Detection and 3D Estimation via an FMCW Radar Using a Fully Convolutional Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.05394) [[Notes](paper_notes\u002Fradar_3d_od_fcn.md)] (radar, RD map, OD, Arxiv 201902) \n- [A study on Radar Target Detection Based on Deep Neural Networks](https:\u002F\u002Fwww.researchgate.net\u002Fpublication\u002F330748053_A_Study_on_Radar_Target_Detection_Based_on_Deep_Neural_Networks) [[Notes](paper_notes\u002Fradar_target_detection_tsinghua.md)] (radar, RD map, OD) \n- [2D Car Detection in Radar Data with PointNets](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08414) [[Notes](paper_notes\u002Fradar_detection_pointnet.md)] (from Ulm Univ, radar, point cloud, OD, Arxiv 201904) \n- [Learning Confidence for Out-of-Distribution Detection in Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.04865) [[Notes](paper_notes\u002Flearning_ood_conf.md)] (budget to cheat)\n- [A Deep Learning Approach to Traffic Lights: Detection, Tracking, and Classification](assets\u002Fpapers\u002Fbosch_traffic_lights.pdf) [[Notes](paper_notes\u002Fbosch_traffic_lights.md)] \u003Ckbd>ICRA 2017\u003C\u002Fkbd> (Bosch, traffic lights)\n- [How hard can it be? Estimating the difficulty of visual search in an image](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.08280) [[Notes](paper_notes\u002Fhow_hard_can_it_be.md)] \u003Ckbd>CVPR 2016\u003C\u002Fkbd>\n- [Deep Multi-modal Object Detection and Semantic Segmentation for Autonomous Driving: Datasets, Methods, and Challenges](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1902.07830.pdf) [[Notes](paper_notes\u002Fdeep_fusion_review.md)] (review from Bosch)\n- [Review of monocular 3d object detection](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F57029694) (blog from 知乎)\n- [Deep3dBox: 3D Bounding Box Estimation Using Deep Learning and Geometry](https:\u002F\u002Farxiv.org\u002Fabs\u002F1612.00496) [[Notes](paper_notes\u002Fdeep3dbox.md)] \u003Ckbd>CVPR 2017\u003C\u002Fkbd> [Zoox]\n- [MonoPSR: Monocular 3D Object Detection Leveraging Accurate Proposals and Shape Reconstruction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.01690) [[Notes](paper_notes\u002Fmonopsr.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [OFT: Orthographic Feature Transform for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.08188) [[Notes](paper_notes\u002Foft.md)] \u003Ckbd>BMVC 2019\u003C\u002Fkbd> [Convert camera to BEV, Alex Kendall]\n\n\n## 2019-06 (12)\n- [MixMatch: A Holistic Approach to Semi-Supervised Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02249) [[Notes](paper_notes\u002FMixMatch.md)]\n- [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.11946.pdf) [[Notes](paper_notes\u002Fefficientnet.md)] \u003Ckbd>ICML 2019\u003C\u002Fkbd>\n- [What Uncertainties Do We Need in Bayesian Deep Learning for Computer Vision?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.04977) [[Notes](paper_notes\u002Funcertainty_bdl.md)] \u003Ckbd>NIPS 2017\u003C\u002Fkbd>\n- [Bayesian SegNet: Model Uncertainty in Deep Convolutional Encoder-Decoder Architectures for Scene Understanding](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.02680.pdf) [[Notes](paper_notes\u002Fbayesian_segnet.md)]\u003Ckbd>BMVC 2017\u003C\u002Fkbd>\n- [TrafficPredict: Trajectory Prediction for Heterogeneous Traffic-Agents](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1811.02146.pdf) [[Notes](paper_notes\u002Ftrafficpredict.md)] \u003Ckbd>AAAI 2019 oral\u003C\u002Fkbd>\n- [Deep Depth Completion of a Single RGB-D Image](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.09326.pdf) [[Notes](paper_notes\u002Fdeep_depth_completion_rgbd.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (indoor)\n- [DeepLiDAR: Deep Surface Normal Guided Depth Prediction for Outdoor Scene from Sparse LiDAR Data and Single Color Image](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.00488v2.pdf) [[Notes](paper_notes\u002Fdeeplidar.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (outdoor)\n- [SfMLearner: Unsupervised Learning of Depth and Ego-Motion from Video](https:\u002F\u002Fpeople.eecs.berkeley.edu\u002F~tinghuiz\u002Fprojects\u002FSfMLearner\u002Fcvpr17_sfm_final.pdf) [[Notes](paper_notes\u002Fsfm_learner.md)] \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [Monodepth2: Digging Into Self-Supervised Monocular Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.01260) [[Notes](paper_notes\u002Fmonodepth2.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Niantic]\n- [DeepSignals: Predicting Intent of Drivers Through Visual Signals](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.01333.pdf) [[Notes](paper_notes\u002Fdeep_signals.md)] \u003Ckbd>ICRA 2019\u003C\u002Fkbd> (@Uber, turn signal detection)\n- [FCOS: Fully Convolutional One-Stage Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.01355) [[Notes](paper_notes\u002Ffcos.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Chunhua Shen]\n- [Pseudo-LiDAR++: Accurate Depth for 3D Object Detection in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.06310) [[Notes](paper_notes\u002Fpseudo_lidar++.md)] \u003Ckbd>ICLR 2020\u003C\u002Fkbd>\n- [MMF: Multi-Task Multi-Sensor Fusion for 3D Object Detection](http:\u002F\u002Fwww.cs.toronto.edu\u002F~byang\u002Fpapers\u002Fmmf.pdf) [[Notes](paper_notes\u002Fmmf.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (@Uber, sensor fusion)\n\n\n## 2019-05 (18)\n- [CenterNet: Objects as points](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.07850) (from ExtremeNet authors) [[Notes](paper_notes\u002Fcenternet.md)]\n- [CenterNet: Object Detection with Keypoint Triplets](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08189) [[Notes](paper_notes\u002Fcenternet_cas.md)]\n- [Object Detection based on Region Decomposition and Assembly](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.08225) [[Notes](paper_notes\u002Fobject_detection_region_decomposition.md)] \u003Ckbd>AAAI 2019 \u003C\u002Fkbd>\n- [The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.03635) [[Notes](paper_notes\u002Flottery_ticket_hypothesis.md)] \u003Ckbd>ICLR 2019 \u003C\u002Fkbd>\n- [M2Det: A Single-Shot Object Detector based on Multi-Level Feature Pyramid Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.04533) [[Notes](paper_notes\u002Fm2det.md)] \u003Ckbd>AAAI 2019 \u003C\u002Fkbd>\n- [Deep Radar Detector](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.12187) [[Notes](paper_notes\u002Fdeep_radar_detector.md)] \u003Ckbd>RadarCon 2019\u003C\u002Fkbd>\n- [Semantic Segmentation on Radar Point Clouds](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F8455344) [[[Notes](paper_notes\u002Fradar_point_semantic_seg.md)]] (from Daimler AG) \u003Ckbd>FUSION 2018\u003C\u002Fkbd>\n- [Pruning Filters for Efficient ConvNets](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1608.08710.pdf) [[Notes](paper_notes\u002Fpruning_filters.md)] \u003Ckbd>ICLR 2017\u003C\u002Fkbd>\n- [Layer-compensated Pruning for Resource-constrained Convolutional Neural Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1810.00518.pdf) [[Notes](paper_notes\u002Flayer_compensated_pruning.md)] \u003Ckbd>NIPS 2018 talk\u003C\u002Fkbd>\n- [LeGR: Filter Pruning via Learned Global Ranking](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.12368.pdf) [[Notes](paper_notes\u002Flegr.md)] \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd>\n- [NAS-FPN: Learning Scalable Feature Pyramid Architecture for Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.07392.pdf) [[Notes](paper_notes\u002Fnas_fpn.md)] \u003Ckbd>CVPR 2019 \u003C\u002Fkbd>\n- [AutoAugment: Learning Augmentation Policies from Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.09501) [[Notes](paper_notes\u002Fautoaugment.md)] \u003Ckbd>CVPR 2019 \u003C\u002Fkbd>\n- [Path Aggregation Network for Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.01534.pdf) [[Notes](paper_notes\u002Fpanet.md)] \u003Ckbd>CVPR 2018 \u003C\u002Fkbd>\n- [Channel Pruning for Accelerating Very Deep Neural Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1707.06168.pdf) \u003Ckbd>ICCV 2017\u003C\u002Fkbd> (Face++, Yihui He) [[Notes](paper_notes\u002Fchannel_pruning_megvii.md)]\n- [AMC: AutoML for Model Compression and Acceleration on Mobile Devices](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1802.03494.pdf) \u003Ckbd>ECCV 2018\u003C\u002Fkbd> (Song Han, Yihui He)\n- [MobileNetV3: Searching for MobileNetV3](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02244) [[Notes](paper_notes\u002Fmobilenets_v3.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [MnasNet: Platform-Aware Neural Architecture Search for Mobile](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1807.11626.pdf) [[Notes](paper_notes\u002Fmnasnet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> \n- [Rethinking the Value of Network Pruning](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1810.05270.pdf) \u003Ckbd>ICLR 2019\u003C\u002Fkbd>\n\n## 2019-04 (12)\n- [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1801.04381) (MobileNets v2) [[Notes](paper_notes\u002Fmobilenets_v2.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [A New Performance Measure and Evaluation Benchmark\nfor Road Detection Algorithms](http:\u002F\u002Fwww.cvlibs.net\u002Fpublications\u002FFritsch2013ITSC.pdf) [[Notes](paper_notes\u002Fkitti_lane.md)] \u003Ckbd>ITSC 2013\u003C\u002Fkbd>\n- [MultiNet: Real-time Joint Semantic Reasoning for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1612.07695.pdf) [[Notes](paper_notes\u002Fmultinet_raquel.md)]\n- [Optimizing the Trade-off between Single-Stage and Two-Stage Object Detectors using Image Difficulty Prediction](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.08707.pdf) (Very nice illustration of 1 and 2 stage object detection)\n- [Light-Head R-CNN: In Defense of Two-Stage Object Detector](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.07264.pdf) [[Notes](paper_notes\u002Flighthead_rcnn.md)] (from Megvii)\n- [CSP: High-level Semantic Feature Detection: A New Perspective for Pedestrian Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.02948) [[Notes](paper_notes\u002Fcsp_pedestrian.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [center and scale prediction, anchor-free, near SOTA pedestrian]\n- [Review of Anchor-free methods (知乎Blog) 目标检测：Anchor-Free时代](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F62103812) [Anchor free深度学习的目标检测方法](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F64563186) [My Slides on CSP](https:\u002F\u002Fdocs.google.com\u002Fpresentation\u002Fd\u002F1_dUfxv63108bZXUnVYPIOAdEIkRZw5BR9-rOp-Ni0X0\u002F)\n- [DenseBox: Unifying Landmark Localization with End to End Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1509.04874.pdf)\n- [CornerNet: Detecting Objects as Paired Keypoints](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1808.01244.pdf) [[Notes](paper_notes\u002Fcornernet.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [ExtremeNet: Bottom-up Object Detection by Grouping Extreme and Center Points](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1901.08043.pdf) [[Notes](paper_notes\u002Fextremenet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [FSAF: Feature Selective Anchor-Free Module for Single-Shot Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1903.00621.pdf) [[Notes](paper_notes\u002Ffsaf_detection.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [FoveaBox: Beyond Anchor-based Object Detector](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.03797v1.pdf) (anchor-free) [[Notes](paper_notes\u002Ffoveabox.md)]\n\n\n\n## 2019-03 (19)\n- [Bag of Freebies for Training Object Detection Neural Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1902.04103.pdf) [[Notes](paper_notes\u002Fbag_of_freebies_object_detection.md)]\n- [mixup: Beyond Empirical Risk Minimization](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1710.09412.pdf) [[Notes](paper_notes\u002Fmixup.md)] \u003Ckbd>ICLR 2018\u003C\u002Fkbd>\n- [Multi-view Convolutional Neural Networks for 3D Shape Recognition](https:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_iccv_2015\u002Fpapers\u002FSu_Multi-View_Convolutional_Neural_ICCV_2015_paper.pdf) (MVCNN) [[Notes](paper_notes\u002Fmvcnn.md)] \u003Ckbd>ICCV 2015\u003C\u002Fkbd> \n- [3D ShapeNets: A Deep Representation for Volumetric Shapes](http:\u002F\u002F3dshapenets.cs.princeton.edu\u002Fpaper.pdf) [[Notes](paper_notes\u002F3d_shapenets.md)] \u003Ckbd>CVPR 2015\u003C\u002Fkbd>\n- [Volumetric and Multi-View CNNs for Object Classification on 3D Data](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1604.03265.pdf) [[Notes](paper_notes\u002Fvol_vs_mvcnn.md)] \u003Ckbd>CVPR 2016\u003C\u002Fkbd>\n- [Group Normalization](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.08494.pdf) [[Notes](paper_notes\u002Fgroupnorm.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [Spatial Transformer Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.02025.pdf) [[Notes](paper_notes\u002Fstn.md)] \u003Ckbd>NIPS 2015\u003C\u002Fkbd>\n- [Frustum PointNets for 3D Object Detection from RGB-D Data](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.08488.pdf) (F-PointNet) [[Notes](paper_notes\u002Ffrustum_pointnet.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> \n- [Dynamic Graph CNN for Learning on Point Clouds](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1801.07829.pdf) [[Notes](paper_notes\u002Fedgeconv.md)]\n- [PointRCNN: 3D Object Proposal Generation and Detection from Point Cloud](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.04244) (SOTA for 3D object detection) [[Notes](paper_notes\u002Fpoint_rcnn.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [MV3D: Multi-View 3D Object Detection Network for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.07759) [[Notes](paper_notes\u002Fmv3d.md)] \u003Ckbd>CVPR 2017\u003C\u002Fkbd> (Baidu, sensor fusion, BV proposal)\n- [AVOD: Joint 3D Proposal Generation and Object Detection from View Aggregation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.02294) [[Notes](paper_notes\u002Favod.md)] \u003Ckbd>IROS 2018\u003C\u002Fkbd> (sensor fusion, multiview proposal)\n- [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.04861) [[Notes](paper_notes\u002Fmobilenets.md)]\n- [Pseudo-LiDAR from Visual Depth Estimation: Bridging the Gafp in 3D Object Detection for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.07179) [[Notes](paper_notes\u002Fpseudo_lidar.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [VoxelNet: End-to-End Learning for Point Cloud Based 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.06396.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (Apple, first end-to-end point cloud encoding to grid)\n- [SECOND: Sparsely Embedded Convolutional Detection](https:\u002F\u002Fwww.mdpi.com\u002F1424-8220\u002F18\u002F10\u002F3337\u002Fpdf) \u003Ckbd>Sensors 2018\u003C\u002Fkbd> (builds on VoxelNet)\n- [PointPillars: Fast Encoders for Object Detection from Point Clouds](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.05784) [[Notes](paper_notes\u002Fpoint_pillars.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (builds on SECOND)\n- [Are we ready for Autonomous Driving? The KITTI Vision Benchmark Suite](http:\u002F\u002Fwww.cvlibs.net\u002Fpublications\u002FGeiger2012CVPR.pdf) [[Notes](paper_notes\u002Fkitti.md)] \u003Ckbd>CVPR 2012\u003C\u002Fkbd>\n- [Vision meets Robotics: The KITTI Dataset](http:\u002F\u002Fww.cvlibs.net\u002Fpublications\u002FGeiger2013IJRR.pdf) [[Notes](paper_notes\u002Fkitti.md)] \u003Ckbd>IJRR 2013\u003C\u002Fkbd>\n\n\n## 2019-02 (9)\n- [Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1705.07750.pdf) (I3D) [[Notes](paper_notes\u002Fquo_vadis_i3d.md)]\u003Ckbd>Video\u003C\u002Fkbd> \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [Initialization Strategies of Spatio-Temporal Convolutional Neural Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1503.07274.pdf) [[Notes](paper_notes\u002Fquo_vadis_i3d.md)] \u003Ckbd>Video\u003C\u002Fkbd>\n- [Detect-and-Track: Efficient Pose Estimation in Videos](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1712.09184.pdf) [[Notes](paper_notes\u002Fquo_vadis_i3d.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd> \u003Ckbd>Video\u003C\u002Fkbd>\n- [Deep Learning Based Rib Centerline Extraction and Labeling](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1809.07082) [[Notes](paper_notes\u002Frib_centerline_philips.md)] \u003Ckbd>MI\u003C\u002Fkbd> \u003Ckbd>MICCAI 2018\u003C\u002Fkbd>\n- [SlowFast Networks for Video Recognition](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.03982.pdf) [[Notes](paper_notes\u002Fslowfast.md)] \u003Ckbd>ICCV 2019 Oral\u003C\u002Fkbd>\n- [Aggregated Residual Transformations for Deep Neural Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.05431.pdf) (ResNeXt) [[Notes](paper_notes\u002Fresnext.md)] \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [Beyond the pixel plane: sensing and learning in 3D](https:\u002F\u002Fthegradient.pub\u002Fbeyond-the-pixel-plane-sensing-and-learning-in-3d\u002F) (blog, [中文版本](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F44386618))\n- [VoxNet: A 3D Convolutional Neural Network for Real-Time Object Recognition](https:\u002F\u002Fwww.ri.cmu.edu\u002Fpub_files\u002F2015\u002F9\u002Fvoxnet_maturana_scherer_iros15.pdf) (VoxNet) [[Notes](paper_notes\u002Fvoxnet.md)]\n- [PointNet: Deep Learning on Point Sets for 3D Classification and Segmentation](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1612.00593.pdf) \u003Ckbd>CVPR 2017\u003C\u002Fkbd> [[Notes](paper_notes\u002Fpointnet.md)]\n- [PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1706.02413.pdf) \u003Ckbd>NIPS 2017\u003C\u002Fkbd> [[Notes](paper_notes\u002Fpointnet++.md)]\n- [Review of Geometric deep learning 几何深度学习前沿 (from 知乎)](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F36888114) (Up to CVPR 2018)\n\n\n## 2019-01 (10)\n- [DQN: Human-level control through deep reinforcement learning (Nature DQN paper)](https:\u002F\u002Fstorage.googleapis.com\u002Fdeepmind-media\u002Fdqn\u002FDQNNaturePaper.pdf) [[Notes](paper_notes\u002Fnature_dqn_paper.md)] \u003Ckbd>DRL\u003C\u002Fkbd>\n- [Retina U-Net: Embarrassingly Simple Exploitation of Segmentation Supervision for Medical Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1811.08661.pdf) [[Notes](paper_notes\u002Fretina_unet.md)] \u003Ckbd>MI\u003C\u002Fkbd>\n- [Panoptic Segmentation](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1801.00868.pdf) [[Notes](paper_notes\u002Fpanoptic_segmentation.md)] \u003Ckbd>PanSeg\u003C\u002Fkbd>\n- [Panoptic Feature Pyramid Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1901.02446.pdf) [[Notes](paper_notes\u002Fpanoptic_fpn.md)] \u003Ckbd>PanSeg\u003C\u002Fkbd> \n- [Attention-guided Unified Network for Panoptic Segmentation](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.03904.pdf) [[Notes](paper_notes\u002FAUNet_panoptic.md)] \u003Ckbd>PanSeg\u003C\u002Fkbd>\n- [Bag of Tricks for Image Classification with Convolutional Neural Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.01187.pdf) [[Notes](paper_notes\u002Fbag_of_tricks_cnn.md)] \u003Ckbd>CLS\u003C\u002Fkbd>\n- [Deep Reinforcement Learning for Vessel Centerline Tracing in Multi-modality 3D Volumes](https:\u002F\u002Flink.springer.com\u002Fchapter\u002F10.1007\u002F978-3-030-00937-3_86) [[Notes](paper_notes\u002Fdrl_vessel_centerline.md)] \u003Ckbd>DRL\u003C\u002Fkbd> \u003Ckbd>MI\u003C\u002Fkbd>\n- [Deep Reinforcement Learning for Flappy Bird](http:\u002F\u002Fcs229.stanford.edu\u002Fproj2015\u002F362_report.pdf) [[Notes](paper_notes\u002Fdrl_flappy.md)] \u003Ckbd>DRL\u003C\u002Fkbd>\n- [Long-Term Feature Banks for Detailed Video Understanding](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.05038.pdf) [[Notes](paper_notes\u002Flong_term_feat_bank.md)] \u003Ckbd>Video\u003C\u002Fkbd> \n- [Non-local Neural Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.07971.pdf) [[Notes](paper_notes\u002Fnon_local_net.md)] \u003Ckbd>Video\u003C\u002Fkbd> \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n\n\n## 2018\n- [Mask R-CNN](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1703.06870.pdf)\n- [Cascade R-CNN: Delving into High Quality Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1712.00726.pdf)\n- [Focal Loss for Dense Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1708.02002.pdf) (RetinaNet) [[Notes](paper_notes\u002Ffocal_loss.md)]\n- [Squeeze-and-Excitation Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1709.01507) (SENet)\n- [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1710.10196.pdf)\n- [Deformable Convolutional Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.06211) \u003Ckbd>ICCV 2017\u003C\u002Fkbd> [build on R-FCN]\n- [Learning Region Features for Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.07066.pdf)\n\n## 2017 and before\n- [Learning notes on Deep Learning](Learning_notes.md)\n- [List of Papers on Machine Learning](List_of_Machine_Learning_Papers.md)\n- [Notes of Literature Review on CNN in CV](paper_notes\u002Fcnn_papers.md) This is the notes for all the papers in the recommended list [here](papers_and_books_to_start.md)\n- [Notes of Literature Review (Others)](misc.md)\n- [Notes on how to set up DL\u002FML environment](ML_DL_environment_Setup.md)\n- [Useful setup notes](installation_log.md)\n\n## Papers to Read\nHere is the list of papers waiting to be read. \n### Deep Learning in general\n- [SqueezeDet: Unified, Small, Low Power Fully Convolutional Neural Networks for Real-Time Object Detection for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1612.01051.pdf)\n- [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1706.02677.pdf)\n- [ImageNet-trained CNNs are biased towards texture; increasing shape bias improves accuracy and robustness](https:\u002F\u002Fopenreview.net\u002Fforum?id=Bygh9j09KX) \u003Ckbd>ICML 2019\u003C\u002Fkbd>\n- [Approximating CNNs with Bag-of-local-Features models works surprisingly well on ImageNet](https:\u002F\u002Fopenreview.net\u002Fforum?id=SkfMWhAqYQ) (BagNet) [blog](https:\u002F\u002Fblog.evjang.com\u002F2019\u002F02\u002Fbagnet.html) \u003Ckbd>ICML 2019\u003C\u002Fkbd>\n- [A disciplined approach to neural network hyper-parameters: Part 1 -- learning rate, batch size, momentum, and weight decay](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.09820v2.pdf)\n- [Understanding deep learning requires rethinking generalization](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.03530.pdf)\n- [Gradient Reversal: Unsupervised Domain Adaptation by Backpropagation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1409.7495) \u003Ckbd>ICML 2015\u003C\u002Fkbd>\n\n### Self-training\n- [Rethinking Pre-training and Self-training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.06882) \u003Ckbd>NeurIPS 2020\u003C\u002Fkbd> [Quoc Le]\n\n### 2D Object Detection and Segmentation\n- [Mask Scoring R-CNN](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1903.00241.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Training Region-based Object Detectors with Online Hard Example Mining](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1604.03540.pdf)\n- [Gliding vertex on the horizontal bounding box for multi-oriented object detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.09358)\n- [ONCE: Incremental Few-Shot Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.04668) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Domain Adaptive Faster R-CNN for Object Detection in the Wild](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.03243) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [Foggy Cityscapes: Semantic Foggy Scene Understanding with Synthetic Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F1708.07819) \u003Ckbd>IJCV 2018\u003C\u002Fkbd>\n- [Foggy Cityscapes ECCV: Model Adaptation with Synthetic and Real Data for Semantic Dense Foggy Scene Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.01265) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [Dropout Sampling for Robust Object Detection in Open-Set Conditions](https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.06677) \u003Ckbd>ICRA 2018\u003C\u002Fkbd> (Niko Sünderhauf)\n- [Hybrid Task Cascade for Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.07518) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (cascaded mask RCNN)\n- [Evaluating Merging Strategies for Sampling-based Uncertainty Techniques in Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.06006) \u003Ckbd>ICRA 2019\u003C\u002Fkbd> (Niko Sünderhauf)\n- [A Unified Panoptic Segmentation Network](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1901.03784.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> \u003Ckbd>PanSeg\u003C\u002Fkbd>\n- [Model Vulnerability to Distributional Shifts over Image Transformation Sets](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1903.11900.pdf) (CVPR workshop) [tl:dr](https:\u002F\u002Fwww.reddit.com\u002Fr\u002FMachineLearning\u002Fcomments\u002Fb81uwq\u002Fr_model_vulnerability_to_distributional_shifts\u002F)\n- [Automatic adaptation of object detectors to new domains using self-training](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.07305.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (find corner case and boost)\n- [Missing Labels in Object Detection](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPRW_2019\u002Fpapers\u002FWeakly%20Supervised%20Learning%20for%20Real-World%20Computer%20Vision%20Applications\u002FXu_Missing_Labels_in_Object_Detection_CVPRW_2019_paper.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [DenseBox: Unifying Landmark Localization with End to End Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1509.04874.pdf)\n- [Circular Object Detection in Polar Coordinates for 2D LIDAR Data](https:\u002F\u002Fwww.researchgate.net\u002Fpublication\u002F309365539_Circular_Object_Detection_in_Polar_Coordinates_for_2D_LIDAR_DataCCPR2016) \u003Ckbd>CCPR 2016\u003C\u002Fkbd>\n- [LFFD: A Light and Fast Face Detector for Edge Devices](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.10633) [Lightweight, face detection, car detection]\n- [UnitBox: An Advanced Object Detection Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F1608.01471) \u003Ckbd>ACM MM 2016\u003C\u002Fkbd> [Ln IoU loss, Thomas Huang]\n\n\n### Fisheye\n- [Generalized Object Detection on Fisheye Cameras for Autonomous Driving: Dataset, Representations and Baseline](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.02124) \u003Ckbd>WACV 2021\u003C\u002Fkbd>\n\n\n### Video Understanding\n- [Learning Spatiotemporal Features with 3D Convolutional Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.0767.pdf) (C3D)  \u003Ckbd>Video \u003C\u002Fkbd>\u003Ckbd>ICCV 2015 \u003C\u002Fkbd>\n- [AVA: A Video Dataset of Spatio-temporally Localized Atomic Visual Actions](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1705.08421.pdf)\n- [Spatiotemporal Residual Networks for Video Action Recognition](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.02155.pdf) (decouple spatiotemporal) \u003Ckbd>NIPS 2016\u003C\u002Fkbd>\n- [Learning Spatio-Temporal Representation with Pseudo-3D Residual Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.10305.pdf) (P3D, decouple spatiotemporal) \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [A Closer Look at Spatiotemporal Convolutions for Action Recognition](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.11248.pdf) (decouple spatiotemporal) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video Classification](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1712.04851.pdf) (decouple spatiotemporal) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.09577.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [AGSS-VOS: Attention Guided Single-Shot Video Object Segmentation](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FLin_AGSS-VOS_Attention_Guided_Single-Shot_Video_Object_Segmentation_ICCV_2019_paper.pdf) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [One-Shot Video Object Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.05198) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [Looking Fast and Slow: Memory-Guided Mobile Video Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.10172) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [Towards High Performance Video Object Detection](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FZhu_Towards_High_Performance_CVPR_2018_paper.pdf) [[Notes](paper_notes\u002Fhigh_performance_video_od.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [Towards High Performance Video Object Detection for Mobiles](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.05830) [[Notes](paper_notes\u002Fhigh_performance_video_od_mobile.md)]\n- [Temporally Distributed Networks for Fast Video Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.01800) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [efficient video segmentation]\n- [Memory Enhanced Global-Local Aggregation for Video Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.12063) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [efficient video object detection]\n- [Co-occurrence Feature Learning from Skeleton Data for Action Recognition and\nDetection with Hierarchical Aggregation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.06055) \u003Ckbd>IJCAI 2018 oral\u003C\u002Fkbd> [video skeleton]\n- [RST-MODNet: Real-time Spatio-temporal Moving Object Detection for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.00438) \u003Ckbd>NeurIPS 2019 workshop\u003C\u002Fkbd>\n- [Long-term Recurrent Convolutional Networks for Visual Recognition and Description](https:\u002F\u002Farxiv.org\u002Fabs\u002F1411.4389) \u003Ckbd>CVPR 2015 oral\u003C\u002Fkbd>\n- [Temporal Segment Networks: Towards Good Practices for Deep Action Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F1608.00859) \u003Ckbd>ECCV 2016\u003C\u002Fkbd>\n- [TRN: Temporal Relational Reasoning in Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.08496) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [X3D: Expanding Architectures for Efficient Video Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.04730) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [FAIR]\n- [Temporal-Context Enhanced Detection of Heavily Occluded Pedestrians](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fhtml\u002FWu_Temporal-Context_Enhanced_Detection_of_Heavily_Occluded_Pedestrians_CVPR_2020_paper.html) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [pedestrian, video]\n- [Flow-guided feature aggregation for video object detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.10025) \u003Ckbd>ICCV 2017\u003C\u002Fkbd> [video, object detection]\n- [3D human pose estimation in video with temporal convolutions and\nsemi-supervised training](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2019\u002Fpapers\u002FPavllo_3D_Human_Pose_Estimation_in_Video_With_Temporal_Convolutions_and_CVPR_2019_paper.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [mono3D pose estimation from video]\n- [OmegaNet: Distilled Semantics for Comprehensive Scene Understanding from Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.14030) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Object Detection in Videos with Tubelet Proposal Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1702.06355) \u003Ckbd>CVPR 2017\u003C\u002Fkbd> [video object detection]\n- [T-CNN: Tubelets with Convolutional Neural Networks for Object Detection from Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F1604.02532) [video object detection]\n- [Flow-Guided Feature Aggregation for Video Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.10025) \u003Ckbd>ICCV 2017\u003C\u002Fkbd> [Jifeng Dai]\n\n\n### Pruning and Compression\n- [Efficient Deep Learning Inference based on Model Compression](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018_workshops\u002Fpapers\u002Fw33\u002FZhang_Efficient_Deep_Learning_CVPR_2018_paper.pdf) (Model Compression)\n- [Neural Network Distiller](https:\u002F\u002Fintellabs.github.io\u002Fdistiller\u002Falgo_pruning.html) [Intel]\n\n\n### Architecture Improvements\n- [Concurrent Spatial and Channel Squeeze & Excitation in Fully Convolutional Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.02579.pdf)\n- [CBAM: Convolutional Block Attention Module](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1807.06521.pdf)\n\n### Reinforcement Learning\n- [Playing Atari with Deep Reinforcement Learning](https:\u002F\u002Fwww.cs.toronto.edu\u002F~vmnih\u002Fdocs\u002Fdqn.pdf) \u003Ckbd>NIPS 2013 \u003C\u002Fkbd>\n- [Multi-Scale Deep Reinforcement Learning for Real-Time 3D-Landmark Detection in CT Scan](http:\u002F\u002Fcomaniciu.net\u002FPapers\u002FMultiscaleDeepReinforcementLearning_PAMI18.pdf)\n- [An Artificial Agent for Robust Image Registration](https:\u002F\u002Fwww.aaai.org\u002Focs\u002Findex.php\u002FAAAI\u002FAAAI17\u002Fpaper\u002Fdownload\u002F14751\u002F14296)\n\n### 3D Perception\n- [3D-CNN：3D Convolutional Neural Networks for Landing Zone Detection from LiDAR](https:\u002F\u002Fwww.ri.cmu.edu\u002Fpub_files\u002F2015\u002F3\u002Fmaturana-root.pdf)\n- [Generative and Discriminative Voxel Modeling with Convolutional Neural Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1608.04236.pdf)\n- [Orientation-boosted Voxel Nets for 3D Object Recognition](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1604.03351.pdf) (ORION) \u003CBMVC 2017>\n- [GIFT: A Real-time and Scalable 3D Shape Search Engine](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1604.01879.pdf) \u003Ckbd>CVPR 2016\u003C\u002Fkbd>\n- [3D Shape Segmentation with Projective Convolutional Networks](https:\u002F\u002Fpeople.cs.umass.edu\u002F~kalo\u002Fpapers\u002Fshapepfcn\u002F) (ShapePFCN)\u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [Learning Local Shape Descriptors from Part Correspondences With Multi-view Convolutional Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1706.04496.pdf)\n- [Open3D: A Modern Library for 3D Data Processing](http:\u002F\u002Fwww.open3d.org\u002Fwordpress\u002Fwp-content\u002Fpaper.pdf)\n- [Multimodal Deep Learning for Robust RGB-D Object Recognition](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1507.06821.pdf) \u003Ckbd>IROS 2015\u003C\u002Fkbd>\n- [FlowNet3D: Learning Scene Flow in 3D Point Clouds](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1806.01411.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Mining Point Cloud Local Structures by Kernel Correlation and Graph Pooling](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1712.06760.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (Neighbors Do Help: Deeply Exploiting Local Structures of Point Clouds)\n- [PU-Net: Point Cloud Upsampling Network](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1801.06761.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [Recurrent Slice Networks for 3D Segmentation of Point Clouds](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1802.04402.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [SPLATNet: Sparse Lattice Networks for Point Cloud Processing](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1802.08275.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [Convolutional Neural Networks on Graphs with Fast Localized Spectral Filtering](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.09375.pdf) \u003Ckbd>NIPS 2016\u003C\u002Fkbd>\n- [Semi-Supervised Classification with Graph Convolutional Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1609.02907.pdf) \u003Ckbd>ICLR 2017\u003C\u002Fkbd>\n- [Geometric Matrix Completion with Recurrent Multi-Graph Neural Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1704.06803.pdf) \u003Ckbd>NIPS 2017\u003C\u002Fkbd>\n- [Graph Attention Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1710.10903.pdf) \u003Ckbd>ICLR 2018\u003C\u002Fkbd>\n- [3D-SSD: Learning Hierarchical Features from RGB-D Images for Amodal 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.00238.pdf) (3D SSD)\n- [Escape from Cells: Deep Kd-Networks for the Recognition of 3D Point Cloud Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1704.01222.pdf) \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [Shape Completion using 3D-Encoder-Predictor CNNs and Shape Synthesis](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1612.00101.pdf) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [IPOD: Intensive Point-based Object Detector for Point Cloud](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.05276.pdf)\n- [Amodal Detection of 3D Objects: Inferring 3D Bounding Boxes from 2D Ones in RGB-Depth Images](https:\u002F\u002Fcis.temple.edu\u002F~latecki\u002FPapers\u002FDengCVPR2017.pdf) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [2D-Driven 3D Object Detection in RGB-D Images](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2017\u002Fpapers\u002FLahoud_2D-Driven_3D_Object_ICCV_2017_paper.pdf)\n- [3D-SSD: Learning Hierarchical Features from RGB-D Images for Amodal 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.00238.pdf)\n- [Associate-3Ddet: Perceptual-to-Conceptual Association for 3D Point Cloud Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04356) [classify occluded object]\n\n### Stereo and Flow\n- [PSMNet: Pyramid Stereo Matching Network](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.08669.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [Stereo R-CNN based 3D Object Detection for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1902.09738.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Deep Rigid Instance Scene Flow](https:\u002F\u002Fpeople.csail.mit.edu\u002Fweichium\u002Fpapers\u002Fcvpr19-dsisf\u002Fpaper.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Upgrading Optical Flow to 3D Scene Flow through Optical Expansion](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FYang_Upgrading_Optical_Flow_to_3D_Scene_Flow_Through_Optical_Expansion_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Learning Multi-Object Tracking and Segmentation from Automatic Annotations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.02096) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [automatic MOTS annotation]\n\n\n### Traffic light and traffic sign\n- [Traffic-Sign Detection and Classification in the Wild](https:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2016\u002Fpapers\u002FZhu_Traffic-Sign_Detection_and_CVPR_2016_paper.pdf) \u003Ckbd>CVPR 2016\u003C\u002Fkbd> [Tsinghua, Tencent, traffic signs]\n- [A Hierarchical Deep Architecture and Mini-Batch Selection Method For Joint\nTraffic Sign and Light Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.07987) \u003Ckbd>IEEE CRV 2018\u003C\u002Fkbd> [U torronto]\n- [Detecting Traffic Lights by Single Shot Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.02523) \u003Ckbd>ITSC 2018\u003C\u002Fkbd>\n- [DeepTLR: A single Deep Convolutional Network for Detection and Classification of Traffic Lights](https:\u002F\u002Fsci-hub.st\u002F10.1109\u002FIVS.2016.7535408) \u003Ckbd>IV 2016\u003C\u002Fkbd>\n- [Evaluating State-of-the-art Object Detector on Challenging Traffic Light Data](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2017_workshops\u002Fw9\u002Fpapers\u002FJensen_Evaluating_State-Of-The-Art_Object_CVPR_2017_paper.pdf) \u003Ckbd>CVPR 2017 workshop\u003C\u002Fkbd>\n- [Traffic light recognition in varying illumination using deep learning and saliency map](https:\u002F\u002Fwww.researchgate.net\u002Fprofile\u002FVijay_John3\u002Fpublication\u002F265014373_Traffic_Light_Recognition_in_Varying_Illumination_using_Deep_Learning_and_Saliency_Map\u002Flinks\u002F56aac00408ae8f3865666102.pdf) \u003Ckbd>ITSC 2014\u003C\u002Fkbd> [traffic light]\n- [Traffic light recognition using high-definition map features](https:\u002F\u002Fsci-hub.st\u002Fhttps:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fabs\u002Fpii\u002FS0921889018301234) \u003Ckbd>RAS 2019\u003C\u002Fkbd>\n- [Vision for Looking at Traffic Lights: Issues, Survey, and Perspectives](http:\u002F\u002Fcvrr.ucsd.edu\u002Fpublications\u002F2016\u002FtrafficSignalsITSTrans2016.pdf) \u003Ckbd>TITS 2015\u003C\u002Fkbd>\n\n### Datasets and Surveys\n- [The DriveU Traffic Light Dataset: Introduction and Comparison with Existing Datasets](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F8460737) \u003Ckbd>ICRA 2018\u003C\u002Fkbd> \n- [The Oxford Radar RobotCar Dataset: A Radar Extension to the Oxford RobotCar Dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.01300)\n- [Vision for Looking at Traffic Lights: Issues, Survey, and Perspectives](http:\u002F\u002Fcvrr.ucsd.edu\u002Fpublications\u002F2016\u002FtrafficSignalsITSTrans2016.pdf) (traffic light survey, UCSD LISA)\n- [Review of Graph Spectrum Theory](paper_notes\u002Fgraph_spectrum.md) (WIP)\n- [3D Deep Learning Tutorial at CVPR 2017](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=8CenT_4HWyY) [[Notes](paper_notes\u002F3ddl_cvpr2017.md)] - (WIP)\n- [A Survey on Neural Architecture Search](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.01392.pdf)\n- [Network pruning tutorial](https:\u002F\u002Fjacobgil.github.io\u002Fdeeplearning\u002Fpruning-deep-learning) (blog)\n- [GNN tutorial at CVPR 2019](https:\u002F\u002Fxiaolonw.github.io\u002Fgraphnn\u002F)\n- [Large Scale Interactive Motion Forecasting for Autonomous Driving : The Waymo Open Motion Dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10133) [Waymo, prediction dataset]\n- [PANDA: A Gigapixel-level Human-centric Video Dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.04852) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [WoodScape: A multi-task, multi-camera fisheye dataset for autonomous driving](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FYogamani_WoodScape_A_Multi-Task_Multi-Camera_Fisheye_Dataset_for_Autonomous_Driving_ICCV_2019_paper.pdf) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Valeo]\n\n### Unsupervised depth estimation\n- [Sparse and Dense Data with CNNs: Depth Completion and Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1808.00769.pdf) \u003Ckbd>3DV 2018\u003C\u002Fkbd>\n- [Depth Map Prediction from a Single Image using a Multi-Scale Deep Network](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1406.2283.pdf) \u003Ckbd>NIPS 2014\u003C\u002Fkbd> (Eigen et al)\n- [Learning Depth from Monocular Videos using Direct Methods](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.00175) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (monocular depth estimation)\n- [Virtual-Normal: Enforcing geometric constraints of virtual normal for depth prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.12209) [[Notes](paper_notes\u002Fvirtual_normal.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> (better generation of PL)\n- [Spatial Correspondence with Generative Adversarial Network: Learning Depth from Monocular Videos](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FWu_Spatial_Correspondence_With_Generative_Adversarial_Network_Learning_Depth_From_Monocular_ICCV_2019_paper.pdf) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [Unsupervised Collaborative Learning of Keyframe Detection and Visual Odometry Towards Monocular Deep SLAM](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FSheng_Unsupervised_Collaborative_Learning_of_Keyframe_Detection_and_Visual_Odometry_Towards_ICCV_2019_paper.pdf) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [Visualization of Convolutional Neural Networks for Monocular Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.03380) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n\n### Indoor Depth\n- [Fast and Accurate Recovery of Occluding Contours in Monocular Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.08598) \u003Ckbd>ICCV 2019 workshop\u003C\u002Fkbd> [indoor]\n- [Multi-Loss Rebalancing Algorithm for Monocular Depth Estimation](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fhtml\u002F2890_ECCV_2020_paper.php) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [indoor depth]\n- [Disambiguating Monocular Depth Estimation with a Single Transient](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fhtml\u002F3668_ECCV_2020_paper.php) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [additional laser sensor, indoor depth]\n- [Guiding Monocular Depth Estimation Using Depth-Attention Volume](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fhtml\u002F5491_ECCV_2020_paper.php) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [indoor depth]\n- [Improving Monocular Depth Estimation by Leveraging Structural Awareness and Complementary Datasets](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.11256) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [indoor depth]\n- [CLIFFNet for Monocular Depth Estimation with Hierarchical Embedding Loss](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fhtml\u002F3365_ECCV_2020_paper.php) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [indoor depth]\n\n\n### lidar\n- [PointSIFT: A SIFT-like Network Module for 3D Point Cloud Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1807.00652.pdf) (pointnet alternative, backbone)\n- [Vehicle Detection from 3D Lidar Using Fully Convolutional Network](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1608.07916.pdf) (VeloFCN) \u003Ckbd>RSS 2016\u003C\u002Fkbd> \n- [KPConv: Flexible and Deformable Convolution for Point Clouds](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08889) (from the authors of PointNet)\n- [PointCNN: Convolution On X-Transformed Points](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1801.07791.pdf) \u003Ckbd>NIPS 2018\u003C\u002Fkbd>\n- [L3-Net: Towards Learning based LiDAR Localization for Autonomous Driving](https:\u002F\u002Fsongshiyu01.github.io\u002Fpdf\u002FL3Net_W.Lu_Y.Zhou_S.Song_CVPR2019.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [RoarNet: A Robust 3D Object Detection based on RegiOn Approximation Refinement](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.03818) (sensor fusion, 3D mono proposal, refined in point cloud)\n- [DeLS-3D: Deep Localization and Segmentation with a 3D Semantic Map](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1805.04949.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [Frustum ConvNet: Sliding Frustums to Aggregate Local Point-Wise Features for Amodal 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.01864) \u003Ckbd>IROS 2019\u003C\u002Fkbd>\n- [PointRNN: Point Recurrent Neural Network for Moving Point Cloud Processing](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.08287)\n- [Gated2Depth: Real-time Dense Lidar from Gated Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.04997) \u003Ckbd>ICCV 2019 oral\u003C\u002Fkbd>\n- [A Multi-Sensor Fusion System for Moving Object Detection and Tracking in Urban Driving Environments](http:\u002F\u002Fwww.cs.cmu.edu\u002F~youngwoo\u002Fdoc\u002Ficra-14-sensor-fusion.pdf) \u003Ckbd>ICRA 2014\u003C\u002Fkbd>\n- [PointFusion: Deep Sensor Fusion for 3D Bounding Box Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.10871) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [sensor fusion, Zoox]\n- [Deep Hough Voting for 3D Object Detection in Point Clouds](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09664) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Charles Qi]\n- [StixelNet: A Deep Convolutional Network for Obstacle Detection and Road Segmentation](http:\u002F\u002Fwww.bmva.org\u002Fbmvc\u002F2015\u002Fpapers\u002Fpaper109\u002Fpaper109.pdf)\n- [PolarNet: An Improved Grid Representation for Online LiDAR Point Clouds Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.14032) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Depth Sensing Beyond LiDAR Range](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.03048) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [wide baseline stereo with trifocal]\n- [Probabilistic Semantic Mapping for Urban Autonomous Driving Applications](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04894) \u003Ckbd>IROS 2020\u003C\u002Fkbd> [lidar mapping]\n- [RandLA-Net: Efficient Semantic Segmentation of Large-Scale Point Clouds](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.11236) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [lidar segmentation]\n- [PolarNet: An Improved Grid Representation for Online LiDAR Point Clouds Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.14032) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [lidar segmentation]\n- [OctSqueeze: Octree-Structured Entropy Model for LiDAR Compression](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.07178) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [lidar compression]\n- [MuSCLE: Multi Sweep Compression of LiDAR using Deep Entropy Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.07590) \u003Ckbd>NeurIPS 2020 oral\u003C\u002Fkbd> [lidar compression]\n\n### Egocentric bbox prediction\n- [Long-Term On-Board Prediction of People in Traffic Scenes under Uncertainty](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.09026) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [on-board bbox prediction]\n- [Unsupervised Traffic Accident Detection in First-Person Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.00618) \u003Ckbd>IROS 2019\u003C\u002Fkbd> (Honda)\n- [NEMO: Future Object Localization Using Noisy Ego Priors](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.08150) (Honda)\n- [Robust Aleatoric Modeling for Future Vehicle Localization](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPRW_2019\u002Fpapers\u002FPrecognition\u002FHudnell_Robust_Aleatoric_Modeling_for_Future_Vehicle_Localization_CVPRW_2019_paper.pdf) (perspective)\n- [Multiple Object Forecasting: Predicting Future Object Locations in Diverse\nEnvironments](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11944) \u003Ckbd>WACV 2020\u003C\u002Fkbd> (perspective bbox, pedestrian)\n- [Using panoramic videos for multi-person localization and tracking in a 3D panoramic coordinate](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.10535)\n\n### Lane Detection\n- [End-to-end Lane Detection through Differentiable Least-Squares Fitting](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.00293) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [Line-CNN: End-to-End Traffic Line Detection With Line Proposal Unit](https:\u002F\u002Fdoi.org\u002F10.1109\u002FTITS.2019.2890870) \u003Ckbd>TITS 2019\u003C\u002Fkbd> [object-like proposals]\n- [Detecting Lane and Road Markings at A Distance with Perspective Transformer Layers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.08550) [3D LLD]\n- [Ultra Fast Structure-aware Deep Lane Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.11757) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [lane detection]\n- [A Novel Approach for Detecting Road Based on Two-Stream Fusion Fully Convolutional Network](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F8500551\u002F) (convert camera to BEV)\n- [FastDraw: Addressing the Long Tail of Lane Detection by Adapting a Sequential Prediction Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.04354)\n\n### Tracking\n- [RetinaTrack: Online Single Stage Joint Detection and Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13870) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Computer Vision for Autonomous Vehicles: Problems, Datasets and State of the Art](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.05519v2) (latest update in Dec 2019)\n- [Simultaneous Identification and Tracking of Multiple People Using Video and IMUs](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPRW_2019\u002Fpapers\u002FBMTT\u002FHenschel_Simultaneous_Identification_and_Tracking_of_Multiple_People_Using_Video_and_CVPRW_2019_paper.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Detect-and-Track: Efficient Pose Estimation in Videos](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FGirdhar_Detect-and-Track_Efficient_Pose_CVPR_2018_paper.pdf)\n- [TrackNet: Simultaneous Object Detection and Tracking and Its Application in Traffic Video Analysis](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.01466)\n- [Video Action Transformer Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.02707) \u003Ckbd>CVPR 2019 oral\u003C\u002Fkbd>\n- [Online Real-time Multiple Spatiotemporal Action Localisation and Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.08563) \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [多目标跟踪 近年论文及开源代码汇总](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F65177442)\n- [GNN3DMOT: Graph Neural Network for 3D Multi-Object Tracking with Multi-Feature Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.07327) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [3DMOT, CMU, Kris Kitani]\n- [Chained-Tracker: Chaining Paired Attentive Regression Results for End-to-End Joint Multiple-Object Detection and Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.14557) \u003Ckbd>ECCV 2020 spotlight\u003C\u002Fkbd> [MOT, Tencent]\n- [Towards Real-Time Multi-Object Tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.12605) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [MOT]\n- [Probabilistic 3D Multi-Object Tracking for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.05673) [TRI]\n\n### keypoints: pose and face\n- [Probabilistic Face Embeddings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09658) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [Data Uncertainty Learning in Face Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.11339) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Self-Supervised Learning of Interpretable Keypoints From Unlabelled Videos](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fhtml\u002FJakab_Self-Supervised_Learning_of_Interpretable_Keypoints_From_Unlabelled_Videos_CVPR_2020_paper.html) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [VGG, self-supervised, interpretable, discriminator]\n\n\n### General DL\n- [Revisiting Small Batch Training for Deep Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.07612)\n- [ICML2019 workshop: Adaptive and Multitask Learning: Algorithms & Systems](https:\u002F\u002Ficml.cc\u002FConferences\u002F2019\u002FScheduleMultitrack?event=3504) \u003Ckbd>ICML 2019\u003C\u002Fkbd>\n- [Adaptive Scheduling for Multi-Task Learning](https:\u002F\u002Fmarcpickett.com\u002Fcl2018\u002FCL-2018_paper_82.pdf) \u003Ckbd>NIPS 2018\u003C\u002Fkbd> (NMT)\n- [Polar Transformer Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.01889) \u003Ckbd>ICLR 2018\u003C\u002Fkbd>\n- [Measuring Calibration in Deep Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.01685) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Sampling-free Epistemic Uncertainty Estimation Using Approximated Variance Propagation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.00598) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> (epistemic uncertainty)\n- [Making Convolutional Networks Shift-Invariant Again](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.11486) \u003Ckbd>ICML\u003C\u002Fkbd>\n- [Using Self-Supervised Learning Can Improve Model Robustness and Uncertainty](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.12340) \u003Ckbd>NeurIPS 2019\u003C\u002Fkbd>\n- [Understanding deep learning requires rethinking generalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.03530) \u003Ckbd>ICLR 2017\u003C\u002Fkbd> [ICLR best paper]\n- [A Baseline for Detecting Misclassified and Out-of-Distribution Examples in Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1610.02136) \u003Ckbd>ICLR 2017\u003C\u002Fkbd> (NLL score as anomaly score)\n- [Unsupervised Feature Learning via Non-Parametric Instance-level Discrimination](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002FCameraReady\u002F0801.pdf) \u003Ckbd>CVPR 2018 spotlight\u003C\u002Fkbd> (Stella Yu)\n- [Theoretical insights into the optimization landscape of over-parameterized shallow neural networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1707.04926) \u003Ckbd>TIP 2018\u003C\u002Fkbd>\n- [The Power of Interpolation: Understanding the Effectiveness of SGD in Modern Over-parametrized Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.06559) \u003Ckbd>ICML 2018\u003C\u002Fkbd>\n- [Designing Network Design Spaces](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13678) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Moco2: Improved Baselines with Momentum Contrastive Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.04297)\n- [SGD on Neural Networks Learns Functions of Increasing Complexity](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.11604) \u003Ckbd>NIPS 2019\u003C\u002Fkbd> (SGD learns a linear classifier first)\n- [Pay attention to the activations: a modular attention mechanism for fine-grained image recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.13075)\n- [A Mixed Classification-Regression Framework for 3D Pose Estimation from 2D Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.03225) \u003Ckbd>BMVC 2018\u003C\u002Fkbd> (multi-bin, what's new?)\n- [In-Place Activated BatchNorm for Memory-Optimized Training of DNNs](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.02616) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (optimized BatchNorm + ReLU)\n- [FCNN: Fourier Convolutional Neural Networks](http:\u002F\u002Fecmlpkdd2017.ijs.si\u002Fpapers\u002FpaperID11.pdf) (FFT as CNN)\n- [Visualizing the Loss Landscape of Neural Nets](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F7875-visualizing-the-loss-landscape-of-neural-nets.pdf) \u003Ckbd>NIPS 2018\u003C\u002Fkbd>\n- [Xception: Deep Learning with Depthwise Separable Convolutions](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1610.02357.pdf) (Xception)\n- [Multi-Task Learning Using Uncertainty to Weigh Losses for Scene Geometry and Semantics](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.07115) (uncertainty)\n- [Learning to Drive from Simulation without Real World Labels](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.03823) \u003Ckbd>ICRA 2019\u003C\u002Fkbd> (domain adaptation, sim2real)\n- [Filter Response Normalization Layer: Eliminating Batch Dependence in the Training of Deep Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.09737) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd>\n- [Switchable Whitening for Deep Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09739) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [domain adaptation]\n- [Visual Chirality](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.09512) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [best paper nominee]\n- [Generalized ODIN: Detecting Out-of-Distribution Image Without Learning From Out-of-Distribution Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.11297) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Self-training with Noisy Student improves ImageNet classification](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.04252) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [distillation]\n- [Keep it Simple: Image Statistics Matching for Domain Adaptation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.12551) \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [Domain adaptation for 2D mod bbox]\n- [Epipolar Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.04551) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Yihui He]\n- [Scalable Uncertainty for Computer Vision With Functional Variational Inference](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.03396) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [epistemic uncertainty with one fwd pass]\n\n\n### Mono3D\n- [3DOP: 3D Object Proposals for Accurate Object Class Detection](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F5644-3d-object-proposals-for-accurate-object-class-detection) \u003Ckbd>NIPS 2015\u003C\u002Fkbd>\n- [DirectShape: Photometric Alignment of Shape Priors for Visual Vehicle Pose and Shape Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.10097)\n- [Eliminating the Blind Spot: Adapting 3D Object Detection and Monocular Depth Estimation to 360° Panoramic Imagery](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.06253) \u003Ckbd>ECCV 2018\u003C\u002Fkbd> (Monocular 3D object detection and depth estimation)\n- [Towards Scene Understanding: Unsupervised Monocular Depth Estimation with Semantic-aware Representation](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2019\u002Fpapers\u002FChen_Towards_Scene_Understanding_Unsupervised_Monocular_Depth_Estimation_With_Semantic-Aware_Representation_CVPR_2019_paper.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [unified conditional decoder]\n- [DDP: Dense Depth Posterior from Single Image and Sparse Range](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.10034) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Augmented Reality Meets Computer Vision : Efficient Data Generation for Urban Driving Scenes](https:\u002F\u002Farxiv.org\u002Fabs\u002F1708.01566) \u003Ckbd>IJCV 2018\u003C\u002Fkbd> (data augmentation with AR, Toyota)\n- [Exploring the Capabilities and Limits of 3D Monocular Object Detection -- A Study on Simulation and Real World Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.07424) \u003Ckbd>IITS\u003C\u002Fkbd>\n- [Towards Scene Understanding with Detailed 3D Object Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F1411.5935) \u003Ckbd>IJCV 2014\u003C\u002Fkbd> (keypoint, 3D bbox annotation)\n- [Deep Cuboid Detection: Beyond 2D Bounding Boxes](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.10010) (Magic Leap)\n- [Viewpoints and Keypoints](https:\u002F\u002Farxiv.org\u002Fabs\u002F1411.6067) (Malik)\n- [Lifting Object Detection Datasets into 3D](https:\u002F\u002Farxiv.org\u002Fabs\u002F1503.06465) (PASCAL)\n- [3D Object Class Detection in the Wild](https:\u002F\u002Farxiv.org\u002Fabs\u002F1503.05038) (keypoint based)\n- [Fast Single Shot Detection and Pose Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1609.05590) \u003Ckbd>3DV 2016\u003C\u002Fkbd> (SSD + pose, Wei Liu)\n- [Virtual KITTI 2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.10773)\n- [Deep Supervision with Shape Concepts for Occlusion-Aware 3D Object Parsing](https:\u002F\u002Farxiv.org\u002Fabs\u002F1612.02699) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [Render for CNN: Viewpoint Estimation in Images Using CNNs Trained with Rendered 3D Model Views](https:\u002F\u002Farxiv.org\u002Fabs\u002F1505.05641) \u003Ckbd>ICCV 2015 Oral\u003C\u002Fkbd>\n- [Real-Time Seamless Single Shot 6D Object Pose Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.08848) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [Practical Deep Stereo (PDS): Toward applications-friendly deep stereo matching](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.01677) \u003Ckbd>NIPS 2018\u003C\u002Fkbd> [disparity estimation]\n- [Self-supervised Sparse-to-Dense: Self-supervised Depth Completion from LiDAR and Monocular Camera](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.00275) \u003Ckbd>ICRA 2019\u003C\u002Fkbd>\n- [Learning Depth with Convolutional Spatial Propagation Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.02695) (Baidu, depth from SPN) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [Just Go with the Flow: Self-Supervised Scene Flow Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.00497) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [Scene flow, Lidar]\n- [Online Depth Learning against Forgetting in Monocular Videos](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FZhang_Online_Depth_Learning_Against_Forgetting_in_Monocular_Videos_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [monodepth]\n- [Self-Supervised Deep Visual Odometry with Online Adaptation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.06136) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [DF-VO, TrianFlow, meta-learning]\n- [Self-supervised Monocular Trained Depth Estimation using Self-attention and Discrete Disparity Volume](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13951) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Online Depth Learning against Forgetting in Monocular Videos](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FZhang_Online_Depth_Learning_Against_Forgetting_in_Monocular_Videos_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [monodepth, online learning]\n- [SDC-Depth: Semantic Divide-and-Conquer Network for Monocular Depth Estimation](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FWang_SDC-Depth_Semantic_Divide-and-Conquer_Network_for_Monocular_Depth_Estimation_CVPR_2020_paper.pdf)  \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [monodepth, semantic]\n- [Inferring Distributions Over Depth from a Single Image](http:\u002F\u002Fwww.contrib.andrew.cmu.edu\u002F~gengshay\u002Fwordpress\u002Fwp-content\u002Fuploads\u002F2018\u002F11\u002Firos_monodepth_uncertainty.pdf) \u003Ckbd>TRO\u003C\u002Fkbd> [Depth confidence, stitching them together]\n- [Novel View Synthesis of Dynamic Scenes with Globally Coherent Depths](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.01294) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [The Edge of Depth: Explicit Constraints between Segmentation and Depth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.00171) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Xiaoming Liu, multimodal, depth bleeding] \n\n### Radar Perception\n- [MV-RSS: Multi-View Radar Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.16214) \u003Ckbd>ICCV 2021\u003C\u002Fkbd>\n- [Classification of Objects in Polarimetric Radar Images Using CNNs at 77 GHz](http:\u002F\u002Fsci-hub.tw\u002F10.1109\u002FAPMC.2017.8251453) (Radar, polar)\n- [CNNs for Interference Mitigation and Denoising in Automotive Radar Using Real-World Data](https:\u002F\u002Fml4ad.github.io\u002Ffiles\u002Fpapers\u002FCNNs%20for%20Interference%20Mitigation%20and%20Denoising%20in%20Automotive%20Radar%20Using%20Real-World%20Data.pdf) \u003Ckbd>NeurIPS 2019\u003C\u002Fkbd> (radar)\n- [Road Scene Understanding by Occupancy Grid Learning from Sparse Radar Clusters using Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.00415) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> (radar)\n- [RadarNet: Exploiting Radar for Robust Perception of Dynamic Objects](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.14366) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Uber ATG]\n- [Depth Estimation from Monocular Images and Sparse Radar Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.00058) \u003Ckbd>IROS 2020\u003C\u002Fkbd> [Camera + Radar for monodepth, nuscenes]\n- [RPR: Radar-Camera Sensor Fusion for Joint Object Detection and Distance Estimation in Autonomous Vehicles](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.08428) \u003Ckbd>IROS 2020\u003C\u002Fkbd> [radar proposal refinement]\n- [Warping of Radar Data into Camera Image for Cross-Modal Supervision in Automotive Applications](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.12809)\n\n### SLAM\n- [PoseNet: A Convolutional Network for Real-Time 6-DOF Camera Relocalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1505.07427) [[Notes](paper_notes\u002Fposenet.md)] \u003Ckbd>ICCV 2015\u003C\u002Fkbd>\n- [PoseNet2: Modelling Uncertainty in Deep Learning for Camera Relocalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1509.05909) \u003Ckbd>ICRA 2016\u003C\u002Fkbd>\n- [PoseNet3: Geometric Loss Functions for Camera Pose Regression with Deep Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.00390) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [EssNet: Convolutional neural network architecture for geometric matching](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.05593) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [NC-EssNet: Neighbourhood Consensus Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.10510) \u003Ckbd>NeurIPS 2018\u003C\u002Fkbd>\n- [Reinforced Feature Points: Optimizing Feature Detection and Description for a High-Level Task](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.00623) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [Eric Brachmann, ngransac]\n- [Unsupervised Learning of Depth and Ego-Motion from Monocular Video Using 3D Geometric Constraints](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1802.05522.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [DynSLAM: Robust Dense Mapping for Large-Scale Dynamic Environments](https:\u002F\u002Fsiegedog.com\u002Fdynslam\u002F) [dynamic SLAM, Andreas Geiger] \u003Ckbd>ICRA 2018\u003C\u002Fkbd>\n- [GCNv2: Efficient Correspondence Prediction for Real-Time SLAM](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.11046) \u003Ckbd>LRA 2019\u003C\u002Fkbd> [Superpoint + orb slam]\n- [Real-time Scalable Dense Surfel Mapping](Real-time Scalable Dense Surfel Mapping) \u003Ckbd>ICRA 2019\u003C\u002Fkbd> [dense reconstruction, monodepth]\n- [Dynamic SLAM: The Need For Speed](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08584)\n- [GSLAM: A General SLAM Framework and Benchmark](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.07995) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n\n### Radar Perception\n- [Seeing Around Street Corners: Non-Line-of-Sight Detection and Tracking In-the-Wild Using Doppler Radar](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FScheiner_Seeing_Around_Street_Corners_Non-Line-of-Sight_Detection_and_Tracking_In-the-Wild_Using_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Daimler]\n- [Radar+RGB Attentive Fusion for Robust Object Detection in Autonomous Vehicles](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.13642) \u003Ckbd>ICIP 2020\u003C\u002Fkbd>\n- [Spatial Attention Fusion for Obstacle Detection Using MmWave Radar and Vision Sensor](https:\u002F\u002Fwww.mdpi.com\u002F1424-8220\u002F20\u002F4\u002F956) \u003Ckbd>sensors 2020\u003C\u002Fkbd> [radar, camera, early fusion]\n\n### Reviews and Surveys\n- [A Survey on Deep Learning for Localization and Mapping: Towards the Age of Spatial Machine Intelligence](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.12567)\n- [Monocular Depth Estimation Based On Deep Learning: An Overview](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.06620)\n\n### Beyond Perception in Autonomous Driving\n- [Uncertainty Guided Multi-Scale Residual Learning-using a Cycle Spinning CNN for Single Image De-Raining](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.11129) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Learn to Combine Modalities in Multimodal Deep Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.11730) (sensor fusion, general DL)\n- [Safe Trajectory Generation For Complex Urban Environments Using Spatio-temporal Semantic Corridor](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.09788) \u003Ckbd>LRA 2019\u003C\u002Fkbd> [Motion planning]\n- [DAgger: Driving Policy Transfer via Modularity and Abstraction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.09364) \u003Ckbd>CoRL 2018\u003C\u002Fkbd> [DAgger, Immitation Learning]\n- [Efficient Uncertainty-aware Decision-making for Automated Driving Using Guided Branching](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02746) \u003Ckbd>ICRA 2020\u003C\u002Fkbd> [Motion planning]\n- [Calibration of Heterogeneous Sensor Systems](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.11445.pdf)\n- [Intro：Sensor Fusion for Adas 无人驾驶中的数据融合 (from 知乎)](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F40967227) (Up to CVPR 2018)\n- [YUVMultiNet: Real-time YUV multi-task CNN for autonomous driving](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.05673.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (Real Time, Low Power)\n- [Deep Fusion of Heterogeneous Sensor Modalities for the Advancements of ADAS to Autonomous Vehicles](http:\u002F\u002Fsci-hub.tw\u002F10.1109\u002FVLSI-DAT.2018.8373245)\n- [Temporal Coherence for Active Learning in Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.11757) \u003Ckbd>ICCVW 2019\u003C\u002Fkbd> [active learning, temporal coherence]\n- [R-TOD: Real-Time Object Detector with Minimized End-to-End Delay for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.06372) \u003Ckbd>RTSS 2020\u003C\u002Fkbd> [perception system design]\n\n### Prediction and Planning\n- [Learning Lane Graph Representations for Motion Forecasting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.13732) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Uber ATG]\n- [DSDNet: Deep Structured self-Driving Network](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.06041) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Uber ATG]\n\n### Annotation and Tooling\n- [Temporal Coherence for Active Learning in Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.11757) \u003Ckbd>ICCV 2019 workshop\u003C\u002Fkbd>\n- [Leveraging Pre-Trained 3D Object Detection Models For Fast Ground Truth Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.06072) \u003Ckbd>ITSC 2018\u003C\u002Fkbd> [UToronto, autolabeling]\n- [Learning Multi-Object Tracking and Segmentation From Automatic Annotations](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fhtml\u002FPorzi_Learning_Multi-Object_Tracking_and_Segmentation_From_Automatic_Annotations_CVPR_2020_paper.html) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Autolabeling]\n- [Canonical Surface Mapping via Geometric Cycle Consistency](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.10043) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [TIDE: A General Toolbox for Identifying Object Detection Errors](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.08115) \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [tools]\n\n### Low level DL\n- [Self-Supervised Camera Self-Calibration from Video](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.03325) [TRI, intrinsic calibration, fisheye\u002Fpinhole]\n\n### Early NLP papers\n- [A Convolutional Neural Network for Modelling Sentences](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1404.2188.pdf) \u003Ckbd>ACL 2014\u003C\u002Fkbd>\n- [FastText: Bag of Tricks for Efficient Text Classification](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1607.01759.pdf) \u003Ckbd>ACL 2017\u003C\u002Fkbd>\n- [Siamese recurrent architectures for learning sentence similarity](https:\u002F\u002Fwww.aaai.org\u002Focs\u002Findex.php\u002FAAAI\u002FAAAI16\u002Fpaper\u002Fdownload\u002F12195\u002F12023) \u003Ckbd>AAAI 2016\u003C\u002Fkbd>\n- [Efficient Estimation of Word Representations in Vector Space](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1301.3781.pdf) \u003Ckbd>ICLR 2013\u003C\u002Fkbd>\n- [Neural Machine Translation by Jointly Learning to Align and Translate](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1409.0473.pdf) \u003Ckbd>ICLR 2015\u003C\u002Fkbd>\n- [Transformers: Attention Is All You Need](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1706.03762.pdf) \u003Ckbd>NIPS 2017\u003C\u002Fkbd>\n\n\n\n\n## Non-DL\n- [Ad推荐系统方向文章汇总](https:\u002F\u002Fgithub.com\u002Fwzhe06\u002FAd-papers)\n- [UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.03426) [[Notes](paper_notes\u002Fumap.md)] (dimension reduction, better than t-SNE)\n\n## Technical Debt\n- [Review Notes of Classical Key Points and Descriptors](paper_notes\u002Fclassical_keypoints.md)\n- CRF\n- [Visual SLAM and Visual Odometry](https:\u002F\u002Flink.springer.com\u002Fcontent\u002Fpdf\u002F10.1007%2Fs40903-015-0032-7.pdf)\n- ORB SLAM\n- Bundle Adjustment\n- 3D vision\n- [SLAM\u002FVIO学习总结](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F34995102)\n- [Design Patterns](https:\u002F\u002Frefactoring.guru\u002Fdesign-patterns\u002Fpython)\n\n\n## To be organized (CVPR 2021 and ICCV 2021 the pile to be read)\n- [Capturing Omni-Range Context for Omnidirectional Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.05687) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [UP-DETR: Unsupervised Pre-training for Object Detection with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.09094) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [transformers]\n- [DCL: Dense Label Encoding for Boundary Discontinuity Free Rotation Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.09670) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [4D Panoptic LiDAR Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.12472) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [TUM]\n- [CanonPose: Self-Supervised Monocular 3D Human Pose Estimation in the Wild](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14679) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [Fast and Accurate Model Scaling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.06877) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [FAIR]\n- [Cylinder3D: Cylindrical and Asymmetrical 3D Convolution Networks for LiDAR Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.10033) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [lidar semantic segmentation]\n- [LiDAR R-CNN: An Efficient and Universal 3D Object Detector](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.15297) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [TuSimple, Lidar]\n- [PREDATOR: Registration of 3D Point Clouds with Low Overlap](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.13005) \u003Ckbd>CVPR 2021 oral\u003C\u002Fkbd> \n- [DBB: Diverse Branch Block: Building a Convolution as an Inception-like Unit](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.13425) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [RepVGG, ACNet, Xiaohan Ding, Megvii] \n- [GrooMeD-NMS: Grouped Mathematically Differentiable NMS for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.17202) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [DDMP: Depth-conditioned Dynamic Message Propagation for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.16470) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [M3DSSD: Monocular 3D Single Stage Object Detector](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.13164) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [MonoRUn: Monocular 3D Object Detection by Reconstruction and Uncertainty Propagation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.12605) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [HVPR: Hybrid Voxel-Point Representation for Single-stage 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.00902) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Lidar]\n- [PLUME: Efficient 3D Object Detection from Stereo Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.06594) [Yan Wang, Uber ATG]\n- [V2F-Net: Explicit Decomposition of Occluded Pedestrian Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.03106) [crowded, pedestrian, megvii]\n- [IP-basic: In Defense of Classical Image Processing: Fast Depth Completion on the CPU](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.00036) \u003Ckbd>CRV 2018\u003C\u002Fkbd>\n- [Revisiting Feature Alignment for One-stage Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.01570) [cls+reg]\n- [Per-frame mAP Prediction for Continuous Performance Monitoring of Object Detection During Deployment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.08650) \u003Ckbd>WACV 2021\u003C\u002Fkbd> [SafetyNet]\n- [TSD: Revisiting the Sibling Head in Object Detector](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.07540) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [sensetime, cls+reg]\n- [1st Place Solutions for OpenImage2019 -- Object Detection and Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.07557) [sensetime, cls+reg, 1st place OpenImage2019]\n- [Enabling spatio-temporal aggregation in Birds-Eye-View Vehicle Estimation](https:\u002F\u002Fcvssp.org\u002FPersonal\u002FOscarMendez\u002Fpapers\u002Fpdf\u002FSahaICRA2021.pdf) \u003Ckbd>ICRA 2021\u003C\u002Fkbd>\n- [End-to-end Lane Detection through Differentiable Least-Squares Fitting](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.00293) \u003Ckbd>ICCV workshop 2019\u003C\u002Fkbd>\n- [Revisiting ResNets: Improved Training and Scaling Strategies](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.07579)\n- [Multi-Modality Cut and Paste for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.12741)\n- [LD: Localization Distillation for Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.12252)\n- [PolyTransform: Deep Polygon Transformer for Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.02801) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [single stage instance segmentation]\n- [ROAD: The ROad event Awareness Dataset for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.11585)\n- [LidarMTL: A Simple and Efficient Multi-task Network for 3D Object Detection and Road Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.04056) [lidar MTL]\n- [High-Performance Large-Scale Image Recognition Without Normalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.06171) \u003Ckbd>ICLR 2021\u003C\u002Fkbd>\n- [Ground-aware Monocular 3D Object Detection for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.00690) \u003Ckbd>RA-L\u003C\u002Fkbd> [mono3D]\n- [Demystifying Pseudo-LiDAR for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.05796) [mono3d]\n- [Pseudo-labeling for Scalable 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.02093) [Waymo]\n- [LLA: Loss-aware Label Assignment for Dense Pedestrian Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.04307) [Megvii]\n- [VectorNet: Encoding HD Maps and Agent Dynamics from Vectorized Representation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.04259) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Waymo] \n- [CoverNet: Multimodal Behavior Prediction using Trajectory Sets](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.10298) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [prediction, nuScenes]\n- [SplitNet: Divide and Co-training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14660)\n- [VoVNet: An Energy and GPU-Computation Efficient Backbone Network for Real-Time Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09730) \u003Ckbd>CVPR 2019 workshop\u003C\u002Fkbd>\n- [Isometric Neural Networks: Non-discriminative data or weak model? On the relative importance of data and model resolution](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03205) \u003Ckbd>ICCV 2019 workshop\u003C\u002Fkbd> [spatial2channel]\n- [TResNet](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13630) \u003Ckbd>WACV 2021\u003C\u002Fkbd> [spatial2channel]\n- [Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.08287) \u003Ckbd>AAAI 2020\u003C\u002Fkbd> [DIOU, NMS]\n- [RegNet: Designing Network Design Spaces](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13678) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [FAIR]\n- [On Network Design Spaces for Visual Recognition](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.13214) [FAIR]\n- [Lane Endpoint Detection and Position Accuracy Evaluation for Sensor Fusion-Based Vehicle Localization on Highways](https:\u002F\u002Fwww.ncbi.nlm.nih.gov\u002Fpmc\u002Farticles\u002FPMC6308985\u002F) \u003Ckbd>Sensors 2018\u003C\u002Fkbd> [lane endpoints]\n- [Map-Matching-Based Cascade Landmark Detection and Vehicle Localization](https:\u002F\u002Fieeexplore.ieee.org\u002Fstamp\u002Fstamp.jsp?arnumber=8826538) \u003Ckbd>IEEE Access 2019\u003C\u002Fkbd> [lane endpoints]\n- [GCNet: End-to-End Learning of Geometry and Context for Deep Stereo Regression](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.04309) \u003Ckbd>ICCV 2017\u003C\u002Fkbd> [disparity estimation, Alex Kendall, cost volume]\n- [Traffic Control Gesture Recognition for Autonomous Vehicles](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.16072) \u003Ckbd>IROS 2020\u003C\u002Fkbd> [Daimler]\n- [Perceiving 3D Human-Object Spatial Arrangements from a Single Image in the Wild](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.15649) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [OrcVIO: Object residual constrained Visual-Inertial Odometry](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.15107) [dynamic SLAM, very mathematical]\n- [InfoFocus: 3D Object Detection for Autonomous Driving with Dynamic Information Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.08556) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [DA4AD: End-to-End Deep Attention-based Visual Localization for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.03026) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [Towards Lightweight Lane Detection by Optimizing Spatial Embedding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.08311) \u003Ckbd>ECCV 2020 workshop\u003C\u002Fkbd> [LLD]\n- [Multi-Frame to Single-Frame: Knowledge Distillation for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.11859) \u003Ckbd>ECCV 2020 workshop\u003C\u002Fkbd> [lidar]\n- [DeepIM: Deep iterative matching for 6d pose estimation](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCV_2018\u002Fpapers\u002FYi_Li_DeepIM_Deep_Iterative_ECCV_2018_paper.pdf) \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [pose estimation]\n- [Monocular Depth Prediction through Continuous 3D Loss](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.09763) \u003Ckbd>IROS 2020\u003C\u002Fkbd>\n- [Multi-Task Learning for Dense Prediction Tasks: A Survey](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.13379) [MTL, Luc Van Gool]\n- [Dynamic Task Weighting Methods for Multi-task Networks in Autonomous Driving Systems](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.02223) \u003Ckbd>ITSC 2020 oral\u003C\u002Fkbd> [MTL]\n- [NeurAll: Towards a Unified Model for Visual Perception in Automated Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.03589) \u003Ckbd>ITSC 2019 oral\u003C\u002Fkbd> [MTL]\n- [Deep Evidential Regression](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2020\u002Ffile\u002Faab085461de182608ee9f607f3f7d18f-Paper.pdf) \u003Ckbd>NeurIPS 2020\u003C\u002Fkbd> [one-pass aleatoric\u002Fepistemic uncertainty]\n- [Estimating Drivable Collision-Free Space from Monocular Video](http:\u002F\u002Fwww.cs.toronto.edu\u002F~yaojian\u002FfreeSpace.pdf) \u003Ckbd>WACV 2015\u003C\u002Fkbd> [Drivable space]\n- [Visualization of Convolutional Neural Networks for Monocular Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.03380) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [monodepth]\n- [Differentiable Rendering: A Survey](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.12057) [differentiable rendering, TRI]\n- [SAFENet: Self-Supervised Monocular Depth Estimation with Semantic-Aware\nFeature Extraction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.02893) [monodepth, semantics, Naver labs]\n- [Toward Interactive Self-Annotation For Video Object Bounding Box: Recurrent Self-Learning And Hierarchical Annotation Based Framework](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_WACV_2020\u002Fpapers\u002FLe_Toward_Interactive_Self-Annotation_For_Video_Object_Bounding_Box_Recurrent_Self-Learning_WACV_2020_paper.pdf) \u003Ckbd>WACV 2020\u003C\u002Fkbd>\n- [Towards Good Practice for CNN-Based Monocular Depth Estimation](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_WACV_2020\u002Fpapers\u002FFang_Towards_Good_Practice_for_CNN-Based_Monocular_Depth_Estimation_WACV_2020_paper.pdf) \u003Ckbd>WACV 2020\u003C\u002Fkbd>\n- [Self-Supervised Scene De-occlusion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.02788) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd>\n- [TP-LSD: Tri-Points Based Line Segment Detector](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.05505)\n- [Data Distillation: Towards Omni-Supervised Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.04440) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [Kaiming He, FAIR]\n- [MiDas: Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.01341) [monodepth, dynamic object, synthetic dataset]\n- [Semantics-Driven Unsupervised Learning for Monocular Depth and Ego-Motion Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04371) [monodepth]\n- [Towards Lightweight Lane Detection by Optimizing Spatial Embedding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.08311) \u003Ckbd>ECCV 2020 workshop\u003C\u002Fkbd>\n- [Synthetic-to-Real Domain Adaptation for Lane Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.04023) [GM Israel, LLD]\n- [PolyLaneNet: Lane Estimation via Deep Polynomial Regression](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.10924) \u003Ckbd>ICPR 2020\u003C\u002Fkbd> [polynomial, LLD]\n- [Learning Universal Shape Dictionary for Realtime Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.01050)\n- [End-to-End Video Instance Segmentation with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14503) [DETR, transformers]\n- [Score-CAM: Score-Weighted Visual Explanations for Convolutional Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.01279) \u003Ckbd>CVPR 2020 workshop\u003C\u002Fkbd>\n- [When and Why Test-Time Augmentation Works](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.11156)\n- [Footprints and Free Space from a Single Color Image](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.06376) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd> [Parking use, footprint]\n- [Driving among Flatmobiles: Bird-Eye-View occupancy grids from a monocular camera for holistic trajectory planning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.04047) [BEV, only predict footprint]\n- [Rethinking Classification and Localization for Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.06493) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Monocular 3D Object Detection with Sequential Feature Association and Depth Hint Augmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14589) [mono3D]\n- [Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.07177)\n- [ViP-DeepLab: Learning Visual Perception with Depth-aware Video Panoptic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.05258)\n- [MVSNet: Depth Inference for Unstructured Multi-view Stereo](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.02505) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [Recurrent MVSNet for High-resolution Multi-view Stereo Depth Inference](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.10556) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [Deep learning + MVS, Vidar, same author MVSNet]\n- [Artificial Dummies for Urban Dataset Augmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.08274) \u003Ckbd>AAAI 2021\u003C\u002Fkbd>\n- [DETR for Pedestrian Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.06785) [transformer, pedestrian detection]\n- [Multi-Modality Cut and Paste for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.12741) [SenseTime]\n- [Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.15840) [transformer, semantic segmenatation]\n- [TransPose: Towards Explainable Human Pose Estimation by Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.14214) [transformer, pose estimation]\n- [Seesaw Loss for Long-Tailed Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.10032)\n- [SWA Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.12645) [Stochastic Weights Averaging (SWA)]\n- [3D Object Detection with Pointformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.11409)\n- [Toward Transformer-Based Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.09958) [DETR-like]\n- [Boosting Monocular Depth Estimation with Lightweight 3D Point Fusion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.10296) [dense SfM]\n- [Multi-Modality Cut and Paste for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.12741)\n- [Vision Global Localization with Semantic Segmentation and Interest Feature Points](http:\u002F\u002Fras.papercept.net\u002Fimages\u002Ftemp\u002FIROS\u002Ffiles\u002F1899.pdf)\n- [Transformer Interpretability Beyond Attention Visualization](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.09838) [transformers]\n- [Scaling Semantic Segmentation Beyond 1K Classes on a Single GPU](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.07489)\n- [DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.02334)\n- [Empirical Upper Bound in Object Detection and More](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.12451)\n- [Generalized Object Detection on Fisheye Cameras for Autonomous Driving: Dataset, Representations and Baseline](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.02124) [Fisheye, Senthil Yogamani]\n- [Monocular 3D Object Detection with Sequential Feature Association and Depth Hint Augmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14589) [mono3D]\n- [SOSD-Net: Joint Semantic Object Segmentation and Depth Estimation from Monocular images](http:\u002F\u002Farxiv.org\u002Fabs\u002F2101.07422) [Jiwen Lu, monodepth]\n- [Sparse Auxiliary Networks for Unified Monocular Depth Prediction and Completion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.16690) [TRI]\n- [Linformer: Self-Attention with Linear Complexity](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04768)\n- [Set Transformer: A Framework for Attention-based Permutation-Invariant Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.00825) \u003Ckbd>ICML 2019\u003C\u002Fkbd>\n- [PCT: Point cloud transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.09688) \u003Ckbd>Computational Visual Media 2021\u003C\u002Fkbd>\n- [DDT: Unsupervised Object Discovery and Co-Localization by Deep Descriptor Transforming](https:\u002F\u002Farxiv.org\u002Fabs\u002F1707.06397) \u003Ckbd>IJCAI 2017\u003C\u002Fkbd>\n- [Hierarchical Road Topology Learning for Urban Map-less Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.00084) [Mercedes]\n- [Probabilistic Future Prediction for Video Scene Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.06409) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Alex Kendall]\n- [Detecting 32 Pedestrian Attributes for Autonomous Vehicles](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.02647) [VRU, MTL]\n- [Cascaded deep monocular 3D human pose estimation with evolutionary training data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.07778) \u003Ckbd>CVPR 2020 oral\u003C\u002Fkbd>\n- [MonoGeo: Learning Geometry-Guided Depth via Projective Modeling for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.13931) [mono3D]\n- [Aug3D-RPN: Improving Monocular 3D Object Detection by Synthetic Images with Virtual Depth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.13269) [mono3D]\n- [Neighbor-Vote: Improving Monocular 3D Object Detection through Neighbor Distance Voting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.02493) [mono3D]\n- [Lite-FPN for Keypoint-based Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.00268) [mono3D]\n- [Lidar Point Cloud Guided Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.09035)\n- [Vision Transformers for Dense Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.13413) [Vladlen Koltun, Intel]\n- [Efficient Transformers: A Survey](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.06732)\n- [Do Vision Transformers See Like Convolutional Neural Networks?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.08810)\n- [Progressive Coordinate Transforms for Monocular 3D Object Detection](http:\u002F\u002Farxiv.org\u002Fabs\u002F2108.05793) [mono3D]\n- [AutoShape: Real-Time Shape-Aware Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.11127) \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [mono3D]\n- [BlazePose: On-device Real-time Body Pose tracking](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.10204)\n\n\n## TODO\n- [Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.00598) [Andy Zeng]\n- [Large Language Models as General Pattern Machines](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.04721) [Embodied AI]\n- RetinaGAN: An Object-aware Approach to Sim-to-Real Transfer\n- [PlaNet: Learning Latent Dynamics for Planning from Pixels](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.04551) \u003Ckbd>ICML 2019\u003C\u002Fkbd>\n- [Dreamer: Dream to Control: Learning Behaviors by Latent Imagination](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01603) \u003Ckbd>ICLR 2020 oral\u003C\u002Fkbd>\n- [DreamerV2: Mastering Atari with Discrete World Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.02193) \u003Ckbd>ICLR 2021\u003C\u002Fkbd> [World models]\n- [DreamerV3: Mastering Diverse Domains through World Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.04104)\n- [DayDreamer: World Models for Physical Robot Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14176) \u003Ckbd>CoRL 2022\u003C\u002Fkbd>\n- [JEPA: A Path Towards Autonomous Machine Intelligence](https:\u002F\u002Fopenreview.net\u002Fpdf?id=BZ5a1r-kVsf)\n- [I-JEPA: Self-Supervised Learning from Images with a Joint-Embedding Predictive Architecture](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.08243) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [Runway Gen-1: Structure and Content-Guided Video Synthesis with Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.03011)\n- [IL Difficulty Model: Embedding Synthetic Off-Policy Experience for Autonomous Driving via Zero-Shot Curricula](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.01375) \u003Ckbd>CoRL 2022\u003C\u002Fkbd> [Waymo]\n- [Decision Transformer: Reinforcement Learning via Sequence Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.01345) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd> [LLM for planning]\n- [LID: Pre-Trained Language Models for Interactive Decision-Making](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.01771) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [LLM for planning]\n- [Planning with Large Language Models via Corrective Re-prompting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.09935) \u003Ckbd>NeurIPS 2022 Workshop\u003C\u002Fkbd>\n- [Object as Query: Equipping Any 2D Object Detector with 3D Detection Ability](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.02364) \u003Ckbd>ICCV 2023\u003C\u002Fkbd> [TuSimple]\n- [Speculative Sampling: Accelerating Large Language Model Decoding with Speculative Sampling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.01318) [Accelerated LLM, DeepMind]\n- [Inference with Reference: Lossless Acceleration of Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.04487) [Accelerated LLM, Microsoft]\n- [EPSILON: An Efficient Planning System for Automated Vehicles in Highly Interactive Environments](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.07993) \u003Ckbd>T-RO 2021\u003C\u002Fkbd>\n- [Efficient Uncertainty-aware Decision-making for Automated Driving Using Guided Branching](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02746) \u003Ckbd>ICRA 2020\u003C\u002Fkbd>\n- [StreamPETR: Exploring Object-Centric Temporal Modeling for Efficient Multi-View 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11926)\n- [SSCNet: Semantic Scene Completion from a Single Depth Image](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.08974) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [SemanticKITTI: A Dataset for Semantic Scene Understanding of LiDAR Sequences](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.01416) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> \n- [PixPro: Propagate Yourself: Exploring Pixel-Level Consistency for Unsupervised Visual Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.10043) [self-supervised]\n- [Pixel-Wise Contrastive Distillation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.00218) [self-supervised]\n- [VICRegL: Self-Supervised Learning of Local Visual Features](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.01571) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd>\n- [ImageBind: One Embedding Space To Bind Them All](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.05665) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [KEMP: Keyframe-Based Hierarchical End-to-End Deep Model for Long-Term Trajectory Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.04624) \u003Ckbd>ICRA 2022\u003C\u002Fkbd> [Planning]\n- [Deep Interactive Motion Prediction and Planning: Playing Games with Motion Prediction Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.02392) \u003Ckbd>L4DC\u003C\u002Fkbd> [Planning]\n- [GameFormer: Game-theoretic Modeling and Learning of Transformer-based Interactive Prediction and Planning for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05760) [Planning]\n- [LookOut: Diverse Multi-Future Prediction and Planning for Self-Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.06547) [Planning, Raquel]\n- [DIPP: Differentiable Integrated Motion Prediction and Planning with Learnable Cost Function for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.10422) [Planning]\n- [Imitation Is Not Enough: Robustifying Imitation with Reinforcement Learning for Challenging Driving Scenarios](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.11419) [Planning, Waymo]\n- [Hierarchical Model-Based Imitation Learning for Planning in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.09539) \u003Ckbd>IROS 2022\u003C\u002Fkbd> [Planning, Waymo]\n- [Symphony: Learning Realistic and Diverse Agents for Autonomous Driving Simulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.03195) \u003Ckbd>ICRA 2022\u003C\u002Fkbd> [Planning, Waymo]\n- [JFP: Joint Future Prediction with Interactive Multi-Agent Modeling for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08710) [Planning, Waymo]\n- [MaskFormer: Per-Pixel Classification is Not All You Need for Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.06278) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd>\n- [3D Semantic Scene Completion: a Survey](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.07466) \u003Ckbd>IJCV 2022\u003C\u002Fkbd>\n- [DETIC: Detecting Twenty-thousand Classes using Image-level Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.02605) \u003Ckbd>ECCV 2022\u003C\u002Fkbd>\n- [Atlas: End-to-End 3D Scene Reconstruction from Posed Images](https:\u002F\u002Fgithub.com\u002Fmagicleap\u002FAtlas) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [TransformerFusion: Monocular RGB Scene Reconstruction using Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.02191) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd>\n- [SimpleOccupancy: A Simple Attempt for 3D Occupancy Estimation in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10076) [Occupancy Network]\n- [OccDepth: A Depth-Aware Method for 3D Semantic Scene Completion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.13540) [Occupancy Network, stereo]\n- [Fast-BEV: Towards Real-time On-vehicle Bird's-Eye View Perception](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.07870) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd>\n- [Fast-BEV: A Fast and Strong Bird's-Eye View Perception Baseline](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12511) \n- [ProphNet: Efficient Agent-Centric Motion Forecasting with Anchor-Informed Proposals](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.12071) \u003Ckbd>CVPR 2023\u003C\u002Fkbd> [Qcraft, prediction]\n- [Motion Transformer with Global Intention Localization and Local Movement Refinement](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.13508) \u003Ckbd>NeurIPS 2022 Oral\u003C\u002Fkbd>\n- [P4P: Conflict-Aware Motion Prediction for Planning in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.01634)\n- [MultiPath++: Efficient Information Fusion and Trajectory Aggregation for Behavior Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.14973)\n- [ViP3D: End-to-end Visual Trajectory Prediction via 3D Agent Queries](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.01582)\n- [SAM: Segment Anything](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.02643) [FAIR]\n- [GeoMIM: Towards Better 3D Knowledge Transfer via Masked Image Modeling for Multi-view 3D Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11325)\n- [Motion Prediction using Trajectory Sets and Self-Driving Domain Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04767) [Encode Road requirement to prediction]\n- [Transformer Feed-Forward Layers Are Key-Value Memories](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.14913) \u003Ckbd>EMNLP 2021\u003C\u002Fkbd>\n- [BEV-LaneDet: a Simple and Effective 3D Lane Detection Baseline](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.06006) \u003Ckbd>CVPR 2023\u003C\u002Fkbd> [BEVNet]\n- [Exploring Recurrent Long-term Temporal Fusion for Multi-view 3D Perception](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05970) [BEVNet, megvii]\n- [VAD: Vectorized Scene Representation for Efficient Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.12077) [Horizon]\n- [A Simple Attempt for 3D Occupancy Estimation in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10076)\n- [BEVPoolv2: A Cutting-edge Implementation of BEVDet Toward Deployment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.17111) [BEVDet, PhiGent]\n- [NVRadarNet: Real-Time Radar Obstacle and Free Space Detection for Autonomous Driving](http:\u002F\u002Faixpaper.com\u002Fview\u002Fnvradarnet_realtime_radar_obstacle_and_free_space_detection_for_autonomous_driving)\n- [GraspNet-1Billion: A Large-Scale Benchmark for General Object Grasping](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FFang_GraspNet-1Billion_A_Large-Scale_Benchmark_for_General_Object_Grasping_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Cewu Lu]\n- [AnyGrasp: Robust and Efficient Grasp Perception in Spatial and Temporal Domains](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08333) [Cewu Lu]\n- [Point Cloud Forecasting as a Proxy for 4D Occupancy Forecasting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.13130)\n- [HDGT: Heterogeneous Driving Graph Transformer for Multi-Agent Trajectory Prediction via Scene Encoding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.09753)\n- [MTR: Motion Transformer with Global Intention Localization and Local Movement Refinement](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.13508) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd>\n- [UVTR: Unifying Voxel-based Representation with Transformer for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.00630) [BEVFusion, Megvii, BEVNet, camera + lidar]\n- [Don't Use Large Mini-Batches, Use Local SGD](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.07217) \u003Ckbd>ICLR 2020\u003C\u002Fkbd>\n- [Grokking: Generalization beyond Overfitting on small algorithmic datasets](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.02177)\n- [Progress measures for grokking via mechanistic interpretability]()\n- [Understanding deep learning requires rethinking generalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.03530) \u003Ckbd>ICLR 2017\u003C\u002Fkbd>\n- [Unifying Grokking and Double Descent](https:\u002F\u002Fopenreview.net\u002Fforum?id=JqtHMZtqWm)\n- [Deep Interactive Motion Prediction and Planning: Playing Games with Motion Prediction Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.02392) \u003Ckbd>L4DC 2022\u003C\u002Fkbd>\n- [Interactive Prediction and Planning for Autonomous Driving: from Algorithms to Fundamental Aspects](https:\u002F\u002Fescholarship.org\u002Fuc\u002Fitem\u002F0vf4q2x1) [PhD thesis of Wei Zhan, 2019]\n- [Lyft1001: One Thousand and One Hours: Self-driving Motion Prediction Dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.14480) [Lyft Level 5, prediction dataset]\n- [PCAccumulation: Dynamic 3D Scene Analysis by Point Cloud Accumulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.12394) \u003Ckbd>ECCV 2022\u003C\u002Fkbd>\n- [UniSim: A Neural Closed-Loop Sensor Simulator](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fpapers\u002FYang_UniSim_A_Neural_Closed-Loop_Sensor_Simulator_CVPR_2023_paper.pdf) \u003Ckbd>CVPR 2023\u003C\u002Fkbd> [simulation, Raquel]\n- [GeoSim: Realistic Video Simulation via Geometry-Aware Composition for\nSelf-Driving](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fpapers\u002FChen_GeoSim_Realistic_Video_Simulation_via_Geometry-Aware_Composition_for_Self-Driving_CVPR_2021_paper.pdf) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [Accelerating Reinforcement Learning for Autonomous Driving using Task-Agnostic and Ego-Centric Motion Skills](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.12072) [Driving Skill]\n- [Efficient Reinforcement Learning for Autonomous Driving with Parameterized Skills and Priors](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04412) \u003Ckbd>RSS 2023\u003C\u002Fkbd> [Driving Skill]\n- [IL Difficulty Model: Embedding Synthetic Off-Policy Experience for Autonomous Driving via Zero-Shot Curricula](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.01375) \u003Ckbd>CoRL 2022\u003C\u002Fkbd> [Waymo]\n- [Neural Map Prior for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08481) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [Track Anything: Segment Anything Meets Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.11968)\n- [Self-Supervised Camera Self-Calibration from Video](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.03325) \u003Ckbd>ICRA 2022\u003C\u002Fkbd> [TRI, calibration]\n- [Real-time Online Video Detection with Temporal Smoothing Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.09236) \u003Ckbd>ECCV 2022\u003C\u002Fkbd> [ConvLSTM-style cross-attention]\n- [NeRF-Supervised Deep Stereo](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17603) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [GET3D: A Generative Model of High Quality 3D Textured Shapes Learned from Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.11163) \u003Ckbd>NeurIOS 2022\u003C\u002Fkbd>\n- [OmniObject3D: Large-Vocabulary 3D Object Dataset for Realistic Perception, Reconstruction and Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.07525) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [Ego-Body Pose Estimation via Ego-Head Pose Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.04636) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [PanoOcc: Unified Occupancy Representation for Camera-based 3D Panoptic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.10013)\n- [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12597)\n- [Visual Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08485)\n- [VideoChat: Chat-Centric Video Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.06355)\n- [CoBEVT: Cooperative Bird's Eye View Semantic Segmentation with Sparse Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.02202) \u003Ckbd>CoRL 2022\u003C\u002Fkbd>\n- [BEVFormer v2: Adapting Modern Image Backbones to Bird's-Eye-View Recognition via Perspective Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.10439) [BEVNet, Jifeng Dai]\n- [Fast-BEV: Towards Real-time On-vehicle Bird’s-Eye View Perception](https:\u002F\u002Fml4ad.github.io\u002Ffiles\u002Fpapers2022\u002FFast-BEV:%20Towards%20Real-time%20On-vehicle%20Bird's-Eye%20View%20Perception.pdf) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd>\n- [Traj++: Human Trajectory Forecasting in Crowds: A Deep Learning Perspective](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.03639) \u003Ckbd>TITS 2021\u003C\u002Fkbd>\n- [Data Driven Prediction Architecture for Autonomous Driving and its Application on Apollo Platform](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.06715) \u003Ckbd>IV 2020\u003C\u002Fkbd> [Baidu]\n- [THOMAS: Trajectory Heatmap Output with learned Multi-Agent Sampling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.06607) \u003Ckbd>ICLR 2022\u003C\u002Fkbd>\n- [Learning Lane Graph Representations for Motion Forecasting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.13732) \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd>\n- [Identifying Driver Interactions via Conditional Behavior Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.09959) \u003Ckbd>ICRA 2021\u003C\u002Fkbd> [Waymo]\n- [Trajectron++: Dynamically-Feasible Trajectory Forecasting With Heterogeneous Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.03093) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [TPNet: Trajectory Proposal Network for Motion Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.12255) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [GOHOME: Graph-Oriented Heatmap Output for future Motion Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01827)\n- [PECNet: It Is Not the Journey but the Destination: Endpoint Conditioned Trajectory Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.02025) \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd>\n- [From Goals, Waypoints & Paths To Long Term Human Trajectory Forecasting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.01526) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [PRECOG: PREdiction Conditioned On Goals in Visual Multi-Agent Settings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.01296) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [PiP: Planning-informed Trajectory Prediction for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.11476) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [MultiPath: Multiple Probabilistic Anchor Trajectory Hypotheses for Behavior Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05449) \u003Ckbd>CoRL 2019\u003C\u002Fkbd>\n- [LaPred: Lane-Aware Prediction of Multi-Modal Future Trajectories of Dynamic Agents](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.00249) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [PRIME: Learning to Predict Vehicle Trajectories with Model-based Planning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.04027) \u003Ckbd>CoRL 2021\u003Ckbd>\n- [A Flexible and Explainable Vehicle Motion Prediction and Inference Framework Combining Semi-Supervised AOG and ST-LSTM](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fabs\u002F10.1109\u002FTITS.2020.3016304) \u003Ckbd>TITS 2020\u003C\u002Fkbd>\n- [Multi-Modal Trajectory Prediction of Surrounding Vehicles with Maneuver based LSTMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.05499) \u003Ckbd>IV 2018\u003C\u002Fkbd> [Trivedi]\n- [HYPER: Learned Hybrid Trajectory Prediction via Factored Inference and Adaptive Sampling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.02344) \u003Ckbd>ICRA 2022\u003C\u002Fkbd>\n- [Trajectory Prediction with Linguistic Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.09741) \u003Ckbd>ICRA 2022\u003C\u002Fkbd>\n- [What-If Motion Prediction for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.10587)\n- [End-to-end Contextual Perception and Prediction with Interaction Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.05927) \u003Ckbd>IROS 2020\u003C\u002Fkbd> [Auxiliary collision loss, scene compliant pred]\n- [SafeCritic: Collision-Aware Trajectory Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06673) \u003Ckbd>BMVC 2019\u003C\u002Fkbd> [IRL, scene compliant pred]\n- [Large Scale Interactive Motion Forecasting for Autonomous Driving: The Waymo Open Motion Dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10133) \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [Waymo]\n- [Interaction-Based Trajectory Prediction Over a Hybrid Traffic Graph](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.12916) \u003Ckbd>IROS 2020\u003C\u002Fkbd>\n- [Joint Interaction and Trajectory Prediction for Autonomous Driving using Graph Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.07882) \u003Ckbd>NeurIPS 2019 workshop\u003C\u002Fkbd>\n- [Fast Risk Assessment for Autonomous Vehicles Using Learned Models of Agent Futures](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.13458) \u003Ckbd>Robotics: science and systems 2020\u003C\u002Fkbd>\n- [Monocular 3D Object Detection: An Extrinsic Parameter Free Approach](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.15796) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [PJLab]\n- [UniFormer: Unified Multi-view Fusion Transformer for Spatial-Temporal Representation in Bird's-Eye-View](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.08536) [BEVFormer, BEVNet, Temporal]\n- [GitNet: geometric prior-baesd transformation for birds yee view segmentation]()\n- [WBF: weighted box fusion: ensembling boxes from differnt object detection modules]()\n- [NNI: auto parameter finding algorithm]()\n- [BEVFormer++: Improving BEVFormer for 3D Camera-only Object Detection](https:\u002F\u002Fstorage.googleapis.com\u002Fwaymo-uploads\u002Ffiles\u002Fresearch\u002F3DCam\u002F3DCam_BEVFormer.pdf) [Waymo open dataset challenge 1st place in mono3d]\n- [LET-3D-AP: Longitudinal Error Tolerant 3D Average Precision for Camera-Only 3D Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.07705) [Waymo open dataset challenge official metric]\n- [High-Level Interpretation of Urban Road Maps Fusing Deep Learning-Based Pixelwise Scene Segmentation and Digital Navigation Maps](https:\u002F\u002Fpdfs.semanticscholar.org\u002F44ac\u002F01c0d356f22e7ee883f8e4ac2cccf199f68d.pdf) \u003Ckbd>Journal of Advanced Transportation 2018\u003C\u002Fkbd>\n- [A Hybrid Vision-Map Method for Urban Road Detection](https:\u002F\u002Fdownloads.hindawi.com\u002Fjournals\u002Fjat\u002F2017\u002F7090549.pdf) \u003Ckbd>Journal of Advanced Transportation 2017\u003C\u002Fkbd>\n- [Terminology and Analysis of Map Deviations in Urban Domains: Towards Dependability for HD Maps in Automated Vehicles](https:\u002F\u002Fwww.researchgate.net\u002Fprofile\u002FChristopher-Plachetka\u002Fpublication\u002F348367176_Terminology_and_Analysis_of_Map_Deviations_in_Urban_Domains_Towards_Dependability_for_HD_Maps_in_Automated_Vehicles\u002Flinks\u002F607d523f907dcf667babc06b\u002FTerminology-and-Analysis-of-Map-Deviations-in-Urban-Domains-Towards-Dependability-for-HD-Maps-in-Automated-Vehicles.pdf) \u003Ckbd>IV 2020\u003C\u002Fkbd>\n- [TIME WILL TELL: NEW OUTLOOKS AND A BASELINE FOR TEMPORAL MULTI-VIEW 3D OBJECT DETECTION](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02443)\n- [Conditional DETR for Fast Training Convergence](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.06152) \u003Ckbd>ICCV 2021\u003C\u002Fkbd>\n- [DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.12329) \u003Ckbd>ICLR 2022\u003C\u002Fkbd>\n- [DN-DETR: Accelerate DETR Training by Introducing Query DeNoising](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.01305) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.03605)\n- [Trajectory Forecasting from Detection with Uncertainty-Aware Motion Encoding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.01478) [Ouyang Wanli]\n- [Vision-based Uneven BEV Representation Learning with Polar Rasterization and Surface Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.01878) [BEVNet, polar]\n- [MUTR3D: A Multi-camera Tracking Framework via 3D-to-2D Queries](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.00613) [BEVNet, tracking] \u003Ckbd>CVPR 2022 workshop\u003C\u002Fkbd> [Hang Zhao]\n- [ST-P3: End-to-end Vision-based Autonomous Driving via Spatial-Temporal Feature Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.07601) \u003Ckbd>ECCV 2022\u003C\u002Fkbd> [Hongyang Li]\n- [GKT: Efficient and Robust 2D-to-BEV Representation Learning via Geometry-guided Kernel Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.04584) [BEVNet, Horizon]\n- [SiamRPN: High Performance Visual Tracking with Siamese Region Proposal Network](https:\u002F\u002Fyan-junjie.github.io\u002Fpublication\u002Fdblp-confcvpr-li-ywzh-18\u002Fdblp-confcvpr-li-ywzh-18.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [TPLR: Topology Preserving Local Road Network Estimation from Single Onboard Camera Image](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.10155) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [STSU, Luc Van Gool]\n- [LaRa: Latents and Rays for Multi-Camera Bird's-Eye-View Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.13294) [Valeo, BEVNet, polar]\n- [PolarDETR: Polar Parametrization for Vision-based Surround-View 3D Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.10965) [BEVNet]\n- [Exploring Geometric Consistency for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.05858) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.01178) \u003Ckbd>WACV 2022\u003C\u002Fkbd> [mono3D]\n- [Learning to Predict 3D Lane Shape and Camera Pose from a Single Image via Geometry Constraints](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.15351) \u003Ckbd>AAAI 2022\u003C\u002Fkbd>\n- [Detecting Lane and Road Markings at A Distance with Perspective Transformer Layers](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9294383) \u003Ckbd>ICICN 2021\u003C\u002Fkbd> [BEVNet, lane line]\n- [Unsupervised Labeled Lane Markers Using Maps](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCVW_2019\u002Fpapers\u002FCVRSUAD\u002FBehrendt_Unsupervised_Labeled_Lane_Markers_Using_Maps_ICCVW_2019_paper.pdf) \u003Ckbd>ICCV 2019 workshop\u003C\u002Fkbd> [Bosch, 2D lane line]\n- [M3DeTR: Multi-representation, Multi-scale, Mutual-relation 3D Object Detection with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.11896) [Lidar detection, Waymo open dataset] \u003Ckbd>WACV 2022\u003C\u002Fkbd>\n- [K-Lane: Lidar Lane Dataset and Benchmark for Urban Roads and Highways](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.11048) [lane line dataset]\n- [Robust Monocular 3D Lane Detection With Dual Attention](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9506296) \u003Ckbd>ICIP 2021\u003C\u002Fkbd>\n- [OcclusionFusion: Occlusion-aware Motion Estimation for Real-time Dynamic 3D Reconstruction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.07977) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.02178) \u003Ckbd>ICLR 2022\u003C\u002Fkbd> [lightweight Transformers]\n- [XFormer: Lightweight Vision Transformer with Cross Feature Attention](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.07268) [Samsung]\n- [CenterFormer: Center-based Transformer for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.05588) \u003Ckbd>ECCV 2022 oral\u003C\u002Fkbd> [TuSimple]\n- [LidarMultiNet: Towards a Unified Multi-task Network for LiDAR Perception](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.09385) [2022 Waymo Open Dataset, TuSimple]\n- [MTRA: 1st Place Solution for 2022 Waymo Open Dataset Challenge - Motion Prediction](https:\u002F\u002Fstorage.googleapis.com\u002Fwaymo-uploads\u002Ffiles\u002Fresearch\u002FMotionPred\u002FMotionPrediction_MTRA.pdf) [Waymo open dataset challenge 1st place in motion prediction]\n- [BEVSegFormer: Bird's Eye View Semantic Segmentation From Arbitrary Camera Rigs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.04050) [BEVNet]\n- [Panoptic SegFormer: Delving Deeper into Panoptic Segmentation with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.03814) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [nVidia]\n- [Efficiently Identifying Task Groupings for Multi-Task Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.04617) \u003Ckbd>NeurIPS 2021 spotlight\u003C\u002Fkbd> [MTL]\n- [Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.05482) [Google, Golden Backbone]\n- [\"The Pedestrian next to the Lamppost\" Adaptive Object Graphs for Better Instantaneous Mapping](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.02944) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [GitNet: Geometric Prior-based Transformation for Birds-Eye-View Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07733) [BEVNet, Baidu]\n- [FUTR3D: A Unified Sensor Fusion Framework for 3D Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.10642) [Hang Zhao]\n- [GitNet: Geometric Prior-based Transformation for Birds-Eye-View Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07733) [BEVNet]\n- [MonoFormer: Towards Generalization of self-supervised monocular depth estimation with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.11083) [monodepth]\n- [Time3D: End-to-End Joint Monocular 3D Object Detection and Tracking for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.14882)\n- [cosFormer: Rethinking Softmax in Attention](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.08791) \u003Ckbd>ICLR 2022\u003C\u002Fkbd>\n- [StretchBEV: Stretching Future Instance Prediction Spatially and Temporally](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.13641) [BEVNet, prediction]\n- [Scene Representation in Bird’s-Eye View from Surrounding Cameras with Transformers](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2022W\u002FWAD\u002Fpapers\u002FZhao_Scene_Representation_in_Birds-Eye_View_From_Surrounding_Cameras_With_Transformers_CVPRW_2022_paper.pdf) [BEVNet, LLD] \u003Ckbd>CVPR 2022 workshop\u003C\u002Fkbd>\n- [Multi-Frame Self-Supervised Depth with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07616) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [It's About Time: Analog Clock Reading in the Wild](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09162) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [Andrew Zisserman]\n- [SurroundDepth: Entangling Surrounding Views for Self-Supervised Multi-Camera Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.03636) \u003Ckbd>CoRL 2022\u003C\u002Fkbd> [Jiwen Lu]\n- [ONCE-3DLanes: Building Monocular 3D Lane Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.00301) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [K-Lane: Lidar Lane Dataset and Benchmark for Urban Roads and Highways](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.11048) \u003Ckbd>CVPR 2022 workshop\u003C\u002Fkbd> [3D LLD]\n- [Multi-modal 3D Human Pose Estimation with 2D Weak Supervision in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.12141) \u003Ckbd>CVPR 2022 workshop\u003C\u002Fkbd>\n- [A Simple Baseline for BEV Perception Without LiDAR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.07959) [TRI, BEVNet, vision+radar]\n- [Reconstruct from Top View: A 3D Lane Detection Approach based on Geometry Structure Prior](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2022W\u002FWAD\u002Fpapers\u002FLi_Reconstruct_From_Top_View_A_3D_Lane_Detection_Approach_Based_CVPRW_2022_paper.pdf) \u003Ckbd>CVPR 2022 workshop\u003C\u002Fkbd>\n- [RIDDLE: Lidar Data Compression with Range Image Deep Delta Encoding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.01738) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [Waymo, Charles Qi]\n- [Occupancy Flow Fields for Motion Forecasting in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.03875) \u003Ckbd>RAL 2022\u003C\u002Fkbd> [Waymo occupancy flow challenge]\n- [Safe Local Motion Planning with Self-Supervised Freespace Forecasting](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fpapers\u002FHu_Safe_Local_Motion_Planning_With_Self-Supervised_Freespace_Forecasting_CVPR_2021_paper.pdf) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [数据闭环的核心 - Auto-labeling 方案分享](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F533907821)\n- [K-Lane: Lidar Lane Dataset and Benchmark for Urban Roads and Highways](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.11048)\n- [LETR: Line Segment Detection Using Transformers without Edges](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.01909) \u003Ckbd>CVPR 2021 oral\u003C\u002Fkbd>\n- [HDMapGen: A Hierarchical Graph Generative Model of High Definition Maps](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fpapers\u002FMi_HDMapGen_A_Hierarchical_Graph_Generative_Model_of_High_Definition_Maps_CVPR_2021_paper.pdf) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [HD mapping]\n- [SketchRNN: A Neural Representation of Sketch Drawings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.03477) [David Ha]\n- [PolyGen: An Autoregressive Generative Model of 3D Meshes](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.10880) \u003Ckbd>ICML 2020\u003C\u002Fkbd>\n- [SOLQ: Segmenting Objects by Learning Queries](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.02351) \u003Ckbd>NeurlPS 2021\u003C\u002Fkbd> [Megvii, end-to-end, instance segmentation]\n- [MonoViT: Self-Supervised Monocular Depth Estimation with a Vision Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.03543) \u003Ckbd>3DV 2022\u003C\u002Fkbd>\n- [MVSTER: Epipolar Transformer for Efficient Multi-View Stereo](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07346) \u003Ckbd>ECCV 2022\u003C\u002Fbd>\n- [MOVEDepth: Crafting Monocular Cues and Velocity Guidance for Self-Supervised Multi-Frame Depth Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.09170) [MVS + monodepth]\n- [SurroundDepth: Entangling Surrounding Views for Self-Supervised Multi-Camera Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.03636)\n- [Scene Transformer: A unified architecture for predicting multiple agent trajectories](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.08417) [prediction, Waymo] \u003Ckbd>ICLR 2022\u003C\u002Fkbd>\n- [SSIA: Monocular Depth Estimation with Self-supervised Instance Adaptation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.05821) [VGG team, TTR, test time refinement, CVD]\n- [CoMoDA: Continuous Monocular Depth Adaptation Using Past Experiences](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FWACV2021\u002Fpapers\u002FKuznietsov_CoMoDA_Continuous_Monocular_Depth_Adaptation_Using_Past_Experiences_WACV_2021_paper.pdf) \u003Ckbd>WACV 2021\u003C\u002Fkbd>\n- [MonoRec: Semi-supervised dense reconstruction in dynamic environments from a single moving camera](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.11814) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Daniel Cremmers]\n- [Plenoxels: Radiance Fields without Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.05131)\n- [Lidar with Velocity: Motion Distortion Correction of Point Clouds from Oscillating Scanning Lidars](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09497) [Livox, ISEE]\n- [NWD: A Normalized Gaussian Wasserstein Distance for Tiny Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.13389)\n- [Towards Optimal Strategies for Training Self-Driving Perception Models in Simulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.07971) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd> [Sanja Fidler]\n- [Insta-DM: Learning Monocular Depth in Dynamic Scenes via Instance-Aware Projection Consistency](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.02629) \u003Ckbd>AAAI 2021\u003C\u002Fkbd>\n- [Instance-wise Depth and Motion Learning from Monocular Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.09351) \u003Ckbd>NeurIPS 2020 workshop\u003C\u002Fkbd> [[website](https:\u002F\u002Fsites.google.com\u002Fsite\u002Fseokjucv\u002Fhome\u002Finstadm)]\n- [NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.08934) \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd>\n- [BARF: Bundle-Adjusting Neural Radiance Fields](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.06405) \u003Ckbd>ICCV 2021 oral\u003C\u002Fkbd>\n- [NerfingMVS: Guided Optimization of Neural Radiance Fields for Indoor Multi-view Stereo](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01129) \u003Ckbd>ICCV 2021 oral\u003C\u002Fkbd>\n- [YOLinO: Generic Single Shot Polyline Detection in Real Time](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.14420) \u003Ckbd>ICCV 2021 workshop\u003C\u002Fkbd> [lld]\n- [MonoRCNN: Geometry-based Distance Decomposition for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.03775) \u003Ckbd>ICCV 2021\u003C\u002Fkbd>\n- [MonoCInIS: Camera Independent Monocular 3D Object Detection using Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.00464) \u003Ckbd>ICCV 2021 workshop\u003C\u002Fkbd>\n- [PV-RCNN: Point-Voxel Feature Set Abstraction for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.13192) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Waymo challenge 2nd place]\n- [Geometry-based Distance Decomposition for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.03775) \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [mono3D]\n- [Offboard 3D Object Detection from Point Cloud Sequences](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.05073) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Charles Qi] \n- [FreeAnchor: Learning to Match Anchors for Visual Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02466) \u003Ckbd>NeurIPS 2019\u003C\u002Fkbd>\n- [AutoAssign: Differentiable Label Assignment for Dense Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.03496)\n- [Probabilistic Anchor Assignment with IoU Prediction for Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.08103) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [FOVEA: Foveated Image Magnification for Autonomous Navigation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.12102) \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [Argo]\n- [PifPaf: Composite Fields for Human Pose Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.06593) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Monocular 3D Localization of Vehicles in Road Scenes](https:\u002F\u002Favvision.xyz\u002Ficcv21\u002Fpapers\u002F1\u002FCameraReady\u002F01.pdf) \u003Ckbd>ICCV 2021 workshop\u003C\u002Fkbd> [mono3D, tracking]\n- [TransformerFusion: Monocular RGB Scene Reconstruction using Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.02191)\n- [Conditional DETR for Fast Training Convergence](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.06152)\n- [Anchor DETR: Query Design for Transformer-Based Detector](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.07107) [megvii]\n- [PGD: Probabilistic and Geometric Depth: Detecting Objects in Perspective](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.14160) \u003Ckbd>CoRL 2021\u003C\u002Fkbd>\n- [Adaptive Wing Loss for Robust Face Alignment via Heatmap Regression](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.07399) \n- [What Makes for End-to-End Object Detection?](https:\u002F\u002Fproceedings.mlr.press\u002Fv139\u002Fsun21b.html) \u003Ckbd>PMLR 2021\u003C\u002Fkbd>\n- [Instances as Queries](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.01928) \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [instance segmentation]\n- [One Million Scenes for Autonomous Driving: ONCE Dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.11037) [Huawei]\n- [NVS-MonoDepth: Improving Monocular Depth Prediction with Novel View Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.12577) \u003Ckbd>3DV 2021\u003C\u002Fkbd>\n- [Is 2D Heatmap Representation Even Necessary for Human Pose Estimation?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03332)\n- [Topology Preserving Local Road Network Estimation from Single Onboard Camera Image](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.10155) [BEVNet, Luc Van Gool]\n- [Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.16452) [Small LLM prompting, Microsoft]\n- [CoT: Chain-of-Thought Prompting Elicits Reasoning in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd>\n- [ToT: Tree of Thoughts: Deliberate Problem Solving with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601) [[Notes](paper_notes\u002Ftot.md)] \u003Ckbd>NeurIPS 2023 Oral\u003C\u002Fkbd>\n- [Cumulative Reasoning with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.04371)\n- [A Survey of Techniques for Maximizing LLM Performance](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=ahnGLM-RC1Y&ab_channel=OpenAI) [OpenAI]\n- [Drive AGI](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FDriveAGI)\n- [Harnessing the Power of Multi-Modal LLMs for Autonomy](https:\u002F\u002Fwww.ghostautonomy.com\u002Fblog\u002Fmllms-for-autonomy) [Ghost Autonomy]\n- [Language to Rewards for Robotic Skill Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.08647)\n- [ALOHA: Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.13705)\n- [LLM-Grounder: Open-Vocabulary 3D Visual Grounding with Large Language Model as an Agent](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.12311) [UM]\n- [LM-Nav: Robotic Navigation with Large Pre-Trained Models of Language, Vision, and Action](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.04429) [Sergey Levine]\n- [A Survey of Embodied AI: From Simulators to Research Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.04918) \u003Ckbd>IEEE TETCI 2021\u003C\u002Fkbd>\n- [Habitat Challenge 2021](https:\u002F\u002Faihabitat.org\u002Fchallenge\u002F2021\u002F)\n- [Video ChatCaptioner: Towards Enriched Spatiotemporal Descriptions](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.04227)\n- [DoReMi: Grounding Language Model by Detecting and Recovering from Plan-Execution Misalignment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.00329) [Jianyu Chen]\n- [The Power of Scale for Parameter-Efficient Prompt Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08691) \u003Ckbd>EMNLP 2021\u003C\u002Fkbd>\n- [Language Models as Zero-Shot Planners: Extracting Actionable Knowledge for Embodied Agents](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.07207) \u003Ckbd>ICML 2022\u003C\u002Fkbd>\n- [ProgPrompt: Generating Situated Robot Task Plans using Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.11302) \u003Ckbd>ICRA 2023\u003C\u002Fkbd>\n- [Perceiver-Actor: A Multi-Task Transformer for Robotic Manipulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.05451) \u003Ckbd>CoRL 2022\u003C\u002Fkbd>\n- [LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.07339) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [LLM Quant]\n- [AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.00978) [Song Han, LLM Quant]\n- [RoFormer: Enhanced Transformer with Rotary Position Embedding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.09864)\n- [CoDi: Any-to-Any Generation via Composable Diffusion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11846) \u003Ckbd>NeurIPS 2023\u003C\u002Fkbd>\n- [What if a Vacuum Robot has an Arm?](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10202493) \u003Ckbd>UR 2023\u003C\u002Fkbd>\n- [FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.14135)\n- [GPT in 60 Lines of NumPy](https:\u002F\u002Fjaykmody.com\u002Fblog\u002Fgpt-from-scratch\u002F)\n- [Speeding up the GPT - KV cache](https:\u002F\u002Fwww.dipkumar.dev\u002Fbecoming-the-unbeatable\u002Fposts\u002Fgpt-kvcache\u002F)\n- [LLM Parameter Counting](https:\u002F\u002Fkipp.ly\u002Ftransformer-param-count\u002F)\n- [Transformer Inference Arithmetic](https:\u002F\u002Fkipp.ly\u002Ftransformer-inference-arithmetic\u002F#kv-cache)\n- [ALBEF: Align before Fuse: Vision and Language Representation Learning with Momentum Distillation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.07651) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd> [Junnan Li]\n- [CLIP: Learning Transferable Visual Models From Natural Language Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020) \u003Ckbd>ICLR 2021\u003C\u002Fkbd> [OpenAI]\n- [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.12086) \u003Ckbd>ICML 2022\u003C\u002Fkbd> [Junnan Li]\n- [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12597) [Junnan Li]\n- [MOO: Open-World Object Manipulation using Pre-trained Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.00905) [Google Robotics, end-to-end visuomotor]\n- [VC-1: Where are we in the search for an Artificial Visual Cortex for Embodied Intelligence?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.18240)\n- [CLIPort: What and Where Pathways for Robotic Manipulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.12098) \u003Ckbd>CoRL 2021\u003C\u002Fkbd> [Nvidia, end-to-end visuomotor]\n- [GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.17323) \u003Ckbd>ICLR 2023\u003C\u002Fkbd>\n- [SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.10438) \u003Ckbd>ICML 2023\u003C\u002Fkbd> [Song Han, LLM Quant]\n- [SAPIEN: A SimulAted Part-based Interactive ENvironment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.08515) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [FiLM: Visual Reasoning with a General Conditioning Layer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.07871) \u003Ckbd>AAAI 2018\u003C\u002Fkbd>\n- [TokenLearner: What Can 8 Learned Tokens Do for Images and Videos?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.11297) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd>\n- [QLoRA: Efficient Finetuning of Quantized LLMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14314)\n- [OVO: Open-Vocabulary Occupancy](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16133)\n- [Code Llama: Open Foundation Models for Code](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.12950)\n- [Chinchilla: Training Compute-Optimal Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.15556) [DeepMind]\n- [GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13245)\n- [RoFormer: Enhanced Transformer with Rotary Position Embedding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.09864)\n- [RH20T: A Robotic Dataset for Learning Diverse Skills in One-Shot](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.00595)\n- [Perceiver-Actor: A Multi-Task Transformer for Robotic Manipulation]()\n- [VIMA: General Robot Manipulation with Multimodal Prompts]()\n- [An Attention Free Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.14103) [Apple]\n- [PDDL Planning with Pretrained Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11014) [MIT, Leslie Kaelbling]\n- [Task and Motion Planning with Large Language Models for Object Rearrangement](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.06247) \u003Ckbd>IROS 2023\u003C\u002Fkbd>\n\n","# 论文笔记\n本仓库包含我对深度学习和机器学习相关论文的阅读笔记。灵感来源于 [Denny Britz](https:\u002F\u002Fgithub.com\u002Fdennybritz\u002Fdeeplearning-papernotes) 和 [Daniel Takeshi](https:\u002F\u002Fgithub.com\u002FDanielTakeshi\u002FPaper_Notes)。一个使用 Github Pages 生成的极简网页可以在这里找到：[https:\u002F\u002Fpatrick-llgc.github.io\u002FLearning-Deep-Learning\u002F](https:\u002F\u002Fpatrick-llgc.github.io\u002FLearning-Deep-Learning\u002F)。\n\n## 关于我\n我叫 [Patrick Langechuan Liu](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fpatrick-llgc\u002F)。在物理学领域接受了约十年的训练与研究后，我发现了自己对深度学习和自动驾驶的热情。\n\n目前我在 NVIDIA 担任 AI 总监，领导 NVIDIA 端到端自动驾驶项目 Alpamayo 的机器学习建模工作。\n\n## 应该读什么\n如果你是计算机视觉领域深度学习的新手，不知道从何入手，我建议你在最初的一个月左右深入研读 [这份论文列表](start\u002Ffirst_cnn_papers.md)。我当时就是这样做的（[查看我的笔记](start\u002Ffirst_cnn_papers_notes.md)），效果非常好。\n\n这里还有一份 [可靠的论文来源列表](trusty.md)，以防我找不到更多论文来读。\n\n## 我的主题评论文章\n我定期更新我的博客专栏 [The Thinking Car](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car)。\n\n- [自动驾驶感知工程师的规划速成课程](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fa-crash-course-of-planning-for-perception-engineers-in-autonomous-driving-ede324d78717)\n- [量产级自动驾驶中的 BEV 感知](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fbev-perception-in-mass-production-autonomous-driving-c6e3f1e46ae0)\n- [中国量产级自动驾驶面临的挑战](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fchallenges-of-mass-production-autonomous-driving-in-china-407c7e2dc5d8)\n- [面向自动驾驶的视觉中心语义占用预测](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fvision-centric-semantic-occupancy-prediction-for-autonomous-driving-16a46dbd6f65)（[相关论文笔记](topics\u002Ftopic_occupancy_network.md)）\n- [自动驾驶中的可行驶空间——行业视角](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fdrivable-space-in-autonomous-driving-the-industry-7a4624b94d41)\n- [自动驾驶中的可行驶空间——学术界回顾](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fdrivable-space-in-autonomous-driving-a-review-of-academia-ef1a6aa4dc15)\n- [自动驾驶中的可行驶空间——概念解析](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fdrivable-space-in-autonomous-driving-the-concept-df699bb8682f)\n- [自动驾驶中基于 Transformer 的单目 BEV 感知](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fmonocular-bev-perception-with-transformers-in-autonomous-driving-c41e4a893944)（[相关论文笔记](topics\u002Ftopic_transformers_bev.md)）\n- [图解 MLP 与 Transformer 在深度学习张量重塑中的差异](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fillustrated-difference-between-mlp-and-transformers-for-tensor-reshaping-52569edaf89)\n- [自动驾驶中的单目 3D 车道线检测](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fmonocular-3d-lane-line-detection-in-autonomous-driving-4d7cdfabf3b6)（[相关论文笔记](topics\u002Ftopic_3d_lld.md)）\n- [拥挤场景下的深度学习目标检测](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fdeep-learning-based-object-detection-in-crowded-scenes-1c9fddbd7bc4)（[相关论文笔记](topics\u002Ftopic_crowd_detection.md)）\n- [自动驾驶中的单目鸟瞰图语义分割](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fmonocular-birds-eye-view-semantic-segmentation-for-autonomous-driving-ee2f771afb59)（[相关论文笔记](topics\u002Ftopic_bev_segmentation.md)）\n- [自动驾驶地图构建中的深度学习](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fdeep-learning-in-mapping-for-autonomous-driving-9e33ee951a44)\n- [自动驾驶中的单目动态物体 SLAM](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fmonocular-dynamic-object-slam-in-autonomous-driving-f12249052bf1)\n- [自动驾驶中单目 3D 目标检测综述](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fmonocular-3d-object-detection-in-autonomous-driving-2476a3c7f57e)\n- [自监督关键点学习综述](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fself-supervised-keypoint-learning-aade18081fc3)\n- [单阶段实例分割综述](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fsingle-stage-instance-segmentation-a-review-1eeb66e0cc49)\n- [自定进度多任务学习综述](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fself-paced-multitask-learning-76c26e9532d0)\n- [带有异构元数据的卷积神经网络](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fconvolutional-neural-networks-with-heterogeneous-metadata-2af9241218a9)\n- [自动驾驶中将 2D 目标检测提升至 3D](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fgeometric-reasoning-based-cuboid-generation-in-monocular-3d-object-detection-5ee2996270d1)\n- [多模态回归](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car\u002Fanchors-and-multi-bin-loss-for-multi-modal-target-regression-647ea1974617)\n\n## AI 播客笔记\n- [WhynotTV 对 OpenAI 的翁家笠的采访](https:\u002F\u002Fwww.bilibili.com\u002Fvideo\u002FBV1darmBcE4A\u002F) [[笔记](podcast\u002Fjialiweng.md)] [采访，Jiali Weng\u002F翁家笠，OpenAI，WhynotTV]\n\n## 主题速记本\n这一部分包含一些快速笔记（类似 git-gist），供未来的自己参考。\n\n- [计算硬件](gist\u002Fcompute_hardware.md)\n- [注意力掩码](gist\u002Fattention_mask.md)\n\n\n## 2025-02 (1)\n- [DFlash：用于 Flash 规范性解码的块扩散模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.06036)\n- [VLM4VLA：在视觉-语言-动作模型中重新审视视觉-语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.03309) [[笔记](paper_notes\u002Fvlm4vla.md)] [Qwen 团队]\n\n## 2025-01 (10)\n- [逐步内化：从显式CoT到隐式CoT——循序渐进地学习内化CoT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14838) [[笔记](paper_notes\u002Fstepwise_internalization.md)] [Yejin Choi, 隐式CoT]\n- [Coconut：在连续潜在空间中训练大型语言模型进行推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.06769) [[笔记](paper_notes\u002Fcoconut.md)] \u003Ckbd>COLM 2025\u003C\u002Fkbd> [Yuandong Tian, 隐式CoT]\n- [DLCM：动态大型概念模型——自适应语义空间中的潜在推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.24617) [[笔记](paper_notes\u002Fdlcm.md)] [Xingwei Qu, 字节跳动，隐式CoT → 可解释的概念模型]\n- [潜在推理综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.06203) [[笔记](paper_notes\u002Flatent_cot_horizon.md)] [Xingwei Qu, 字节跳动]\n- [大型语言扩散模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.09992)\n- [Dream 7B：扩散型大型语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.15487)\n- [Seed Diffusion：具有高速推理能力的大规模扩散语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.02193) [字节跳动 Seed]\n- [MMaDA：多模态大型扩散语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.15809) [字节跳动 Seed]\n- [Fast-dLLM：通过启用KV缓存和并行解码实现无需训练的扩散LLM加速](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.22618) [[笔记](paper_notes\u002Ffast_dllm.md)] [Song Han, Enze Xie]\n- [Fast-dLLM v2：高效的块扩散LLM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.26328) [[笔记](paper_notes\u002Ffast_dllm_v2.md)] [Song Han, Enze Xie]\n- [Efficient-DLM：从自回归到扩散语言模型，并在速度上更进一步](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.14067) [[笔记](paper_notes\u002Fefficient_dlm.md)] [Pavlo, Song Han, Nvidia]\n- [TiDAR：用扩散思考，用自回归表达](https:\u002F\u002Farxiv.org\u002Fabs\u002F2511.08923) [[笔记](paper_notes\u002Ftidar.md)] [Nvidia, Pavlo]\n- [DFlash：用于闪电式推测解码的块扩散](https:\u002F\u002Fz-lab.ai\u002Fprojects\u002Fdflash\u002F)\n- [Dream 7B：扩散型大型语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.15487)\n- [块扩散：介于自回归与扩散语言模型之间](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.09573) \u003Ckbd>ICLR 2025 口头报告\u003C\u002Fkbd>\n- [SD3：扩展修正流变换器以实现高分辨率图像合成](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.03206) [Patrick Esser, Stable Diffusion]\n- [Wan：开放且先进的大规模视频生成模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.20314) [通义万象]\n- [RHO-1：并非所有token都是你需要的](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.07965) [[笔记](paper_notes\u002Frho1.md)] \u003Ckbd>NeurIPS 2024 口头报告\u003C\u002Fkbd>\n- [WOD-E2E：Waymo开放数据集，用于复杂长尾场景下的端到端驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.26125) [[笔记](paper_notes\u002Fwod_e2e.md)] [Waymo]\n- [LVP：大型视频规划器实现通用机器人控制](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.15840) [[笔记](paper_notes\u002Flvp.md)]\n- [通过参数融合对视觉-语言-动作机器人策略进行稳健微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.08333)\n- [Epona：用于自动驾驶的自回归扩散世界模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.24113) \u003Ckbd>ICCV 2025\u003C\u002Fkbd> [Horizonp]\n- [HaMeR：用Transformer重建三维手部](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.05251) [手部网格恢复，伯克利，单目重建MANO参数化]\n- [MegaSaM：从随意动态视频中准确、快速且稳健地提取结构与运动信息](https:\u002F\u002Farxiv.org\u002Fabd\u002F2412.04463) \u003Ckbd>CVPR 2025 最佳论文短名单\u003C\u002Fkbd> [SaM：结构与运动] \n\n\n## 2025-12 (0)\n- [DDPM：去噪扩散概率模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.11239) [伯克利，Jonathan Ho，Pieter Abbeel]\n- [DDIM：去噪扩散隐式模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.02502) \u003Ckbd>ICLR 2021\u003C\u002Fkbd> [斯坦福，Stefano Ermon]\n- [score SDE：基于分数的生成建模，通过随机微分方程实现](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.13456) \u003Ckbd>ICLR 2021 口头报告\u003C\u002Fkbd> [斯坦福，Yang Song，Stefano Ermon]\n- [改进的DDPM：改进的去噪扩散概率模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.09672) [OpenAI，Prafulla Dhariwal]\n- [分类器引导：扩散模型在图像生成方面超越GAN](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.05233) [OpenAI，Prafulla Dhariwal]\n- [LDM：潜在扩散模型——使用潜在扩散模型实现高分辨率图像合成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.10752) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [Stable Diffusion v1，Patrick Esser]\n- [CFG：无分类器扩散引导](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.12598) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd> [Jonathan Ho，Google Brain]\n- [修正流：流畅而快速的流——学习用修正流生成和传输数据](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.03003) [Xingchao Liu，UT Austin，比flow matching论文更早]\n- [用于生成建模的flow matching](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02747) [FAIR]\n- [DiT：基于Transformer的可扩展扩散模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.09748) [Saining Xie]\n- [OT-CFM：利用小批量最优运输改进和推广基于flow的生成模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.00482) \u003Ckbd>TMLR 2023\u003C\u002Fkbd> [Alex Tong]\n- [一步式生成建模的平均流](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.13447) [Kaiming He]\n- [iMF：改进的平均流——关于快速生成模型的挑战](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.02012) [Kaiming He]\n\n## 2025-09 (2)\n- [Cosmos-Reason1：从物理常识到具身推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.15558) [[笔记](paper_notes\u002Fcosmos_reason1.md)] [Nvidia]\n- [CoVLA：面向自动驾驶的综合视觉-语言-动作数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.10845) [[笔记](paper_notes\u002Fcovla.md)] \u003Ckbd>WACV 2025\u003C\u002Fkbd> [约80小时的真实世界驾驶视频，配有语言和轨迹标注；最大的用于自动驾驶的VLA数据集]\n- [SimLingo：基于语言-动作对齐的纯视觉闭环自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.09594) \u003Ckbd>CVPR 2025\u003C\u002Fkbd> [2024年CARLA挑战赛第一名；在CARLA LB 2.0和Bench2Drive上达到SOTA水平；被AutoVLA引用]\n- [AutoVLA：一种用于端到端自动驾驶的视觉-语言-动作模型，具备自适应推理和强化学习微调能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.13757) \u003Ckbd>arXiv 2025-06\u003C\u002Fkbd> [轨迹分词；双思维模式（快速与慢速CoT）；GRPO微调；在nuPlan、nuScenes、Waymo、CARLA上进行评估；引用了SimLingo]\n- [DriveAgent-R1：通过混合思维和主动感知推进基于VLM的自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.20879) \u003Ckbd>arXiv 2025-07\u003C\u002Fkbd> [混合思维（文本与工具基CoT）+主动感知；三阶段强化学习训练；基于DriveVLM lineage构建]\n- [AgentThink：面向自动驾驶的视觉-语言模型中工具增强型思维链推理的统一框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.15298) [Kangan Qian, Sicong Jiang, Xiaomi] \u003Ckbd>EMNLP25\u003C\u002Fkbd>\n- [VERDI：嵌入VLM的自动驾驶推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.15925) \u003Ckbd>arXiv 2025-05\u003C\u002Fkbd> [将VLM推理蒸馏到模块化的AD栈中；对感知、预测、规划进行对齐；在不增加VLM推理成本的情况下提升nuScenes性能]\n- [Poutine：视觉-语言-轨迹预训练与强化后训练实现稳健的端到端自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.11234) \u003Ckbd>arXiv 2025-06\u003C\u002Fkbd> [30亿参数的VLM，在83小时CoVLA和11小时Waymo长尾数据上训练；经过RL微调（GRPO）；在Waymo基于视觉的E2E驾驶挑战赛中获得第一名（RFS=7.99）]\n- [ReasonPlan：面向闭环自动驾驶的统一场景预测与决策推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.20024) \u003Ckbd>arXiv 2025-05\u003C\u002Fkbd> [思维链规划；在Bench2Drive上显著提升了闭环性能]\n- [DiffVLA：用于自动驾驶的视觉-语言引导扩散规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.19381) \u003Ckbd>arXiv 2025-05\u003C\u002Fkbd> [VLM引导的扩散轨迹规划；在2025年自动驾驶大挑战中表现优异]\n- [VLAD：一种VLM增强型自动驾驶框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.01284) \u003Ckbd>ITSC 2025\u003C\u002Fkbd> [VLM为E2E控制器生成高层指令；提升可解释性和规划安全性]\n- [DriveAction（基准测试）：DriveAction——探索VLA模型中类人驾驶决策的基准测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.24044) \u003Ckbd>arXiv 2025-06\u003C\u002Fkbd> [以行动为核心的评估，涵盖各类驾驶场景的问答对；被VLA4AD综述引用；正逐渐受到关注]\n- [DINOv3](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.10104) [高分辨率Dino]\n- [LiveVLM：通过面向流式传输的KV缓存和检索实现高效的在线视频理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.15269)\n- [DexUMI：将人类手作为灵巧操作的通用操控接口](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.21864) [灵巧手数据采集，Shuran Song，Jim Fan]\n- [DEXOP：一种用于机器人复制人类灵巧操作的装置](https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.04441) [RSS 2025最佳论文奖]\n- [HAD数据集：为自动驾驶车辆提供人车交互建议的基础](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.06978) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [John Canny，本田研究院，2019年，VLA OG]\n- [RAD：通过大规模基于3DGS的强化学习训练端到端驾驶策略](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.13144) \u003Ckbd>NeurIPS 2025\u003C\u002Fkbd> [Horizon]\n- [PerceptionLM：用于精细视觉理解的开放获取数据与模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.13180) [FAIR]\n- [Difix3D+：利用单步扩散模型改进3D重建](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.01774) \u003Ckbd>CVPR 2025最佳论文候选\u003C\u002Fkbd> [Nvidia，Sanja Fidler]\n- [π∗0.6：一款从经验中学习的VLA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2511.14759) [VLA + RL]\n- [π𝚁𝙻：面向基于流的视觉-语言-动作模型的在线RL微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.25889) [VLA + RL]\n- [VLA-RL：通过可扩展的强化学习迈向精通且通用的机器人操作](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.18719) [VLA + RL]\n- [GR-RL：为长时程机器人操作实现灵巧与精准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.01801) [VLA + RL]\n- [GR-3技术报告](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.15493) [VLA，字节跳动]\n- [行动切块中的多时间尺度混合](https:\u002F\u002Farxiv.org\u002Fabs\u002F2511.19433)\n\n## 2025-06 (1)\n- [V-JEPA 2: 自监督视频模型实现理解、预测与规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.09985) [LeCun]\n- [V-JEPA: 重新审视基于特征预测的视频视觉表征学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.08471) \u003Ckbd>ICLR 2025\u003C\u002Fkbd>\n- [I-JEPA: 基于联合嵌入预测架构的图像自监督学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.08243) \u003Ckbd>ICCV 2023\u003C\u002Fkbd>\n- [PlaNet: 从像素中学习用于规划的潜在动力学](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.04551)\n- [DreamerV1: 梦想到控制：通过潜在想象学习行为](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01603)\n- [DreamerV2: 使用离散世界模型掌握雅达利游戏](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.02193) \u003Ckbd>ICLR 2021\u003C\u002Fkbd>\n- [DreamerV3: 通过世界模型掌握多样化领域](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.04104) \u003Ckbd>Nature 2025\u003C\u002Fkbd>\n- [DayDreamer: 用于物理机器人学习的世界模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14176) \u003Ckbd>CoRL 2022\u003C\u002Fkbd>\n- [Dynalang: 学习用语言建模世界](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.01399) \u003Ckbd>ICML 2024\u003C\u002Fkbd>\n- [将世界分词为对象级知识，以应对自动驾驶中的长尾事件](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.00959) [[笔记](paper_notes\u002Ftoken_ad.md)] [Marco Pavone, Nvidia]\n- [SparseDrive: 基于稀疏场景表示的端到端自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.19620) [[笔记](sparse_drive.md)] \u003Ckbd>ICRA 2025\u003C\u002Fkbd> [Horizon]\n- [HE-Drive: 基于视觉语言模型的人类式端到端驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.05051) \u003Ckbd>IROS 2025\u003C\u002Fkbd> [Horizon]\n- [GPT-Driver: 使用GPT学习驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01415) [NeurIPS 2023, Hang Zhao]\n- [使用LLM驾驶：融合对象级向量模态实现可解释的自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01957) [[笔记](paper_notes\u002Fdriving_with_llms.md)] \u003Ckbd>ICRA 2024\u003C\u002Fkbd> [Wayve]\n- [PARA-Drive: 用于实时自动驾驶的并行化架构](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2024\u002Fhtml\u002FWeng_PARA-Drive_Parallelized_Architecture_for_Real-time_Autonomous_Driving_CVPR_2024_paper.html) \u003Ckbd>CVPR 2024\u003C\u002Fkbd> [Marco Pavone, NVidia]\n- [PDM-Closed: 摆脱关于基于学习的车辆运动规划的误解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.07962) [[笔记](paper_notes\u002Fpdm_closed.md)] \u003Ckbd>CoRL 2023\u003C\u002Fkbd>\n- [Ego-MLP: 开环端到端自动驾驶是否只需车辆状态？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.03031) \u003Ckbd>CVPR 2024\u003C\u002Fkbd>\n- [AD-MLP: 重新思考nuScenes数据集中端到端自动驾驶的开环评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10430) [Baidu]\n- [GAIA-2: 用于自动驾驶的可控多视角生成式世界模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.20523) [Wayve]\n- [摄像头作为相对位置编码](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.10496)\n\n## 2025-04\n- [Scenario Dreamer: 用于生成驾驶仿真环境的向量化潜在扩散模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.22496) \u003Ckbd>CVPR 2025\u003C\u002Fkbd>\n- [Hi Robot: 基于分层视觉-语言-动作模型的开放式指令遵循](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.19417) [Physical Intelligence]\n- [利用人类反馈强化学习微调生成式轨迹模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.10434) [Li Auto, RLHF]\n- [TokenFLEX: 面向灵活视觉令牌推理的统一VLM训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.03154) [Li Auto]\n- [快中慢：一种结合快速操作与慢速推理的双系统基础模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.01953)\n- [STORM: 用于大规模室外场景的时空重建模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.00602)\n\n## 2024-12 (0)\n- [VLM-AD：通过视觉-语言模型监督实现端到端自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.14446) [Cruise]\n- [GPD-1：面向驾驶的生成式预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.08643) [PhiGent]\n- [Transformer推理优化工具集](https:\u002F\u002Fastralord.github.io\u002Fposts\u002Ftransformer-inference-optimization-toolset\u002F)\n- [空间中的思考：多模态大语言模型如何感知、记忆和回忆空间](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.14171) [李飞飞]\n- [探查视觉基础模型的三维感知能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.08636) \u003Ckbd>CVPR 2024\u003C\u002Fkbd>\n- [iVideoGPT：交互式VideoGPT是可扩展的世界模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15223) \u003Ckbd>NeurIPS 2024\u003C\u002Fkbd>\n- [CarLLaVA：仅使用摄像头的闭环驾驶用视觉语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.10165) [Wayve]\n- [提示之痕：增强自动驾驶中多模态LLM的视觉表征](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.13076) [DeepRoute]\n- [LAW：利用潜在世界模型提升端到端自动驾驶性能](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.08481)\n- [TCP：轨迹引导的端到端自动驾驶控制预测——简单而强大的基线](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.08129) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [E2E规划，Hongyang]\n- [越差越好：视觉分词中的压缩-生成权衡](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16326)\n- [RoGs：基于网格高斯的大规模道路表面重建](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.14342)\n- [RoMe：通过网格表示实现大规模道路表面重建](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.11368)\n- [SLEDGE：利用生成模型与规则化交通合成驾驶环境](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.17933) \u003Ckbd>ECCV 2024\u003C\u002Fkbd>\n- [Lookahead：通过前瞻解码打破LLM推理的顺序依赖](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.02057) [specdec]\n- [EAGLE：推测采样需要重新思考特征不确定性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.15077) [specdec]\n- [EAGLE-2：利用动态草稿树加速语言模型推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.16858) [specdec]\n- [Medusa：具有多个解码头的简单LLM推理加速框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.10774)\n- [RealGen：用于可控交通场景的检索增强生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.13303) \u003Ckbd>ECCV 2024\u003C\u002Fkbd>\n- [MobileVLM V2：更快更强的视觉语言模型基线](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.03766)\n- [开源π0](https:\u002F\u002Fwww.physicalintelligence.company\u002Fblog\u002Fopenpi) [PI，工业界]\n- [Helix：用于通用人形机器人控制的视觉-语言-行动模型](https:\u002F\u002Fwww.figure.ai\u002Fnews\u002Fhelix) [Figure，工业界]\n- [AM-RADIO：凝聚型视觉基础模型——将所有领域归为单一模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.06709v5) \u003Ckbd>CVPR 2024\u003C\u002Fkbd>\n- [Transfusion：用一个多模态模型预测下一个标记并扩散图像](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.11039)\n- [iVideoGPT：交互式VideoGPT是可扩展的世界模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.15223) \u003Ckbd>NeurIPS 2024\u003C\u002Fkbd>\n- [MetaMorph：通过指令微调实现多模态理解和生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.14164)\n- [WORLDMEM：带有记忆的长期一致世界模拟](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.12369) [长期记忆]\n- [PADriver：迈向个性化自动驾驶](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2505.05240) [旷视科技，个性化驾驶]\n\n\n## 2024-11 (1)\n- [关于基础模型的机会与风险](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.07258) [[笔记](paper_notes\u002Fopportunities_foundation_models.md)]\n- [π0：用于通用机器人控制的视觉-语言-行动流模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.24164) [Physical Intelligence，VLA]\n- [EMMA：面向自动驾驶的端到端多模态模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.23262) [Waymo，VLA]\n- [Depth Anything：释放大规模无标注数据的力量](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.10891) \u003Ckbd>CVPR 2024\u003C\u002Fkbd>\n- [Depth Anything V2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.09414) \u003Ckbd>NeurIPS 2024\u003C\u002Fkbd>\n- [CarLLaVA：仅使用摄像头的闭环驾驶用视觉语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.10165)\n- [LVSM：具有最小3D归纳偏置的大规模视图合成模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.17242) [场景分词]\n- [NAVSIM：数据驱动的非反应式自动驾驶车辆仿真与基准测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.15349) \u003Ckbd>NeurIPS 2024\u003C\u002Fkbd>\n- [借助大语言模型策略自适应实现全场景驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.05932) \u003Ckbd>CVPR 2024\u003C\u002Fkbd> [Marco Pavone]\n- [一致性模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.01469) [扩散加速，OpenAI，杨松]\n- [VILA：关于视觉语言模型的预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.07533) \u003Ckbd>CVPR 2024\u003C\u002Fkbd> [Song Han，Yao Lu]\n\n## 2024-06 (8)\n- [LINGO-1：探索用于自动驾驶的自然语言](https:\u002F\u002Fwayve.ai\u002Fthinking\u002Flingo-natural-language-autonomous-driving\u002F) [[笔记](paper_notes\u002Flingo_1.md)] [Wayve，开环世界模型]\n- [LINGO-2：用自然语言驾驶](https:\u002F\u002Fwayve.ai\u002Fthinking\u002Flingo-2-driving-with-language\u002F) [[笔记](paper_notes\u002Flingo_2.md)] [Wayve，闭环世界模型]\n- [OpenVLA：一个开源的视觉-语言-行动模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.09246) [开源RT-2]\n- [告别基于学习的车辆运动规划的误解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.07962) \u003Ckbd>CoRL 2023\u003C\u002Fkbd> [简单的非学习基线]\n- [QuAD：基于查询的可解释神经网络自动驾驶运动规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.01486) [Waabi]\n- [MPDM：自动驾驶中动态不确定环境下的多策略决策](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F7139412) [[笔记](paper_notes\u002Fmpdm.md)] \u003Ckbd>ICRA 2015\u003C\u002Fkbd> [行为规划，密歇根大学，May Autonomy]\n- [MPDM2：基于变点的行为预测的自动驾驶多策略决策](https:\u002F\u002Fwww.roboticsproceedings.org\u002Frss11\u002Fp43.pdf) [[笔记](paper_notes\u002Fmpdm2.md)] \u003Ckbd>RSS 2015\u003C\u002Fkbd> [行为规划]\n- [MPDM3：基于变点行为预测的自动驾驶多策略决策：理论与实验](https:\u002F\u002Flink.springer.com\u002Farticle\u002F10.1007\u002Fs10514-017-9619-z) \u003Ckbd>RSS 2017\u003C\u002Fkbd> [行为规划]\n- [EUDM：使用引导分支的高效不确定性感知自动驾驶决策](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02746) [[笔记](paper_notes\u002Feudm.md)] \u003Ckbd>ICRA 2020\u003C\u002Fkbd> [丁文超、沈劭杰，行为规划]\n- [TPP：基于学习行为模型的树状策略规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.11902) \u003Ckbd>ICRA 2023\u003C\u002Fkbd> [马可·帕沃内、英伟达，行为规划]\n- [MARC：自动驾驶中的多策略与风险敏感应急规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.12021) [[笔记](paper_notes\u002Fmarc.md)] \u003Ckbd>RAL 2023\u003C\u002Fkbd> [沈劭杰，行为规划]\n- [EPSILON：高度交互环境中自动驾驶车辆的高效规划系统](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.07993) \u003Ckbd>TRO 2021\u003C\u002Fkbd> [丁文超，pnc百科全书]\n- [trajdata：多个人类轨迹数据集的统一接口](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.13924) \u003Ckbd>NeurIPS 2023\u003C\u002Fkbd> [马可·帕沃内、英伟达]\n- [利用非线性优化进行静态障碍物规避的最优车辆轨迹规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.09466) [小鹏]\n- [面向自动驾驶车辆的行为与轨迹联合可学习规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.04586) [[笔记](paper_notes\u002Fjoint_learned_bptp.md)] \u003Ckbd>IROS 2019 口头报告\u003C\u002Fkbd> [Uber ATG，行为规划、运动规划]\n- [通过潜在世界模型增强端到端自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.08481)\n- [OccNeRF：在无激光雷达环境下推进3D占用预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.09243) [陆继文]\n- [RenderOcc：以视觉为中心的3D占用预测，辅以2D渲染监督](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.09502) \u003Ckbd>ICRA 2024\u003C\u002Fkbd>\n- [EmerNeRF：通过自监督实现涌现式时空场景分解](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.02077) [Sanja、Marco、NV]\n- [FB-OCC：基于前后视图变换的3D占用预测](https:\u002F\u002Fopendrivelab.com\u002Fe2ead\u002FAD23Challenge\u002FTrack_3_NVOCC.pdf?=&linkId=100000205404832)\n- [Trajeglish：将交通建模视为下一个标记预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.04535) \u003Ckbd>ICLR 2024\u003C\u002Fkbd>\n- [交叉路口的自动驾驶策略：场景、现状与未来展望](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.13052) \u003Ckbd>ITSC 2021\u003C\u002Fkbd>\n- [基于学习的在线变道意图预测方法](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F6629564\u002F) \u003Ckbd>IV 2013\u003C\u002Fkbd> [SVM，LC意图预测]\n- [复杂城市场景下的基于交通流的众包地图](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10171417) \u003Ckbd>RAL 2023\u003C\u002Fkbd> [丁文超、华为，众包地图]\n- [FlowMap：利用交通流为开放空间中的自动驾驶车辆生成路径](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.01622) \u003Ckbd>ICRA 2023\u003C\u002Fkbd>\n- [混合A*：未知半结构化环境中自动驾驶车辆的路径规划](https:\u002F\u002Fwww.semanticscholar.org\u002Fpaper\u002FPath-Planning-for-Autonomous-Vehicles-in-Unknown-Dolgov-Thrun\u002F0e8c927d9c2c46b87816a0f8b7b8b17ed1263e9c) \u003Ckbd>IJRR 2010\u003C\u002Fkbd> [Dolgov、Thrun，搜索]\n- [弗雷内坐标系下动态街道场景的最优轨迹生成](https:\u002F\u002Fwww.semanticscholar.org\u002Fpaper\u002FOptimal-trajectory-generation-for-dynamic-street-in-Werling-Ziegler\u002F6bda8fc13bda8cffb3bb426a73ce5c12cc0a1760) \u003Ckbd>ICRA 2010\u003C\u002Fkbd> [Werling、Thrun，采样] [规划人员必读]\n- [不依赖弗雷内坐标系的弯道自动驾驶：一种基于笛卡尔坐标的轨迹规划方法](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9703250) \u003Ckbd>TITS 2022\u003C\u002Fkbd>\n- [百度Apollo EM运动规划器](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.08048) [[笔记](paper_notes\u002Fapollo_em_planner.md)][优化]\n- [基于改进混合A*的智能汽车时空联合规划方法](https:\u002F\u002Fwww.qichegongcheng.com\u002FCN\u002Fabstract\u002Fabstract1500.shtml) \u003Ckbd>汽车工程：规划&决策2023年\u003C\u002Fkbd> [联合优化，搜索]\n- [提升受限动态环境中自动驾驶车辆的时空轨迹规划速度与平滑度](https:\u002F\u002Fjournals.sagepub.com\u002Fdoi\u002Fabs\u002F10.1177\u002F0954407020906627) \u003Ckbd>JAE 2020\u003C\u002Fkbd> [联合优化，搜索]\n- [面向自动驾驶道路行驶的聚焦轨迹规划](https:\u002F\u002Fwww.ri.cmu.edu\u002Fpub_files\u002F2013\u002F6\u002FIV2013-Tianyu.pdf) \u003Ckbd>IV 2013\u003C\u002Fkbd> [联合优化，迭代]\n- [SSC：利用时空语义走廊为复杂城市环境生成安全轨迹](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.09788) \u003Ckbd>RAL 2019\u003C\u002Fkbd> [联合优化，SSC，丁文超，运动规划]\n- [AlphaGo：借助深度神经网络和树搜索掌握围棋](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fnature16961) [[笔记](paper_notes\u002Falphago.md)] \u003Ckbd>Nature 2016\u003C\u002Fkbd> [DeepMind，MTCS]\n- [AlphaZero：一种通用强化学习算法，通过自我对弈掌握国际象棋、将棋和围棋](https:\u002F\u002Fwww.science.org\u002Fdoi\u002Ffull\u002F10.1126\u002Fscience.aar6404) \u003Ckbd>Science 2017\u003C\u002Fkbd> [DeepMind]\n- [MuZero：通过基于学习模型的规划掌握Atari游戏、围棋、国际象棋和将棋](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-020-03051-4) \u003Ckbd>Nature 2020\u003C\u002Fkbd> [DeepMind]\n- [无需搜索的特级大师级国际象棋](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.04494) [DeepMind]\n- [面向自动驾驶的安全多智能体强化学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1610.03295) [MobileEye，欲望与轨迹优化]\n- [全面的反应式安全：只要有策略就不需要轨迹](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.00198) \u003Ckbd>IROS 2022\u003C\u002Fkbd> [大放、Qcraft]\n- [BEVGPT：用于自动驾驶预测、决策和规划的生成式预训练大型模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.10357) \u003Ckbd>AAAI 2024\u003C\u002Fkbd>\n- [LLM-MCTS：大型语言模型作为大规模任务规划的常识知识](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14078) \u003Ckbd>NeurIPS 2023\u003C\u002Fkbd>\n- [Hivt：用于多智能体运动预测的层次化向量Transformer](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2022\u002Fpapers\u002FZhou_HiVT_Hierarchical_Vector_Transformer_for_Multi-Agent_Motion_Prediction_CVPR_2022_paper.pdf) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [周子康，以智能体为中心，运动预测]\n- [QCNet：以查询为中心的轨迹预测](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fpapers\u002FZhou_Query-Centric_Trajectory_Prediction_CVPR_2023_paper.pdf) [[笔记](paper_notes\u002Fqcnet.md)] \u003Ckbd>CVPR 2023\u003C\u002Fkbd> [周子康，以场景为中心，运动预测]\n\n## 2024-03 (11)\n- [Genie: 生成式交互环境](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.15391) [[笔记](paper_notes\u002Fgenie.md)] [DeepMind, 世界模型]\n- [DriveDreamer: 面向自动驾驶的真实世界驱动型世界模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.09777) [[笔记](paper_notes\u002Fdrive_dreamer.md)] [陆继文, 世界模型]\n- [WorldDreamer: 通过预测掩码标记实现视频生成的通用世界模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.09985) [[笔记](paper_notes\u002Fworld_dreamer.md)] [陆继文, 世界模型]\n- [VideoPoet: 用于零样本视频生成的大语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14125) [类似Sora，但使用LLM，而非世界模型]\n- [对齐你的潜在空间：基于潜在扩散模型的高分辨率视频合成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08818) [[笔记](paper_notes\u002Fvideo_ldm.md)] \u003Ckbd>CVPR 2023\u003C\u002Fkbd> [Sanja, Nvidia, VideoLDM, 视频预测]\n- [视频预训练（VPT）：通过观看无标签在线视频学习行动](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.11795) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [[笔记](paper_notes\u002Fvpt.md)] [OpenAI]\n- [MineDojo: 构建具有互联网规模知识的开放式具身智能体](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.08853) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [NVidia, 杰出论文奖]\n- [类人机器人运动作为下一个标记预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.19469) [[笔记](paper_notes\u002Flocomotion_next_token_pred.md)] [伯克利, EAI]\n- [RPT: 基于感觉运动预训练的机器人学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.10007) [[笔记](paper_notes\u002Frpt.md)] \u003Ckbd>CoRL 2023 口头报告\u003C\u002Fkbd> [伯克利, EAI]\n- [MVP: 基于掩码视觉预训练的真实世界机器人学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.03109) [[笔记](paper_notes\u002Fmvp.md)] \u003Ckbd>CoRL 2022\u003C\u002Fkbd> [伯克利, EAI]\n- [BC-Z: 基于机器人模仿学习的零样本任务泛化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.02005) [[笔记](paper_notes\u002Fbc_z.md)] \u003Ckbd>CoRL 2021\u003C\u002Fkbd> [Eric Jang, 1X]\n- [GenAD: 自动驾驶的通用预测模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.09630) [[笔记](paper_notes\u002Fgenad.md)] \u003Ckbd>CVPR 2024\u003C\u002Fkbd>\n- [HG-DAgger: 带有人类专家的交互式模仿学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.02890) [DAgger]\n- [DriveGAN: 向可控高质量神经仿真迈进](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.15060) [[笔记](paper_notes\u002Fdrive_gan.md)] \u003Ckbd>CVPR 2021 口头报告\u003C\u002Fkbd> [Nvidia, Sanja]\n- [VideoGPT: 使用VQ-VAE和Transformer进行视频生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10157) [[笔记](paper_notes\u002Fvideogpt.md)] [Pieter Abbeel]\n- [LLM、视觉分词器与视觉智能，由江璐撰写](https:\u002F\u002Fmp.weixin.qq.com\u002Fs\u002FHamz5XMT1tSZHKdPaCBTKg) [[笔记](paper_notes\u002Fllm_vision_intel.md)] [采访江璐]\n- [AV2.0: 重新构想自动驾驶汽车](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.05805) [[笔记](paper_notes\u002Fav20.md)] [Wayve, Alex Kendall]\n- [端到端自动驾驶的仿真](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=8fivoXbT1Ao&ab_channel=Wayve) [Wayve, 技术分享, E2E]\n- [E2E横向规划](https:\u002F\u002Fblog.comma.ai\u002Fend-to-end-lateral-planning\u002F) [Comma.ai, E2E规划]\n- [在视觉表征学习中学习和利用世界模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.00504) [LeCun, JEPA系列]\n- [LVM: 序列建模赋能大型视觉模型的可扩展学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.00785) [大型视觉模型, Jitendra Malik]\n- [LWM: 基于RingAttention的百万级视频与语言世界模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.08268) [Pieter Abbeel]\n- [OccWorld: 学习用于自动驾驶的3D占用世界模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.16038) [陆继文, 世界模型]\n- [GenAD: 生成式端到端自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.11502)\n- [Transfuser: 用于端到端自动驾驶的多模态融合Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.09224) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [E2E规划, Geiger]\n- [使用LLM驾驶：融合对象级向量模态以实现可解释的自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01957) [Wayve, LLM + AD]\n- [LingoQA: 用于自动驾驶的视频问答](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14115) [Wayve, LLM + AD]\n- [Panacea: 用于自动驾驶的全景且可控视频生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.16813) \u003Ckbd>CVPR 2024\u003C\u002Fkbd> [Megvii]\n- [PlanT: 基于对象级表示的可解释规划Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.14222) \u003Ckbd>CoRL 2022\u003C\u002Fkbd>\n- [场景即占用](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.02851) \u003Ckbd>ICCV 2023\u003C\u002Fkbd>\n- [从模型到复合AI系统的转变](https:\u002F\u002Fbair.berkeley.edu\u002Fblog\u002F2024\u002F02\u002F18\u002Fcompound-ai-systems\u002F)\n- [Roach: 通过模仿强化学习教练实现端到端城市驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.08265) \u003Ckbd>ICCV 2021\u003C\u002Fkbd>\n- [从所有车辆中学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11934) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [LBC: 通过作弊学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.12294) \u003Ckbd>CoRL 2019\u003C\u002Fkbd>\n- [从轨道上的世界中学习驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.00636) \u003Ckbd>ICCV 2021 口头报告\u003C\u002Fkbd> [Philipp Krähenbühl]\n- [从所有车辆中学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11934) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [Philipp Krähenbühl]\n- [VADv2: 基于概率规划的端到端矢量化自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.13243) [Horizon]\n- [VQ-VAE: 神经离散表征学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.00937) \u003Ckbd>NeurIPS 2017\u003C\u002Fkbd> [图像分词器]\n- [VQ-GAN: 为高分辨率图像合成驯服Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.09841) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [图像分词器]\n- [ViT-VQGAN: 改进的VQGAN用于向量化图像建模](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.04627) \u003Ckbd>ICLR 2022\u003C\u002Fkbd> [图像分词器]\n- [MaskGIT: 掩码生成式图像Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.04200) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [LLM，非自回归]\n- [MAGVIT: 掩码生成式视频Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.05199) \u003Ckbd>CVPR 2023亮点\u003C\u002Fkbd> [视频分词器]\n- [MAGVIT-v2: 语言模型胜过扩散——分词器是视觉生成的关键](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.05737) \u003Ckbd>ICLR 2024\u003C\u002Fkbd> [视频分词器]\n- [Sora: 关于大型视觉模型的背景、技术、局限性及机遇的综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.17177) [Sora的逆向工程]\n- [GLaM: 基于专家混合的高效语言模型扩展](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.06905) \u003Ckbd>ICML 2022\u003C\u002Fkbd> [MoE, LLM]\n- [基于分布特化的专家进行终身语言预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.12281) \u003Ckbd>ICML 2023\u003C\u002Fkbd> [MoE, LLM]\n- [DriveLM: 用语言驱动](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.14150) [李洪洋]\n- [MotionLM: 多智能体运动预测作为语言建模](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16534) \u003Ckbd>ICCV 2023\u003C\u002Fkbd> [Waymo, LLM + AD]\n- CubeLLM: 将2D\u002F3D与语言对齐\n- EmerNeRF: ICLR 2024\n- 用于自动驾驶的语言代理\n- [迈向驾驶场景理解：用于学习驾驶员行为与因果关系的数据集]\n- [DriveDreamer-2: 增强LLM的世界模型，用于多样化驾驶视频生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.06845)\n- [DriveWorld: 基于世界模型的4D预训练场景理解，用于自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.04390) \u003Ckbd>CVPR 2024\u003C\u002Fkbd> [Zheng Zhu]\n- [Sora是世界模拟器吗？关于通用世界模型及更广泛领域的全面综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.03520) [Zheng Zhu]\n\n## 2024-02 (7)\n- [端到端自动驾驶：挑战与前沿](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.16927) [[笔记](paper_notes\u002Fe2e_review_hongyang.md)] [Hongyang Li, 上海人工智能实验室]\n- [DriveVLM: 自动驾驶与大型视觉-语言模型的融合](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.12289) [[笔记](paper_notes\u002Fdrivevlm.md)] [Hang Zhao]\n- [DriveGPT4: 基于大语言模型的可解释端到端自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01412) [[笔记](paper_notes\u002Fdrivegpt4.md)] [香港大学]\n- [GAIA-1: 用于自动驾驶的生成式世界模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17080) [[笔记](paper_notes\u002Fgaia_1.md)] [Wayve, 视觉基础模型]\n- [ADriver-I: 通用自动驾驶世界模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.13549) [[笔记](paper_notes\u002Fadriver_i.md)] [旷视科技, Xiangyu]\n- [Drive-WM: 驾驶未来：基于世界模型的多视角视觉预测与规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.17918) [[笔记](paper_notes\u002Fdrive_wm.md)]\n- [X]() [[笔记](paper_notes\u002Fx.md)] [E2E规划]\n\n\n## 2023-12 (4)\n- [ChatGPT用于机器人：设计原则与模型能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.17582) [[笔记](paper_notes\u002Fprompt_craft.md)] [微软，机器人LLM]\n- [RoboVQA: 机器人领域的多模态长时序推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.00899) [[笔记](paper_notes\u002Frobovqa.md)] [谷歌DeepMind，机器人LLM]\n- [ChatGPT赋能多种环境下的长步长机器人控制：案例应用](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10235949) [微软机器人]\n- [GPT-4V(ision)用于机器人：基于人类演示的多模态任务规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.12015) [[笔记](paper_notes\u002Fgpt4v_robotics.md)] [机器人LLM，微软机器人]\n- [LLM-Brain: LLM作为机器人大脑：统一自我中心记忆与控制](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.09349) [[笔记](paper_notes\u002Fllm_brain.md)]\n- [Voyager: 基于大语言模型的开放式具身智能体](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16291) [[笔记](paper_notes\u002Fvoyager.md)] [Reasoning Critique, Linxi Jim Fan]\n\n## 2023-09 (3)\n- [RetNet: 持久化网络：大语言模型中Transformer的继任者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.08621) [[笔记](paper_notes\u002Fretnet.md)] [MSRA]\n- [Transformers are RNNs: 具有线性注意力的快速自回归Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.16236) [[笔记](paper_notes\u002Ftransformers_are_rnns.md)] \u003Ckbd>ICML 2020\u003C\u002Fkbd> [线性注意力]\n- [AFT: 无注意力Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.14103) [[笔记](paper_notes\u002Faft.md)] [Apple]\n\n\n## 2023-08 (3)\n- [RT-1: 用于大规模真实世界控制的机器人Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.06817) [[笔记](paper_notes\u002Frt1.md)] [DeepMind]\n- [RT-2: 视觉-语言-动作模型将网络知识迁移到机器人控制中](https:\u002F\u002Frobotics-transformer2.github.io\u002Fassets\u002Frt2.pdf) [[笔记](paper_notes\u002Frt2.md)] [DeepMind，端到端视觉运动]\n- [RWKV: 为Transformer时代重塑RNN](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13048) [[笔记](paper_notes\u002Frwkv.md)]\n\n## 2023-07 (6)\n- [MILE: 基于模型的模仿学习用于城市驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.07729) [[笔记](paper_notes\u002Fmile.md)] \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [Alex Kendall]\n- [PaLM-E: 具身多模态语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03378) [[笔记](paper_notes\u002Fpalm_e.md)] [谷歌机器人]\n- [VoxPoser: 基于语言模型的可组合3D价值地图用于机器人操作](https:\u002F\u002Fvoxposer.github.io\u002Fvoxposer.pdf) [[笔记](paper_notes\u002Fvoxposer.md)] [Feifei Li]\n- [CaP: 代码即策略：用于具身控制的语言模型程序](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.07753) [[笔记](paper_notes\u002Fcap.md)] [[项目](https:\u002F\u002Fcode-as-policies.github.io\u002F)]\n- [ProgPrompt: 使用大语言模型生成情境化的机器人任务计划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.11302) \u003Ckbd>ICRA 2023\u003C\u002Fkbd>\n- [TidyBot: 基于大语言模型的个性化机器人助手](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.05658) [[笔记](paper_notes\u002Ftidybot.md)] [[项目](https:\u002F\u002Ftidybot.cs.princeton.edu\u002F)]\n- [SayCan: 做我能做到的，而不是我说的：将语言 grounding 到机器人的 affordances 中](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.01691) [[笔记](paper_notes\u002Fsaycan.md)] [[项目](https:\u002F\u002Fsay-can.github.io\u002F)]\n\n\n## 2023-06 (5)\n- [上海人工智能实验室的端到端综述](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FEnd-to-end-Autonomous-Driving)\n- [Pix2seq v2: 视觉任务的统一序列接口](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.07669) [[笔记](paper_notes\u002Fpix2seq_v2.md)] \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [Geoffrey Hinton]\n- 🦩 [Flamingo: 用于少样本学习的视觉语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.14198) [[笔记](paper_notes\u002Fflamingo.md)] \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [DeepMind]\n- 😼 [Gato: 通用智能体](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.06175) [[笔记](paper_notes\u002Fgato.md)] \u003Ckbd>TMLR 2022\u003C\u002Fkbd> [DeepMind]\n- [BC-SAC: 模仿还不够：通过强化学习增强模仿，以应对复杂的驾驶场景](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.11419) [[笔记](paper_notes\u002Fbc_sac.md)] \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [Waymo]\n- [MGAIL-AD: 用于自动驾驶规划的分层基于模型的模仿学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.09539) [[笔记](paper_notes\u002Fmgail_ad.md)] \u003Ckbd>IROS 2022\u003C\u002Fkbd> [Waymo]\n\n\n\n## 2023-05 (7)\n- [SurroundOcc: 多摄像头3D占用预测用于自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.09551) [[笔记](paper_notes\u002Fsurroundocc.md)] [Occupancy Network, Wei Yi, Jiwen Lu]\n- [Occ3D: 自动驾驶领域的大规模3D占用预测基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.14365) [[笔记](paper_notes\u002Focc3d.md)] [Occupancy Network, Zhao Hang]\n- [Occupancy Networks: 在函数空间中学习3D重建](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.03828) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [[笔记](paper_notes\u002Foccupancy_networks.md)] [Andreas Geiger]\n- [OccFormer: 双路径Transformer用于基于视觉的3D语义占用预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05316) [Occupancy Network, PhiGent]\n- [Pix2seq: 用于目标检测的语言建模框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.10852) [[笔记](paper_notes\u002Fpix2seq.md)] \u003Ckbd>ICLR 2022\u003C\u002Fkbd> [Geoffrey Hinton]\n- [VisionLLM: 大语言模型也是面向视觉任务的开放式解码器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11175) [[笔记](paper_notes\u002Fvision_llm.md)] [Jifeng Dai]\n- [HuggingGPT: 使用ChatGPT及其在Hugging Face中的伙伴解决AI任务](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17580) [[笔记](paper_notes\u002Fhugging_gpt.md)]\n\n\n## 2023-04 (1)\n- [UniAD: 以规划为导向的自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.10156) [[笔记](paper_notes\u002Funiad.md)] \u003Ckbd>CVPR 2023最佳论文\u003C\u002Fkbd> [BEV, e2e, Hongyang Li]\n\n## 2023-03 (5)\n- [GPT-4 技术报告](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.08774) [[笔记](paper_notes\u002Fgpt4.md)] [OpenAI, GPT]\n- [OpenOccupancy: 周围语义占用感知的大规模基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03991) [[笔记](paper_notes\u002Fopenoccupancy.md)] [Occupancy Network, Lu Jiwen]\n- [VoxFormer: 基于相机的 3D 语义场景补全的稀疏体素 Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.12251) [[笔记](paper_notes\u002Fvoxformer.md)] \u003Ckbd>CVPR 2023 亮点\u003C\u002Fkbd> [Occupancy Network, Nvidia]\n- [MonoScene: 单目 3D 语义场景补全](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.00726) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [[笔记](paper_notes\u002Fmonoscene.md)] [Occupancy Network, 单目摄像头]\n- [CoReNet: 从单张 RGB 图像中进行连贯的 3D 场景重建](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.12989) [[笔记](paper_notes\u002Fcorenet.md)] \u003Ckbd>ECCV 2020 口头报告\u003C\u002Fkbd>\n\n\n## 2023-02 (4)\n- [我们会耗尽数据吗？机器学习中数据集扩展极限分析](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.04325) [[笔记](paper_notes\u002Fout_of_data.md)] [Epoch.ai 行业报告]\n- [Codex: 针对代码训练的大型语言模型评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03374) [[笔记](paper_notes\u002Fcodex.md)] [GPT, OpenAI]\n- [InstructGPT: 通过人类反馈训练语言模型遵循指令](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.02155) [[笔记](paper_notes\u002Finstructgpt.md)] [GPT, OpenAI]\n- [TPVFormer: 基于视觉的 3D 语义占用预测的三视角视图](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.07817) [[笔记](paper_notes\u002Ftpvformer.md)] \u003Ckbd>CVPR 2023\u003C\u002Fkbd> [Occupancy Network, Lu Jiwen]\n\n\n## 2023-01 (2)\n- [PPGeo: 通过自监督几何建模进行端到端自动驾驶策略预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.01006) [[笔记](paper_notes\u002Fppgeo.md)] \u003Ckbd>ICLR 2023\u003C\u002Fkbd>\n- [nuPlan: 面向自动驾驶车辆的闭环 ML 导航规划基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.11810) [[笔记](paper_notes\u002Fnuplan.md)]\n\n\n\n## 2022-11 (1)\n- [M2I: 从分解的边际轨迹预测到交互式预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.11884) [[笔记](paper_notes\u002Fm2i.md)] \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n\n\n## 2022-10 (1)\n- [深入探讨鸟瞰图感知的难点：综述、评估与实践指南](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.05324) [[笔记](paper_notes\u002Fdelving_bev.md)] [PJLab]\n\n## 2022-09 (3)\n- [ViP3D: 通过 3D 代理查询实现端到端视觉轨迹预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.01582) [[笔记](paper_notes\u002Fvip3d.md)] [BEV, 感知+预测, Zhao Hang]\n- [MapTR: 面向在线矢量化高清地图构建的结构化建模与学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.14437) [[笔记](paper_notes\u002Fmaptr.md)] [Horizon, BEVNet]\n- [StopNet: 面向城市自动驾驶的可扩展轨迹与占用预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.00991) \u003Ckbd>ICRA 2022\u003C\u002Fkbd>\n- [MOTR: 基于 Transformer 的端到端多目标跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.03247) \u003Ckbd>ECCV 2022\u003C\u002Fkbd> [Megvii, 多目标跟踪]\n- [Anchor DETR: 基于 Transformer 的目标检测查询设计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.07107) [[笔记](paper_notes\u002Fanchor_detr.md)] \u003Ckbd>AAAI 2022\u003C\u002Fkbd> [Megvii]\n\n\n## 2022-08 (1)\n- [HOME: 用于未来运动估计的热图输出](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.10968) [[笔记](paper_notes\u002Fhome.md)] \u003Ckbd>ITSC 2021\u003C\u002Fkbd> [行为预测, 华为巴黎]\n\n## 2022-07 (8)\n- [PersFormer: 基于透视 Transformer 和 OpenLane 基准的 3D 车道线检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.11089) [[笔记](paper_notes\u002Fpersformer.md)] [BEVNet, 车道线]\n- [VectorMapNet: 端到端矢量化高清地图学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.08920) [[笔记](paper_notes\u002Fvectormapnet.md)] [BEVNet, LLD, Zhao Hang]\n- [PETR: 用于多视角 3D 目标检测的位置嵌入变换](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.05625) [[笔记](paper_notes\u002Fpetr.md)] \u003Ckbd>ECCV 2022\u003C\u002Fkbd> [BEVNet]\n- [PETRv2: 一种统一的多摄像头图像 3D 感知框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.01256) [[笔记](paper_notes\u002Fpetrv2.md)] [BEVNet, MegVii]\n- [M^2BEV: 多摄像头联合 3D 目标检测与分割，采用统一的鸟瞰图表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.05088) [[笔记](paper_notes\u002Fm2bev.md)] [BEVNet, Nvidia]\n- [BEVDepth: 用于多视角 3D 目标检测的可靠深度获取](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.10092) [[笔记](paper_notes\u002Fbevdepth.md)] [BEVNet, NuScenes SOTA, MegVii]\n- [CVT: 用于实时地图视图语义分割的跨视角 Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.02833) [[笔记](paper_notes\u002Fcvt.md)] \u003Ckbd>CVPR 2022 口头报告\u003C\u002Fkbd> [UTAustin, Philipp]\n- [Wayformer: 基于简单高效的注意力网络进行运动预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.05844) [[笔记](paper_notes\u002Fwayformer.md)] [行为预测, Waymo]\n\n## 2022-06 (3)\n- [BEVDet4D: 在多摄像头 3D 目标检测中利用时间线索](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.17054) [[笔记](paper_notes\u002Fbevdet4d.md)] [BEVNet]\n- [BEVerse: 面向以视觉为中心的自动驾驶的鸟瞰图统一感知与预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.09743) [[笔记](paper_notes\u002Fbeverse.md)] [Lu Jiwen, BEVNet, 感知+预测]\n- [BEVFusion: 具有统一鸟瞰图表示的多任务多传感器融合](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.13542) [[笔记](paper_notes\u002Fbevfusion.md)] [BEVNet, Song Han]\n\n## 2022-03 (1)\n- [BEVFormer: 通过时空 Transformer 从多摄像头图像中学习鸟瞰图表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.17270) [[笔记](paper_notes\u002Fbevformer.md)] \u003Ckbd>ECCV 2022\u003C\u002Fkbd> [BEVNet, Li Hongyang, Dai Jifeng]\n\n## 2022-02 (1)\n- [TNT: 目标驱动的轨迹预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.08294) [[笔记](paper_notes\u002Ftnt.md)] \u003Ckbd>CoRL 2020\u003C\u002Fkbd> [预测, Waymo, Zhao Hang]\n- [DenseTNT: 基于密集目标集合的端到端轨迹预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.09640) [[笔记](paper_notes\u002Fdense_tnt.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [预测, Waymo, WOMD 第一名]\n\n## 2022-01 (1)\n- [Manydepth: 时间机会主义者：自监督多帧单目深度](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.14540) [[笔记](paper_notes\u002Fmanydepth.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [单目深度, Niantic]\n- [DEKR: 基于解耦关键点回归的自下而上人体姿态估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.02300) [[笔记](paper_notes\u002Fdekr.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n\n## 2021年12月 (5)\n- [BN-FFN-BN：为视觉Transformer利用批归一化](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021W\u002FNeurArch\u002Fpapers\u002FYao_Leveraging_Batch_Normalization_for_Vision_Transformers_ICCVW_2021_paper.pdf) [[笔记](paper_notes\u002Fbn_ffn_bn.md)] \u003Ckbd>ICCVW 2021\u003C\u002Fkbd> [BN, 变压器]\n- [PowerNorm：重新思考变压器中的批归一化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.07845) [[笔记](paper_notes\u002Fpowernorm.md)] \u003Ckbd>ICML 2020\u003C\u002Fkbd> [BN, 变压器]\n- [MultiPath++：用于行为预测的高效信息融合与轨迹聚合](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.14973) [[笔记](paper_notes\u002Fmultipath++.md)] \u003Ckbd>ICRA 2022\u003C\u002Fkbd> [Waymo, 行为预测]\n- [BEVDet：鸟瞰视角下的高性能多摄像头3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.11790) [[笔记](paper_note\u002Fbevdet.md)]\n- [将图像转换为地图](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.00966) [[笔记](paper_notes\u002Ftranslating_images_to_maps.md)] \u003Ckbd>ICRA 2022\u003C\u002Fkbd> [BEVNet, 变压器]\n\n## 2021年11月 (4)\n- [DETR3D：通过3D到2D查询从多视角图像进行3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.06922) [[笔记](paper_notes\u002Fdetr3d.md)] \u003Ckbd>CoRL 2021\u003C\u002Fkbd> [BEVNet, 变压器]\n- [Robust-CVD：鲁棒一致的视频深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.05901) \u003Ckbd>CVPR 2021口头报告\u003C\u002Fkbd> [[官网](https:\u002F\u002Frobust-cvd.github.io\u002F)]\n- [MAE：掩码自编码器是可扩展的视觉学习者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.06377) [[笔记](paper_notes\u002Fmae.md)] [何凯明, 无监督学习]\n- [SimMIM：一种简单的掩码图像建模框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09886) [[笔记](paper_notes\u002Fsimmim.md)] [MSRA, 无监督学习, MAE]\n- [iBOT：使用在线分词器进行图像BERT预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.07832)\n\n## 2021年10月 (3)\n- [STSU：基于车载图像的结构化鸟瞰交通场景理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.01997) [[笔记](paper_notes\u002Fstsu.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [BEV特征拼接, Luc Van Gool]\n- [PanopticBEV：使用单目前视图像进行鸟瞰全景分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.03227) [[笔记](paper_notes\u002Fpanoptic_bev.md)] \u003Ckbd>RAL 2022\u003C\u002Fkbd> [BEVNet, 垂直\u002F水平特征]\n- [NEAT：用于端到端自动驾驶的神经注意力场](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.04456) [[笔记](paper_notes\u002Fneat.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [[补充材料](http:\u002F\u002Fwww.cvlibs.net\u002Fpublications\u002FChitta2021ICCV_supplementary.pdf)] [BEVNet]\n\n\n## 2021年9月 (11)\n- [DD3D：单目3D目标检测是否需要伪激光雷达？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.06417) [[笔记](paper_notes\u002Fdd3d.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [mono3D, 丰田]\n- [EfficientDet：可扩展且高效的物体检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.09070) [[笔记](paper_notes\u002Fefficientdet.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [BiFPN, 特斯拉AI日]\n- [PnPNet：闭环跟踪下的端到端感知与预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.14711) [[笔记](paper_notes\u002Fpnpnet.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Uber ATG]\n- [MP3：一个统一的模型，用于地图构建、感知、预测和规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.06806) [[笔记](paper_notes\u002Fmp3.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Uber, 规划]\n- [BEV-Net：通过联合人员定位和几何推理评估社交距离遵守情况](http:\u002F\u002Farxiv.org\u002Fabs\u002F2110.04931) [[笔记](paper_notes\u002Fbevnet_sdca.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [BEVNet, 监控]\n- [LiDAR R-CNN：一种高效通用的3D目标检测器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.15297) [[笔记](paper_notes\u002Flidar_rcnn.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [TuSimple, 王乃延]\n- [自动驾驶中视觉感知的边缘案例：关于检测方法的一些指导](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.05897) [[笔记](paper_notes\u002Fcorner_case_vision_arxiv.md)] [边缘案例]\n- [自动驾驶中视觉感知边缘案例的系统化研究](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9304789) [[笔记](paper_notes\u002Fcorner_case_vision_iv.md)] \u003Ckbd>IV 2020\u003C\u002Fkbd> [边缘案例]\n- [高度自动化驾驶中感知的边缘案例的应用驱动概念化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.03678) [[笔记](paper_notes\u002Fcorner_case_multisensor.md)] \u003Ckbd>IV 2021\u003C\u002Fkbd> [边缘案例]\n- [PYVA：专注地投射你的视野：通过跨视图变换进行单目道路场景布局估计](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fhtml\u002FYang_Projecting_Your_View_Attentively_Monocular_Road_Scene_Layout_Estimation_via_CVPR_2021_paper.html) [[笔记](paper_notes\u002Fpyva.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [[补充材料](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fsupplemental\u002FYang_Projecting_Your_View_CVPR_2021_supplemental.zip)] [BEVNet]\n- [YOLOF：你只需查看单一层次的特征](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.09460) [[笔记](paper_notes\u002Fyolof.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [旷视科技]\n- [感知人类：从单目3D定位到社交距离](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.00984) [[笔记](paper_notes\u002Fperceiving_humans.md)] \u003Ckbd>TITS 2021\u003C\u002Fkbd> [monoloco++]\n- [PifPaf：用于人体姿态估计的复合场](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.06593) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [使用单目前视图像进行鸟瞰全景分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.03227) [BEVNet]\n- [TransformerFusion：使用变压器进行单目RGB场景重建](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.02191)\n- [专注地投射你的视野：通过跨视图变换进行单目道路场景布局估计](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fpapers\u002FYang_Projecting_Your_View_Attentively_Monocular_Road_Scene_Layout_Estimation_via_CVPR_2021_paper.pdf) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [用于端到端自动驾驶的多模态融合变压器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.09224) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [用于快速训练收敛的条件DET R](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.06152)\n- [概率与几何深度：在透视中检测物体](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.14160) \u003Ckbd>CoRL 2021\u003C\u002Fkbd>\n\n## 2021年8月 (11)\n- [EgoNet: 探索用于单目车辆位姿估计的中间表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.08464) [[笔记](paper_notes\u002Fegonet.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [MonoEF: 单目3D目标检测：一种无需外参的方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.15796) [[笔记](paper_notes\u002Fmonoef.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [GAC: 地面感知的自动驾驶单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.00690) [[笔记](paper_notes\u002Fgac.md)] \u003Ckbd>RAL 2021\u003C\u002Fkbd> [mono3D]\n- [FCOS3D: 全卷积单阶段单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10956) [[笔记](paper_notes\u002Ffcos3d.md)] \u003Ckbd>NeurIPS 2020\u003C\u002Fkbd> [mono3D, 感知技术]\n- [GUPNet: 用于单目3D目标检测的几何不确定性投影网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.13774) [[笔记](paper_notes\u002Fgupnet.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [mono3D, Wanli Ouyang]\n- [DARTS: 可微架构搜索](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.09055) [[笔记](paper_notes\u002Fdarts.md)] \u003Ckbd>ICLR 2019\u003C\u002Fkbd> [VGG作者]\n- [FBNet: 基于可微神经架构搜索的硬件感知高效卷积网络设计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.03443) [[笔记](paper_notes\u002Ffbnet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [DARTS]\n- [FBNetV2: 针对空间和通道维度的可微神经架构搜索](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.05565) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [FBNetV3: 使用预测器预训练的联合架构-配方搜索](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.02049) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [Perceiver: 基于迭代注意力的通用感知模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.03206) [[笔记](paper_notes\u002Fperceiver.md)] \u003Ckbd>ICML 2021\u003C\u002Fkbd> [transformers, 多模态]\n- [Perceiver IO: 一种用于结构化输入与输出的通用架构](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.14795) [[笔记](paper_notes\u002Fperceiver_io.md)]\n- [PillarMotion: 自主驾驶中的自监督柱状运动学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08683) [[笔记](paper_notes\u002Fpillar_motion.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Qcraft, Alan Yuille]\n- [SimTrack: 探索用于自主驾驶的简单3D多目标跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.10312) [[笔记](paper_notes\u002Fsimtrack.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [QCraft, Alan Yuille]\n\n\n## 2021年7月 (1)\n- [HDMapNet: 在线高清地图构建与评估框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.06307) [[笔记](paper_notes\u002Fhdmapnet.md)] \u003Ckbd>CVPR 2021研讨会\u003C\u002Fkbd> [仅YouTube视频，理想汽车]\n\n\n## 2021年6月 (2)\n- [FIERY: 基于环绕单目相机的鸟瞰图未来实例预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10490) [[笔记](paper_notes\u002Ffiery.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [BEVNet, 感知+预测]\n- [百度的CNN分割](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F35034215) [[笔记](paper_notes\u002Fcnn_seg.md)]\n\n## 2021年4月 (5)\n- [重新思考自下而上的人体姿态估计中的热图回归](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.15175) [[笔记](paper_notes\u002Fswahr.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [旷视科技] \n- [CrowdPose: 高效拥挤场景姿态估计及新基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.00324) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [目标检测中被忽视的大象：开放集](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_WACV_2020\u002Fhtml\u002FDhamija_The_Overlooked_Elephant_of_Object_Detection_Open_Set_WACV_2020_paper.html) \u003Ckbd>WACV 2021\u003C\u002Fkbd>\n- [类别无关的目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14204) \u003Ckbd>WACV 2021\u003C\u002Fkbd>\n- [OWOD: 朝着开放世界目标检测迈进](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.02603) [[笔记](paper_notes\u002Fowod.md)] \u003Ckbd>CVPR 2021口头报告\u003C\u002Fkbd>\n- [FsDet: 令人沮丧的简单少样本目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.06957) \u003Ckbd>ICML 2020\u003C\u002Fkbd>\n- [MonoFlex: 物体各不相同：灵活的单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.02323) [[笔记](paper_notes\u002Fmonoflex.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D, Jiwen Lu, 裁剪版]\n- [monoDLE: 深入研究单目3D目标检测中的定位误差](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.16237) [[笔记](paper_notes\u002Fmonodle.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [探索用于3D单目目标检测的2D数据增强](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10786)\n- [OCM3D: 以物体为中心的单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.06041) [mono3D]\n- [FSM: 基于多摄像头的全环绕单目深度](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.00152) [[笔记](paper_notes\u002Ffsm.md)] \u003Ckbd>ICRA 2021\u003C\u002Fkbd> [单目深度, Xnet]\n\n\n## 2021年3月 (4)\n- [CaDDN: 用于单目3D目标检测的分类深度分布网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.01100) [[笔记](paper_notes\u002Fcaddn.md)] \u003Ckbd>CVPR 2021口头报告\u003C\u002Fkbd> [mono3D, BEVNet]\n- [DSNT: 基于卷积神经网络的数值坐标回归](https:\u002F\u002Farxiv.org\u002Fabs\u002F1801.07372) [[笔记](paper_notes\u002Fdsnt.md)] [可微的空间到数值转换]\n- [Soft-Argmax: 结合间接部件检测和上下文信息的人体姿态回归](https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.02322)\n- [INSTA-YOLO: 实时实例分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.06777) [[笔记](paper_notes\u002Finsta_yolo.md)] \u003Ckbd>ICML研讨会2020\u003C\u002Fkbd> [单阶段实例分割]\n- [CenterNet2: 概率论两阶段检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.07461) [[笔记](paper_notes\u002Fcenternet2.md)] [CenterNet, 两阶段]\n\n\n## 2021年1月 (7)\n- [Confluence: 目标检测中鲁棒的非IoU替代方案，用于非极大值抑制](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.00257) [[笔记](paper_notes\u002Fconfluence.md)] [NMS]\n- [BoxInst: 基于边界框标注的高性能实例分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.02310) [[笔记](paper_notes\u002Fboxinst.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Chunhua Shen, Tian Zhi]\n- [3DSSD: 基于点云的3D单阶段目标检测器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.10187) [[笔记](paper_notes\u002F3dssd.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [RepVGG: 让VGG风格的卷积网络再次伟大](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.03697) [[笔记](paper_notes\u002Frepvgg.md)] [旷视科技, Xiangyu Zhang, ACNet]\n- [ACNet: 通过不对称卷积块强化卷积核骨架，打造强大CNN](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03930) [[笔记](paper_notes\u002Facnet.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [BEV-Feat-Stitching: 利用车载单目相机理解鸟瞰图语义高清地图](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.03040) [[笔记](paper_notes\u002Fbev_feat_stitching.md)] [BEVNet, mono3D, Luc Van Gool]\n- [PSS: 通过消除启发式NMS使目标检测更简单](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.11782) [[笔记](paper_notes\u002Fpss.md)] [Transformer, DETR]\n\n## 2020-12 (17)\n- [DeFCN：基于全卷积网络的端到端目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.03544) [[笔记](paper_notes\u002Fdefcn.md)] [Transformer, DETR]\n- [OneNet：通过分类代价实现端到端单阶段目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.05780) [[笔记](paper_notes\u002Fonenet.md)] [Transformer, DETR]\n- [自动驾驶车辆中的交通信号灯地图构建、定位与状态检测](http:\u002F\u002Fdriving.stanford.edu\u002Fpapers\u002FICRA2011.pdf) [[笔记](paper_notes\u002Ftfl_stanford.md)] \u003Ckbd>ICRA 2011\u003C\u002Fkbd> [交通信号灯, Sebastian Thrun]\n- [面向半静态环境的终身特征地图构建](https:\u002F\u002Fstorage.googleapis.com\u002Fpub-tools-public-publication-data\u002Fpdf\u002F43966.pdf) [[笔记](paper_notes\u002Flifelong_feature_mapping_google.md)] \u003Ckbd>ICRA 2016\u003C\u002Fkbd>\n- [如何保持高精地图在自动驾驶中的实时更新](http:\u002F\u002Fwww.lewissoft.com\u002Fpdf\u002FICRA2020\u002F1484.pdf) [[笔记](paper_notes\u002Fkeep_hd_maps_updated_bmw.md)] \u003Ckbd>ICRA 2020\u003C\u002Fkbd> [BMW]\n- [广义焦点损失V2：学习密集目标检测中可靠的定位质量估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.12885) [[笔记](paper_notes\u002Fgfocalv2.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [焦点损失]\n- [自动驾驶中的视觉SLAM：探索深度学习的应用](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018_workshops\u002Fpapers\u002Fw9\u002FMilz_Visual_SLAM_for_CVPR_2018_paper.pdf) [[笔记](paper_notes\u002Fvslam_for_ad.md)] \u003Ckbd>CVPR 2018研讨会\u003C\u002Fkbd>\n- [质心投票：面向单目3D目标检测的对象感知质心投票方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.09836) [[笔记](paper_notes\u002Fcentroid_voting.md)] \u003Ckbd>IROS 2020\u003C\u002Fkbd> [mono3D, 几何+外观=距离]\n- [基于鱼眼相机柱状图像的单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.03759) [[笔记](paper_notes\u002Fmono3d_fisheye.md)] [GM Israel, mono3D]\n- [DeepPS：基于视觉的停车位检测——一种基于DCNN的方法及大规模基准数据集](https:\u002F\u002Fcslinzhang.github.io\u002Fdeepps\u002Fparkingslot.pdf) \u003Ckbd>TIP 2018\u003C\u002Fkbd> [停车位检测, PS2.0数据集]\n- [PSDet：高效通用的停车位检测方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.05528) [[笔记](paper_notes\u002Fpsdet.md)] \u003Ckbd>IV 2020\u003C\u002Fkbd> [Zongmu, 停车位检测]\n- [PatDNN：基于模式的权重剪枝实现移动设备上的实时DNN执行](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.00138) [[笔记](paper_notes\u002Fpatdnn.md)] \u003Ckbd>ASPLOS 2020\u003C\u002Fkbd> [剪枝]\n- [Scaled-YOLOv4：跨阶段部分网络的尺度化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.08036) [[笔记](paper_notes\u002Fscaled_yolov4.md)] [yolo]\n- [Ultralytics的Yolov5](https:\u002F\u002Fgithub.com\u002Fultralytics\u002Fyolov5) [[笔记](paper_notes\u002Fyolov5.md)] [yolo, 空间转通道]\n- [PP-YOLO：一种有效且高效的物体检测器实现](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.12099) [[笔记](paper_notes\u002Fpp_yolo.md)] [yolo, paddle-paddle, 百度]\n- [PointPainting：用于3D目标检测的序列融合方法](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.10150.pdf) [[笔记](paper_notes\u002Fpoint_painting.md)] [nuscenece]\n- [MotionNet：基于鸟瞰图地图的自动驾驶联合感知与运动预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.06754) [[笔记](paper_notes\u002Fmotionnet.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [未见的运动物体, BEV]\n- [无需边界框的对象定位](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.07564) [[笔记](paper_notes\u002Fobjects_without_bboxes.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [加权豪斯多夫距离, 无NMS]\n\n## 2020-11 (18)\n- [TSP: 重新思考基于 Transformer 的集合预测用于目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.10881) [[笔记](paper_notes\u002Ftsp.md)] \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [DETR, transformers, Kris Kitani]\n- [Sparse R-CNN: 基于可学习提议的端到端目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.12450) [[笔记](paper_notes\u002Fsparse_rcnn.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [DETR, Transformer]\n- [动态场景中的无监督单目深度学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.16404) [[笔记](paper_notes\u002Flearn_depth_and_motion.md)] \u003Ckbd>CoRL 2020\u003C\u002Fkbd> [LearnK改进版，Google]\n- [MoNet3D: 实现实时精确的单目3D目标定位](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.16007) [[笔记](paper_notes\u002Fmonet3d.md)] \u003Ckbd>ICML 2020\u003C\u002Fkbd> [Mono3D, 成对关系]\n- [Argoverse: 带丰富地图的3D跟踪与预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.02620) [[笔记](paper_notes\u002Fargoverse.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [高清地图，数据集，CV激光雷达]\n- [H3D数据集：用于拥挤城市场景中全方位3D多目标检测与跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.01568) [[笔记](paper_notes\u002Fh3d.md)] \u003Ckbd>ICRA 2019\u003C\u002Fkbd>\n- [Cityscapes 3D: 用于9自由度车辆检测的数据集和基准测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.07864) \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [数据集，戴姆勒，mono3D]\n- [NYC3DCars: 地理背景下3D车辆的数据集](https:\u002F\u002Fwww.cs.cornell.edu\u002F~snavely\u002Fpublications\u002Fpapers\u002Fnyc3dcars_iccv13.pdf) \u003Ckbd>ICCV 2013\u003C\u002Fkbd>\n- [迈向完全自动驾驶：系统与算法](https:\u002F\u002Fwww.ri.cmu.edu\u002Fwp-content\u002Fuploads\u002F2017\u002F12\u002Flevinson-iv2011.pdf) \u003Ckbd>IV 2011\u003C\u002Fkbd>\n- [Center3D: 基于中心的单目3D目标检测与联合深度理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.13423) [[笔记](paper_notes\u002Fcenter3d.md)] [mono3D, LID+DepJoint]\n- [ZoomNet: 面向3D目标检测的部件感知自适应缩放神经网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.00529) \u003Ckbd>AAAI 2020口头报告\u003C\u002Fkbd> [mono3D] \n- [CenterFusion: 基于中心的雷达与相机融合用于3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.04841) [[笔记](paper_notes\u002Fcenterfusion.md)] \u003Ckbd>WACV 2021\u003C\u002Fkbd> [早期融合，相机，雷达]\n- [3D-LaneNet+: 使用半局部表示的无锚车道检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.01535) [[笔记](paper_notes\u002F3d_lanenet+.md)] \u003Ckbd>NeurIPS 2020研讨会\u003C\u002Fkbd> [GM以色列，3D LLD]\n- [LSTR: 基于Transformer的端到端车道形状预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.04233) [[笔记](paper_notes\u002Flstr.md)] \u003Ckbd>WACV 2021\u003C\u002Fkbd> [LLD，transformers]\n- [PIXOR: 来自点云的实时3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.06326) [[笔记](paper_notes\u002Fpixor.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (鸟瞰视角)\n- [HDNET\u002FPIXOR++: 利用高清地图进行3D目标检测](http:\u002F\u002Fproceedings.mlr.press\u002Fv87\u002Fyang18b\u002Fyang18b.pdf) [[笔记](paper_notes\u002Fpixor++.md)] \u003Ckbd>CoRL 2018\u003C\u002Fkbd>\n- [CPNDet: 用于无锚、两阶段目标检测的角点提议网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.13816) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [无锚，两阶段]\n- [MVF: 用于激光雷达点云中3D目标检测的端到端多视角融合](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06528) [[笔记](paper_notes\u002Fmvf.md)] \u003Ckbd>CoRL 2019\u003C\u002Fkbd> [Waymo，VoxelNet第一作者]\n- [面向自动驾驶的柱状体目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.10323) [[笔记](paper_notes\u002Fpillar_od.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [训练友好的实时目标检测网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.00700) \u003Ckbd>AAAI 2020\u003C\u002Fkbd> [无锚，快速训练]\n- [深度学习驱动的自动驾驶：最新技术综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.06091) [自动驾驶技术栈综述，Yu Huang]\n- [复杂动态场景中的密集单目深度估计](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2016\u002Fpapers\u002FRanftl_Dense_Monocular_Depth_CVPR_2016_paper.pdf) \u003Ckbd>CVPR 2016\u003C\u002Fkbd>\n- [视频场景理解中的概率未来预测](https:\u002F\u002Fanthonyhu.github.io\u002Fresearch\u002Fprobabilistic-future-prediction\u002F)\n- [AB3D: 3D多目标跟踪的基线](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.03961) \u003Ckbd>IROS 2020\u003C\u002Fkbd> [3D MOT]\n- [用于多目标跟踪的时空关系网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.11489) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [MOT，随时间变化的特征位置]\n- [超越像素：利用几何与形状线索进行在线多目标跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.09298) \u003Ckbd>ICRA 2018\u003C\u002Fkbd> [MOT，IIT，3D形状]\n- [ST-3D: 立体3D目标跟踪的联合时空优化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.09305) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Peilinag LI，VINS和S3DOT的作者]\n- [增强你的批次：通过实例重复提升泛化能力](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FHoffer_Augment_Your_Batch_Improving_Generalization_Through_Instance_Repetition_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [RetinaTrack: 在线单阶段联合检测与跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13870) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [MOT]\n- [以热点为目标：通过热点激发实现无锚3D目标检测方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.12791)\n- [梯度中心化：一种用于深度神经网络的新优化技术](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.01461) \u003Ckbd>ECCV 2020口头报告\u003C\u002Fkbd>\n- [基于深度基底拟合的深度补全](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.10336) \u003Ckbd>WACV 2020\u003C\u002Fkbd>\n- [BTS: 由大到小：用于单目深度估计的多尺度局部平面引导](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.10326) [monodepth，有监督]\n- [深度的边界：分割与深度之间的显式约束](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.00171) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [monodepth，Xiaoming Liu]\n- [关于神经网络中旋转表示的连续性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.07035) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [旋转表示]\n- [VDO-SLAM: 一种视觉动态对象感知SLAM系统](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.11052) \u003Ckbd>IJRR 2020\u003C\u002Fkbd>\n- [动态SLAM：速度的必要性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08584)\n- [伪RGB-D用于自我改进的单目SLAM和深度预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.10681) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [交通信号灯映射与检测](https:\u002F\u002Fstatic.googleusercontent.com\u002Fmedia\u002Fresearch.google.com\u002Fen\u002F\u002Fpubs\u002Farchive\u002F37259.pdf) [[笔记](paper_notes\u002Ftfl_mapping_google.md)] \u003Ckbd>ICRA 2011\u003C\u002Fkbd> [交通信号灯，Google，Chris Urmson]\n- [在每个阶段都利用地图和定位进行交通信号灯识别](https:\u002F\u002Fweb.yonsei.ac.kr\u002Fjksuhr\u002Fpapers\u002FTraffic%20light%20recognition%20exploiting%20map%20and%20localization%20at%20every%20stage.pdf) [[笔记](paper_notes\u002Ftfl_exploting_map_korea.md)] \u003Ckbd>专家系统2017\u003C\u002Fkbd> [交通信号灯，鲜于明镐，徐在圭，郑浩奇]\n- [利用深度学习和先验地图进行自动驾驶汽车的交通信号灯识别](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.11886) [[笔记](paper_notes\u002Ftfl_lidar_map_building_brazil.md)] \u003Ckbd>IJCNN 2019\u003C\u002Fkbd> [交通信号灯，巴西圣埃斯皮里图州]\n\n## 2020-10 (14)\n- [TSM：用于高效视频理解的时序移位模块](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.08383) [[笔记](paper_notes\u002Ftsm.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [宋涵，视频，目标检测]\n- [WOD：Waymo数据集：自动驾驶感知任务中的可扩展性——Waymo开放数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04838) [[笔记](paper_notes\u002Fwod.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [广义焦点损失：为密集目标检测学习高质量且分布均匀的边界框](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04388) [[笔记](paper_notes\u002Fgfocal.md)] \u003Ckbd>NeurIPS 2020\u003C\u002Fkbd> [分类作为回归]\n- [一种基于排序的平衡损失函数：统一目标检测中的分类与定位](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.13592) \u003Ckbd>NeurIPS 2020 Spotlight\u003C\u002Fkbd>\n- [重新思考标签在改善类别不平衡学习中的价值](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.07529) \u003Ckbd>NeurIPS 2020\u003C\u002Fkbd>\n- [RepLoss：斥力损失：人群中的行人检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.07752) [[笔记](paper_notes\u002Frep_loss.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [人群检测，旷视科技]\n- [自适应NMS：优化人群中的行人检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.03629) [[笔记](paper_notes\u002Fadaptive_nms.md)] \u003Ckbd>CVPR 2019 口头报告\u003C\u002Fkbd> [人群检测，NMS]\n- [AggLoss：遮挡感知的R-CNN：人群中的行人检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.08407) [[笔记](paper_notes\u002Fagg_loss.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [人群检测]\n- [CrowdDet：拥挤场景下的检测：一个提案，多个预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.09163) [[笔记](paper_notes\u002Fcrowd_det.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [人群检测，旷视科技，地球移动距离]\n- [R2-NMS：基于代表性区域的NMS：通过提案配对实现拥挤场景下的行人检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.12729) [[笔记](paper_notes\u002Fr2_nms.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [双锚点R-CNN：人群中的行人检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.09998) [[笔记](paper_notes\u002Fdouble_anchor.md)] [头部-身体捆绑]\n- [综述：AP与MR](paper_notes\u002Fap_mr.md)\n- [SKU110K：密集排列场景中的精确检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.00853) [[笔记](paper_notes\u002Fsku110k.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [人群检测，无遮挡]\n- [GossipNet：学习非极大值抑制](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.02950) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [TLL：基于躯体拓扑定位和时间特征聚合的小尺度行人检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.01438) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [无需3D边界框标签的学习单目3D车辆检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.03506) \u003Ckbd>GCPR 2020\u003C\u002Fkbd> [单目3D，Daniel Cremers，慕尼黑工业大学]\n- [CubifAE-3D：自动驾驶车辆上的单目相机空间立方化，用于基于自编码器的3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04080) [[笔记](paper_notes\u002Fcubifae_3d.md)] [单目3D，深度自编码器预训练]\n- [可变形DETR：用于端到端目标检测的可变形Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.04159) [[笔记](paper_notes\u002Fdeformable_detr.md)] \u003Ckbd>ICLR 2021\u003C\u002Fkbd> [戴继峰，DETR]\n- [ViT：一张图胜过16×16个词：大规模图像识别中的Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.11929) [[笔记](paper_notes\u002Fvit.md)] \u003Ckbd>ICLR 2021\u003C\u002Fkbd>\n- [BYOL：自举你的潜在表示：一种新的自监督学习方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.07733) [自监督]\n\n## 2020-09 (15)\n- [SDFLabel: 使用可微分渲染的 SDF 形状先验自动标注 3D 对象](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.11288) [[笔记](paper_notes\u002Fsdflabel.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [TRI, 可微分渲染]\n- [DensePose: 野外密集人体姿态估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.00434) [[笔记](paper_notes\u002Fdensepose.md)] \u003Ckbd>CVPR 2018 口头报告\u003C\u002Fkbd> [FAIR]\n- [NOCS: 面向类别级 6D 对象位姿与尺寸估计的归一化对象坐标空间](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.02970) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [monoDR: 用于自监督 3D 物体检测的单目可微分渲染](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.14524) [[笔记](paper_notes\u002Fmonodr.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [TRI, mono3D]\n- [Lift, Splat, Shoot: 通过隐式反投影到 3D 编码来自任意相机阵列的图像](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.05711) [[笔记](paper_notes\u002Flift_splat_shoot.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [BEV-Net, Utoronto, Sanja Fidler]\n- [用于场景一致性运动预测的隐式潜在变量模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.12036) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Uber ATG, Rachel Urtasun]\n- [FISHING Net: 格网中语义热图的未来推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.09917) [[笔记](paper_notes\u002Ffishing_net.md)] \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [BEV-Net, Mapping, Zoox]\n- [VPN: 用于环境感知的跨视角语义分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.03560) [[笔记](paper_notes\u002Fvpn.md)] \u003Ckbd>RAL 2020\u003C\u002Fkbd> [Bolei Zhou, BEV-Net]\n- [VED: 基于卷积变分编码器-解码器网络的单目语义占用网格建图](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.02176) [[笔记](paper_notes\u002Fved.md)] \u003Ckbd>ICRA 2019\u003C\u002Fkbd> [BEV-Net]\n- [Cam2BEV: 一种 Sim2Real 深度学习方法，用于将多路车载摄像头拍摄的图像转换为鸟瞰视角下的语义分割图像](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.04078) [[笔记](paper_notes\u002Fcam2bev.md)] \u003Ckbd>ITSC 2020\u003C\u002Fkbd> [BEV-Net] \n- [学习如何环绕物体以获取户外场景的俯视图表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.10870) [[笔记](paper_notes\u002Flearning_to_look_around_objects.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [BEV-Net, UCSD, Manmohan Chandraker]\n- [复杂道路场景的参数化俯视图表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.06152) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [BEV-Net, UCSD, Manmohan Chandraker]\n- [FTM: 将视频作为一个整体来理解道路布局](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.00822) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [BEV-Net, UCSD, Manmohan Chandraker]\n- [KM3D-Net: 基于几何约束嵌入和半监督训练的单目 3D 检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.00764) [[笔记](paper_notes\u002Fkm3d_net.md)] \u003Ckbd>RAL 2021\u003C\u002Fkbd> [RTM3D, Peixuan Li]\n- [InstanceMotSeg: 用于自动驾驶的实时实例运动分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.07008) [[笔记](paper_notes\u002Finstance_mot_seg.md)] \u003Ckbd>IROS 2020\u003C\u002Fkbd> [运动分割]\n- [MPV-Nets: 用于自动驾驶的单目平面视图网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.06937) [[笔记](paper_notes\u002Fmpv_nets.md)] \u003Ckbd>IROS 2019\u003C\u002Fkbd> [BEV-Net]\n- [基于有效样本数的类别平衡损失](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.05555) [[笔记](paper_notes\u002Fclass_balanced_loss.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [焦点损失作者]\n- [单目深度估计的几何预训练](http:\u002F\u002Flewissoft.com\u002Fpdf\u002FICRA2020\u002F0035.pdf) [[笔记](paper_notes\u002Fgeometric_pretraining.md)] \u003Ckbd>ICRA 2020\u003C\u002Fkbd>\n- [利用带有空间先验信息的数字地图实现自动驾驶中的鲁棒交通信号灯和箭头检测](https:\u002F\u002Fwww.mdpi.com\u002F1424-8220\u002F20\u002F4\u002F1181) [[笔记](paper_notes\u002Ftfl_robust_japan.md)] \u003Ckbd>Sensors 2020\u003C\u002Fkbd> [交通信号灯, 金沢]\n\n## 2020-08 (26)\n- [用于深度和自运动自监督学习的特征度量损失](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.10603) [[笔记](paper_notes\u002Ffeature_metric.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [特征度量、局部极小值、单目深度]\n- [Depth-VO-Feat：通过深度特征重建进行单目深度估计与视觉里程计的无监督学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.03893) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [特征度量、单目深度]\n- [MonoResMatch：融合传统立体视觉知识的单目深度估计学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.04144) [[笔记](paper_notes\u002Fmonoresmatch.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [单目深度、局部极小值、廉价立体匹配真值]\n- [SGDepth：自监督单目深度估计——通过语义引导解决动态物体问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.06936) [[笔记](paper_notes\u002Fsgdepth.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [动态物体]\n- [每个像素都重要：基于整体三维运动理解的无监督几何学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.10556) \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [动态物体、刚体与非刚体运动]\n- [每个像素都重要 ++：结合三维整体理解的几何与运动联合学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.06125) \u003Ckbd>TPAMI 2018\u003C\u002Fkbd>\n- [CC：竞争协作——深度、相机运动、光流及运动分割的联合无监督学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.09806) [[笔记](paper_notes\u002Fcc.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [ObjMotionNet：从视频中进行自监督的对象运动与深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04250) [[笔记](paper_notes\u002Fobj_motion_net.md)] \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [对象运动预测、速度预测]\n- [基于单目视频的实例级深度与运动学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.09351)\n- [语义驱动的单目深度与自运动估计无监督学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04371)\n- [基于隐式线索的深度估计自监督联合学习框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.09876)\n- [DF-Net：利用跨任务一致性进行深度与光流的无监督联合学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.01649) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [LineNet：一种可缩放的CNN，用于城市环境中众包高清地图建模](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.05696) [地图构建]\n- [Road-SLAM：基于车道线标记的SLAM，实现车道级精度](https:\u002F\u002Fwww.naverlabs.com\u002Fimg\u002FautonomousDriving\u002Fintelligence\u002Fdissertation\u002FRoad-SLAM_Road%20Marking%20based%20SLAM%20with%20Lane-level%20Accuracy.pdf) [[笔记](paper_notes\u002Froad_slam.md)] [高清地图]\n- [AVP-SLAM：面向停车场内自动驾驶车辆的语义视觉建图与定位](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.01813) [[笔记](paper_notes\u002Favp_slam.md)] \u003Ckbd>IROS 2020\u003C\u002Fkbd> [华为、高清地图、Tong Qin、VINS作者、自动代客泊车]\n- [AVP-SLAM-后融合：在室内停车场中利用语义车道线标记实现厘米级精度的建图与定位](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8917529) [[笔记](paper_notes\u002Favp_slam_late_fusion.md)] \u003Ckbd>ITSC 2019\u003C\u002Fkbd>\n- [基于车道线标记的高速公路重定位](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8917254) \u003Ckbd>ITSC 2019\u003C\u002Fkbd>\n- [DeepRoadMapper：从航空影像中提取道路拓扑结构](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2017\u002Fpapers\u002FMattyus_DeepRoadMapper_Extracting_Road_ICCV_2017_paper.pdf) [[笔记](paper_notes\u002Fdeep_road_mapper.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd> [Uber ATG，非高清地图]\n- [RoadTracer：自动从航空影像中提取道路网络](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FBastani_RoadTracer_Automatic_Extraction_CVPR_2018_paper.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [非高清地图]\n- [PolyMapper：从俯视图像中提取拓扑地图](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.01497) [[笔记](paper_notes\u002Fpolymapper.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [地图构建、多边形，非高清地图]\n- [HRAN：用于结构化在线地图的层次递归注意力网络](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FHomayounfar_Hierarchical_Recurrent_Attention_CVPR_2018_paper.pdf) [[笔记](paper_notes\u002Fhran.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [高清地图、高速公路、折线损失、Chamfer距离]\n- [Deep Structured Crosswalk：端到端深度结构化模型用于绘制人行横道](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCV_2018\u002Fpapers\u002FJustin_Liang_End-to-End_Deep_Structured_ECCV_2018_paper.pdf) [[笔记](paper_notes\u002Fdeep_structured_crosswalk.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [DeepBoundaryExtractor：用于道路边界提取的卷积循环网络](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2019\u002Fhtml\u002FLiang_Convolutional_Recurrent_Network_for_Road_Boundary_Extraction_CVPR_2019_paper.html) [[笔记](paper_notes\u002Fdeep_boundary_extractor.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [高清地图、边界、折线损失]\n- [DAGMapper：通过发现车道拓扑结构来学习地图构建](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FHomayounfar_DAGMapper_Learning_to_Map_by_Discovering_Lane_Topology_ICCV_2019_paper.pdf) [[笔记](paper_notes\u002Fdagmapper.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [高清地图、高速公路、分叉与汇合、折线损失]\n- [稀疏高清地图：利用稀疏语义高清地图进行自动驾驶车辆定位](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03274) [[笔记](paper_notes\u002Fsparse_hd_maps.md)] \u003Ckbd>IROS 2019 口头报告\u003C\u002Fkbd> [Uber ATG、元数据、地图构建、定位]\n- [Aerial LaneNet：使用小波增强的成本敏感对称全卷积神经网络进行航空影像中的车道线语义分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.06904) \u003Ckbd>IEEE TGRS 2018\u003C\u002Fkbd>\n- [基于矢量高清地图的单目定位（MLVHM）：一种低成本的商用IVs方法](https:\u002F\u002Fwww.mdpi.com\u002F1424-8220\u002F20\u002F7\u002F1870\u002Fhtm) \u003Ckbd>Sensors 2020\u003C\u002Fkbd> [清华大学、3D高清地图]\n- [PatchNet：重新思考伪LiDAR表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.04582) [[笔记](paper_notes\u002Fpatchnet.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [商汤科技、Wanli Ouyang]\n- [D4LCN：用于单目3D目标检测的深度引导卷积学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04799) [[笔记](paper_notes\u002Fd4lcn.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [单目3D]\n- [MfS：从单张图像中学习立体匹配](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.01484) [[笔记](paper_notes\u002Fmfs.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [单目转立体、用单目学习立体匹配]\n- [BorderDet：密集目标检测的边界特征](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.11056) \u003Ckbd>ECCV 2020 口头报告\u003C\u002Fkbd> [旷视科技]\n- [尺度感知三叉戟网络用于目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.01892) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [不同尺度对应不同分支]\n- [利用直接法从单目视频中学习深度](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.00175)\n- [Vid2Depth：利用3D几何约束从单目视频中进行深度与自运动的无监督学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.05522) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [谷歌]\n- [NeRF in the Wild：适用于无约束照片集的神经辐射场](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.02268)\n- [以旧驭新：从SFM中学习SFM](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCV_2018\u002Fpapers\u002FMaria_Klodt_Supervising_the_new_ECCV_2018_paper.pdf) [[笔记](paper_notes\u002Flearn_sfm_from_sfm.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [神经RGB->D传感：从视频摄像头获取深度与不确定性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.02571) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [多帧单目深度]\n- [勿忘过去：从单目视频中进行递归深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.02613) [多帧单目深度、RNN]\n- [用于单目视频视觉里程计与深度的（无）监督学习的递归神经网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.07087) [多帧单目深度、RNN]\n- [利用时间一致性实现实时视频深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03706) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [多帧单目深度、RNN、室内场景]\n- [SfM-Net：从视频中学习结构与运动](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.07804) [动态物体、SfM]\n- [MB-Net：用于实时3D车辆检测的MergeBoxes](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F8500395) [[笔记](paper_notes\u002Fmb_net.md)] \u003Ckbd>IV 2018\u003C\u002Fkbd> [单目3D：戴姆勒]\n- [BS3D：超越边界框——使用包围形状实现从单目RGB图像中实时3D车辆检测](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8814036\u002F) [[笔记](paper_notes\u002Fbs3d.md)] \u003Ckbd>IV 2019\u003C\u002Fkbd> [单目3D、戴姆勒]\n- [3D-GCK：通过几何约束关键点实现实时单目RGB图像中的单次3D车辆检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.13084) [[笔记](paper_notes\u002F3d_gck.md)] \u003Ckbd>IV 2020\u003C\u002Fkbd> [[单目3D、戴姆勒]\n- [UR3D：用于单目3D目标检测的距离归一化统一表示](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fhtml\u002F6559_ECCV_2020_paper.php) [[笔记](paper_notes\u002Fur3d.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [单目3D]\n- [DA-3Det：通过特征域适应进行单目3D目标检测](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fpapers\u002F123540018.pdf) [[笔记](paper_notes\u002Fda_3det.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [单目3D]\n- [RAR-Net：用于单目3D目标检测的强化轴向精炼网络](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fhtml\u002F2822_ECCV_2020_paper.php) [[笔记](paper_notes\u002Frarnet.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [单目3D]\n\n## 2020-07 (25)\n- [CenterTrack: 将目标跟踪为点](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.01177) [[笔记](paper_notes\u002Fcentertrack.md)] \u003Ckbd>ECCV 2020 spotlight\u003C\u002Fkbd> [基于摄像头的3D多对象检测与跟踪SOTA，CenterNet，视频目标检测，Philipp Krähenbühl]\n- [CenterPoint: 基于中心点的3D目标检测与跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.11275) [[笔记](paper_notes\u002Fcenterpoint.md)] \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [基于激光雷达的3D多对象检测，CenterNet]\n- [Tracktor: 不花哨的跟踪方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.05625) [[笔记](paper_notes\u002Ftracktor.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Tracktor\u002FTracktor++，Laura Leal-Taixe@TUM]\n- [FairMOT: 多目标跟踪的简单基线](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.01888) [[笔记](paper_notes\u002Ffairmot.md)]\n- [DeepMOT: 用于训练多目标跟踪器的可微分框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.06618) [[笔记](paper_notes\u002Fdeepmot.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [可训练的匈牙利算法，Laura Leal-Taixe@TUM]\n- [MPNTracker: 学习多目标跟踪的神经求解器](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.07515) \u003Ckbd>CVPR 2020口头报告\u003C\u002Fkbd> [可训练的匈牙利算法，Laura Leal-Taixe@TUM]\n- [nuScenes: 用于自动驾驶的多模态数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.11027) [[笔记](paper_notes\u002Fnuscenes.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [数据集，点云，雷达]\n- [CBGS: 面向点云3D目标检测的类别平衡分组与采样](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.09492) [[笔记](paper_notes\u002Fcbgs.md)] \u003Ckbd>CVPRW 2019\u003C\u002Fkbd> [旷视科技，激光雷达，WAD挑战赛冠军]\n- [AFDet: 无锚框的一阶段3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.12671) 和 [竞赛解决方案](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.15505.pdf) [[笔记](paper_notes\u002Fafdet.md)] \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [地平线机器人，激光雷达，Waymo挑战赛冠军] \n- 多目标跟踪与单目标跟踪综述 [[笔记](paper_notes\u002Fmot_and_sot.md)]\n- [CrowdHuman: 人群中的行人检测基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.00123) [[笔记](paper_notes\u002Fcrowdhuman.md)] [旷视科技，行人，数据集]\n- [WiderPerson: 野外密集行人检测的多样化数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.12118) [[笔记](paper_notes\u002Fwiderperson.md)] \u003Ckbd>TMM 2019\u003C\u002Fkbd> [数据集，行人]\n- [清华大学-戴姆勒骑自行车者数据集：基于视觉的骑自行车者检测新基准](http:\u002F\u002Fwww.gavrila.net\u002FPublications\u002Fiv16_cyclist_benchmark.pdf) [[笔记](paper_notes\u002Ftsinghua_daimler_cyclist.md)] \u003Ckbd>IV 2016\u003C\u002Fkbd> [数据集，骑自行车者检测]\n- [专业骑自行车者检测数据集：使用单目RGB相机进行骑自行车者检测的具有挑战性的真实世界计算机视觉数据集](https:\u002F\u002Fdrive.google.com\u002Fdrive\u002Fu\u002F0\u002Ffolders\u002F1inawrX9NVcchDQZepnBeJY4i9aAI5mg9) [[笔记](paper_notes\u002Fspecialized_cyclists.md)] \u003Ckbd>IV 2019\u003C\u002Fkbd> [KITTI的扩展]\n- [PointTrack: 以点的形式进行分割，实现高效的在线多目标跟踪与分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.01550) [[笔记](paper_notes\u002Fpointtrack.md)] \u003Ckbd>ECCV 2020口头报告\u003C\u002Fkbd> [MOTS]\n- [PointTrack++：高效在线多目标跟踪与分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.01549) [[笔记](paper_notes\u002Fpointtrack++.md)] \u003Ckbd>CVPR 2020研讨会\u003C\u002Fkbd> [CVPR2020 MOTS挑战赛冠军。PointTrack++在KITTI MOTS榜单中排名第一]\n- [SpatialEmbedding: 通过联合优化空间嵌入和聚类带宽实现实例分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.11109) [[笔记](paper_notes\u002Fspatial_embedding.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [单阶段，实例分割]\n- [BA-Net: 密集束调整网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.04807) [[笔记](paper_notes\u002Fbanet.md)] \u003Ckbd>ICLR 2019\u003C\u002Fkbd> [束调整，多帧单目深度估计，特征度量]\n- [DeepSFM: 基于深度束调整的运动结构](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.09697) \u003Ckbd>ECCV 2020口头报告\u003C\u002Fkbd> [多帧单目深度估计，室内场景]\n- [CVD: 一致的视频深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.15021) [[笔记](paper_notes\u002Fcvd.md)] \u003Ckbd>SIGGRAPH 2020\u003C\u002Fkbd> [多帧单目深度估计，在线微调]\n- [DeepV2D: 基于可微分运动结构的视频转深度](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.04605) [[笔记](paper_notes\u002Fdeepv2d.md)] \u003Ckbd>ICLR 2020\u003C\u002Fkbd> [多帧单目深度估计，Jia Deng]\n- [GeoNet: 无监督学习密集深度、光流和相机位姿](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.02276) [[笔记](paper_notes\u002Fgeonet.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [残差光流，单目深度，刚体与动态运动]\n- [GLNet: 单目视频中基于几何约束的自监督学习：连接光流、深度和相机](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.05820) [[笔记](paper_notes\u002Fglnet.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [在线微调，刚体与动态运动]\n- [Depth Hints: 自监督单目深度提示](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.09051) [[笔记](paper_notes\u002Fdepth_hints.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [单目深度，局部极小值，廉价立体GT]\n- [MonoUncertainty: 关于自监督单目深度估计的不确定性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.06209) [[笔记](paper_notes\u002Fmono_uncertainty.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [深度不确定性]\n- [基于可微分束调整的深度与自我运动的自监督学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.13163) [[笔记](paper_notes\u002Fba_sfm_learner.md)] [束调整，xmotors.ai，多帧单目深度]\n- [单目视频中的运动学3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.09548) [[笔记](paper_notes\u002Fkinematic_mono3d.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [多帧单目3D，Xiaoming Liu]\n- [VelocityNet: 基于摄像头的车辆速度估计（来自单目视频)](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.07094) [[笔记](paper_notes\u002Fvelocity_net.md)] \u003Ckbd>CVPR 2017研讨会\u003C\u002Fkbd> [单目速度估计，CVPR 2017挑战赛冠军]\n- [以车辆为中心的VelocityNet: 基于单目摄像头的ADAS中车距与相对速度端到端学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04082) [[笔记](paper_notes\u002Fvehicle_centric_velocity_net.md)] [单目速度估计，单目距离，SOTA]\n\n## 2020-06 (20)\n- [LeGO-LOAM：轻量级且针对地面优化的可变地形激光雷达里程计与建图](http:\u002F\u002Fpersonal.stevens.edu\u002F~benglot\u002FShan_Englot_IROS_2018_Preprint.pdf) [[笔记](paper_notes\u002Flego_loam.md)] \u003Ckbd>IROS 2018\u003C\u002Fkbd> [激光雷达, 建图]\n- [PIE：用于行人意图估计和轨迹预测的大规模数据集及模型](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FRasouli_PIE_A_Large-Scale_Dataset_and_Models_for_Pedestrian_Intention_Estimation_ICCV_2019_paper.pdf) [[笔记](paper_notes\u002Fpie.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [JAAD：他们会过马路吗？行人过街行为基准数据集与基线](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2017_workshops\u002Fpapers\u002Fw3\u002FRasouli_Are_They_Going_ICCV_2017_paper.pdf) \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [基于堆叠RNN中上下文特征融合的行人动作预判](https:\u002F\u002Fbmvc2019.org\u002Fwp-content\u002Fuploads\u002Fpapers\u002F0283-paper.pdf) \u003Ckbd>BMVC 2019\u003C\u002Fkbd>\n- [行人会过马路吗？通过2D姿态估计来回答](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.10580) \u003Ckbd>IV 2018\u003C\u002Fkbd>\n- [基于2D姿态估计的行人与骑行者意图识别](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.03858) \u003Ckbd>ITSC 2019\u003C\u002Fkbd> [骨架, 行人, 骑行者意图]\n- [多任务的专注单任务处理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08918) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [DETR：基于Transformer的端到端目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.12872) [[笔记](paper_notes\u002Fdetr.md)] \u003Ckbd>ECCV 2020 口头报告\u003C\u002Fkbd> [FAIR]\n- [Transformer：注意力就是你所需要的](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.03762) [[笔记](paper_notes\u002Ftransformer.md)] \u003Ckbd>NIPS 2017\u003C\u002Fkbd>\n- [SpeedNet：学习视频中的速度感](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.06130) [[笔记](paper_notes\u002Fspeednet.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd>\n- [MonoPair：利用成对空间关系的单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.00504) [[笔记](paper_notes\u002Fmonopair.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Mono3D, 成对关系]\n- [SMOKE：基于关键点估计的单阶段单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.10111) [[笔记](paper_notes\u002Fsmoke.md)] \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [Mono3D, Zongmu]\n- [环视摄像头系统的车辆Re-ID](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1e6y8wtHAricaEHS9CpasSGOx0aAxCGib\u002Fview) [[笔记](paper_notes\u002Freid_surround_fisheye.md)] \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [轮胎线, 车辆Re-ID, Zongmu]\n- [通过逐行分类实现端到端车道标记检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.08630) [[笔记](paper_notes\u002Fe2e_lmd.md)] [高通韩国, LLD作为分类器]\n- [利用CNN作为回归网络进行可靠的多车道检测与分类](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCVW_2018\u002Fpapers\u002F11133\u002FChougule_Reliable_multilane_detection_and_classification_by_utilizing_CNN_as_a_ECCVW_2018_paper.pdf) \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [LLD作为回归器]\n- [SUPER：一种新型车道检测系统](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.07277) [[笔记](paper_notes\u002Fsuper.md)]\n- [通过自注意力蒸馏学习轻量级车道检测CNN](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.00821) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [StixelNet：用于障碍物检测和道路分割的深度卷积网络](http:\u002F\u002Fwww.bmva.org\u002Fbmvc\u002F2015\u002Fpapers\u002Fpaper109\u002Fpaper109.pdf) \u003Ckbd>BMVC 2015\u003C\u002Fkbd>\n- [StixelNetV2：面向自动驾驶的实时类别化通用障碍物检测](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2017_workshops\u002Fpapers\u002Fw3\u002FGarnett_Real-Time_Category-Based_and_ICCV_2017_paper.pdf) [[笔记](paper_notes\u002Fstixelnetv2.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd> [DS]\n- [使用高效的亚像素卷积神经网络实现单张图像和视频的实时超分辨率](https:\u002F\u002Farxiv.org\u002Fabs\u002F1609.05158) [[笔记](paper_notes\u002Fsubpixel_conv.md)] \u003Ckbd>CVPR 2016\u003C\u002Fkbd> [通道转像素]\n- [上下文中的汽车姿态：结合地面约束的精确姿态估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04363) [mono3D]\n- [Self-Mono-SF：自监督单目场景流估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.04143) [[笔记](paper_notes\u002Fself_mono_sf.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [场景流, 立体输入]\n- [MEBOW：野外环境下的单目身体朝向估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.13688) [[笔记](paper_notes\u002Fmebow.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [VG-NMS：可见性引导的NMS：在拥挤交通场景中高效提升非模态目标检测效果](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.08547) [[笔记](paper_notes\u002Fvg_nms.md)] \u003Ckbd>NeurIPS 2019研讨会\u003C\u002Fkbd> [拥挤场景, NMS, 戴姆勒]\n- [WYSIWYG：所见即所得：利用可见性进行3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04986) [[笔记](paper_notes\u002Fwysiwyg.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [占用栅格]\n- [基于密集检测的实时全景分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01202) [[笔记](paper_notes\u002Frealtime_panoptic.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [边界框 + 语义分割 = 全景分割, 丰田]\n- [面向自动驾驶的以人为核心的图像标注效率提升](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1DY95vfWBLKOOZZyq8gLDd0heZ6aBSdji\u002Fview) [[笔记](paper_notes\u002Fhuman_centric_annotation.md)] \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [高效标注]\n- [SurfelGAN：为自动驾驶合成逼真的传感器数据](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.03844) [[笔记](paper_notes\u002Fsurfel_gan.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [Waymo, 自动数据生成, surfel]\n- [LiDARsim：借助真实世界实现逼真的激光雷达仿真](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.09348) [[笔记](paper_notes\u002Flidarsim.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [Uber ATG, 自动数据生成, surfel]\n- [SuMa++：高效的基于激光雷达的语义SLAM](http:\u002F\u002Fwww.ipb.uni-bonn.de\u002Fwp-content\u002Fpapercite-data\u002Fpdf\u002Fchen2019iros.pdf) \u003Ckbd>IROS 2019\u003C\u002Fkbd> [语义分割, 激光雷达, SLAM]\n- [PON\u002FPyrOccNet：利用金字塔占用网络从图像预测语义地图表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13402) [[笔记](paper_notes\u002Fpyroccnet.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [BEV-Net, OFT]\n- [MonoLayout：从单张图像重建非模态场景布局](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08394) [[笔记](paper_notes\u002Fmonolayout.md)] \u003Ckbd>WACV 2020\u003C\u002Fkbd> [BEV-Net]\n- [BEV-Seg：利用几何与语义点云进行鸟瞰视角语义分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.11436) [[笔记](paper_notes\u002Fbev_seg.md)] \u003Ckbd>CVPR 2020研讨会\u003C\u002Fkbd> [BEV-Net, 建图]\n- [一种从图像获取鸟瞰视角的几何方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02231) \u003Ckbd>ICCVW 2019\u003C\u002Fkbd> [建图, 几何, Andrew Zisserman]\n- [FrozenDepth：通过观察静止的人来学习运动中人的深度](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.11111) [[笔记](paper_notes\u002Ffrozen_depth.md)] \u003Ckbd>CVPR 2019 口头报告\u003C\u002Fkbd>\n- [ORB-SLAM：一个多功能且精确的单目SLAM系统](https:\u002F\u002Farxiv.org\u002Fabs\u002F1502.00956) \u003Ckbd>TRO 2015\u003C\u002Fkbd>\n- [ORB-SLAM2：适用于单目、立体和RGB-D相机的开源SLAM系统](https:\u002F\u002Farxiv.org\u002Fabs\u002F1610.06475) \u003Ckbd>TRO 2016\u003C\u002Fkbd>\n- [CubeSLAM：单目3D物体SLAM](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.00557) [[笔记](paper_notes\u002Fcube_slam.md)] \u003Ckbd>TRO 2019\u003C\u002Fkbd> [动态SLAM, orb slam + mono3D]\n- [ClusterVO：聚类移动实例并估计自身及周围环境的视觉里程计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.12980) [[笔记](paper_notes\u002Fcluster_vo.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [通用动态SLAM]\n- [S3DOT：基于立体视觉的语义3D目标与自我运动跟踪，用于自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.02062) [[笔记](paper_notes\u002Fs3dot.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [Peiliang Li]\n- [用于动态环境的多目标单目SLAM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.03528) [[笔记](paper_notes\u002Fmulti_object_mono_slam.md)] \u003Ckbd>IV 2020\u003C\u002Fkbd> [monolayout作者]\n- [PWC-Net：利用金字塔、变形和代价体积的光流CNN](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.02371) [[笔记](paper_notes\u002Fpwc_net.md)] \u003Ckbd>CVPR 2018 口头报告\u003C\u002Fkbd> [光流]\n- [LiteFlowNet：用于光流估计的轻量级卷积神经网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.07036) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [光流]\n- [FlowNet：用卷积网络学习光流](https:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_iccv_2015\u002Fpapers\u002FDosovitskiy_FlowNet_Learning_Optical_ICCV_2015_paper.pdf) \u003Ckbd>ICCV 2015\u003C\u002Fkbd> [光流]\n- [FlowNet 2.0：深度网络下光流估计的发展](https:\u002F\u002Farxiv.org\u002Fabs\u002F1612.01925) \u003Ckbd>CVPR 2017\u003C\u002Fkbd> [光流]\n- [ESPNetv2：一种轻量、节能且通用的卷积神经网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.11431) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [语义分割, 轻量化]\n- [Mono-SF：多视角几何与单视角深度相结合，用于动态交通场景的单目场景流估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.06316) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [深度不确定性]\n\n## 2020-05 (19)\n- [基于自我中心视觉的未来车辆定位用于智能驾驶辅助系统](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.07408) [[笔记](paper_notes\u002Fhevi.md)] [本田] \u003Ckbd>ICRA 2019\u003C\u002Fkbd>\n- [PackNet：用于自监督单目深度估计的3D打包方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02693) [[笔记](paper_notes\u002Fpacknet.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [尺度感知深度]\n- [PackNet-SG：面向自监督单目深度估计的语义引导表征学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.12319) [[笔记](paper_notes\u002Fpacknet_sg.md)] \u003Ckbd>ICLR 2020\u003C\u002Fkbd> [TRI，无穷大深度问题]\n- [TrianFlow：迈向更好的泛化能力——无需PoseNet的联合深度-位姿学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.01314) [[笔记](paper_notes\u002Ftrianflow.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [尺度感知]\n- [理解基于CNN的绝对相机位姿回归的局限性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.07504) [[笔记](paper_notes\u002Funderstanding_apr.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [PoseNet、MapNet的缺点，Laura Leal-Taixe@TUM]\n- [学还是不学：从本质矩阵进行视觉定位](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.01293) [[笔记](paper_notes\u002Fto_learn_or_not.md)] \u003Ckbd>ICRA 2020\u003C\u002Fkbd> [SIFT + 5点求解器 >> 其他VO方法，Laura Leal-Taixe@TUM]\n- [DF-VO：重访视觉里程计——究竟应该学习什么？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.09803) [[笔记](paper_notes\u002Fdf_vo.md)] \u003Ckbd>ICRA 2020\u003C\u002Fkbd> [深度与光流用于精确的VO]\n- [D3VO：用于单目视觉里程计的深度、位姿及不确定性深度学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.01060) [[笔记](paper_notes\u002Fd3vo.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [Daniel Cremers, TUM, 深度不确定性]\n- [网络瘦身：通过网络瘦身学习高效的卷积网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1708.06519) [[笔记](paper_notes\u002Fnetwork_slimming.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [BatchNorm剪枝：重新思考卷积层通道剪枝中“范数越小信息量越少”的假设](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.00124) [[笔记](paper_notes\u002Fbatchnorm_pruning.md)] \u003Ckbd>ICLR 2018\u003C\u002Fkbd>\n- [直接稀疏里程计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1607.02565) \u003Ckbd>PAMI 2018\u003C\u002Fkbd>\n- [在德国训练，在美国测试：使3D目标检测器具备更强的泛化能力](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.08139) [[笔记](paper_notes\u002Ftrain_in_germany.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [PseudoLidarV3：用于基于图像的3D目标检测的端到端伪LiDAR方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.03080) [[笔记](paper_notes\u002Fpseudo_lidar_v3.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [ATSS：通过自适应训练样本选择弥合基于锚框与无锚框检测之间的差距](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.02424) [[笔记](paper_notes\u002Fatss.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd>\n- [距离IoU损失：更快更好的边界框回归学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.08287) \u003Ckbd>AAAI 2020\u003C\u002Fkbd>\n- [增强目标检测和实例分割中模型学习与推理中的几何因素](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.03572) [期刊版]\n- [YOLOv4：目标检测的最佳速度与精度](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.10934) [[笔记](paper_notes\u002Fyolov4.md)]\n- [CBN：跨迭代批归一化](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.05712) [[笔记](paper_notes\u002Fcbn.md)]\n- [Stitcher：面向目标检测的反馈驱动数据提供者](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.12432) [[笔记](paper_notes\u002Fstitcher.md)]\n- [SKNet：选择性卷积网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.06586) [[笔记](paper_notes\u002Fsknet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [CBAM：卷积块注意力模块](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.06521) [[笔记](paper_notes\u002Fcbam.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [ResNeSt：分裂注意力网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.08955) [[笔记](paper_notes\u002Fresnest.md)]\n\n## 2020-04 (14)\n- [ChauffeurNet：通过模仿最佳并合成最差来学习驾驶](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.03079.pdf) [[笔记](paper_notes\u002Fchauffeurnet.md)] \u003Ckbd>RSS 2019\u003C\u002Fkbd> [Waymo]\n- [IntentNet：从原始传感器数据中学习预测意图](http:\u002F\u002Fwww.cs.toronto.edu\u002F~wenjie\u002Fpapers\u002Fintentnet_corl18.pdf) [[笔记](paper_notes\u002Fintentnet.md)] \u003Ckbd>CoRL 2018\u003C\u002Fkbd> [Uber ATG，感知与预测，激光雷达+地图]\n- [RoR：道路规则：利用语义交互的卷积模型预测驾驶行为](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.08945) [[笔记](paper_notes\u002Fror.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [Zoox]\n- [MultiPath：用于行为预测的多条概率性锚定轨迹假设](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05449) [[笔记](paper_notes\u002Fmultipath.md)] \u003Ckbd>CoRL 2019\u003C\u002Fkbd> [Waymo，作者来自RoR和ChauffeurNet]\n- [NMP：端到端可解释的神经运动规划器](http:\u002F\u002Fwww.cs.toronto.edu\u002F~wenjie\u002Fpapers\u002Fcvpr19\u002Fnmp.pdf) [[笔记](paper_notes\u002Fnmp.md)] \u003Ckbd>CVPR 2019 口头报告\u003C\u002Fkbd> [Uber ATG]\n- [使用深度卷积网络进行自动驾驶的多模态轨迹预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.10732) [[笔记](paper_notes\u002Fmultipath_uber.md)] \u003Ckbd>ICRA 2019\u003C\u002Fkbd> [Henggang Cui，多模态，Uber ATG匹兹堡]\n- [面向自动驾驶的交通参与者短期运动预测中的不确定性感知](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.05819) \u003Ckbd>WACV 2020\u003C\u002Fkbd> [Uber ATG匹兹堡]\n- [TensorMask：密集目标分割的基础](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.12174) [[笔记](paper_notes\u002Ftensormask.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [单阶段实例分割]\n- [BlendMask：自顶向下结合自底向上实现实例分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.00309) [[笔记](paper_notes\u002Fblendmask.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd>\n- [用于单次射击实例分割的掩码编码](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.11712) [[笔记](paper_notes\u002Fmeinst.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [单阶段实例分割，Chunhua Shen]\n- [PolarMask：采用极坐标表示的单次射击实例分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.13226) [[笔记](paper_notes\u002Fpolarmask.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [单阶段实例分割]\n- [SOLO：按位置分割目标](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.04488) [[笔记](paper_notes\u002Fsolo.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [单阶段实例分割，Chunhua Shen]\n- [SOLOv2：动态、更快、更强](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.10152) [[笔记](paper_notes\u002Fsolov2.md)] [单阶段实例分割，Chunhua Shen]\n- [CondInst：用于实例分割的条件卷积](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.05664) [[笔记](paper_notes\u002Fcondinst.md)] \u003Ckbd>ECCV 2020 口头报告\u003C\u002Fkbd> [单阶段实例分割，Chunhua Shen]\n- [CenterMask：采用点表示的单次射击实例分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.04446) [[笔记](paper_notes\u002Fcentermask.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n\n## 2020-03 (15)\n- [VPGNet: 基于消失点引导的车道线与道路标记检测与识别网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.06288) [[笔记](paper_notes\u002Fvpgnet.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [多任务学习中哪些任务应该一起学习？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.07553) [[笔记](paper_notes\u002Ftask_grouping.md)] [斯坦福大学，MTL] \u003Ckbd>ICML 2020\u003C\u002Fkbd>\n- [MGDA: 多任务学习作为多目标优化](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.04650) \u003Ckbd>NeurIPS 2018\u003C\u002Fkbd>\n- [Taskonomy: 解耦任务迁移学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.08328) [[笔记](paper_notes\u002Ftaskonomy.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [重新思考ImageNet预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.08883) [[笔记](paper_notes\u002Frethinking_pretraining.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [何恺明]\n- [UnsuperPoint: 端到端无监督兴趣点检测与描述](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.04011) [[笔记](paper_notes\u002Funsuperpoint.md)] [superpoint]\n- [KP2D: 自监督关键点学习中的神经异常值剔除](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.10615) [[笔记](paper_notes\u002Fkp2d.md)] \u003Ckbd>ICLR 2020\u003C\u002Fkbd> (pointNet)\n- [KP3D: 用于自我运动估计的自监督3D关键点学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.03426) [[笔记](paper_notes\u002Fkp3d.md)] \u003Ckbd>CoRL 2020\u003C\u002Fkbd> [丰田，superpoint]\n- [NG-RANSAC: 神经引导的RANSAC：学习在哪里采样模型假设](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.04132) [[笔记](paper_notes\u002Fng_ransac.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [pointNet]\n- [学习寻找良好的对应关系](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.05971) [[笔记](paper_notes\u002Flearning_correspondence.md)] \u003Ckbd>CVPR 2018 口头报告\u003C\u002Fkbd> (pointNet)\n- [RefinedMPL: 面向自动驾驶中3D目标检测的改进单目伪LiDAR](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.09712) [[笔记](paper_notes\u002Frefined_mpl.md)] [华为，Mono3D]\n- [DSP: 基于解耦结构化多边形估计和高度引导深度估计的单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.01619) [[笔记](paper_notes\u002Fdsp.md)] \u003Ckbd>AAAI 2020\u003C\u002Fkbd> (商汤科技，Mono3D)\n- [利用深度神经网络从连续驾驶场景中鲁棒地检测车道线](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.02193) (LLD, LSTM)\n- [LaneNet: 向端到端车道线检测迈进：一种实例分割方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.05591) [[笔记](paper_notes\u002Flanenet.md)] \u003Ckbd>IV 2018\u003C\u002Fkbd> (LaneNet)\n- [3D-LaneNet: 端到端3D多车道线检测](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FGarnett_3D-LaneNet_End-to-End_3D_Multiple_Lane_Detection_ICCV_2019_paper.pdf) [[笔记](paper_notes\u002F3d_lanenet.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [半局部3D车道线检测与不确定性估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.05257) [[笔记](paper_notes\u002Fsemilocal_3d_lanenet.md)] [GM以色列，3D LLD]\n- [Gen-LaneNet: 一种通用且可扩展的3D车道线检测方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.10656) [[笔记](paper_notes\u002Fgen_lanenet.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Apollo，3D LLD]\n- [交通场景中不确定条件下的人群长期车载预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.09026) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [视角预测]\n- [不仅仅是规模问题：数据特性在行人检测中的作用](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCVW_2018\u002Fpapers\u002F11129\u002FRasouli_Its_Not_All_About_Size_On_the_Role_of_Data_ECCVW_2018_paper.pdf) \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [行人]\n\n\n## 2020-02 (12)\n- [关联嵌入：用于联合检测与分组的端到端学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.05424) [[笔记](paper_notes\u002Fassociative_embedding.md)] \u003Ckbd>NIPS 2017\u003C\u002Fkbd>\n- [通过关联嵌入从像素到图](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.07365) [[笔记](paper_notes\u002Fpixels_to_graphs.md)] \u003Ckbd>NIPS 2017\u003C\u002Fkbd>\n- [Social LSTM: 人群密集区域中的人类轨迹预测](http:\u002F\u002Fcvgl.stanford.edu\u002Fpapers\u002FCVPR16_Social_LSTM.pdf) [[笔记](paper_notes\u002Fsocial_lstm.md)] \u003Ckbd>CVPR 2017\u003C\u002Fkbd> \n- [使用关联LSTM进行在线视频目标检测](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2017\u002Fpapers\u002FLu__Online_Video_ICCV_2017_paper.pdf) [[笔记](paper_notes\u002Fassociation_lstm.md)] [单阶段，循环]\n- [SuperPoint: 自监督兴趣点检测与描述](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.07629) [[笔记](paper_notes\u002Fsuperpoint.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (通道到像素，深度SLAM，Magic Leap)\n- [PointRend: 将图像分割视为渲染](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.08193) [[笔记](paper_notes\u002Fpointrend.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [何恺明，FAIR]\n- [Multigrid: 一种高效训练视频模型的多尺度方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.00998) [[笔记](paper_notes\u002Fmultigrid_training.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [何恺明，FAIR]\n- [GhostNet: 以低成本操作获得更多特征](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.11907) [[笔记](paper_notes\u002Fghostnet.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [FixRes: 解决训练与测试分辨率不一致的问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.06423) [[笔记](paper_notes\u002Ffixres.md)] \u003Ckbd>NIPS 2019\u003C\u002Fkbd> [FAIR]\n- [MoVi-3D: 向单目3D目标检测中跨深度的泛化迈进](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.08035) [[笔记](paper_notes\u002Fmovi_3d.md)] \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Virtual Cam，视口，Mapillary\u002FFacebook，Mono3D] \n- [自然场景中的残缺补全与大小恒常性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1509.08147) [[笔记](paper_notes\u002Famodal_completion.md)] \u003Ckbd>ICCV 2015\u003C\u002Fkbd> (残缺补全)\n- [MoCo: 用于无监督视觉表征学习的动力对比](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.05722) [[笔记](paper_notes\u002Fmoco.md)] \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [FAIR，何恺明]\n\n## 2020-01 (19)\n- [双重下降：调和现代机器学习实践与偏差-方差权衡](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.11118) [[笔记](paper_notes\u002Fdouble_descent.md)] \u003Ckbd>PNAS 2019\u003C\u002Fkbd>\n- [深度双重下降：为何更大的模型和更多数据会带来负面影响](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.02292) [[笔记](paper_notes\u002Fdeep_double_descent.md)]\n- [神经网络损失景观的可视化](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.09913) \u003Ckbd>NIPS 2018\u003C\u002Fkbd>\n- [ApolloScape 自动驾驶开放数据集及其应用](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.06184.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (数据集)\n- [ApolloCar3D：面向自动驾驶的大规模3D汽车实例理解基准](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.12222) [[笔记](paper_notes\u002Fapollocar3d.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [基于单张街景图像的部件级汽车解析与重建](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.10837) [[笔记](paper_notes\u002Fapollo_car_parts.md)] [百度]\n- [6D-VNet：从单目RGB图像端到端估计车辆6自由度位姿](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPRW_2019\u002Fpapers\u002FAutonomous%20Driving\u002FWu_6D-VNet_End-to-End_6-DoF_Vehicle_Pose_Estimation_From_Monocular_RGB_Images_CVPRW_2019_paper.pdf) [[笔记](paper_notes\u002F6d_vnet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [RTM3D：面向自动驾驶的基于目标关键点的实时单目3D检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.03343) [[笔记](paper_notes\u002Frtm3d.md)] \u003Ckbd>ECCV 2020 spotlight\u003C\u002Fkbd>\n- [DORN：用于单目深度估计的深度序数回归网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.02446) [[笔记](paper_notes\u002Fdorn.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [monodepth, 监督]\n- [D&T：先检测后跟踪，再由跟踪反推检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.03958) [[笔记](paper_notes\u002Fdetect_track.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd> (来自Feichtenhofer)\n- [CRF-Net：基于深度学习的雷达与摄像头传感器融合架构，用于目标检测](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8916629\u002F) [[笔记](paper_notes\u002Fcrf_net.md)] \u003Ckbd>SDF 2019\u003C\u002Fkbd> (雷达检测)\n- [RVNet：单目摄像头与雷达的深度传感器融合，用于复杂环境下的基于图像的目标检测](https:\u002F\u002Fwww.researchgate.net\u002Fprofile\u002FVijay_John3\u002Fpublication\u002F335833918_RVNet_Deep_Sensor_Fusion_of_Monocular_Camera_and_Radar_for_Image-based_Obstacle_Detection_in_Challenging_Environments\u002Flinks\u002F5d7f164e92851c87c38b09f1\u002FRVNet-Deep-Sensor-Fusion-of-Monocular-Camera-and-Radar-for-Image-based-Obstacle-Detection-in-Challenging-Environments.pdf) [[笔记](paper_notes\u002Frvnet.md)] \u003Ckbd>PSIVT 2019\u003C\u002Fkbd>\n- [RRPN：用于自动驾驶车辆目标检测的雷达区域建议网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.00526) [[笔记](paper_notes\u002Frrpn_radar.md)] \u003Ckbd>ICIP 2019\u003C\u002Fkbd>\n- [ROLO：用于视觉目标跟踪的空间监督循环卷积神经网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1607.05781) [[笔记](paper_notes\u002Frolo.md)] \u003Ckbd>ISCAS 2016\u003C\u002Fkbd>\n- [循环SSD：用于视频目标检测的多帧单次检测器](https:\u002F\u002Fwww.merl.com\u002Fpublications\u002Fdocs\u002FTR2018-137.pdf) [[笔记](paper_notes\u002Frecurrent_ssd.md)] \u003Ckbd>BMVC 2018\u003C\u002Fkbd> (三菱)\n- [循环RetinaNet：基于焦点损失的视频目标检测模型](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-3-030-04212-7_44) [[笔记](paper_notes\u002Frecurrent_retinanet.md)] \u003Ckbd>ICONIP 2018\u003C\u002Fkbd> (单阶段，循环)\n- [动作即移动点](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.04608) [[笔记](paper_notes\u002Fmoc.md)] [不适合在线使用]\n- [PREVENTION 数据集：一种用于预测车辆意图的新基准](10.1109\u002FITSC.2019.8917433) [[笔记](paper_notes\u002Fprevention_dataset.md)] \u003Ckbd>ITSC 2019\u003C\u002Fkbd> [数据集，切入]\n- [用于多模态远距离传感器数据集的半自动高精度标注工具](https:\u002F\u002Fsci-hub.tw\u002F10.1109\u002FIVS.2018.8500672) [[笔记](paper_notes\u002Fprevention_annotation.md)] \u003Ckbd>IV 2018\u003C\u002Fkbd>\n- [Astyx 数据集：用于基于深度学习的3D目标检测的车载雷达数据集](https:\u002F\u002Fwww.astyx.com\u002Ffileadmin\u002Fredakteur\u002Fdokumente\u002FAutomotive_Radar_Dataset_for_Deep_learning_Based_3D_Object_Detection.PDF) [[笔记](paper_notes\u002Fastyx_dataset.md)] \u003Ckbd>EuRAD 2019\u003C\u002Fkbd> (Astyx)\n- [Astyx 摄像头雷达：基于深度学习的车载雷达与摄像头3D目标检测](https:\u002F\u002Fwww.astyx.net\u002Ffileadmin\u002Fredakteur\u002Fdokumente\u002FDeep_Learning_Based_3D_Object_Detection_for_Automotive_Radar_and_Camera.PDF) [[笔记](paper_notes\u002Fastyx_radar_camera_fusion.md)] \u003Ckbd>EuRAD 2019\u003C\u002Fkbd> (Astyx)\n\n## 2019年12月 (12篇)\n- [神经网络如何从单张图像中感知深度？](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002Fvan_Dijk_How_Do_Neural_Networks_See_Depth_in_Single_Images_ICCV_2019_paper.pdf) [[笔记](paper_notes\u002Fwhat_monodepth_see.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [自监督稀疏转稠密：基于LiDAR和单目相机的自监督深度补全](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.00275) \u003Ckbd>ICRA 2019\u003C\u002Fkbd> （深度补全）\n- [DC：用于深度补全的深度系数](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.05421) [[笔记](paper_notes\u002Fdepth_coeff.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [Xiaoming Liu, 多模态]\n- [从一条线中解析几何：结合部分激光观测的单目深度估计](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.02174.pdf) [[笔记](paper_notes\u002Fdepth_from_one_line.md)] \u003Ckbd>ICRA 2017\u003C\u002Fkbd>\n- [VO-Monodepth：利用传统视觉里程计增强自监督单目深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.03127) [[笔记](paper_notes\u002Fvo_monodepth.md)] \u003Ckbd>3DV 2019\u003C\u002Fkbd> （稀疏转稠密）\n- [概率目标检测：定义与评估](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.10800) [[笔记](paper_notes\u002Fpdq.md)]\n- [Fishyscapes基准：衡量语义分割中的盲点](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.03215) [[笔记](paper_notes\u002Ffishyscape.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [论现代神经网络的校准](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.04599) [[笔记](paper_notes\u002Fcalib_modern_nn.md)] \u003Ckbd>ICML 2017\u003C\u002Fkbd> （Weinberger）\n- [极端点击法用于高效的目标标注](https:\u002F\u002Farxiv.org\u002Fabs\u002F1708.02750) [[笔记](paper_notes\u002Fextreme_clicking.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [雷达与摄像头在高级驾驶辅助系统中的早期融合用于车辆检测](https:\u002F\u002Fml4ad.github.io\u002Ffiles\u002Fpapers\u002FRadar%20and%20Camera%20Early%20Fusion%20for%20Vehicle%20Detection%20in%20Advanced%20Driver%20Assistance%20Systems.pdf) [[笔记](paper_notes\u002Fradar_camera_qcom.md)] \u003Ckbd>NeurIPS 2019\u003C\u002Fkbd> （雷达）\n- [用于高效训练LiDAR 3D目标检测器的深度主动学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.10609) [[笔记](paper_notes\u002Fdeep_active_learning_lidar.md)] \u003Ckbd>IV 2019\u003C\u002Fkbd>\n- [C3DPO：用于非刚性结构光流重建的规范3D姿态网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02533) [[笔记](paper_notes\u002Fc3dpo.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [YOLACT：实时实例分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.02689) [[笔记](paper_notes\u002Fyolact.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [单阶段实例分割]\n- [YOLACT++：更优的实时实例分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.06218) [单阶段实例分割]\n\n\n## 2019年11月 (20篇)\n- [图像与特征描述子综述](paper_notes\u002Freview_descriptors.md)\n- [基于距离-方位-多普勒张量的深度学习进行车载雷达车辆检测](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCVW_2019\u002Fpapers\u002FCVRSUAD\u002FMajor_Vehicle_Detection_With_Automotive_Radar_Using_Deep_Learning_on_Range-Azimuth-Doppler_ICCVW_2019_paper.pdf) [[笔记](paper_notes\u002Fradar_fft_qcom.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [GPP：地面平面投票法用于道路上物体的6DoF位姿估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.06666) [[笔记](paper_notes\u002Fgpp.md)] \u003Ckbd>IV 2020\u003C\u002Fkbd> [UCSD, Trevidi, 单目3DOD]\n- [MVRA：用于姿态估计的多视角重投影架构](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCVW_2019\u002Fpapers\u002FADW\u002FChoi_Multi-View_Reprojection_Architecture_for_Orientation_Estimation_ICCVW_2019_paper.pdf) [[笔记](paper_notes\u002Fmvra.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [YOLOv3：一项渐进式改进](https:\u002F\u002Fpjreddie.com\u002Fmedia\u002Ffiles\u002Fpapers\u002FYOLOv3.pdf)\n- [高斯YOLOv3：一种利用定位不确定性实现精准快速目标检测的自动驾驶专用检测器](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.04620) [[笔记](paper_notes\u002Fgaussian_yolov3.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> （带不确定性的检测）\n- [贝叶斯YOLOv3：单阶段目标检测中的不确定性估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.10296) [[笔记](paper_notes\u002Fbayesian_yolov3.md)] [DriveU]\n- [迈向安全自动驾驶：在用于LiDAR 3D车辆检测的深度神经网络中捕捉不确定性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.05132) [[笔记](paper_notes\u002Ftowards_safe_ad.md)] \u003Ckbd>ITSC 2018\u003C\u002Fkbd> （DriveU）\n- [利用异方差随机不确定性实现鲁棒的实时LiDAR 3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.05590) [[笔记](paper_notes\u002Ftowards_safe_ad2.md)] \u003Ckbd>IV 2019\u003C\u002Fkbd> （DriveU）\n- [我们能信任你吗？关于自动驾驶用概率目标检测器的校准问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.12358) [[笔记](paper_notes\u002Ftowards_safe_ad_calib.md)] \u003Ckbd>IROS 2019\u003C\u002Fkbd> （DriveU）\n- [LaserNet：一种高效的概率3D目标检测器用于自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.08701) [[笔记](paper_notes\u002Flasernet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> （不确定性）\n- [LaserNet KL：学习一种面向不确定性的自动驾驶目标检测器](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.11375) [[笔记](paper_notes\u002Flasernet_kl.md)] [带有KL散度的LaserNet]\n- [IoUNet：获取定位置信度以实现精准目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.11590) [[笔记](paper_notes\u002Fiou_net.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [gIoU：广义交并比：用于边界框回归的度量与损失](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.09630) [[笔记](paper_notes\u002Fgiou.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [洛瓦兹Softmax损失：神经网络中优化交并比度量的可处理替代损失函数](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.08790) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [将IoU作为损失]\n- [KL损失：带有不确定性的边界框回归，用于精准目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.08545) [[笔记](paper_notes\u002Fkl_loss.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [CAM-Convs：面向单视图深度的相机感知多尺度卷积](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.02028) [[笔记](paper_notes\u002Fcam_conv.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [BayesOD：一种用于深度目标检测器中不确定性估计的贝叶斯方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.03838) [[笔记](paper_notes\u002Fbayes_od.md)]\n- [TW-SMNet：远程宽基线立体匹配的深度多任务学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.04463) [[笔记](paper_notes\u002Ftwsm_net.md)] \u003Ckbd>ICIP 2019\u003C\u002Fkbd>\n- [通过校准回归实现深度学习中的精准不确定性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.00263) [[笔记](paper_notes\u002Fdl_regression_calib.md)] \u003Ckbd>ICML 2018\u003C\u002Fkbd>\n- [校准目标定位任务中的不确定性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.11210) [[笔记](paper_notes\u002F2dod_calib.md)] \u003Ckbd>NIPS 2018\u003C\u002Fkbd>\n- [SMWA：关于基于CNN的视差估计过平滑问题的研究](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FChen_On_the_Over-Smoothing_Problem_of_CNN_Based_Disparity_Estimation_ICCV_2019_paper.pdf) [[笔记](paper_notes\u002Fsmwa.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [多模态，深度估计]\n- [稀疏转稠密：从稀疏深度样本和单张图像中预测深度](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.07492) [[笔记](paper_notes\u002Fsparse_to_dense.md)] \u003Ckbd>ICRA 2018\u003C\u002Fkbd> （深度补全）\n\n## 2019-10 (18)\n- [单目目标检测综述](paper_notes\u002Freview_mono_3dod.md)\n- [单目3D目标检测中的2D-3D约束综述](paper_notes\u002Fmono_3dod_2d3d_constraints.md)\n- [MonoGRNet 2：基于关键点几何推理的单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.05618) [[笔记](paper_notes\u002Fmonogrnet_russian.md)] [从关键点估计深度]\n- [Deep MANTA：一种从单目图像中进行联合2D和3D车辆分析的粗到精多任务网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.07570) [[笔记](paper_notes\u002Fdeep_manta.md)] \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [SS3D：使用交并比损失端到端训练的单目3D目标检测与包围盒拟合](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.08070) [[笔记](paper_notes\u002Fss3d.md)] [从图像回归距离，类似CenterNet]\n- [GS3D：面向自动驾驶的高效3D目标检测框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.10955) [[笔记](paper_notes\u002Fgs3d.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [M3D-RPN：用于目标检测的单目3D区域建议网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.06038) [[笔记](paper_notes\u002Fm3d_rpn.md)] \u003Ckbd>ICCV 2019 口头报告\u003C\u002Fkbd> [3D锚框，骑行者，Xiaoming Liu]\n- [TLNet：三角测量学习网络——从单目到双目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.01193) [[笔记](paper_notes\u002Ftlnet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [自动驾驶应用中的3D目标检测方法综述](http:\u002F\u002Fwrap.warwick.ac.uk\u002F114314\u002F1\u002FWRAP-survey-3D-object-detection-methods-autonomous-driving-applications-Arnold-2019.pdf) [[笔记](paper_notes\u002F3dod_review.md)] \u003Ckbd>TITS 2019\u003C\u002Fkbd> [综述]\n- [BEV-IPM：基于逆透视映射图像的深度学习车辆位置与方向估计](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8814050) [[笔记](paper_notes\u002Fbev_od_ipm.md)] \u003Ckbd>IV 2019\u003C\u002Fkbd>\n- [ForeSeE：面向3D目标检测的任务感知单目深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.07701) [[笔记](paper_notes\u002Fforesee_mono3dod.md)] \u003Ckbd>AAAI 2020 口头报告\u003C\u002Fkbd> [伪激光雷达的继任者，单目3D目标检测SOTA]\n- [Obj-dist：从单目图像中学习特定于物体的距离](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.04182) [[笔记](paper_notes\u002Fobj_dist_iccv2019.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> (xmotors.ai + NYU) [单目距离]\n- [DisNet：一种基于单目相机的距离估计新方法](https:\u002F\u002Fproject.inria.fr\u002Fppniv18\u002Ffiles\u002F2018\u002F10\u002Fpaper22.pdf) [[笔记](paper_notes\u002Fdisnet.md)] \u003Ckbd>IROS 2018\u003C\u002Fkbd> [单目距离]\n- [BirdGAN：用于自动驾驶车辆3D目标检测的2D到3D提升学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08494) [[笔记](paper_notes\u002Fbirdgan.md)] \u003Ckbd>IROS 2019\u003C\u002Fkbd>\n- [Shift R-CNN：具有闭式几何约束的深度单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.09970) [[笔记](paper_notes\u002Fshift_rcnn.md)] \u003Ckbd>ICIP 2019\u003C\u002Fkbd>\n- [3D-RCNN：通过渲染与比较实现实例级3D目标重建](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FKundu_3D-RCNN_Instance-Level_3D_CVPR_2018_paper.pdf) [[笔记](paper_notes\u002F3d_rcnn.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [用于单目深度估计和3D目标检测的深度光学](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08601) [[笔记](paper_notes\u002Fdeep_optics.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [MonoLoco：单目3D行人定位与不确定性估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.06059) [[笔记](paper_notes\u002Fmonoloco.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [联合单目3D车辆检测与跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.10742) [[笔记](paper_notes\u002Fmono_3d_tracking.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> (伯克利DeepDrive)\n- [CasGeo：基于级联几何约束及利用3D结果净化2D检测的自动驾驶车辆3D边界框估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.01867) [[笔记](paper_notes\u002Fcasgeo.md)]\n\n## 2019-09 (17)\n- [可裁剪神经网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.08928) [[笔记](paper_notes\u002Fslimmable_networks.md)] \u003Ckbd>ICLR 2019\u003C\u002Fkbd>\n- [通用可裁剪网络及改进的训练技术](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.05134) [[笔记](paper_notes\u002Funiversal_slimmable.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [AutoSlim：面向通道数的一次性架构搜索](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.11728)\n- [Once for All：训练一个网络并将其专门化以实现高效部署](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1908.09791.pdf)\n- [DOTA：用于航空影像目标检测的大规模数据集](https:\u002F\u002Fvision.cornell.edu\u002Fse3\u002Fwp-content\u002Fuploads\u002F2018\u002F03\u002F2666.pdf) [[笔记](paper_notes\u002Fdota.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd>（旋转边界框）\n- [RoiTransformer：学习用于航空影像中定向目标检测的RoI Transformer](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2019\u002Fpapers\u002FDing_Learning_RoI_Transformer_for_Oriented_Object_Detection_in_Aerial_Images_CVPR_2019_paper.pdf) [[笔记](paper_notes\u002Froi_transformer.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>（旋转边界框）\n- [RRPN：通过旋转提议进行任意方向场景文本检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.01086) \u003Ckbd>TMM 2018\u003C\u002Fkbd>\n- [R2CNN：用于鲁棒定向场景文本检测的旋转区域卷积网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.09579)（旋转边界框）\n- [TI 白皮书：网络研讨会——用于汽车和工业应用的毫米波雷达\n](https:\u002F\u002Ftraining.ti.com\u002Fepd-pro-rap-mmwaveradar-adh-tr-webinar-eu) [[笔记](paper_notes\u002Fti_mmwave_radar_webinar.md)] [TI，雷达]\n- [联邦学习：提升通信效率的策略](https:\u002F\u002Farxiv.org\u002Fabs\u002F1610.05492) [[笔记](paper_notes\u002Ffederated_learning_comm.md)] \u003Ckbd>NIPS 2016\u003C\u002Fkbd>\n- [SORT：简单在线实时跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F1602.00763) [[笔记](paper_notes\u002Fsort.md)] \u003Ckbd>ICIP 2016\u003C\u002Fkbd>\n- [Deep SORT：基于深度关联度量的简单在线实时跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.07402) [[笔记](paper_notes\u002Fdeep_sort.md)]\n- [MT-CNN：利用多任务级联卷积网络进行人脸检测与对齐](https:\u002F\u002Fkpzhang93.github.io\u002FMTCNN_face_detection_alignment\u002F) [[笔记](paper_notes\u002Fmtcnn.md)] \u003Ckbd>SPL 2016\u003C\u002Fkbd>（实时，面部关键点）\n- [RetinaFace：野外单阶段密集人脸定位](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.00641) [[笔记](paper_notes\u002Fretina_face.md)] \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [联合目标与关键点检测]\n- [SC-SfM-Learner：从单目视频中无监督地学习尺度一致的深度与自运动](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.10553) [[笔记](paper_notes\u002Fsc_sfm_learner.md)] \u003Ckbd>NIPS 2019\u003C\u002Fkbd>\n- [SiamMask：快速在线目标跟踪与分割——一种统一的方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.05050) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>（跟踪、分割、标签传播）\n- [卡尔曼滤波器综述](https:\u002F\u002Fwww.bzarg.com\u002Fp\u002Fhow-a-kalman-filter-works-in-pictures\u002F)（来自蒂姆·巴布，皮克斯动画）[[笔记](paper_notes\u002Fkalman_filter.md)]\n- [R-FCN：基于区域的全卷积网络进行目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1605.06409) [[笔记](paper_notes\u002Frfcn.md)] \u003Ckbd>NIPS 2016\u003C\u002Fkbd>\n- [引导反向传播：追求简洁：全卷积网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.6806.pdf) [[笔记](paper_notes\u002Fguided_backprop.md)] \u003Ckbd>ICLR 2015\u003C\u002Fkbd>\n- [Occlusion-Net：使用图网络进行2D\u002F3D遮挡关键点定位](http:\u002F\u002Fwww.cs.cmu.edu\u002F~mvo\u002Findex_files\u002FPapers\u002FONet_19.pdf) [[笔记](paper_notes\u002Focclusion_net.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [大型图像中的Boxy车辆检测](https:\u002F\u002Fboxy-dataset.com\u002Fboxy\u002Findex) [[笔记](paper_notes\u002Fboxy.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [FQNet：用于单目3D目标检测的深度拟合度评分网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.12681) [[笔记](paper_notes\u002Ffqnet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [单目3D目标检测，陆继文]\n\n## 2019-08 (18)\n- [Mono3D：面向自动驾驶的单目3D目标检测](https:\u002F\u002Fwww.cs.toronto.edu\u002F~urtasun\u002Fpublications\u002Fchen_etal_cvpr16.pdf) [[笔记](paper_notes\u002Fmono3d.md)] \u003Ckbd>CVPR2016\u003C\u002Fkbd>\n- [MonoDIS：解耦单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.12365) [[笔记](paper_notes\u002Fmonodis.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [Pseudo lidar-e2e：基于伪LiDAR点云的单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.09847) [[笔记](paper_notes\u002Fpseudo_lidar_e2e.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>（采用2D与3D一致性损失的伪LiDAR方法，效果优于PL但逊于PL++，为纯单目3D检测领域的SOTA）\n- [MonoGRNet：用于单目3D目标定位的几何推理网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.10247) [[笔记](paper_notes\u002Fmonogrnet.md)] \u003Ckbd>AAAI 2019\u003C\u002Fkbd>（Mono3DOD领域SOTA，MLF \u003C MonoGRNet \u003C 伪LiDAR）\n- [MLF：基于多级融合的单目图像3D目标检测](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FXu_Multi-Level_Fusion_Based_CVPR_2018_paper.pdf) [[笔记](paper_notes\u002Fmlf.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd>（伪LiDAR的前身）\n- [ROI-10D：将2D检测提升至6D位姿与度量尺度形状的单目方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.02781) [[笔记](paper_notes\u002Froi10d.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [AM3D：通过颜色嵌入式3D重建实现高精度单目3D目标检测——面向自动驾驶](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.11444) [[笔记](paper_notes\u002Fam3d.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>【类似伪LiDAR，但加入了颜色增强】\n- [Mono3D++：基于双尺度3D假设与任务先验的单目车辆3D检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.03446) [[笔记](paper_notes\u002Fmono3d++.md)]（由Stefano Soatto提出）\u003Ckbd>AAAI 2019\u003C\u002Fkbd>\n- [用于交通信号灯到车道分配的深度元数据融合](https:\u002F\u002Fieeexplore.ieee.org\u002Fstamp\u002Fstamp.jsp?arnumber=8613841) [[笔记](paper_notes\u002Fdeep_lane_association.md)] \u003Ckbd>IEEE RA-L 2019\u003C\u002Fkbd>（交通信号灯关联）\n- [复杂交叉口下交通信号灯到自车车道的自动关联](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F8569421) \u003Ckbd>ITSC 2019\u003C\u002Fkbd>（交通信号灯关联）\n- [基于雷达与视觉的远距离车辆检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.10951)[[笔记](paper_notes\u002Fdistant_object_radar.md)] \u003Ckbd>ICRA 2019\u003C\u002Fkbd>【雷达、视觉及雷达轨迹片段融合】\n- [基于车辆姿态信息的单目距离估计](https:\u002F\u002Fiopscience.iop.org\u002Farticle\u002F10.1088\u002F1742-6596\u002F1168\u002F3\u002F032040\u002Fpdf) [[笔记](paper_notes\u002Fdistance_estimation_pose_radar.md)]\n- [利用不确定性加权损失进行场景几何与语义的多任务学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.07115) [[笔记](paper_notes\u002Funcertainty_multitask.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd>（Alex Kendall）\n- [GradNorm：用于深度多任务网络中自适应损失平衡的梯度归一化](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.02257) [[笔记](paper_notes\u002Fgradnorm.md)] \u003Ckbd>ICML 2018\u003C\u002Fkbd>（多任务）\n- [DTP：多任务学习中的动态任务优先级设置](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCV_2018\u002Fpapers\u002FMichelle_Guo_Focus_on_the_ECCV_2018_paper.pdf) [[笔记](paper_notes\u002Fdtp.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd>【多任务，斯坦福大学】\n- [这辆车会变道吗？——频域中的转向灯识别](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F6856477\u002F) [[笔记](paper_notes\u002Ftsl_frequency.md)] \u003Ckbd>IV 2014\u003C\u002Fkbd>\n- [Complex-YOLO：点云上的实时3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.06199) [[笔记](paper_notes\u002Fcomplex_yolo.md)]（仅BEV检测）\n- [Complexer-YOLO：语义点云上的实时3D目标检测与跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.07537) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>（传感器融合与跟踪）\n- [卷积神经网络的一个有趣缺陷及CoordConv解决方案](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.03247) [[笔记](paper_notes\u002Fcoord_conv.md)] \u003Ckbd>NIPS 2018\u003C\u002Fkbd>\n\n## 2019-07 (19)\n- [深度参数化连续卷积神经网络](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FWang_Deep_Parametric_Continuous_CVPR_2018_paper.pdf) [[笔记](paper_notes\u002Fparametric_cont_conv.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (@Uber, 传感器融合)\n- [ContFuse: 用于多传感器3D目标检测的深度连续融合](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCV_2018\u002Fpapers\u002FMing_Liang_Deep_Continuous_Fusion_ECCV_2018_paper.pdf) [[笔记](paper_notes\u002Fcontfuse.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [Uber ATG, 传感器融合, BEV]\n- [快与狂：基于单个卷积网络的实时端到端3D检测、跟踪和运动预测](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FLuo_Fast_and_Furious_CVPR_2018_paper.pdf) [[笔记](paper_notes\u002Ffaf.md)] \u003Ckbd>CVPR 2018 口头报告\u003C\u002Fkbd> [仅激光雷达, 感知与预测]\n- [LearnK: 来自野外视频的深度：来自未知相机的无监督单目深度学习](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.04998.pdf) [[笔记](paper_notes\u002Flearnk.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [单目深度估计, 内参估计, SOTA]\n- [monodepth: 具有左右一致性约束的无监督单目深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1609.03677) [[笔记](paper_notes\u002Fmonodepth.md)] \u003Ckbd>CVPR 2017 口头报告\u003C\u002Fkbd> (单目深度估计, 使用立体匹配进行训练)\n- [Struct2depth: 无需传感器的深度预测：利用结构信息从单目视频中进行无监督学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.06152) [[笔记](paper_notes\u002Fstruct2depth.md)] \u003Ckbd>AAAI 2019\u003C\u002Fkbd> [单目深度估计, 动态物体运动估计, 无穷远深度问题, 在线微调]\n- [基于边缘感知的深度-法线一致性约束的无监督几何学习](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.03665.pdf) [[笔记](paper_notes\u002Fedge_aware_depth_normal.md)] \u003Ckbd>AAAI 2018\u003C\u002Fkbd> (单目深度估计, 静态假设, 表面法线)\n- [LEGO：通过观看视频一次性学习边缘与几何](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.05648.pdf) [[笔记](paper_notes\u002Flego.md)] \u003Ckbd>CVPR 2018 点亮论文\u003C\u002Fkbd> (单目深度估计, 静态假设, 表面法线)\n- [基于全卷积网络的FMCW雷达目标检测与3D估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.05394) [[笔记](paper_notes\u002Fradar_3d_od_fcn.md)] (雷达, RD图, 目标检测, Arxiv 201902)\n- [基于深度神经网络的雷达目标检测研究](https:\u002F\u002Fwww.researchgate.net\u002Fpublication\u002F330748053_A_Study_on_Radar_Target_Detection_Based_on_Deep_Neural_Networks) [[笔记](paper_notes\u002Fradar_target_detection_tsinghua.md)] (雷达, RD图, 目标检测)\n- [使用PointNets在雷达数据中进行2D车辆检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08414) [[笔记](paper_notes\u002Fradar_detection_pointnet.md)] (来自乌尔姆大学, 雷达, 点云, 目标检测, Arxiv 201904)\n- [神经网络中针对分布外检测的学习置信度](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.04865) [[笔记](paper_notes\u002Flearning_ood_conf.md)] (预算作弊)\n- [交通信号灯的深度学习方法：检测、跟踪与分类](assets\u002Fpapers\u002Fbosch_traffic_lights.pdf) [[笔记](paper_notes\u002Fbosch_traffic_lights.md)] \u003Ckbd>ICRA 2017\u003C\u002Fkbd> (博世, 交通信号灯)\n- [这能有多难？估计图像中视觉搜索的难度](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.08280) [[笔记](paper_notes\u002Fhow_hard_can_it_be.md)] \u003Ckbd>CVPR 2016\u003C\u002Fkbd>\n- [面向自动驾驶的深度多模态目标检测与语义分割：数据集、方法与挑战](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1902.07830.pdf) [[笔记](paper_notes\u002Fdeep_fusion_review.md)] (博世综述)\n- [单目3D目标检测综述](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F57029694) (知乎博客)\n- [Deep3dBox: 基于深度学习和几何的3D边界框估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1612.00496) [[笔记](paper_notes\u002Fdeep3dbox.md)] \u003Ckbd>CVPR 2017\u003C\u002Fkbd> [Zoox]\n- [MonoPSR: 利用精确提案和形状重建的单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.01690) [[笔记](paper_notes\u002Fmonopsr.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [OFT: 用于单目3D目标检测的正射特征变换](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.08188) [[笔记](paper_notes\u002Foft.md)] \u003Ckbd>BMVC 2019\u003C\u002Fkbd> [将相机转换为BEV, Alex Kendall]\n\n\n## 2019-06 (12)\n- [MixMatch: 半监督学习的整体方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02249) [[笔记](paper_notes\u002FMixMatch.md)]\n- [EfficientNet: 重新思考卷积神经网络的模型缩放](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.11946.pdf) [[笔记](paper_notes\u002Fefficientnet.md)] \u003Ckbd>ICML 2019\u003C\u002Fkbd>\n- [我们在计算机视觉的贝叶斯深度学习中需要哪些不确定性？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.04977) [[笔记](paper_notes\u002Funcertainty_bdl.md)] \u003Ckbd>NIPS 2017\u003C\u002Fkbd>\n- [Bayesian SegNet: 场景理解中深度卷积编码器-解码器架构中的模型不确定性](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1511.02680.pdf) [[笔记](paper_notes\u002Fbayesian_segnet.md)]\u003Ckbd>BMVC 2017\u003C\u002Fkbd>\n- [TrafficPredict: 异构交通参与者的轨迹预测](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1811.02146.pdf) [[笔记](paper_notes\u002Ftrafficpredict.md)] \u003Ckbd>AAAI 2019 口头报告\u003C\u002Fkbd>\n- [单张RGB-D图像的深度补全](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.09326.pdf) [[笔记](paper_notes\u002Fdeep_depth_completion_rgbd.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (室内)\n- [DeepLiDAR: 基于稀疏激光雷达数据和单张彩色图像的室外场景表面法向引导深度预测](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.00488v2.pdf) [[笔记](paper_notes\u002Fdeeplidar.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (室外)\n- [SfMLearner: 从视频中无监督学习深度与自身运动](https:\u002F\u002Fpeople.eecs.berkeley.edu\u002F~tinghuiz\u002Fprojects\u002FSfMLearner\u002Fcvpr17_sfm_final.pdf) [[笔记](paper_notes\u002Fsfm_learner.md)] \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [Monodepth2: 深入探讨自监督单目深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.01260) [[笔记](paper_notes\u002Fmonodepth2.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Niantic]\n- [DeepSignals: 通过视觉信号预测驾驶员意图](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.01333.pdf) [[笔记](paper_notes\u002Fdeep_signals.md)] \u003Ckbd>ICRA 2019\u003C\u002Fkbd> (@Uber, 转向灯检测)\n- [FCOS: 全卷积一阶段目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.01355) [[笔记](paper_notes\u002Ffcos.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Chunhua Shen]\n- [Pseudo-LiDAR++: 自动驾驶中3D目标检测的精确深度](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.06310) [[笔记](paper_notes\u002Fpseudo_lidar++.md)] \u003Ckbd>ICLR 2020\u003C\u002Fkbd>\n- [MMF: 用于3D目标检测的多任务多传感器融合](http:\u002F\u002Fwww.cs.toronto.edu\u002F~byang\u002Fpapers\u002Fmmf.pdf) [[笔记](paper_notes\u002Fmmf.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (@Uber, 传感器融合)\n\n## 2019年5月 (18篇)\n- [CenterNet：将目标视为点](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.07850)（来自ExtremeNet作者）[[笔记](paper_notes\u002Fcenternet.md)]\n- [CenterNet：基于关键点三元组的目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08189) [[笔记](paper_notes\u002Fcenternet_cas.md)]\n- [基于区域分解与组装的目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.08225) [[笔记](paper_notes\u002Fobject_detection_region_decomposition.md)] \u003Ckbd>AAAI 2019 \u003C\u002Fkbd>\n- [彩票假说：寻找稀疏且可训练的神经网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.03635) [[笔记](paper_notes\u002Flottery_ticket_hypothesis.md)] \u003Ckbd>ICLR 2019 \u003C\u002Fkbd>\n- [M2Det：基于多级特征金字塔网络的单阶段目标检测器](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.04533) [[笔记](paper_notes\u002Fm2det.md)] \u003Ckbd>AAAI 2019 \u003C\u002Fkbd>\n- [深度雷达检测器](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.12187) [[笔记](paper_notes\u002Fdeep_radar_detector.md)] \u003Ckbd>RadarCon 2019\u003C\u002Fkbd>\n- [雷达点云上的语义分割](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F8455344) [[[笔记](paper_notes\u002Fradar_point_semantic_seg.md)]]（来自戴姆勒公司）\u003Ckbd>FUSION 2018\u003C\u002Fkbd>\n- [用于高效卷积神经网络的滤波器剪枝](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1608.08710.pdf) [[笔记](paper_notes\u002Fpruning_filters.md)] \u003Ckbd>ICLR 2017\u003C\u002Fkbd>\n- [面向资源受限卷积神经网络的层补偿式剪枝](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1810.00518.pdf) [[笔记](paper_notes\u002Flayer_compensated_pruning.md)] \u003Ckbd>NIPS 2018演讲\u003C\u002Fkbd>\n- [LeGR：通过学习的全局排序进行滤波器剪枝](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.12368.pdf) [[笔记](paper_notes\u002Flegr.md)] \u003Ckbd>CVPR 2020口头报告\u003C\u002Fkbd>\n- [NAS-FPN：为目标检测学习可扩展的特征金字塔架构](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.07392.pdf) [[笔记](paper_notes\u002Fnas_fpn.md)] \u003Ckbd>CVPR 2019 \u003C\u002Fkbd>\n- [AutoAugment：从数据中学习增强策略](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.09501) [[笔记](paper_notes\u002Fautoaugment.md)] \u003Ckbd>CVPR 2019 \u003C\u002Fkbd>\n- [用于实例分割的路径聚合网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.01534.pdf) [[笔记](paper_notes\u002Fpanet.md)] \u003Ckbd>CVPR 2018 \u003C\u002Fkbd>\n- [用于加速超深神经网络的通道剪枝](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1707.06168.pdf) \u003Ckbd>ICCV 2017\u003C\u002Fkbd>（旷视科技，何毅辉）[[笔记](paper_notes\u002Fchannel_pruning_megvii.md)]\n- [AMC：面向移动设备的模型压缩与加速自动化机器学习](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1802.03494.pdf) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>（韩松，何毅辉）\n- [MobileNetV3：搜索MobileNetV3](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.02244) [[笔记](paper_notes\u002Fmobilenets_v3.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [MnasNet：面向移动端的平台感知型神经架构搜索](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1807.11626.pdf) [[笔记](paper_notes\u002Fmnasnet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [重新思考网络剪枝的价值](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1810.05270.pdf) \u003Ckbd>ICLR 2019\u003C\u002Fkbd>\n\n## 2019年4月 (12篇)\n- [MobileNetV2：倒残差与线性瓶颈](https:\u002F\u002Farxiv.org\u002Fabs\u002F1801.04381)（MobileNets v2）[[笔记](paper_notes\u002Fmobilenets_v2.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [一种新的性能度量与道路检测算法评估基准](http:\u002F\u002Fwww.cvlibs.net\u002Fpublications\u002FFritsch2013ITSC.pdf) [[笔记](paper_notes\u002Fkitti_lane.md)] \u003Ckbd>ITSC 2013\u003C\u002Fkbd>\n- [MultiNet：面向自动驾驶的实时联合语义推理](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1612.07695.pdf) [[笔记](paper_notes\u002Fmultinet_raquel.md)]\n- [利用图像难度预测优化单阶段与双阶段目标检测器之间的权衡](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.08707.pdf)（对1阶段和2阶段目标检测有很好的说明）\n- [Light-Head R-CNN：捍卫双阶段目标检测器](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.07264.pdf) [[笔记](paper_notes\u002Flighthead_rcnn.md)]（来自旷视科技）\n- [CSP：高层语义特征检测——行人检测的新视角](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.02948) [[笔记](paper_notes\u002Fcsp_pedestrian.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [中心与尺度预测、无锚框，接近SOTA水平的行人检测]\n- [无锚框方法综述（知乎博客）目标检测：Anchor-Free时代](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F62103812) [Anchor free深度学习的目标检测方法](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F64563186) [关于CSP的我的幻灯片](https:\u002F\u002Fdocs.google.com\u002Fpresentation\u002Fd\u002F1_dUfxv63108bZXUnVYPIOAdEIkRZw5BR9-rOp-Ni0X0\u002F)\n- [DenseBox：统一地标定位与端到端目标检测](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1509.04874.pdf)\n- [CornerNet：将目标检测为成对的关键点](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1808.01244.pdf) [[笔记](paper_notes\u002Fcornernet.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [ExtremeNet：通过聚类极端点和中心点进行自下而上的目标检测](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1901.08043.pdf) [[笔记](paper_notes\u002Fextremenet.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [FSAF：用于单阶段目标检测的特征选择性无锚模块](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1903.00621.pdf) [[笔记](paper_notes\u002Ffsaf_detection.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [FoveaBox：超越基于锚框的目标检测器](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.03797v1.pdf)（无锚框）[[笔记](paper_notes\u002Ffoveabox.md)]\n\n## 2019-03 (19)\n- [用于目标检测神经网络训练的免费工具包](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1902.04103.pdf) [[笔记](paper_notes\u002Fbag_of_freebies_object_detection.md)]\n- [mixup：超越经验风险最小化](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1710.09412.pdf) [[笔记](paper_notes\u002Fmixup.md)] \u003Ckbd>ICLR 2018\u003C\u002Fkbd>\n- [用于3D形状识别的多视角卷积神经网络](https:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_iccv_2015\u002Fpapers\u002FSu_Multi-View_Convolutional_Neural_ICCV_2015_paper.pdf) (MVCNN) [[笔记](paper_notes\u002Fmvcnn.md)] \u003Ckbd>ICCV 2015\u003C\u002Fkbd> \n- [3D ShapeNets：体素化形状的深度表示](http:\u002F\u002F3dshapenets.cs.princeton.edu\u002Fpaper.pdf) [[笔记](paper_notes\u002F3d_shapenets.md)] \u003Ckbd>CVPR 2015\u003C\u002Fkbd>\n- [用于3D数据上物体分类的体素化与多视角CNN](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1604.03265.pdf) [[笔记](paper_notes\u002Fvol_vs_mvcnn.md)] \u003Ckbd>CVPR 2016\u003C\u002Fkbd>\n- [组归一化](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.08494.pdf) [[笔记](paper_notes\u002Fgroupnorm.md)] \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [空间变换网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1506.02025.pdf) [[笔记](paper_notes\u002Fstn.md)] \u003Ckbd>NIPS 2015\u003C\u002Fkbd>\n- [基于RGB-D数据的3D目标检测的Frustum PointNets](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.08488.pdf) (F-PointNet) [[笔记](paper_notes\u002Ffrustum_pointnet.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd> \n- [用于点云学习的空间图卷积网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1801.07829.pdf) [[笔记](paper_notes\u002Fedgeconv.md)]\n- [PointRCNN：基于点云的3D目标提案生成与检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.04244) (3D目标检测SOTA) [[笔记](paper_notes\u002Fpoint_rcnn.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [MV3D：面向自动驾驶的多视角3D目标检测网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.07759) [[笔记](paper_notes\u002Fmv3d.md)] \u003Ckbd>CVPR 2017\u003C\u002Fkbd> (百度，传感器融合，BV提案)\n- [AVOD：视图聚合下的联合3D提案生成与目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.02294) [[笔记](paper_notes\u002Favod.md)] \u003Ckbd>IROS 2018\u003C\u002Fkbd> (传感器融合，多视角提案)\n- [MobileNets：适用于移动视觉应用的高效卷积神经网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.04861) [[笔记](paper_notes\u002Fmobilenets.md)]\n- [基于视觉深度估计的伪LiDAR：弥合自动驾驶3D目标检测中的鸿沟](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.07179) [[笔记](paper_notes\u002Fpseudo_lidar.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [VoxelNet：基于点云的3D目标检测端到端学习](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.06396.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (苹果，首个端到端点云编码到网格的方法)\n- [SECOND：稀疏嵌入式卷积检测](https:\u002F\u002Fwww.mdpi.com\u002F1424-8220\u002F18\u002F10\u002F3337\u002Fpdf) \u003Ckbd>Sensors 2018\u003C\u002Fkbd> (基于VoxelNet)\n- [PointPillars：用于点云目标检测的快速编码器](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.05784) [[笔记](paper_notes\u002Fpoint_pillars.md)] \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (基于SECOND)\n- [我们准备好迎接自动驾驶了吗？KITTI视觉基准套件](http:\u002F\u002Fwww.cvlibs.net\u002Fpublications\u002FGeiger2012CVPR.pdf) [[笔记](paper_notes\u002Fkitti.md)] \u003Ckbd>CVPR 2012\u003C\u002Fkbd>\n- [视觉遇见机器人：KITTI数据集](http:\u002F\u002Fww.cvlibs.net\u002Fpublications\u002FGeiger2013IJRR.pdf) [[笔记](paper_notes\u002Fkitti.md)] \u003Ckbd>IJRR 2013\u003C\u002Fkbd>\n\n\n## 2019-02 (9)\n- [动作识别何去何从？一种新模型与Kinetics数据集](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1705.07750.pdf) (I3D) [[笔记](paper_notes\u002Fquo_vadis_i3d.md)]\u003Ckbd>视频\u003C\u002Fkbd> \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [时空卷积神经网络的初始化策略](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1503.07274.pdf) [[笔记](paper_notes\u002Fquo_vadis_i3d.md)] \u003Ckbd>视频\u003C\u002Fkbd>\n- [检测与跟踪：视频中高效的姿态估计](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1712.09184.pdf) [[笔记](paper_notes\u002Fquo_vadis_i3d.md)] \u003Ckbd>ICCV 2017\u003C\u002Fkbd> \u003Ckbd>视频\u003C\u002Fkbd>\n- [基于深度学习的肋骨中心线提取与标注](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1809.07082) [[笔记](paper_notes\u002Frib_centerline_philips.md)] \u003Ckbd>医学影像\u003C\u002Fkbd> \u003Ckbd>MICCAI 2018\u003C\u002Fkbd>\n- [用于视频识别的SlowFast网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.03982.pdf) [[笔记](paper_notes\u002Fslowfast.md)] \u003Ckbd>ICCV 2019口头报告\u003C\u002Fkbd>\n- [深度神经网络的聚合残差变换](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.05431.pdf) (ResNeXt) [[笔记](paper_notes\u002Fresnext.md)] \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [超越像素平面：3D中的感知与学习](https:\u002F\u002Fthegradient.pub\u002Fbeyond-the-pixel-plane-sensing-and-learning-in-3d\u002F) (博客，[中文版](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F44386618))\n- [VoxNet：用于实时目标识别的3D卷积神经网络](https:\u002F\u002Fwww.ri.cmu.edu\u002Fpub_files\u002F2015\u002F9\u002Fvoxnet_maturana_scherer_iros15.pdf) (VoxNet) [[笔记](paper_notes\u002Fvoxnet.md)]\n- [PointNet：面向3D分类与分割的点集深度学习](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1612.00593.pdf) \u003Ckbd>CVPR 2017\u003C\u002Fkbd> [[笔记](paper_notes\u002Fpointnet.md)]\n- [PointNet++：度量空间中点集上的深度层次特征学习](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1706.02413.pdf) \u003Ckbd>NIPS 2017\u003C\u002Fkbd> [[笔记](paper_notes\u002Fpointnet++.md)]\n- [几何深度学习前沿综述（来自知乎）](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F36888114) (截至CVPR 2018)\n\n\n## 2019-01 (10)\n- [DQN：通过深度强化学习实现人类水平控制（Nature DQN论文）](https:\u002F\u002Fstorage.googleapis.com\u002Fdeepmind-media\u002Fdqn\u002FDQNNaturePaper.pdf) [[笔记](paper_notes\u002Fnature_dqn_paper.md)] \u003Ckbd>强化学习\u003C\u002Fkbd>\n- [Retina U-Net：针对医学目标检测的分割监督的简单利用](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1811.08661.pdf) [[笔记](paper_notes\u002Fretina_unet.md)] \u003Ckbd>医学影像\u003C\u002Fkbd>\n- [全景分割](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1801.00868.pdf) [[笔记](paper_notes\u002Fpanoptic_segmentation.md)] \u003Ckbd>全景分割\u003C\u002Fkbd>\n- [全景特征金字塔网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1901.02446.pdf) [[笔记](paper_notes\u002Fpanoptic_fpn.md)] \u003Ckbd>全景分割\u003C\u002Fkbd> \n- [注意力引导的全景分割统一网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.03904.pdf) [[笔记](paper_notes\u002FAUNet_panoptic.md)] \u003Ckbd>全景分割\u003C\u002Fkbd>\n- [用于卷积神经网络图像分类的技巧大全](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.01187.pdf) [[笔记](paper_notes\u002Fbag_of_tricks_cnn.md)] \u003Ckbd>图像分类\u003C\u002Fkbd>\n- [用于多模态3D体积中血管中心线追踪的深度强化学习](https:\u002F\u002Flink.springer.com\u002Fchapter\u002F10.1007\u002F978-3-030-00937-3_86) [[笔记](paper_notes\u002Fdrl_vessel_centerline.md)] \u003Ckbd>强化学习\u003C\u002Fkbd> \u003Ckbd>医学影像\u003C\u002Fkbd>\n- [用于Flappy Bird的深度强化学习](http:\u002F\u002Fcs229.stanford.edu\u002Fproj2015\u002F362_report.pdf) [[笔记](paper_notes\u002Fdrl_flappy.md)] \u003Ckbd>强化学习\u003C\u002Fkbd>\n- [用于详细视频理解的长期特征库](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.05038.pdf) [[笔记](paper_notes\u002Flong_term_feat_bank.md)] \u003Ckbd>视频\u003C\u002Fkbd> \n- [非局部神经网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.07971.pdf) [[笔记](paper_notes\u002Fnon_local_net.md)] \u003Ckbd>视频\u003C\u002Fkbd> \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n\n## 2018年\n- [Mask R-CNN](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1703.06870.pdf)\n- [Cascade R-CNN：深入高质量目标检测](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1712.00726.pdf)\n- [密集目标检测中的焦点损失](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1708.02002.pdf)（RetinaNet）[[笔记](paper_notes\u002Ffocal_loss.md)]\n- [挤压与激励网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1709.01507)（SENet）\n- [用于提升质量、稳定性和多样性的渐进式GAN训练](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1710.10196.pdf)\n- [可变形卷积网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.06211) \u003Ckbd>ICCV 2017\u003C\u002Fkbd> [基于R-FCN]\n- [学习用于目标检测的区域特征](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.07066.pdf)\n\n## 2017年及之前\n- [深度学习学习笔记](Learning_notes.md)\n- [机器学习论文列表](List_of_Machine_Learning_Papers.md)\n- [计算机视觉中CNN文献综述笔记](paper_notes\u002Fcnn_papers.md) 这是推荐列表中所有论文的笔记，该列表位于[这里](papers_and_books_to_start.md)\n- [其他文献综述笔记](misc.md)\n- [搭建深度学习\u002F机器学习环境的笔记](ML_DL_environment_Setup.md)\n- [有用的安装记录](installation_log.md)\n\n## 待读论文\n以下是等待阅读的论文列表。\n### 深度学习一般\n- [SqueezeDet：统一、小型、低功耗的全卷积神经网络，用于自动驾驶的实时目标检测](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1612.01051.pdf)\n- [准确的大批量SGD：1小时内训练ImageNet](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1706.02677.pdf)\n- [经过ImageNet训练的CNN偏向于纹理；增加形状偏好可提高准确性和鲁棒性](https:\u002F\u002Fopenreview.net\u002Fforum?id=Bygh9j09KX) \u003Ckbd>ICML 2019\u003C\u002Fkbd>\n- [用局部特征袋模型近似CNN，在ImageNet上效果出奇的好](https:\u002F\u002Fopenreview.net\u002Fforum?id=SkfMWhAqYQ)（BagNet）[博客](https:\u002F\u002Fblog.evjang.com\u002F2019\u002F02\u002Fbagnet.html) \u003Ckbd>ICML 2019\u003C\u002Fkbd>\n- [神经网络超参数的规范方法：第一部分——学习率、批量大小、动量和权重衰减](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.09820v2.pdf)\n- [理解深度学习需要重新思考泛化问题](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.03530.pdf)\n- [梯度反转：通过反向传播实现无监督领域适应](https:\u002F\u002Farxiv.org\u002Fabs\u002F1409.7495) \u003Ckbd>ICML 2015\u003C\u002Fkbd>\n\n### 自我训练\n- [重新思考预训练与自我训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.06882) \u003Ckbd>NeurIPS 2020\u003C\u002Fkbd> [Quoc Le]\n\n### 2D目标检测与分割\n- [Mask Scoring R-CNN](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1903.00241.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [使用在线难例挖掘训练基于区域的目标检测器](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1604.03540.pdf)\n- [滑动顶点在水平边界框上的多方向目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.09358)\n- [ONCE：增量式少样本目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.04668) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [面向野外场景的目标检测的领域自适应Faster R-CNN](https:\u002F\u002Farxiv.org\u002Fabs\u002F1803.03243) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [雾天Cityscapes：利用合成数据进行语义雾天场景理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1708.07819) \u003Ckbd>IJCV 2018\u003C\u002Fkbd>\n- [ECCV雾天Cityscapes：结合合成与真实数据的模型适配，用于语义密集雾天场景理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.01265) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [用于开放集条件下稳健目标检测的Dropout采样](https:\u002F\u002Farxiv.org\u002Fabs\u002F1710.06677) \u003Ckbd>ICRA 2018\u003C\u002Fkbd>（Niko Sünderhauf）\n- [实例分割的混合任务级联](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.07518) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>（级联掩码RCNN）\n- [评估基于采样的不确定性技术在目标检测中的合并策略](https:\u002F\u002Farxiv.org\u002Fabs\u002F1809.06006) \u003Ckbd>ICRA 2019\u003C\u002Fkbd>（Niko Sünderhauf）\n- [统一的全景分割网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1901.03784.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> \u003Ckbd>PanSeg\u003C\u002Fkbd>\n- [模型对图像变换集合中分布偏移的脆弱性](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1903.11900.pdf)（CVPR研讨会）[简要说明](https:\u002F\u002Fwww.reddit.com\u002Fr\u002FMachineLearning\u002Fcomments\u002Fb81uwq\u002Fr_model_vulnerability_to_distributional_shifts\u002F)\n- [利用自我训练使目标检测器自动适应新领域](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.07305.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>（寻找边界情况并增强）\n- [目标检测中的缺失标签](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPRW_2019\u002Fpapers\u002FWeakly%20Supervised%20Learning%20for%20Real-World%20Computer%20Vision%20Applications\u002FXu_Missing_Labels_in_Object_Detection_CVPRW_2019_paper.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [DenseBox：将地标定位与端到端目标检测统一起来](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1509.04874.pdf)\n- [针对2D LIDAR数据的极坐标系圆形目标检测](https:\u002F\u002Fwww.researchgate.net\u002Fpublication\u002F309365539_Circular_Object_Detection_in_Polar_Coordinates_for_2D_LIDAR_DataCCPR2016) \u003Ckbd>CCPR 2016\u003C\u002Fkbd>\n- [LFFD：适用于边缘设备的轻量级快速人脸检测器](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.10633) [轻量化、人脸检测、车辆检测]\n- [UnitBox：先进的目标检测网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1608.01471) \u003Ckbd>ACM MM 2016\u003C\u002Fkbd> [Ln IoU损失，Thomas Huang]\n\n\n### 鱼眼镜头\n- [自动驾驶中鱼眼相机上的通用目标检测：数据集、表示方法与基线](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.02124) \u003Ckbd>WACV 2021\u003C\u002Fkbd>\n\n### 视频理解\n- [利用3D卷积网络学习时空特征](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.0767.pdf) (C3D)  \u003Ckbd>视频 \u003C\u002Fkbd>\u003Ckbd>ICCV 2015 \u003C\u002Fkbd>\n- [AVA：一个时空局部化的原子视觉动作视频数据集](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1705.08421.pdf)\n- [用于视频动作识别的时空残差网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.02155.pdf) (解耦时空) \u003Ckbd>NIPS 2016\u003C\u002Fkbd>\n- [利用伪3D残差网络学习时空表示](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.10305.pdf) (P3D，解耦时空) \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [更深入地研究用于动作识别的时空卷积](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.11248.pdf) (解耦时空) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [重新思考时空特征学习：视频分类中的速度-精度权衡](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1712.04851.pdf) (解耦时空) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [时空3D CNN能否重演2D CNN和ImageNet的历史？](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.09577.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [AGSS-VOS：注意力引导的单次视频目标分割](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FLin_AGSS-VOS_Attention_Guided_Single-Shot_Video_Object_Segmentation_ICCV_2019_paper.pdf) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [单次视频目标分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.05198) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [快看与慢看：记忆引导的移动端视频目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.10172) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [迈向高性能视频目标检测](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FZhu_Towards_High_Performance_CVPR_2018_paper.pdf) [[笔记](paper_notes\u002Fhigh_performance_video_od.md)] \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [面向移动端的高性能视频目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.05830) [[笔记](paper_notes\u002Fhigh_performance_video_od_mobile.md)]\n- [用于快速视频语义分割的时序分布式网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.01800) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [高效视频分割]\n- [内存增强的全局-局部聚合用于视频目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.12063) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [高效视频目标检测]\n- [基于骨骼数据的动作识别与检测中的共现特征学习及层次聚合](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.06055) \u003Ckbd>IJCAI 2018 口头报告\u003C\u002Fkbd> [视频骨骼]\n- [RST-MODNet：自动驾驶中的实时时空移动目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.00438) \u003Ckbd>NeurIPS 2019 研讨会\u003C\u002Fkbd>\n- [用于视觉识别与描述的长期循环卷积网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1411.4389) \u003Ckbd>CVPR 2015 口头报告\u003C\u002Fkbd>\n- [时间片段网络：迈向深度动作识别的良好实践](https:\u002F\u002Farxiv.org\u002Fabs\u002F1608.00859) \u003Ckbd>ECCV 2016\u003C\u002Fkbd>\n- [TRN：视频中的时间关系推理](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.08496) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [X3D：扩展架构以实现高效的视频识别](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.04730) \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [FAIR]\n- [强遮挡行人的时间-上下文增强检测](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fhtml\u002FWu_Temporal-Context_Enhanced_Detection_of_Heavily_Occluded_Pedestrians_CVPR_2020_paper.html) \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [行人，视频]\n- [流引导的特征聚合用于视频目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.10025) \u003Ckbd>ICCV 2017\u003C\u002Fkbd> [视频，目标检测]\n- [利用时间卷积和半监督训练进行视频中的人体3D姿态估计](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2019\u002Fpapers\u002FPavllo_3D_Human_Pose_Estimation_in_Video_With_Temporal_Convolutions_and_CVPR_2019_paper.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [从视频中进行单目3D姿态估计]\n- [OmegaNet：从视频中提炼语义以实现全面场景理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.14030) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [基于管状提议网络的视频目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1702.06355) \u003Ckbd>CVPR 2017\u003C\u002Fkbd> [视频目标检测]\n- [T-CNN：利用卷积神经网络对管状体进行视频目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1604.02532) [视频目标检测]\n- [流引导的视频目标特化聚合](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.10025) \u003Ckbd>ICCV 2017\u003C\u002Fkbd> [戴密峰]\n\n\n### 剪枝与压缩\n- [基于模型压缩的高效深度学习推理](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018_workshops\u002Fpapers\u002Fw33\u002FZhang_Efficient_Deep_Learning_CVPR_2018_paper.pdf) (模型压缩)\n- [神经网络蒸馏器](https:\u002F\u002Fintellabs.github.io\u002Fdistiller\u002Falgo_pruning.html) [英特尔]\n\n\n### 架构改进\n- [全卷积网络中的空间与通道并行挤压与激励机制](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.02579.pdf)\n- [CBAM：卷积块注意力模块](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1807.06521.pdf)\n\n### 强化学习\n- [使用深度强化学习玩Atari游戏](https:\u002F\u002Fwww.cs.toronto.edu\u002F~vmnih\u002Fdocs\u002Fdqn.pdf) \u003Ckbd>NIPS 2013 \u003C\u002Fkbd>\n- [多尺度深度强化学习用于CT扫描中的实时3D地标检测](http:\u002F\u002Fcomaniciu.net\u002FPapers\u002FMultiscaleDeepReinforcementLearning_PAMI18.pdf)\n- [用于鲁棒图像配准的人工智能代理](https:\u002F\u002Fwww.aaai.org\u002Focs\u002Findex.php\u002FAAAI\u002FAAAI17\u002Fpaper\u002Fdownload\u002F14751\u002F14296)\n\n### 3D感知\n- [3D-CNN：用于从激光雷达数据中检测着陆区的三维卷积神经网络](https:\u002F\u002Fwww.ri.cmu.edu\u002Fpub_files\u002F2015\u002F3\u002Fmaturana-root.pdf)\n- [基于卷积神经网络的生成与判别体素建模](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1608.04236.pdf)\n- [面向3D目标识别的方向增强体素网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1604.03351.pdf) (ORION) \u003CBMVC 2017>\n- [GIFT：一个实时且可扩展的3D形状搜索引擎](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1604.01879.pdf) \u003Ckbd>CVPR 2016\u003C\u002Fkbd>\n- [基于投影卷积网络的3D形状分割](https:\u002F\u002Fpeople.cs.umass.edu\u002F~kalo\u002Fpapers\u002Fshapepfcn\u002F) (ShapePFCN)\u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [利用多视角卷积网络从部件对应关系中学习局部形状描述子](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1706.04496.pdf)\n- [Open3D：现代3D数据处理库](http:\u002F\u002Fwww.open3d.org\u002Fwordpress\u002Fwp-content\u002Fpaper.pdf)\n- [用于鲁棒RGB-D目标识别的多模态深度学习](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1507.06821.pdf) \u003Ckbd>IROS 2015\u003C\u002Fkbd>\n- [FlowNet3D：在三维点云中学习场景光流](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1806.01411.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [通过核相关和图池化挖掘点云局部结构](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1712.06760.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (Neighbors Do Help: Deeply Exploiting Local Structures of Point Clouds)\n- [PU-Net：点云上采样网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1801.06761.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [用于点云3D分割的循环切片网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1802.04402.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [SPLATNet：用于点云处理的稀疏格网网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1802.08275.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [基于快速局部化谱滤波的图卷积神经网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.09375.pdf) \u003Ckbd>NIPS 2016\u003C\u002Fkbd>\n- [基于图卷积网络的半监督分类](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1609.02907.pdf) \u003Ckbd>ICLR 2017\u003C\u002Fkbd>\n- [基于递归多图神经网络的几何矩阵补全](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1704.06803.pdf) \u003Ckbd>NIPS 2017\u003C\u002Fkbd>\n- [图注意力网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1710.10903.pdf) \u003Ckbd>ICLR 2018\u003C\u002Fkbd>\n- [3D-SSD：从RGB-D图像中学习层次特征，用于无遮挡3D目标检测](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.00238.pdf) (3D SSD)\n- [逃离细胞：用于识别3D点云模型的深度Kd网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1704.01222.pdf) \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [使用3D编码器-预测器CNN和形状合成进行形状补全](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1612.00101.pdf) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [IPOD：基于密集点的点云目标检测器](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.05276.pdf)\n- [3D目标的无遮挡检测：从RGB-深度图像中的2D边界框推断3D边界框](https:\u002F\u002Fcis.temple.edu\u002F~latecki\u002FPapers\u002FDengCVPR2017.pdf) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [RGB-D图像中由2D驱动的3D目标检测](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2017\u002Fpapers\u002FLahoud_2D-Driven_3D_Object_ICCV_2017_paper.pdf)\n- [3D-SSD：从RGB-D图像中学习层次特征，用于无遮挡3D目标检测](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1711.00238.pdf)\n- [Associate-3Ddet：用于3D点云目标检测的感知到概念关联](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04356) [对遮挡目标进行分类]\n\n### 立体视觉与光流\n- [PSMNet：金字塔立体匹配网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1803.08669.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [基于立体R-CNN的自动驾驶3D目标检测](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1902.09738.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [深度刚性实例场景光流](https:\u002F\u002Fpeople.csail.mit.edu\u002Fweichium\u002Fpapers\u002Fcvpr19-dsisf\u002Fpaper.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [通过光学膨胀将光流升级为3D场景光流](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FYang_Upgrading_Optical_Flow_to_3D_Scene_Flow_Through_Optical_Expansion_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [从自动标注中学习多目标跟踪与分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.02096) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [自动MOTS标注]\n\n\n### 交通信号灯与交通标志\n- [野外环境下的交通标志检测与分类](https:\u002F\u002Fwww.cv-foundation.org\u002Fopenaccess\u002Fcontent_cvpr_2016\u002Fpapers\u002FZhu_Traffic-Sign_Detection_and_CVPR_2016_paper.pdf) \u003Ckbd>CVPR 2016\u003C\u002Fkbd> [清华大学、腾讯、交通标志]\n- [用于联合交通标志与信号灯检测的分层深度架构及小批量选择方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.07987) \u003Ckbd>IEEE CRV 2018\u003C\u002Fkbd> [多伦多大学]\n- [通过单次检测法识别交通信号灯](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.02523) \u003Ckbd>ITSC 2018\u003C\u002Fkbd>\n- [DeepTLR：用于交通信号灯检测与分类的单一深度卷积网络](https:\u002F\u002Fsci-hub.st\u002F10.1109\u002FIVS.2016.7535408) \u003Ckbd>IV 2016\u003C\u002Fkbd>\n- [在具有挑战性的交通信号灯数据集上评估最先进目标检测器](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2017_workshops\u002Fw9\u002Fpapers\u002FJensen_Evaluating_State-Of-The-Art_Object_CVPR_2017_paper.pdf) \u003Ckbd>CVPR 2017研讨会\u003C\u002Fkbd>\n- [利用深度学习和显著性图在不同光照条件下识别交通信号灯](https:\u002F\u002Fwww.researchgate.net\u002Fprofile\u002FVijay_John3\u002Fpublication\u002F265014373_Traffic_Light_Recognition_in_Varying_Illumination_using_Deep_Learning_and_Saliency_Map\u002Flinks\u002F56aac00408ae8f3865666102.pdf) \u003Ckbd>ITSC 2014\u003C\u002Fkbd> [交通信号灯]\n- [利用高清地图特征识别交通信号灯](https:\u002F\u002Fsci-hub.st\u002Fhttps:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fabs\u002Fpii\u002FS0921889018301234) \u003Ckbd>RAS 2019\u003C\u002Fkbd>\n- [关于交通信号灯视觉的研究：问题、综述与展望](http:\u002F\u002Fcvrr.ucsd.edu\u002Fpublications\u002F2016\u002FtrafficSignalsITSTrans2016.pdf) \u003Ckbd>TITS 2015\u003C\u002Fkbd>\n\n### 数据集与综述\n- [The DriveU 交通信号灯数据集：介绍及与现有数据集的比较](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F8460737) \u003Ckbd>ICRA 2018\u003C\u002Fkbd> \n- [牛津雷达 RobotCar 数据集：牛津 RobotCar 数据集的雷达扩展版](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.01300)\n- [面向交通信号灯的视觉：问题、综述与展望](http:\u002F\u002Fcvrr.ucsd.edu\u002Fpublications\u002F2016\u002FtrafficSignalsITSTrans2016.pdf)（交通信号灯综述，UCSD LISA）\n- [图谱理论综述](paper_notes\u002Fgraph_spectrum.md)（进行中）\n- [CVPR 2017 3D深度学习教程](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=8CenT_4HWyY) [[笔记](paper_notes\u002F3ddl_cvpr2017.md)] - （进行中）\n- [神经架构搜索综述](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.01392.pdf)\n- [网络剪枝教程](https:\u002F\u002Fjacobgil.github.io\u002Fdeeplearning\u002Fpruning-deep-learning)（博客）\n- [CVPR 2019 GNN 教程](https:\u002F\u002Fxiaolonw.github.io\u002Fgraphnn\u002F)\n- [面向自动驾驶的大规模交互式运动预测：Waymo 开放运动数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10133) [Waymo，预测数据集]\n- [PANDA：千兆像素级以人为中心的视频数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.04852) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [WoodScape：用于自动驾驶的多任务、多摄像头鱼眼数据集](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FYogamani_WoodScape_A_Multi-Task_Multi-Camera_Fisheye_Dataset_for_Autonomous_Driving_ICCV_2019_paper.pdf) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Valeo]\n\n### 无监督深度估计\n- [CNN 中的稀疏与密集数据：深度补全与语义分割](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1808.00769.pdf) \u003Ckbd>3DV 2018\u003C\u002Fkbd>\n- [基于多尺度深度网络从单张图像预测深度图](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1406.2283.pdf) \u003Ckbd>NIPS 2014\u003C\u002Fkbd>（Eigen 等人）\n- [使用直接法从单目视频中学习深度](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.00175) \u003Ckbd.CVPR 2018\u003C\u002Fkbd>（单目深度估计）\n- [Virtual-Normal：通过强制执行虚拟法线的几何约束来预测深度](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.12209) [[笔记](paper_notes\u002Fvirtual_normal.md)] \u003Ckbd>ICCV 2019\u003C\u002Fkbd>（PL 的生成效果更好）\n- [基于生成对抗网络的空间对应关系：从单目视频中学习深度](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FWu_Spatial_Correspondence_With_Generative_Adversarial_Network_Learning_Depth_From_Monocular_ICCV_2019_paper.pdf) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [面向单目深度 SLAM 的关键帧检测与视觉里程计的无监督协同学习](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCV_2019\u002Fpapers\u002FSheng_Unsupervised_Collaborative_Learning_of_Keyframe_Detection_and_Visual_Odometry_Towards_ICCV_2019_paper.pdf) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [用于单目深度估计的卷积神经网络可视化](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.03380) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n\n### 室内深度\n- [单目深度估计中遮挡轮廓的快速准确恢复](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.08598) \u003Ckbd>ICCV 2019 工作坊\u003C\u002Fkbd> [室内]\n- [用于单目深度估计的多损失再平衡算法](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fhtml\u002F2890_ECCV_2020_paper.php) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [室内深度]\n- [利用单次瞬态消除歧义的单目深度估计](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fhtml\u002F3668_ECCV_2020_paper.php) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [附加激光传感器，室内深度]\n- [使用深度注意力体积引导单目深度估计](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fhtml\u002F5491_ECCV_2020_paper.php) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [室内深度]\n- [通过利用结构感知和互补数据集改进单目深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.11256) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [室内深度]\n- [用于单目深度估计的 CLIFFNet，采用层次嵌入损失](https:\u002F\u002Fwww.ecva.net\u002Fpapers\u002Feccv_2020\u002Fpapers_ECCV\u002Fhtml\u002F3365_ECCV_2020_paper.php) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [室内深度]\n\n### 激光雷达\n- [PointSIFT：用于3D点云语义分割的类似SIFT的网络模块](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1807.00652.pdf)（PointNet替代方案，骨干网络）\n- [基于全卷积网络的3D激光雷达车辆检测](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1608.07916.pdf)（VeloFCN）\u003Ckbd>RSS 2016\u003C\u002Fkbd> \n- [KPConv：面向点云的灵活且可变形卷积](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.08889)（来自PointNet作者团队）\n- [PointCNN：在X变换后的点上进行卷积](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1801.07791.pdf) \u003Ckbd>NIPS 2018\u003C\u002Fkbd>\n- [L3-Net：面向自动驾驶的基于学习的激光雷达定位](https:\u002F\u002Fsongshiyu01.github.io\u002Fpdf\u002FL3Net_W.Lu_Y.Zhou_S.Song_CVPR2019.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [RoarNet：基于区域近似精炼的鲁棒3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.03818)（传感器融合，3D单目提案，在点云中精炼）\n- [DeLS-3D：利用3D语义地图进行深度定位与分割](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1805.04949.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [Frustum ConvNet：滑动截锥体聚合局部逐点特征以实现非遮挡3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.01864) \u003Ckbd>IROS 2019\u003C\u002Fkbd>\n- [PointRNN：用于移动点云处理的点递归神经网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.08287)\n- [Gated2Depth：从门控图像实时生成稠密激光雷达数据](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.04997) \u003Ckbd>ICCV 2019口头报告\u003C\u002Fkbd>\n- [用于城市驾驶环境中运动目标检测与跟踪的多传感器融合系统](http:\u002F\u002Fwww.cs.cmu.edu\u002F~youngwoo\u002Fdoc\u002Ficra-14-sensor-fusion.pdf) \u003Ckbd>ICRA 2014\u003C\u002Fkbd>\n- [PointFusion：用于3D边界框估计的深度传感器融合](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.10871) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [传感器融合，Zoox]\n- [点云中3D目标检测的深度霍夫投票法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09664) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [Charles Qi]\n- [StixelNet：用于障碍物检测和道路分割的深度卷积网络](http:\u002F\u002Fwww.bmva.org\u002Fbmvc\u002F2015\u002Fpapers\u002Fpaper109\u002Fpaper109.pdf)\n- [PolarNet：面向在线激光雷达点云语义分割的改进网格表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.14032) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [超越激光雷达范围的深度感知](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.03048) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [广基线立体视觉结合三焦点相机]\n- [面向城市自动驾驶应用的概率语义建图](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04894) \u003Ckbd>IROS 2020\u003C\u002Fkbd> [激光雷达建图]\n- [RandLA-Net：大规模点云的高效语义分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.11236) \u003Ckbd>CVPR 2020口头报告\u003C\u002Fkbd> [激光雷达分割]\n- [PolarNet：面向在线激光雷达点云语义分割的改进网格表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.14032) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [激光雷达分割]\n- [OctSqueeze：面向激光雷达压缩的八叉树结构熵模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.07178) \u003Ckbd>CVPR 2020口头报告\u003C\u002Fkbd> [激光雷达压缩]\n- [MuSCLE：利用深度熵模型对激光雷达进行多扫描压缩](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.07590) \u003Ckbd>NeurIPS 2020口头报告\u003C\u002Fkbd> [激光雷达压缩]\n\n### 自我中心边界框预测\n- [交通场景中人在不确定性下的长期车载预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.09026) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [车载边界框预测]\n- [第一人称视频中的无监督交通事故检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.00618) \u003Ckbd>IROS 2019\u003C\u002Fkbd>（本田）\n- [NEMO：利用噪声自我先验进行未来物体定位](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.08150)（本田）\n- [面向未来车辆定位的鲁棒随机性建模](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPRW_2019\u002Fpapers\u002FPrecognition\u002FHudnell_Robust_Aleatoric_Modeling_for_Future_Vehicle_Localization_CVPRW_2019_paper.pdf)（视角）\n- [多目标预测：在多样化环境中预测未来物体位置](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.11944) \u003Ckbd>WACV 2020\u003C\u002Fkbd>（视角边界框，行人）\n- [使用全景视频在3D全景坐标系中进行多人定位与跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.10535)\n\n### 车道检测\n- [通过可微分最小二乘拟合实现端到端车道检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.00293) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [Line-CNN：带有线条提案单元的端到端交通线路检测](https:\u002F\u002Fdoi.org\u002F10.1109\u002FTITS.2019.2890870) \u003Ckbd>TITS 2019\u003C\u002Fkbd> [类似目标的提案]\n- [利用透视变换层远距离检测车道和道路标记](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.08550) [3D LLD]\n- [超快速结构感知深度车道检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.11757) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [车道检测]\n- [基于双流融合全卷积网络的道路检测新方法](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F8500551\u002F)（将摄像头转换为BEV）\n- [FastDraw：通过适应序列预测网络解决车道检测的长尾问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.04354)\n\n### 跟踪\n- [RetinaTrack：在线单阶段联合检测与跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13870) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [自动驾驶车辆的计算机视觉：问题、数据集及最新进展](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.05519v2)（2019年12月最新更新）\n- [利用视频和IMU同时识别与跟踪多人](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPRW_2019\u002Fpapers\u002FBMTT\u002FHenschel_Simultaneous_Identification_and_Tracking_of_Multiple_People_Using_Video_and_CVPRW_2019_paper.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Detect-and-Track：视频中高效的姿态估计](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fpapers\u002FGirdhar_Detect-and-Track_Efficient_Pose_CVPR_2018_paper.pdf)\n- [TrackNet：同时进行目标检测与跟踪及其在交通视频分析中的应用](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.01466)\n- [视频动作变换网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.02707) \u003Ckbd>CVPR 2019口头报告\u003C\u002Fkbd>\n- [在线实时多时空动作定位与预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.08563) \u003Ckbd>ICCV 2017\u003C\u002Fkbd>\n- [多目标跟踪近年来论文及开源代码汇总](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F65177442)\n- [GNN3DMOT：用于3D多目标跟踪的多特征学习图神经网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.07327) \u003Ckbd>CVPR 2020口头报告\u003C\u002Fkbd> [3DMOT，CMU，Kris Kitani]\n- [Chained-Tracker：将成对注意力回归结果串联起来实现端到端的多目标检测与跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.14557) \u003Ckbd>ECCV 2020亮点展示\u003C\u002Fkbd> [MOT，腾讯]\n- [迈向实时多目标跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.12605) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [MOT]\n- [面向自动驾驶的概率3D多目标跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.05673) [TRI]\n\n### 关键点：姿态与人脸\n- [概率人脸嵌入](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09658) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [人脸识别中的数据不确定性学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.11339) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [从无标签视频中自监督学习可解释的关键点](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fhtml\u002FJakab_Self-Supervised_Learning_of_Interpretable_Keypoints_From_Unlabelled_Videos_CVPR_2020_paper.html) \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [VGG, 自监督, 可解释, 判别器]\n\n\n### 通用深度学习\n- [重新审视深度神经网络的小批量训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.07612)\n- [ICML2019研讨会：自适应与多任务学习：算法与系统](https:\u002F\u002Ficml.cc\u002FConferences\u002F2019\u002FScheduleMultitrack?event=3504) \u003Ckbd>ICML 2019\u003C\u002Fkbd>\n- [多任务学习的自适应调度](https:\u002F\u002Fmarcpickett.com\u002Fcl2018\u002FCL-2018_paper_82.pdf) \u003Ckbd>NIPS 2018\u003C\u002Fkbd> (NMT)\n- [极坐标变换网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.01889) \u003Ckbd>ICLR 2018\u003C\u002Fkbd>\n- [深度学习中的校准度量](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.01685) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [利用近似方差传播进行无采样先验不确定性估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.00598) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> (先验不确定性)\n- [使卷积网络再次具备平移不变性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.11486) \u003Ckbd>ICML\u003C\u002Fkbd>\n- [使用自监督学习可以提高模型的鲁棒性和不确定性估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.12340) \u003Ckbd>NeurIPS 2019\u003C\u002Fkbd>\n- [理解深度学习需要重新思考泛化问题](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.03530) \u003Ckbd>ICLR 2017\u003C\u002Fkbd> [ICLR最佳论文]\n- [一种用于检测神经网络中误分类和分布外样本的基线](https:\u002F\u002Farxiv.org\u002Fabs\u002F1610.02136) \u003Ckbd>ICLR 2017\u003C\u002Fkbd> (NLL分数作为异常分数)\n- [基于非参数实例级判别的无监督特征学习](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002FCameraReady\u002F0801.pdf) \u003Ckbd>CVPR 2018 焦点论文\u003C\u002Fkbd> (Stella Yu)\n- [关于过参数化浅层神经网络优化景观的理论见解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1707.04926) \u003Ckbd>TIP 2018\u003C\u002Fkbd>\n- [插值的力量：理解SGD在现代过参数化学习中的有效性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.06559) \u003Ckbd>ICML 2018\u003C\u002Fkbd>\n- [设计网络设计空间](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13678) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [Moco2：基于动量对比学习的改进基线](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.04297)\n- [神经网络上的SGD会学习复杂度逐渐增加的函数](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.11604) \u003Ckbd>NIPS 2019\u003C\u002Fkbd> (SGD首先学习线性分类器)\n- [关注激活值：一种用于细粒度图像识别的模块化注意力机制](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.13075)\n- [一种混合分类-回归框架，用于从2D图像中估计3D姿态](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.03225) \u003Ckbd>BMVC 2018\u003C\u002Fkbd> (多bin，有何新意？)\n- [就地激活的BatchNorm，用于优化内存的DNN训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.02616) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> (优化的BatchNorm + ReLU)\n- [FCNN：傅里叶卷积神经网络](http:\u002F\u002Fecmlpkdd2017.ijs.si\u002Fpapers\u002FpaperID11.pdf) (FFT作为CNN)\n- [可视化神经网络的损失景观](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F7875-visualizing-the-loss-landscape-of-neural-nets.pdf) \u003Ckbd>NIPS 2018\u003C\u002Fkbd>\n- [Xception：使用深度可分离卷积的深度学习](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1610.02357.pdf) (Xception)\n- [利用不确定性权衡损失进行场景几何与语义的多任务学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.07115) (不确定性)\n- [无需真实世界标签的模拟环境驾驶学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1812.03823) \u003Ckbd>ICRA 2019\u003C\u002Fkbd> (领域适应，sim2real)\n- [滤波器响应归一化层：消除深度神经网络训练中的批依赖](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.09737) \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd>\n- [用于深度表示学习的可切换白化](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09739) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [领域适应]\n- [视觉手性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.09512) \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [最佳论文提名]\n- [广义ODIN：无需从分布外数据中学习即可检测分布外图像](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.11297) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [带噪声学生的自训练提升ImageNet分类性能](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.04252) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [蒸馏]\n- [保持简单：基于图像统计匹配的领域适应](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.12551) \u003Ckbd>CVPRW 2020\u003C\u002Fkbd> [针对2D模框的领域适应]\n- [对极变换网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.04551) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Yihui He]\n- [基于函数变分推断的可扩展计算机视觉不确定性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.03396) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [仅需一次前向传播即可获得先验不确定性]\n\n### 单目3D\n- [3DOP：用于精确目标类别检测的3D目标提案](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F5644-3d-object-proposals-for-accurate-object-class-detection) \u003Ckbd>NIPS 2015\u003C\u002Fkbd>\n- [DirectShape：用于视觉车辆位姿与形状估计的形状先验光度对齐方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.10097)\n- [消除盲区：将3D目标检测与单目深度估计适配到360°全景图像](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.06253) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>（单目3D目标检测与深度估计）\n- [迈向场景理解：基于语义感知表示的无监督单目深度估计](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2019\u002Fpapers\u002FChen_Towards_Scene_Understanding_Unsupervised_Monocular_Depth_Estimation_With_Semantic-Aware_Representation_CVPR_2019_paper.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [统一条件解码器]\n- [DDP：从单张图像和稀疏范围中推断密集深度后验分布](https:\u002F\u002Farxiv.org\u002Fabs\u002F1901.10034) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [增强现实与计算机视觉的结合：面向城市驾驶场景的高效数据生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F1708.01566) \u003Ckbd>IJCV 2018\u003C\u002Fkbd>（AR数据增强，丰田）\n- [探索3D单目目标检测的能力与局限——基于仿真与真实世界数据的研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.07424) \u003Ckbd>IITS\u003C\u002Fkbd>\n- [借助精细的3D目标表示迈向场景理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F1411.5935) \u003Ckbd>IJCV 2014\u003C\u002Fkbd>（关键点、3D边界框标注）\n- [深度立方体检测：超越2D边界框](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.10010)（Magic Leap）\n- [视角与关键点](https:\u002F\u002Farxiv.org\u002Fabs\u002F1411.6067)（Malik）\n- [将目标检测数据集提升至3D](https:\u002F\u002Farxiv.org\u002Fabs\u002F1503.06465)（PASCAL）\n- [野外环境中的3D目标类别检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1503.05038)（基于关键点）\n- [快速单次检测与位姿估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1609.05590) \u003Ckbd>3DV 2016\u003C\u002Fkbd>（SSD + 位姿，Wei Liu）\n- [虚拟KITTI 2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.10773)\n- [利用形状概念进行深度监督，实现遮挡感知的3D目标解析](https:\u002F\u002Farxiv.org\u002Fabs\u002F1612.02699) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [为CNN渲染：使用基于渲染3D模型视图训练的CNN进行图像中的视角估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1505.05641) \u003Ckbd>ICCV 2015 口头报告\u003C\u002Fkbd>\n- [实时无缝单次6D目标位姿预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1711.08848) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [实用深度立体匹配（PDS）：面向应用的深度立体匹配](https:\u002F\u002Farxiv.org\u002Fabs\u002F1806.01677) \u003Ckbd>NIPS 2018\u003C\u002Fkbd> [视差估计]\n- [自监督稀疏转稠密：基于LiDAR和单目相机的自监督深度补全](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.00275) \u003Ckbd>ICRA 2019\u003C\u002Fkbd>\n- [利用卷积空间传播网络学习深度](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.02695)（百度，SPN深度）\u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [顺其自然：自监督场景流估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.00497) \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [场景流，LiDAR]\n- [单目视频中的在线深度学习以对抗遗忘](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FZhang_Online_Depth_Learning_Against_Forgetting_in_Monocular_Videos_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [单目深度]\n- [具有在线适应性的自监督深度视觉里程计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.06136) \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [DF-VO、TrianFlow、元学习]\n- [利用自注意力机制和离散视差体积进行自监督单目深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13951) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [单目视频中的在线深度学习以对抗遗忘](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FZhang_Online_Depth_Learning_Against_Forgetting_in_Monocular_Videos_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [单目深度，在线学习]\n- [SDC-Depth：用于单目深度估计的语义分治网络](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FWang_SDC-Depth_Semantic_Divide-and-Conquer_Network_for_Monocular_Depth_Estimation_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [单目深度，语义]\n- [从单张图像推断深度分布](http:\u002F\u002Fwww.contrib.andrew.cmu.edu\u002F~gengshay\u002Fwordpress\u002Fwp-content\u002Fuploads\u002F2018\u002F11\u002Firos_monodepth_uncertainty.pdf) \u003Ckbd>TRO\u003C\u002Fkbd> [深度置信度，将其拼接起来]\n- [具有全局一致深度的动态场景新视角合成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.01294) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [深度的边界：分割与深度之间的显式约束](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.00171) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Xiaoming Liu，多模态，深度渗出]\n\n### 雷达感知\n- [MV-RSS：多视角雷达语义分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.16214) \u003Ckbd>ICCV 2021\u003C\u002Fkbd>\n- [使用77 GHz频段的CNN对极化雷达图像中的目标进行分类](http:\u002F\u002Fsci-hub.tw\u002F10.1109\u002FAPMC.2017.8251453)（雷达，极化）\n- [基于真实世界数据的CNN用于汽车雷达中的干扰抑制与去噪](https:\u002F\u002Fml4ad.github.io\u002Ffiles\u002Fpapers\u002FCNNs%20for%20Interference%20Mitigation%20and%20Denoising%20in%20Automotive%20Radar%20Using%20Real-World%20Data.pdf) \u003Ckbd>NeurIPS 2019\u003C\u002Fkbd>（雷达）\n- [通过语义分割从稀疏雷达聚类中学习占用网格来理解道路场景](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.00415) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>（雷达）\n- [RadarNet：利用雷达实现对动态目标的鲁棒感知](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.14366) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Uber ATG]\n- [从单目图像和稀疏雷达数据中估计深度](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.00058) \u003Ckbd>IROS 2020\u003C\u002Fkbd> [相机+雷达用于单目深度，nuscenes]\n- [RPR：用于自动驾驶车辆中联合目标检测与距离估计的雷达-相机传感器融合](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.08428) \u003Ckbd>IROS 2020\u003C\u002Fkbd> [雷达提案细化]\n- [在汽车应用中将雷达数据映射到相机图像以实现跨模态监督](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.12809)\n\n### SLAM\n- [PoseNet: 用于实时6自由度相机重定位的卷积网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1505.07427) [[笔记](paper_notes\u002Fposenet.md)] \u003Ckbd>ICCV 2015\u003C\u002Fkbd>\n- [PoseNet2: 基于深度学习的相机重定位中的不确定性建模](https:\u002F\u002Farxiv.org\u002Fabs\u002F1509.05909) \u003Ckbd>ICRA 2016\u003C\u002Fkbd>\n- [PoseNet3: 基于深度学习的相机位姿回归的几何损失函数](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.00390) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [EssNet: 用于几何匹配的卷积神经网络架构](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.05593) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [NC-EssNet: 邻域一致性网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.10510) \u003Ckbd>NeurIPS 2018\u003C\u002Fkbd>\n- [强化特征点：针对高层任务优化特征检测与描述](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.00623) \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [Eric Brachmann, ngransac]\n- [利用3D几何约束从单目视频中无监督学习深度与自运动](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1802.05522.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [DynSLAM: 大规模动态环境下的鲁棒稠密建图](https:\u002F\u002Fsiegedog.com\u002Fdynslam\u002F) [动态SLAM, Andreas Geiger] \u003Ckbd>ICRA 2018\u003C\u002Fkbd>\n- [GCNv2: 用于实时SLAM的高效对应点预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.11046) \u003Ckbd>LRA 2019\u003C\u002Fkbd> [Superpoint + orb slam]\n- [实时可扩展的稠密Surfel建图](Real-time Scalable Dense Surfel Mapping) \u003Ckbd>ICRA 2019\u003C\u002Fkbd> [稠密重建, monodepth]\n- [动态SLAM: 对速度的需求](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.08584)\n- [GSLAM: 通用SLAM框架与基准测试](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.07995) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n\n### 雷达感知\n- [绕过街角的视野：使用多普勒雷达在野外进行非视距检测与跟踪](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FScheiner_Seeing_Around_Street_Corners_Non-Line-of-Sight_Detection_and_Tracking_In-the-Wild_Using_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [戴姆勒]\n- [用于自动驾驶车辆中稳健目标检测的雷达+RGB注意力融合](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.13642) \u003Ckbd>ICIP 2020\u003C\u002Fkbd>\n- [基于毫米波雷达和视觉传感器的障碍物检测的空间注意力融合](https:\u002F\u002Fwww.mdpi.com\u002F1424-8220\u002F20\u002F4\u002F956) \u003Ckbd>sensors 2020\u003C\u002Fkbd> [雷达, 摄像头, 前端融合]\n\n### 综述与调查\n- [深度学习在定位与建图中的研究综述：迈向空间机器智能时代](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.12567)\n- [基于深度学习的单目深度估计：概述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.06620)\n\n### 自动驾驶中的感知之外\n- [基于不确定性引导的多尺度残差学习——使用循环旋转CNN进行单幅图像去雨](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.11129) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [学习如何在多模态深度学习中融合多种模态](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.11730) (传感器融合, 通用DL)\n- [利用时空语义走廊为复杂城市环境生成安全轨迹](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.09788) \u003Ckbd>LRA 2019\u003C\u002Fkbd> [运动规划]\n- [DAgger: 基于模块化和抽象的驾驶策略迁移](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.09364) \u003Ckbd>CoRL 2018\u003C\u002Fkbd> [DAgger, 模仿学习]\n- [利用引导分支实现高效且具有不确定性的自动驾驶决策](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02746) \u003Ckbd>ICRA 2020\u003C\u002Fkbd> [运动规划]\n- [异构传感器系统的标定](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.11445.pdf)\n- [引言：ADAS中的数据融合 (来自知乎)](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F40967227) (截至CVPR 2018)\n- [YUVMultiNet: 用于自动驾驶的实时YUV多任务CNN](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1904.05673.pdf) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> (实时, 低功耗)\n- [为将ADAS提升至自动驾驶水平而进行的异构传感器模态深度融合](http:\u002F\u002Fsci-hub.tw\u002F10.1109\u002FVLSI-DAT.2018.8373245)\n- [视频中主动学习的时间一致性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.11757) \u003Ckbd>ICCVW 2019\u003C\u002Fkbd> [主动学习, 时间一致性]\n- [R-TOD: 专为自动驾驶设计的端到端延迟最小化的实时目标检测器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.06372) \u003Ckbd>RTSS 2020\u003C\u002Fkbd> [感知系统设计]\n\n### 预测与规划\n- [用于运动预测的道路图表示学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.13732) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Uber ATG]\n- [DSDNet: 深度结构化自动驾驶网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.06041) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Uber ATG]\n\n### 标注与工具\n- [视频中主动学习的时间一致性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.11757) \u003Ckbd>ICCV 2019研讨会\u003C\u002Fkbd>\n- [利用预训练的3D目标检测模型快速生成真值标注](https:\u002F\u002Farxiv.org\u002Fabs\u002F1807.06072) \u003Ckbd>ITSC 2018\u003C\u002Fkbd> [UToronto, 自动标注]\n- [从自动标注中学习多目标跟踪与分割](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fhtml\u002FPorzi_Learning_Multi-Object_Tracking_and_Segmentation_From_Automatic_Annotations_CVPR_2020_paper.html) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [自动标注]\n- [通过几何循环一致性进行规范表面映射](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.10043) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [TIDE: 用于识别目标检测错误的通用工具箱](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.08115) \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [工具]\n\n### 低层DL\n- [从视频中自监督地进行相机自标定](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.03325) [TRI, 内参标定, 鱼眼\u002F针孔]\n\n### 早期NLP论文\n- [用于句子建模的卷积神经网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1404.2188.pdf) \u003Ckbd>ACL 2014\u003C\u002Fkbd>\n- [FastText: 用于高效文本分类的技巧包](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1607.01759.pdf) \u003Ckbd>ACL 2017\u003C\u002Fkbd>\n- [用于学习句子相似度的暹罗递归架构](https:\u002F\u002Fwww.aaai.org\u002Focs\u002Findex.php\u002FAAAI\u002FAAAI16\u002Fpaper\u002Fdownload\u002F12195\u002F12023) \u003Ckbd>AAAI 2016\u003C\u002Fkbd>\n- [向量空间中词表示的高效估计](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1301.3781.pdf) \u003Ckbd>ICLR 2013\u003C\u002Fkbd>\n- [通过联合学习对齐与翻译实现神经机器翻译](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1409.0473.pdf) \u003Ckbd>ICLR 2015\u003C\u002Fkbd>\n- [Transformer: 注意力就是你所需要的](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1706.03762.pdf) \u003Ckbd>NIPS 2017\u003C\u002Fkbd>\n\n\n\n\n## 非DL\n- [广告推荐系统方向文章汇总](https:\u002F\u002Fgithub.com\u002Fwzhe06\u002FAd-papers)\n- [UMAP: 用于降维的均匀流形近似与投影](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.03426) [[笔记](paper_notes\u002Fumap.md)] (降维, 优于t-SNE)\n\n## 技术债务\n- [经典关键点与描述子回顾笔记](paper_notes\u002Fclassical_keypoints.md)\n- CRF\n- [视觉SLAM与视觉里程计](https:\u002F\u002Flink.springer.com\u002Fcontent\u002Fpdf\u002F10.1007%2Fs40903-015-0032-7.pdf)\n- ORB SLAM\n- 包调整\n- 3D视觉\n- [SLAM\u002FVIO学习总结](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F34995102)\n- [设计模式](https:\u002F\u002Frefactoring.guru\u002Fdesign-patterns\u002Fpython)\n\n## 待整理（CVPR 2021 和 ICCV 2021 待读文献）\n- [捕捉全视角上下文用于全景分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.05687) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [UP-DETR：基于Transformer的无监督目标检测预训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.09094) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [transformers]\n- [DCL：用于无边界不连续性旋转检测的密集标签编码](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.09670) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [4D 全景激光雷达分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.12472) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [TUM]\n- [CanonPose：野外自监督单目人体姿态估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14679) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [快速且准确的模型缩放](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.06877) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [FAIR]\n- [Cylinder3D：用于激光雷达语义分割的圆柱形非对称3D卷积网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.10033) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [lidar semantic segmentation]\n- [LiDAR R-CNN：高效通用的3D目标检测器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.15297) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [TuSimple, Lidar]\n- [PREDATOR：低重叠度3D点云配准](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.13005) \u003Ckbd>CVPR 2021 口头报告\u003C\u002Fkbd>\n- [DBB：多样分支模块——将卷积构建为类似Inception的单元](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.13425) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [RepVGG, ACNet, Xiaohan Ding, Megvii] \n- [GrooMeD-NMS：用于单目3D目标检测的分组可微NMS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.17202) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [DDMP：用于单目3D目标检测的深度条件动态消息传播](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.16470) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [M3DSSD：单目3D单阶段目标检测器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.13164) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [MonoRUn：通过重建与不确定性传播实现单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.12605) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [mono3D]\n- [HVPR：用于单阶段3D目标检测的混合体素-点云表示](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.00902) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Lidar]\n- [PLUME：基于立体图像的高效3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.06594) [Yan Wang, Uber ATG]\n- [V2F-Net：遮挡行人检测的显式分解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.03106) [crowded, pedestrian, megvii]\n- [IP-basic：捍卫经典图像处理——在CPU上快速完成深度补全](https:\u002F\u002Farxiv.org\u002Fabs\u002F1802.00036) \u003Ckbd>CRV 2018\u003C\u002Fkbd>\n- [重新审视单阶段目标检测中的特征对齐](https:\u002F\u002Farxiv.org\u002Fabs\u002F1908.01570) [cls+reg]\n- [部署过程中持续监控目标检测性能的每帧mAP预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.08650) \u003Ckbd>WACV 2021\u003C\u002Fkbd> [SafetyNet]\n- [TSD：重新思考目标检测器中的兄弟头](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.07540) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [sensetime, cls+reg]\n- [OpenImage2019第一梯队解决方案——目标检测与实例分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.07557) [sensetime, cls+reg, OpenImage2019第一名]\n- [在鸟瞰图车辆估计中实现时空聚合](https:\u002F\u002Fcvssp.org\u002FPersonal\u002FOscarMendez\u002Fpapers\u002Fpdf\u002FSahaICRA2021.pdf) \u003Ckbd>ICRA 2021\u003C\u002Fkbd>\n- [通过可微最小二乘拟合实现端到端车道检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.00293) \u003Ckbd>ICCV workshop 2019\u003C\u002Fkbd>\n- [重新审视ResNet：改进的训练与缩放策略](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.07579)\n- [多模态剪切粘贴用于3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.12741)\n- [LD：用于目标检测的定位蒸馏](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.12252)\n- [PolyTransform：用于实例分割的深度多边形Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.02801) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [single stage instance segmentation]\n- [ROAD：面向自动驾驶的ROAD事件感知数据集](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.11585)\n- [LidarMTL：简单高效的多任务网络，用于3D目标检测和道路理解](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.04056) [lidar MTL]\n- [无需归一化的大规模高性能图像识别](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.06171) \u003Ckbd>ICLR 2021\u003C\u002Fkbd>\n- [面向自动驾驶的地面感知单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.00690) \u003Ckbd>RA-L\u003C\u002Fkbd> [mono3D]\n- [揭秘用于单目3D目标检测的伪激光雷达](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.05796) [mono3d]\n- [用于规模化3D目标检测的伪标签](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.02093) [Waymo]\n- [LLA：用于密集行人检测的损失感知标签分配](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.04307) [Megvii]\n- [VectorNet：从矢量化表示中编码高清地图与交通参与者动态](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.04259) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Waymo] \n- [CoverNet：利用轨迹集合进行多模态行为预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.10298) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [prediction, nuScenes]\n- [SplitNet：分工与协同训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14660)\n- [VoVNet：一种节能且GPU计算效率高的实时目标检测骨干网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.09730) \u003Ckbd>CVPR 2019 workshop\u003C\u002Fkbd>\n- [等距神经网络：是数据不具区分性还是模型太弱？关于数据与模型分辨率的相对重要性](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.03205) \u003Ckbd>ICCV 2019 workshop\u003C\u002Fkbd> [spatial2channel]\n- [TResNet](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13630) \u003Ckbd>WACV 2021\u003C\u002Fkbd> [spatial2channel]\n- [距离IoU损失：更快更好的边界框回归学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.08287) \u003Ckbd>AAAI 2020\u003C\u002Fkbd> [DIOU, NMS]\n- [RegNet：设计网络设计空间](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.13678) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [FAIR]\n- [关于视觉识别的网络设计空间](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.13214) [FAIR]\n- [高速公路传感器融合车辆定位中车道端点检测与位置精度评估](https:\u002F\u002Fwww.ncbi.nlm.nih.gov\u002Fpmc\u002Farticles\u002FPMC6308985\u002F) \u003Ckbd>Sensors 2018\u003C\u002Fkbd> [lane endpoints]\n- [基于地图匹配的级联地标检测与车辆定位](https:\u002F\u002Fieeexplore.ieee.org\u002Fstamp\u002Fstamp.jsp?arnumber=8826538) \u003Ckbd>IEEE Access 2019\u003C\u002Fkbd> [lane endpoints]\n- [GCNet：用于深度立体回归的端到端几何与上下文学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1703.04309) \u003Ckbd>ICCV 2017\u003C\u002Fkbd> [disparity estimation, Alex Kendall, cost volume]\n- [自动驾驶车辆的交通管制手势识别](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.16072) \u003Ckbd>IROS 2020\u003C\u002Fkbd> [Daimler]\n- [从野外单张图像中感知3D人机空间关系](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.15649) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [OrcVIO：基于物体残差约束的视觉惯性里程计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.15107) [dynamic SLAM, very mathematical]\n- [InfoFocus：具有动态信息建模的自动驾驶3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.08556) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [DA4AD：面向自动驾驶的端到端深度注意力视觉定位](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.03026) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [通过优化空间嵌入迈向轻量级车道检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.08311) \u003Ckbd>ECCV 2020 workshop\u003C\u002Fkbd> [LLD]\n- [多帧转单帧：用于3D目标检测的知识蒸馏](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.11859) \u003Ckbd>ECCV 2020 workshop\u003C\u002Fkbd> [lidar]\n- [DeepIM：用于6D位姿估计的深度迭代匹配](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ECCV_2018\u002Fpapers\u002FYi_Li_DeepIM_Deep_Iterative_ECCV_2018_paper.pdf) \u003Ckbd>ECCV 2018\u003C\u002Fkbd> [pose estimation]\n- [通过连续3D损失进行单目深度预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.09763) \u003Ckbd>IROS 2020\u003C\u002Fkbd>\n- [密集预测任务中的多任务学习：综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.13379) [MTL, Luc Van Gool]\n- [自动驾驶系统中多任务网络的动态任务权重方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.02223) \u003Ckbd>ITSC 2020 口头报告\u003C\u002Fkbd> [MTL]\n- [NeurAll：迈向自动驾驶中统一的视觉感知模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.03589) \u003Ckbd>ITSC 2019 口头报告\u003C\u002Fkbd> [MTL]\n- [深度证据回归](https:\u002F\u002Fpapers.nips.cc\u002Fpaper\u002F2020\u002Ffile\u002Faab085461de182608ee9f607f3f7d18f-Paper.pdf) \u003Ckbd>NeurIPS 2020\u003C\u002Fkbd> [one-pass aleatoric\u002Fepistemic uncertainty]\n- [从单目视频中估计可行驶无碰撞空间](http:\u002F\u002Fwww.cs.toronto.edu\u002F~yaojian\u002FfreeSpace.pdf) \u003Ckbd>WACV 2015\u003C\u002Fkbd> [Drivable space]\n- [用于单目深度估计的卷积神经网络可视化](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.03380) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> [monodepth]\n- [可微渲染：综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.12057) [differentiable rendering, TRI]\n- [SAFENet：具有语义感知特征提取的自监督单目深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.02893) [monodepth, semantics, Naver labs]\n- [迈向视频对象边界框的交互式自标注：基于循环自学习和层次化标注的框架](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_WACV_2020\u002Fpapers\u002FLe_Toward_Interactive_Self-Annotation_For_Video_Object_Bounding_Box_Recurrent_Self-Learning_WACV_2020_paper.pdf) \u003Ckbd>WACV 2020\u003C\u002Fkbd>\n- [迈向CNN基单目深度估计的良好实践](http:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_WACV_2020\u002Fpapers\u002FFang_Towards_Good_Practice_for_CNN-Based_Monocular_Depth_Estimation_WACV_2020_paper.pdf) \u003Ckbd>WACV 2020\u003C\u002Fkbd>\n- [自监督场景去遮挡](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.02788) \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd>\n- [TP-LSD：基于三点的线段检测器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.05505)\n- [数据蒸馏：迈向全监督学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F1712.04440) \u003Ckbd>CVPR 2018\u003C\u002Fkbd> [Kaiming He, FAIR]\n- [MiDas：迈向鲁棒单目深度估计——混合数据集以实现零样本跨数据集迁移](https:\u002F\u002Farxiv.org\u002Fabs\u002F1907.01341) [monodepth, dynamic object, synthetic dataset]\n- [语义驱动的单目深度与自我运动估计无监督学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04371) [monodepth]\n- [通过优化空间嵌入迈向轻量级车道检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.08311) \u003Ckbd>ECCV 2020 workshop\u003C\u002Fkbd>\n- [车道检测的合成到真实域适应](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.04023) [GM Israel, LLD]\n- [PolyLaneNet：通过深度多项式回归进行车道估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.10924) \u003Ckbd>ICPR 2020\u003C\u002Fkbd> [polynomial, LLD]\n- [为实时实例分割学习通用形状字典](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.01050)\n- [使用Transformer实现端到端视频实例分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14503) [DETR, transformers]\n- [Score-CAM：卷积神经网络的分数加权可视化解释](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.01279) \u003Ckbd>CVPR 2020 workshop\u003C\u002Fkbd>\n- [测试时增强何时以及为何有效](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.11156)\n- [从单色图像中获取足迹与自由空间](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.06376) \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd> [Parking use, footprint]\n- [在Flatmobiles之间行驶：从单目相机获取鸟瞰占用网格，用于整体轨迹规划](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.04047) [BEV, only predict footprint]\n- [重新思考目标检测中的分类与定位](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.06493) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [通过序列特征关联与深度提示增强的单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14589) [mono3D]\n- [简单的复制粘贴是实例分割的强大数据增强方法](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.07177)\n- [ViP-DeepLab：通过深度感知视频全景分割学习视觉感知](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.05258)\n- [MVSNet：用于非结构化多视图立体的深度推断](https:\u002F\u002Farxiv.org\u002Fabs\u002F1804.02505) \u003Ckbd>ECCV 2018\u003C\u002Fkbd>\n- [用于高分辨率多视图立体深度推断的递归MVSNet](https:\u002F\u002Farxiv.org\u002Fabs\u002F1902.10556) \u003Ckbd>CVPR 2019\u003C\u002Fkbd> [Deep learning + MVS, Vidar, same author MVSNet]\n- [用于城市数据集增强的人工假人](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.08274) \u003Ckbd>AAAI 2021\u003C\u002Fkbd>\n- [DETR用于行人检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.06785) [transformer, pedestrian detection]\n- [多模态剪切粘贴用于3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.12741) [SenseTime]\n- [从序列到序列的角度重新思考语义分割，并结合Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.15840) [transformer, semantic segmenatation]\n- [TransPose：迈向可解释的人体姿态估计的Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.14214) [transformer, pose estimation]\n- [跷跷板损失用于长尾实例分割](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.10032)\n- [SWA目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.12645) [Stochastic Weights Averaging (SWA)]\n- [使用Pointformer进行3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.11409)\n- [迈向基于Transformer的目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.09958) [DETR-like]\n- [通过轻量级3D点融合提升单目深度估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.10296) [dense SfM]\n- [多模态剪切粘贴用于3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.12741)\n- [结合语义分割和兴趣特征点的全局视觉定位](http:\u002F\u002Fras.papercept.net\u002Fimages\u002Ftemp\u002FIROS\u002Ffiles\u002F1899.pdf)\n- [超越注意力可视化的Transformer可解释性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.09838) [transformers]\n- [在单个GPU上将语义分割扩展到1000类以上](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.07489)\n- [DetectoRS：利用递归特征金字塔和可切换空洞卷积进行目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.02334)\n- [目标检测中的经验上限及其他](https:\u002F\u002Farxiv.org\u002Fabs\u002F1911.12451)\n- [自动驾驶鱼眼相机上的广义目标检测：数据集、表示与基线](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.02124) [Fisheye, Senthil Yogamani]\n- [通过序列特征关联与深度提示增强的单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14589) [mono3D]\n- [SOSD-Net：从单目图像中联合语义对象分割与深度估计](http:\u002F\u002Farxiv.org\u002Fabs\u002F2101.07422) [Jiwen Lu, monodepth]\n- [用于统一单目深度预测与补全的稀疏辅助网络](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.16690) [TRI]\n- [Linformer：具有线性复杂度的自注意力机制](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04768)\n- [Set Transformer：一种基于注意力的排列不变神经网络框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F1810.00825) \u003Ckbd>ICML 2019\u003C\u002Fkbd>\n- [PCT：点云Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.09688) \u003Ckbd>Computational Visual Media 2021\u003C\u002Fkbd>\n- [DDT：通过深度描述符变换实现无监督对象发现与共定位](https:\u002F\u002Farxiv.org\u002Fabs\u002F1707.06397) \u003Ckbd>IJCAI 2017\u003C\u002Fkbd>\n- [面向城市无地图驾驶的分层道路拓扑学习](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.00084) [Mercedes]\n- [视频场景理解中的概率未来预测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.06409) \u003Ckbd>ECCV 2020\u003C\u002Fkbd> [Alex Kendall]\n- [为自动驾驶车辆检测32种行人属性](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.02647) [VRU, MTL]\n- [通过进化式训练数据进行级联深度单目3D人体姿态估计](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.07778) \u003Ckbd>CVPR 2020 口头报告\u003C\u002Fkbd>\n- [MonoGeo：通过投影建模学习几何引导的深度，用于单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.13931) [mono3D]\n- [Aug3D-RPN：利用带有虚拟深度的合成图像改善单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.13269) [mono3D]\n- [邻居投票：通过邻居距离投票改善单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.02493) [mono3D]\n- [用于关键点驱动单目3D目标检测的Lite-FPN](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.00268) [mono3D]\n- [激光雷达点云引导的单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.09035)\n- [用于密集预测的视觉Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.13413) [Vladlen Koltun, Intel]\n- [高效Transformer：综述](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.06732)\n- [视觉Transformer是否像卷积神经网络那样看待世界？](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.08810)\n- [单目3D目标检测中的渐进式坐标变换](http:\u002F\u002Farxiv.org\u002Fabs\u002F2108.05793) [mono3D]\n- [AutoShape：实时形状感知单目3D目标检测](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.11127) \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [mono3D]\n- [BlazePose：设备端实时身体姿态跟踪](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.10204)\n\n\n\n## TODO\n- [Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.00598) [Andy Zeng]\n- [Large Language Models as General Pattern Machines](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.04721) [Embodied AI]\n- RetinaGAN: An Object-aware Approach to Sim-to-Real Transfer\n- [PlaNet: Learning Latent Dynamics for Planning from Pixels](https:\u002F\u002Farxiv.org\u002Fabs\u002F1811.04551) \u003Ckbd>ICML 2019\u003C\u002Fkbd>\n- [Dreamer: Dream to Control: Learning Behaviors by Latent Imagination](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.01603) \u003Ckbd>ICLR 2020 oral\u003C\u002Fkbd>\n- [DreamerV2: Mastering Atari with Discrete World Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.02193) \u003Ckbd>ICLR 2021\u003C\u002Fkbd> [World models]\n- [DreamerV3: Mastering Diverse Domains through World Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.04104)\n- [DayDreamer: World Models for Physical Robot Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.14176) \u003Ckbd>CoRL 2022\u003C\u002Fkbd>\n- [JEPA: A Path Towards Autonomous Machine Intelligence](https:\u002F\u002Fopenreview.net\u002Fpdf?id=BZ5a1r-kVsf)\n- [I-JEPA: Self-Supervised Learning from Images with a Joint-Embedding Predictive Architecture](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.08243) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [Runway Gen-1: Structure and Content-Guided Video Synthesis with Diffusion Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.03011)\n- [IL Difficulty Model: Embedding Synthetic Off-Policy Experience for Autonomous Driving via Zero-Shot Curricula](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.01375) \u003Ckbd>CoRL 2022\u003C\u002Fkbd> [Waymo]\n- [Decision Transformer: Reinforcement Learning via Sequence Modeling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.01345) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd> [LLM for planning]\n- [LID: Pre-Trained Language Models for Interactive Decision-Making](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.01771) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [LLM for planning]\n- [Planning with Large Language Models via Corrective Re-prompting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.09935) \u003Ckbd>NeurIPS 2022 Workshop\u003C\u002Fkbd>\n- [Object as Query: Equipping Any 2D Object Detector with 3D Detection Ability](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.02364) \u003Ckbd>ICCV 2023\u003C\u002Fkbd> [TuSimple]\n- [Speculative Sampling: Accelerating Large Language Model Decoding with Speculative Sampling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.01318) [Accelerated LLM, DeepMind]\n- [Inference with Reference: Lossless Acceleration of Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.04487) [Accelerated LLM, Microsoft]\n- [EPSILON: An Efficient Planning System for Automated Vehicles in Highly Interactive Environments](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.07993) \u003Ckbd>T-RO 2021\u003C\u002Fkbd>\n- [Efficient Uncertainty-aware Decision-making for Automated Driving Using Guided Branching](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.02746) \u003Ckbd>ICRA 2020\u003C\u002Fkbd>\n- [StreamPETR: Exploring Object-Centric Temporal Modeling for Efficient Multi-View 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11926)\n- [SSCNet: Semantic Scene Completion from a Single Depth Image](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.08974) \u003Ckbd>CVPR 2017\u003C\u002Fkbd>\n- [SemanticKITTI: A Dataset for Semantic Scene Understanding of LiDAR Sequences](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.01416) \u003Ckbd>ICCV 2019\u003C\u002Fkbd> \n- [PixPro: Propagate Yourself: Exploring Pixel-Level Consistency for Unsupervised Visual Representation Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.10043) [self-supervised]\n- [Pixel-Wise Contrastive Distillation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.00218) [self-supervised]\n- [VICRegL: Self-Supervised Learning of Local Visual Features](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.01571) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd>\n- [ImageBind: One Embedding Space To Bind Them All](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.05665) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [KEMP: Keyframe-Based Hierarchical End-to-End Deep Model for Long-Term Trajectory Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.04624) \u003Ckbd>ICRA 2022\u003C\u002Fkbd> [Planning]\n- [Deep Interactive Motion Prediction and Planning: Playing Games with Motion Prediction Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.02392) \u003Ckbd>L4DC\u003C\u002Fkbd> [Planning]\n- [GameFormer: Game-theoretic Modeling and Learning of Transformer-based Interactive Prediction and Planning for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05760) [Planning]\n- [LookOut: Diverse Multi-Future Prediction and Planning for Self-Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.06547) [Planning, Raquel]\n- [DIPP: Differentiable Integrated Motion Prediction and Planning with Learnable Cost Function for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.10422) [Planning]\n- [Imitation Is Not Enough: Robustifying Imitation with Reinforcement Learning for Challenging Driving Scenarios](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.11419) [Planning, Waymo]\n- [Hierarchical Model-Based Imitation Learning for Planning in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.09539) \u003Ckbd>IROS 2022\u003C\u002Fkbd> [Planning, Waymo]\n- [Symphony: Learning Realistic and Diverse Agents for Autonomous Driving Simulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.03195) \u003Ckbd>ICRA 2022\u003C\u002Fkbd> [Planning, Waymo]\n- [JFP: Joint Future Prediction with Interactive Multi-Agent Modeling for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08710) [Planning, Waymo]\n- [MaskFormer: Per-Pixel Classification is Not All You Need for Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.06278) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd>\n- [3D Semantic Scene Completion: a Survey](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.07466) \u003Ckbd>IJCV 2022\u003C\u002Fkbd>\n- [DETIC: Detecting Twenty-thousand Classes using Image-level Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.02605) \u003Ckbd>ECCV 2022\u003C\u002Fkbd>\n- [Atlas: End-to-End 3D Scene Reconstruction from Posed Images](https:\u002F\u002Fgithub.com\u002Fmagicleap\u002FAtlas) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [TransformerFusion: Monocular RGB Scene Reconstruction using Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.02191) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd>\n- [SimpleOccupancy: A Simple Attempt for 3D Occupancy Estimation in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10076) [Occupancy Network]\n- [OccDepth: A Depth-Aware Method for 3D Semantic Scene Completion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.13540) [Occupancy Network, stereo]\n- [Fast-BEV: Towards Real-time On-vehicle Bird's-Eye View Perception](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.07870) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd>\n- [Fast-BEV: A Fast and Strong Bird's-Eye View Perception Baseline](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12511) \n- [ProphNet: Efficient Agent-Centric Motion Forecasting with Anchor-Informed Proposals](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.12071) \u003Ckbd>CVPR 2023\u003C\u002Fkbd> [Qcraft, prediction]\n- [Motion Transformer with Global Intention Localization and Local Movement Refinement](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.13508) \u003Ckbd>NeurIPS 2022 Oral\u003C\u002Fkbd>\n- [P4P: Conflict-Aware Motion Prediction for Planning in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.01634)\n- [MultiPath++: Efficient Information Fusion and Trajectory Aggregation for Behavior Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.14973)\n- [ViP3D: End-to-end Visual Trajectory Prediction via 3D Agent Queries](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.01582)\n- [SAM: Segment Anything](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.02643) [FAIR]\n- [GeoMIM: Towards Better 3D Knowledge Transfer via Masked Image Modeling for Multi-view 3D Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.11325)\n- [Motion Prediction using Trajectory Sets and Self-Driving Domain Knowledge](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.04767) [Encode Road requirement to prediction]\n- [Transformer Feed-Forward Layers Are Key-Value Memories](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.14913) \u003Ckbd>EMNLP 2021\u003C\u002Fkbd>\n- [BEV-LaneDet: a Simple and Effective 3D Lane Detection Baseline](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.06006) \u003Ckbd>CVPR 2023\u003C\u002Fkbd> [BEVNet]\n- [Exploring Recurrent Long-term Temporal Fusion for Multi-view 3D Perception](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.05970) [BEVNet, megvii]\n- [VAD: Vectorized Scene Representation for Efficient Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.12077) [Horizon]\n- [A Simple Attempt for 3D Occupancy Estimation in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.10076)\n- [BEVPoolv2: A Cutting-edge Implementation of BEVDet Toward Deployment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.17111) [BEVDet, PhiGent]\n- [NVRadarNet: Real-Time Radar Obstacle and Free Space Detection for Autonomous Driving](http:\u002F\u002Faixpaper.com\u002Fview\u002Fnvradarnet_realtime_radar_obstacle_and_free_space_detection_for_autonomous_driving)\n- [GraspNet-1Billion: A Large-Scale Benchmark for General Object Grasping](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_CVPR_2020\u002Fpapers\u002FFang_GraspNet-1Billion_A_Large-Scale_Benchmark_for_General_Object_Grasping_CVPR_2020_paper.pdf) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Cewu Lu]\n- [AnyGrasp: Robust and Efficient Grasp Perception in Spatial and Temporal Domains](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.08333) [Cewu Lu]\n- [Point Cloud Forecasting as a Proxy for 4D Occupancy Forecasting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.13130)\n- [HDGT: Heterogeneous Driving Graph Transformer for Multi-Agent Trajectory Prediction via Scene Encoding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.09753)\n- [MTR: Motion Transformer with Global Intention Localization and Local Movement Refinement](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.13508) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd>\n- [UVTR: Unifying Voxel-based Representation with Transformer for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.00630) [BEVFusion, Megvii, BEVNet, camera + lidar]\n- [Don't Use Large Mini-Batches, Use Local SGD](https:\u002F\u002Farxiv.org\u002Fabs\u002F1808.07217) \u003Ckbd>ICLR 2020\u003C\u002Fkbd>\n- [Grokking: Generalization beyond Overfitting on small algorithmic datasets](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.02177)\n- [Progress measures for grokking via mechanistic interpretability]()\n- [Understanding deep learning requires rethinking generalization](https:\u002F\u002Farxiv.org\u002Fabs\u002F1611.03530) \u003Ckbd>ICLR 2017\u003C\u002Fkbd>\n- [Unifying Grokking and Double Descent](https:\u002F\u002Fopenreview.net\u002Fforum?id=JqtHMZtqWm)\n- [Deep Interactive Motion Prediction and Planning: Playing Games with Motion Prediction Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.02392) \u003Ckbd>L4DC 2022\u003C\u002Fkbd>\n- [Interactive Prediction and Planning for Autonomous Driving: from Algorithms to Fundamental Aspects](https:\u002F\u002Fescholarship.org\u002Fuc\u002Fitem\u002F0vf4q2x1) [PhD thesis of Wei Zhan, 2019]\n- [Lyft1001: One Thousand and One Hours: Self-driving Motion Prediction Dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.14480) [Lyft Level 5, prediction dataset]\n- [PCAccumulation: Dynamic 3D Scene Analysis by Point Cloud Accumulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.12394) \u003Ckbd>ECCV 2022\u003C\u002Fkbd>\n- [UniSim: A Neural Closed-Loop Sensor Simulator](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fpapers\u002FYang_UniSim_A_Neural_Closed-Loop_Sensor_Simulator_CVPR_2023_paper.pdf) \u003Ckbd>CVPR 2023\u003C\u002Fkbd> [simulation, Raquel]\n- [GeoSim: Realistic Video Simulation via Geometry-Aware Composition for\nSelf-Driving](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fpapers\u002FChen_GeoSim_Realistic_Video_Simulation_via_Geometry-Aware_Composition_for_Self-Driving_CVPR_2021_paper.pdf) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [Accelerating Reinforcement Learning for Autonomous Driving using Task-Agnostic and Ego-Centric Motion Skills](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.12072) [Driving Skill]\n- [Efficient Reinforcement Learning for Autonomous Driving with Parameterized Skills and Priors](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.04412) \u003Ckbd>RSS 2023\u003C\u002Fkbd> [Driving Skill]\n- [IL Difficulty Model: Embedding Synthetic Off-Policy Experience for Autonomous Driving via Zero-Shot Curricula](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.01375) \u003Ckbd>CoRL 2022\u003C\u002Fkbd> [Waymo]\n- [Neural Map Prior for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08481) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [Track Anything: Segment Anything Meets Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.11968)\n- [Self-Supervised Camera Self-Calibration from Video](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.03325) \u003Ckbd>ICRA 2022\u003C\u002Fkbd> [TRI, calibration]\n- [Real-time Online Video Detection with Temporal Smoothing Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.09236) \u003Ckbd>ECCV 2022\u003C\u002Fkbd> [ConvLSTM-style cross-attention]\n- [NeRF-Supervised Deep Stereo](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.17603) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [GET3D: A Generative Model of High Quality 3D Textured Shapes Learned from Images](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.11163) \u003Ckbd>NeurIOS 2022\u003C\u002Fkbd>\n- [OmniObject3D: Large-Vocabulary 3D Object Dataset for Realistic Perception, Reconstruction and Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.07525) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [Ego-Body Pose Estimation via Ego-Head Pose Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.04636) \u003Ckbd>CVPR 2023\u003C\u002Fkbd>\n- [PanoOcc: Unified Occupancy Representation for Camera-based 3D Panoptic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.10013)\n- [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12597)\n- [Visual Instruction Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.08485)\n- [VideoChat: Chat-Centric Video Understanding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.06355)\n- [CoBEVT: Cooperative Bird's Eye View Semantic Segmentation with Sparse Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.02202) \u003Ckbd>CoRL 2022\u003C\u002Fkbd>\n- [BEVFormer v2: Adapting Modern Image Backbones to Bird's-Eye-View Recognition via Perspective Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.10439) [BEVNet, Jifeng Dai]\n- [Fast-BEV: Towards Real-time On-vehicle Bird’s-Eye View Perception](https:\u002F\u002Fml4ad.github.io\u002Ffiles\u002Fpapers2022\u002FFast-BEV:%20Towards%20Real-time%20On-vehicle%20Bird's-Eye%20View%20Perception.pdf) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd>\n- [Traj++: Human Trajectory Forecasting in Crowds: A Deep Learning Perspective](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.03639) \u003Ckbd>TITS 2021\u003C\u002Fkbd>\n- [Data Driven Prediction Architecture for Autonomous Driving and its Application on Apollo Platform](https:\u002F\u002Farxiv.org\u002Fabs\u002F2006.06715) \u003Ckbd>IV 2020\u003C\u002Fkbd> [Baidu]\n- [THOMAS: Trajectory Heatmap Output with learned Multi-Agent Sampling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.06607) \u003Ckbd>ICLR 2022\u003C\u002Fkbd>\n- [Learning Lane Graph Representations for Motion Forecasting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.13732) \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd>\n- [Identifying Driver Interactions via Conditional Behavior Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.09959) \u003Ckbd>ICRA 2021\u003C\u002Fkbd> [Waymo]\n- [Trajectron++: Dynamically-Feasible Trajectory Forecasting With Heterogeneous Data](https:\u002F\u002Farxiv.org\u002Fabs\u002F2001.03093) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [TPNet: Trajectory Proposal Network for Motion Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.12255) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [GOHOME: Graph-Oriented Heatmap Output for future Motion Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01827)\n- [PECNet: It Is Not the Journey but the Destination: Endpoint Conditioned Trajectory Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.02025) \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd>\n- [From Goals, Waypoints & Paths To Long Term Human Trajectory Forecasting](https:\u002F\u002Farxiv.org\u002Fabs\u002F2012.01526) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [PRECOG: PREdiction Conditioned On Goals in Visual Multi-Agent Settings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.01296) \u003Ckbd>ICCV 2019\u003C\u002Fkbd>\n- [PiP: Planning-informed Trajectory Prediction for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.11476) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [MultiPath: Multiple Probabilistic Anchor Trajectory Hypotheses for Behavior Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.05449) \u003Ckbd>CoRL 2019\u003C\u002Fkbd>\n- [LaPred: Lane-Aware Prediction of Multi-Modal Future Trajectories of Dynamic Agents](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.00249) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [PRIME: Learning to Predict Vehicle Trajectories with Model-based Planning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.04027) \u003Ckbd>CoRL 2021\u003Ckbd>\n- [A Flexible and Explainable Vehicle Motion Prediction and Inference Framework Combining Semi-Supervised AOG and ST-LSTM](https:\u002F\u002Fdl.acm.org\u002Fdoi\u002Fabs\u002F10.1109\u002FTITS.2020.3016304) \u003Ckbd>TITS 2020\u003C\u002Fkbd>\n- [Multi-Modal Trajectory Prediction of Surrounding Vehicles with Maneuver based LSTMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.05499) \u003Ckbd>IV 2018\u003C\u002Fkbd> [Trivedi]\n- [HYPER: Learned Hybrid Trajectory Prediction via Factored Inference and Adaptive Sampling](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.02344) \u003Ckbd>ICRA 2022\u003C\u002Fkbd>\n- [Trajectory Prediction with Linguistic Representations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.09741) \u003Ckbd>ICRA 2022\u003C\u002Fkbd>\n- [What-If Motion Prediction for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.10587)\n- [End-to-end Contextual Perception and Prediction with Interaction Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2008.05927) \u003Ckbd>IROS 2020\u003C\u002Fkbd> [Auxiliary collision loss, scene compliant pred]\n- [SafeCritic: Collision-Aware Trajectory Prediction](https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.06673) \u003Ckbd>BMVC 2019\u003C\u002Fkbd> [IRL, scene compliant pred]\n- [Large Scale Interactive Motion Forecasting for Autonomous Driving: The Waymo Open Motion Dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.10133) \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [Waymo]\n- [Interaction-Based Trajectory Prediction Over a Hybrid Traffic Graph](https:\u002F\u002Farxiv.org\u002Fabs\u002F2009.12916) \u003Ckbd>IROS 2020\u003C\u002Fkbd>\n- [Joint Interaction and Trajectory Prediction for Autonomous Driving using Graph Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.07882) \u003Ckbd>NeurIPS 2019 workshop\u003C\u002Fkbd>\n- [Fast Risk Assessment for Autonomous Vehicles Using Learned Models of Agent Futures](https:\u002F\u002Farxiv.org\u002Fabs\u002F2005.13458) \u003Ckbd>Robotics: science and systems 2020\u003C\u002Fkbd>\n- [Monocular 3D Object Detection: An Extrinsic Parameter Free Approach](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.15796) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [PJLab]\n- [UniFormer: Unified Multi-view Fusion Transformer for Spatial-Temporal Representation in Bird's-Eye-View](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.08536) [BEVFormer, BEVNet, Temporal]\n- [GitNet: geometric prior-baesd transformation for birds yee view segmentation]()\n- [WBF: weighted box fusion: ensembling boxes from differnt object detection modules]()\n- [NNI: auto parameter finding algorithm]()\n- [BEVFormer++: Improving BEVFormer for 3D Camera-only Object Detection](https:\u002F\u002Fstorage.googleapis.com\u002Fwaymo-uploads\u002Ffiles\u002Fresearch\u002F3DCam\u002F3DCam_BEVFormer.pdf) [Waymo open dataset challenge 1st place in mono3d]\n- [LET-3D-AP: Longitudinal Error Tolerant 3D Average Precision for Camera-Only 3D Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.07705) [Waymo open dataset challenge official metric]\n- [High-Level Interpretation of Urban Road Maps Fusing Deep Learning-Based Pixelwise Scene Segmentation and Digital Navigation Maps](https:\u002F\u002Fpdfs.semanticscholar.org\u002F44ac\u002F01c0d356f22e7ee883f8e4ac2cccf199f68d.pdf) \u003Ckbd>Journal of Advanced Transportation 2018\u003C\u002Fkbd>\n- [A Hybrid Vision-Map Method for Urban Road Detection](https:\u002F\u002Fdownloads.hindawi.com\u002Fjournals\u002Fjat\u002F2017\u002F7090549.pdf) \u003Ckbd>Journal of Advanced Transportation 2017\u003C\u002Fkbd>\n- [Terminology and Analysis of Map Deviations in Urban Domains: Towards Dependability for HD Maps in Automated Vehicles](https:\u002F\u002Fwww.researchgate.net\u002Fprofile\u002FChristopher-Plachetka\u002Fpublication\u002F348367176_Terminology_and_Analysis_of_Map_Deviations_in_Urban_Domains_Towards_Dependability_for_HD_Maps_in_Automated_Vehicles\u002Flinks\u002F607d523f907dcf667babc06b\u002FTerminology-and-Analysis-of-Map-Deviations-in-Urban-Domains-Towards-Dependability-for-HD-Maps-in-Automated-Vehicles.pdf) \u003Ckbd>IV 2020\u003C\u002Fkbd>\n- [TIME WILL TELL: NEW OUTLOOKS AND A BASELINE FOR TEMPORAL MULTI-VIEW 3D OBJECT DETECTION](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.02443)\n- [Conditional DETR for Fast Training Convergence](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.06152) \u003Ckbd>ICCV 2021\u003C\u002Fkbd>\n- [DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.12329) \u003Ckbd>ICLR 2022\u003C\u002Fkbd>\n- [DN-DETR: Accelerate DETR Training by Introducing Query DeNoising](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.01305) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.03605)\n- [Trajectory Forecasting from Detection with Uncertainty-Aware Motion Encoding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.01478) [Ouyang Wanli]\n- [Vision-based Uneven BEV Representation Learning with Polar Rasterization and Surface Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.01878) [BEVNet, polar]\n- [MUTR3D: A Multi-camera Tracking Framework via 3D-to-2D Queries](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.00613) [BEVNet, tracking] \u003Ckbd>CVPR 2022 workshop\u003C\u002Fkbd> [Hang Zhao]\n- [ST-P3: End-to-end Vision-based Autonomous Driving via Spatial-Temporal Feature Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.07601) \u003Ckbd>ECCV 2022\u003C\u002Fkbd> [Hongyang Li]\n- [GKT: Efficient and Robust 2D-to-BEV Representation Learning via Geometry-guided Kernel Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.04584) [BEVNet, Horizon]\n- [SiamRPN: High Performance Visual Tracking with Siamese Region Proposal Network](https:\u002F\u002Fyan-junjie.github.io\u002Fpublication\u002Fdblp-confcvpr-li-ywzh-18\u002Fdblp-confcvpr-li-ywzh-18.pdf) \u003Ckbd>CVPR 2018\u003C\u002Fkbd>\n- [TPLR: Topology Preserving Local Road Network Estimation from Single Onboard Camera Image](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.10155) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [STSU, Luc Van Gool]\n- [LaRa: Latents and Rays for Multi-Camera Bird's-Eye-View Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.13294) [Valeo, BEVNet, polar]\n- [PolarDETR: Polar Parametrization for Vision-based Surround-View 3D Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.10965) [BEVNet]\n- [Exploring Geometric Consistency for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.05858) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.01178) \u003Ckbd>WACV 2022\u003C\u002Fkbd> [mono3D]\n- [Learning to Predict 3D Lane Shape and Camera Pose from a Single Image via Geometry Constraints](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.15351) \u003Ckbd>AAAI 2022\u003C\u002Fkbd>\n- [Detecting Lane and Road Markings at A Distance with Perspective Transformer Layers](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F9294383) \u003Ckbd>ICICN 2021\u003C\u002Fkbd> [BEVNet, lane line]\n- [Unsupervised Labeled Lane Markers Using Maps](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_ICCVW_2019\u002Fpapers\u002FCVRSUAD\u002FBehrendt_Unsupervised_Labeled_Lane_Markers_Using_Maps_ICCVW_2019_paper.pdf) \u003Ckbd>ICCV 2019 workshop\u003C\u002Fkbd> [Bosch, 2D lane line]\n- [M3DeTR: Multi-representation, Multi-scale, Mutual-relation 3D Object Detection with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.11896) [Lidar detection, Waymo open dataset] \u003Ckbd>WACV 2022\u003C\u002Fkbd>\n- [K-Lane: Lidar Lane Dataset and Benchmark for Urban Roads and Highways](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.11048) [lane line dataset]\n- [Robust Monocular 3D Lane Detection With Dual Attention](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9506296) \u003Ckbd>ICIP 2021\u003C\u002Fkbd>\n- [OcclusionFusion: Occlusion-aware Motion Estimation for Real-time Dynamic 3D Reconstruction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.07977) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.02178) \u003Ckbd>ICLR 2022\u003C\u002Fkbd> [lightweight Transformers]\n- [XFormer: Lightweight Vision Transformer with Cross Feature Attention](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.07268) [Samsung]\n- [CenterFormer: Center-based Transformer for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.05588) \u003Ckbd>ECCV 2022 oral\u003C\u002Fkbd> [TuSimple]\n- [LidarMultiNet: Towards a Unified Multi-task Network for LiDAR Perception](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.09385) [2022 Waymo Open Dataset, TuSimple]\n- [MTRA: 1st Place Solution for 2022 Waymo Open Dataset Challenge - Motion Prediction](https:\u002F\u002Fstorage.googleapis.com\u002Fwaymo-uploads\u002Ffiles\u002Fresearch\u002FMotionPred\u002FMotionPrediction_MTRA.pdf) [Waymo open dataset challenge 1st place in motion prediction]\n- [BEVSegFormer: Bird's Eye View Semantic Segmentation From Arbitrary Camera Rigs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.04050) [BEVNet]\n- [Panoptic SegFormer: Delving Deeper into Panoptic Segmentation with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.03814) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [nVidia]\n- [Efficiently Identifying Task Groupings for Multi-Task Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.04617) \u003Ckbd>NeurIPS 2021 spotlight\u003C\u002Fkbd> [MTL]\n- [Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.05482) [Google, Golden Backbone]\n- [\"The Pedestrian next to the Lamppost\" Adaptive Object Graphs for Better Instantaneous Mapping](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.02944) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [GitNet: Geometric Prior-based Transformation for Birds-Eye-View Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07733) [BEVNet, Baidu]\n- [FUTR3D: A Unified Sensor Fusion Framework for 3D Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.10642) [Hang Zhao]\n- [GitNet: Geometric Prior-based Transformation for Birds-Eye-View Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07733) [BEVNet]\n- [MonoFormer: Towards Generalization of self-supervised monocular depth estimation with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.11083) [monodepth]\n- [Time3D: End-to-End Joint Monocular 3D Object Detection and Tracking for Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.14882)\n- [cosFormer: Rethinking Softmax in Attention](https:\u002F\u002Farxiv.org\u002Fabs\u002F2202.08791) \u003Ckbd>ICLR 2022\u003C\u002Fkbd>\n- [StretchBEV: Stretching Future Instance Prediction Spatially and Temporally](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.13641) [BEVNet, prediction]\n- [Scene Representation in Bird’s-Eye View from Surrounding Cameras with Transformers](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2022W\u002FWAD\u002Fpapers\u002FZhao_Scene_Representation_in_Birds-Eye_View_From_Surrounding_Cameras_With_Transformers_CVPRW_2022_paper.pdf) [BEVNet, LLD] \u003Ckbd>CVPR 2022 workshop\u003C\u002Fkbd>\n- [Multi-Frame Self-Supervised Depth with Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07616) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [It's About Time: Analog Clock Reading in the Wild](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09162) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [Andrew Zisserman]\n- [SurroundDepth: Entangling Surrounding Views for Self-Supervised Multi-Camera Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.03636) \u003Ckbd>CoRL 2022\u003C\u002Fkbd> [Jiwen Lu]\n- [ONCE-3DLanes: Building Monocular 3D Lane Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.00301) \u003Ckbd>CVPR 2022\u003C\u002Fkbd>\n- [K-Lane: Lidar Lane Dataset and Benchmark for Urban Roads and Highways](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.11048) \u003Ckbd>CVPR 2022 workshop\u003C\u002Fkbd> [3D LLD]\n- [Multi-modal 3D Human Pose Estimation with 2D Weak Supervision in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.12141) \u003Ckbd>CVPR 2022 workshop\u003C\u002Fkbd>\n- [A Simple Baseline for BEV Perception Without LiDAR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.07959) [TRI, BEVNet, vision+radar]\n- [Reconstruct from Top View: A 3D Lane Detection Approach based on Geometry Structure Prior](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2022W\u002FWAD\u002Fpapers\u002FLi_Reconstruct_From_Top_View_A_3D_Lane_Detection_Approach_Based_CVPRW_2022_paper.pdf) \u003Ckbd>CVPR 2022 workshop\u003C\u002Fkbd>\n- [RIDDLE: Lidar Data Compression with Range Image Deep Delta Encoding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2206.01738) \u003Ckbd>CVPR 2022\u003C\u002Fkbd> [Waymo, Charles Qi]\n- [Occupancy Flow Fields for Motion Forecasting in Autonomous Driving](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.03875) \u003Ckbd>RAL 2022\u003C\u002Fkbd> [Waymo occupancy flow challenge]\n- [Safe Local Motion Planning with Self-Supervised Freespace Forecasting](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fpapers\u002FHu_Safe_Local_Motion_Planning_With_Self-Supervised_Freespace_Forecasting_CVPR_2021_paper.pdf) \u003Ckbd>CVPR 2021\u003C\u002Fkbd>\n- [数据闭环的核心 - Auto-labeling 方案分享](https:\u002F\u002Fzhuanlan.zhihu.com\u002Fp\u002F533907821)\n- [K-Lane: Lidar Lane Dataset and Benchmark for Urban Roads and Highways](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.11048)\n- [LETR: Line Segment Detection Using Transformers without Edges](https:\u002F\u002Farxiv.org\u002Fabs\u002F2101.01909) \u003Ckbd>CVPR 2021 oral\u003C\u002Fkbd>\n- [HDMapGen: A Hierarchical Graph Generative Model of High Definition Maps](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fpapers\u002FMi_HDMapGen_A_Hierarchical_Graph_Generative_Model_of_High_Definition_Maps_CVPR_2021_paper.pdf) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [HD mapping]\n- [SketchRNN: A Neural Representation of Sketch Drawings](https:\u002F\u002Farxiv.org\u002Fabs\u002F1704.03477) [David Ha]\n- [PolyGen: An Autoregressive Generative Model of 3D Meshes](https:\u002F\u002Farxiv.org\u002Fabs\u002F2002.10880) \u003Ckbd>ICML 2020\u003C\u002Fkbd>\n- [SOLQ: Segmenting Objects by Learning Queries](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.02351) \u003Ckbd>NeurlPS 2021\u003C\u002Fkbd> [Megvii, end-to-end, instance segmentation]\n- [MonoViT: Self-Supervised Monocular Depth Estimation with a Vision Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.03543) \u003Ckbd>3DV 2022\u003C\u002Fkbd>\n- [MVSTER: Epipolar Transformer for Efficient Multi-View Stereo](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.07346) \u003Ckbd>ECCV 2022\u003C\u002Fbd>\n- [MOVEDepth: Crafting Monocular Cues and Velocity Guidance for Self-Supervised Multi-Frame Depth Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.09170) [MVS + monodepth]\n- [SurroundDepth: Entangling Surrounding Views for Self-Supervised Multi-Camera Depth Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.03636)\n- [Scene Transformer: A unified architecture for predicting multiple agent trajectories](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.08417) [prediction, Waymo] \u003Ckbd>ICLR 2022\u003C\u002Fkbd>\n- [SSIA: Monocular Depth Estimation with Self-supervised Instance Adaptation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.05821) [VGG team, TTR, test time refinement, CVD]\n- [CoMoDA: Continuous Monocular Depth Adaptation Using Past Experiences](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FWACV2021\u002Fpapers\u002FKuznietsov_CoMoDA_Continuous_Monocular_Depth_Adaptation_Using_Past_Experiences_WACV_2021_paper.pdf) \u003Ckbd>WACV 2021\u003C\u002Fkbd>\n- [MonoRec: Semi-supervised dense reconstruction in dynamic environments from a single moving camera](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.11814) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Daniel Cremmers]\n- [Plenoxels: Radiance Fields without Neural Networks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.05131)\n- [Lidar with Velocity: Motion Distortion Correction of Point Clouds from Oscillating Scanning Lidars](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09497) [Livox, ISEE]\n- [NWD: A Normalized Gaussian Wasserstein Distance for Tiny Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.13389)\n- [Towards Optimal Strategies for Training Self-Driving Perception Models in Simulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.07971) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd> [Sanja Fidler]\n- [Insta-DM: Learning Monocular Depth in Dynamic Scenes via Instance-Aware Projection Consistency](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.02629) \u003Ckbd>AAAI 2021\u003C\u002Fkbd>\n- [Instance-wise Depth and Motion Learning from Monocular Videos](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.09351) \u003Ckbd>NeurIPS 2020 workshop\u003C\u002Fkbd> [[website](https:\u002F\u002Fsites.google.com\u002Fsite\u002Fseokjucv\u002Fhome\u002Finstadm)]\n- [NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.08934) \u003Ckbd>ECCV 2020 oral\u003C\u002Fkbd>\n- [BARF: Bundle-Adjusting Neural Radiance Fields](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.06405) \u003Ckbd>ICCV 2021 oral\u003C\u002Fkbd>\n- [NerfingMVS: Guided Optimization of Neural Radiance Fields for Indoor Multi-view Stereo](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01129) \u003Ckbd>ICCV 2021 oral\u003C\u002Fkbd>\n- [YOLinO: Generic Single Shot Polyline Detection in Real Time](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.14420) \u003Ckbd>ICCV 2021 workshop\u003C\u002Fkbd> [lld]\n- [MonoRCNN: Geometry-based Distance Decomposition for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.03775) \u003Ckbd>ICCV 2021\u003C\u002Fkbd>\n- [MonoCInIS: Camera Independent Monocular 3D Object Detection using Instance Segmentation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.00464) \u003Ckbd>ICCV 2021 workshop\u003C\u002Fkbd>\n- [PV-RCNN: Point-Voxel Feature Set Abstraction for 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1912.13192) \u003Ckbd>CVPR 2020\u003C\u002Fkbd> [Waymo challenge 2nd place]\n- [Geometry-based Distance Decomposition for Monocular 3D Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.03775) \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [mono3D]\n- [Offboard 3D Object Detection from Point Cloud Sequences](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.05073) \u003Ckbd>CVPR 2021\u003C\u002Fkbd> [Charles Qi] \n- [FreeAnchor: Learning to Match Anchors for Visual Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F1909.02466) \u003Ckbd>NeurIPS 2019\u003C\u002Fkbd>\n- [AutoAssign: Differentiable Label Assignment for Dense Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.03496)\n- [Probabilistic Anchor Assignment with IoU Prediction for Object Detection](https:\u002F\u002Farxiv.org\u002Fabs\u002F2007.08103) \u003Ckbd>ECCV 2020\u003C\u002Fkbd>\n- [FOVEA: Foveated Image Magnification for Autonomous Navigation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.12102) \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [Argo]\n- [PifPaf: Composite Fields for Human Pose Estimation](https:\u002F\u002Farxiv.org\u002Fabs\u002F1903.06593) \u003Ckbd>CVPR 2019\u003C\u002Fkbd>\n- [Monocular 3D Localization of Vehicles in Road Scenes](https:\u002F\u002Favvision.xyz\u002Ficcv21\u002Fpapers\u002F1\u002FCameraReady\u002F01.pdf) \u003Ckbd>ICCV 2021 workshop\u003C\u002Fkbd> [mono3D, tracking]\n- [TransformerFusion: Monocular RGB Scene Reconstruction using Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.02191)\n- [Conditional DETR for Fast Training Convergence](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.06152)\n- [Anchor DETR: Query Design for Transformer-Based Detector](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.07107) [megvii]\n- [PGD: Probabilistic and Geometric Depth: Detecting Objects in Perspective](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.14160) \u003Ckbd>CoRL 2021\u003C\u002Fkbd>\n- [Adaptive Wing Loss for Robust Face Alignment via Heatmap Regression](https:\u002F\u002Farxiv.org\u002Fabs\u002F1904.07399) \n- [What Makes for End-to-End Object Detection?](https:\u002F\u002Fproceedings.mlr.press\u002Fv139\u002Fsun21b.html) \u003Ckbd>PMLR 2021\u003C\u002Fkbd>\n- [Instances as Queries](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.01928) \u003Ckbd>ICCV 2021\u003C\u002Fkbd> [instance segmentation]\n- [One Million Scenes for Autonomous Driving: ONCE Dataset](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.11037) [Huawei]\n- [NVS-MonoDepth: Improving Monocular Depth Prediction with Novel View Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.12577) \u003Ckbd>3DV 2021\u003C\u002Fkbd>\n- [Is 2D Heatmap Representation Even Necessary for Human Pose Estimation?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.03332)\n- [Topology Preserving Local Road Network Estimation from Single Onboard Camera Image](https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.10155) [BEVNet, Luc Van Gool]\n- [Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.16452) [Small LLM prompting, Microsoft]\n- [CoT: Chain-of-Thought Prompting Elicits Reasoning in Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.11903) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd>\n- [ToT: Tree of Thoughts: Deliberate Problem Solving with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.10601) [[Notes](paper_notes\u002Ftot.md)] \u003Ckbd>NeurIPS 2023 Oral\u003C\u002Fkbd>\n- [Cumulative Reasoning with Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.04371)\n- [A Survey of Techniques for Maximizing LLM Performance](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=ahnGLM-RC1Y&ab_channel=OpenAI) [OpenAI]\n- [Drive AGI](https:\u002F\u002Fgithub.com\u002FOpenDriveLab\u002FDriveAGI)\n- [Harnessing the Power of Multi-Modal LLMs for Autonomy](https:\u002F\u002Fwww.ghostautonomy.com\u002Fblog\u002Fmllms-for-autonomy) [Ghost Autonomy]\n- [Language to Rewards for Robotic Skill Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.08647)\n- [ALOHA: Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.13705)\n- [LLM-Grounder: Open-Vocabulary 3D Visual Grounding with Large Language Model as an Agent](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.12311) [UM]\n- [LM-Nav: Robotic Navigation with Large Pre-Trained Models of Language, Vision, and Action](https:\u002F\u002Farxiv.org\u002Fabs\u002F2207.04429) [Sergey Levine]\n- [A Survey of Embodied AI: From Simulators to Research Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.04918) \u003Ckbd>IEEE TETCI 2021\u003C\u002Fkbd>\n- [Habitat Challenge 2021](https:\u002F\u002Faihabitat.org\u002Fchallenge\u002F2021\u002F)\n- [Video ChatCaptioner: Towards Enriched Spatiotemporal Descriptions](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.04227)\n- [DoReMi: Grounding Language Model by Detecting and Recovering from Plan-Execution Misalignment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.00329) [Jianyu Chen]\n- [The Power of Scale for Parameter-Efficient Prompt Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.08691) \u003Ckbd>EMNLP 2021\u003C\u002Fkbd>\n- [Language Models as Zero-Shot Planners: Extracting Actionable Knowledge for Embodied Agents](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.07207) \u003Ckbd>ICML 2022\u003C\u002Fkbd>\n- [ProgPrompt: Generating Situated Robot Task Plans using Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.11302) \u003Ckbd>ICRA 2023\u003C\u002Fkbd>\n- [Perceiver-Actor: A Multi-Task Transformer for Robotic Manipulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.05451) \u003Ckbd>CoRL 2022\u003C\u002Fkbd>\n- [LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale](https:\u002F\u002Farxiv.org\u002Fabs\u002F2208.07339) \u003Ckbd>NeurIPS 2022\u003C\u002Fkbd> [LLM Quant]\n- [AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.00978) [Song Han, LLM Quant]\n- [RoFormer: Enhanced Transformer with Rotary Position Embedding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.09864)\n- [CoDi: Any-to-Any Generation via Composable Diffusion](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11846) \u003Ckbd>NeurIPS 2023\u003C\u002Fkbd>\n- [What if a Vacuum Robot has an Arm?](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10202493) \u003Ckbd>UR 2023\u003C\u002Fkbd>\n- [FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.14135)\n- [GPT in 60 Lines of NumPy](https:\u002F\u002Fjaykmody.com\u002Fblog\u002Fgpt-from-scratch\u002F)\n- [Speeding up the GPT - KV cache](https:\u002F\u002Fwww.dipkumar.dev\u002Fbecoming-the-unbeatable\u002Fposts\u002Fgpt-kvcache\u002F)\n- [LLM Parameter Counting](https:\u002F\u002Fkipp.ly\u002Ftransformer-param-count\u002F)\n- [Transformer Inference Arithmetic](https:\u002F\u002Fkipp.ly\u002Ftransformer-inference-arithmetic\u002F#kv-cache)\n- [ALBEF: Align before Fuse: Vision and Language Representation Learning with Momentum Distillation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.07651) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd> [Junnan Li]\n- [CLIP: Learning Transferable Visual Models From Natural Language Supervision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020) \u003Ckbd>ICLR 2021\u003C\u002Fkbd> [OpenAI]\n- [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2201.12086) \u003Ckbd>ICML 2022\u003C\u002Fkbd> [Junnan Li]\n- [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2301.12597) [Junnan Li]\n- [MOO: Open-World Object Manipulation using Pre-trained Vision-Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.00905) [Google Robotics, end-to-end visuomotor]\n- [VC-1: Where are we in the search for an Artificial Visual Cortex for Embodied Intelligence?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.18240)\n- [CLIPort: What and Where Pathways for Robotic Manipulation](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.12098) \u003Ckbd>CoRL 2021\u003C\u002Fkbd> [Nvidia, end-to-end visuomotor]\n- [GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.17323) \u003Ckbd>ICLR 2023\u003C\u002Fkbd>\n- [SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.10438) \u003Ckbd>ICML 2023\u003C\u002Fkbd> [Song Han, LLM Quant]\n- [SAPIEN: A SimulAted Part-based Interactive ENvironment](https:\u002F\u002Farxiv.org\u002Fabs\u002F2003.08515) \u003Ckbd>CVPR 2020\u003C\u002Fkbd>\n- [FiLM: Visual Reasoning with a General Conditioning Layer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1709.07871) \u003Ckbd>AAAI 2018\u003C\u002Fkbd>\n- [TokenLearner: What Can 8 Learned Tokens Do for Images and Videos?](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.11297) \u003Ckbd>NeurIPS 2021\u003C\u002Fkbd>\n- [QLoRA: Efficient Finetuning of Quantized LLMs](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.14314)\n- [OVO: Open-Vocabulary Occupancy](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.16133)\n- [Code Llama: Open Foundation Models for Code](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.12950)\n- [Chinchilla: Training Compute-Optimal Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.15556) [DeepMind]\n- [GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13245)\n- [RoFormer: Enhanced Transformer with Rotary Position Embedding](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.09864)\n- [RH20T: A Robotic Dataset for Learning Diverse Skills in One-Shot](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.00595)\n- [Perceiver-Actor: A Multi-Task Transformer for Robotic Manipulation]()\n- [VIMA: General Robot Manipulation with Multimodal Prompts]()\n- [An Attention Free Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.14103) [Apple]\n- [PDDL Planning with Pretrained Large Language Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.11014) [MIT, Leslie Kaelbling]\n- [Task and Motion Planning with Large Language Models for Object Rearrangement](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.06247) \u003Ckbd>IROS 2023\u003C\u002Fkbd>","# Learning-Deep-Learning 快速上手指南\n\n**项目简介**：\n`Learning-Deep-Learning` 并非一个需要编译运行的软件库，而是一个由 NVIDIA 自动驾驶 AI 总监 Patrick Liu 维护的**深度学习与机器学习论文笔记合集**。该项目专注于计算机视觉、自动驾驶（如 BEV 感知、占据网络、端到端驾驶）以及最新的大模型扩散技术。它适合希望系统阅读前沿论文的中国开发者作为学习路线图和参考资料。\n\n由于本项目本质是文档仓库，无需复杂的安装过程，只需克隆代码库并在本地或浏览器中查看即可。\n\n## 1. 环境准备\n\n本项目无特殊系统要求，仅需基础的 Git 环境和文本编辑器\u002F浏览器。\n\n*   **操作系统**：Windows \u002F macOS \u002F Linux\n*   **前置依赖**：\n    *   `git`：用于克隆仓库\n    *   现代浏览器（Chrome, Edge, Firefox 等）：用于阅读 Markdown 渲染后的内容\n    *   （可选）Markdown 编辑器（如 VS Code + Markdown Preview Enhanced）：用于本地获得更好的阅读体验\n\n## 2. 安装步骤\n\n通过 Git 将仓库克隆到本地。国内用户推荐使用 Gitee 镜像（如有）或使用加速代理，若直接连接 GitHub 较慢，可配置终端代理。\n\n```bash\n# 克隆仓库\ngit clone https:\u002F\u002Fgithub.com\u002Fpatrick-llgc\u002FLearning-Deep-Learning.git\n\n# 进入目录\ncd Learning-Deep-Learning\n```\n\n> **提示**：如果 GitHub 连接缓慢，可在命令前添加代理设置，例如：\n> `export https_proxy=http:\u002F\u002F127.0.0.1:7890 http_proxy=http:\u002F\u002F127.0.0.1:7890` (根据实际代理端口调整)\n\n## 3. 基本使用\n\n### 方式一：在线浏览（推荐）\n作者已生成静态网页，无需下载即可直接阅读整理好的笔记和论文列表。\n*   **主页地址**：[https:\u002F\u002Fpatrick-llgc.github.io\u002FLearning-Deep-Learning\u002F](https:\u002F\u002Fpatrick-llgc.github.io\u002FLearning-Deep-Learning\u002F)\n*   **博客专栏**：[The Thinking Car (Medium)](https:\u002F\u002Fmedium.com\u002Fthe-thinking-car) (部分深度综述文章)\n\n### 方式二：本地阅读\n在本地文件系统中直接打开 `.md` 文件，或使用 VS Code 预览。\n\n1.  **入门路线**：\n    如果你是计算机视觉新手，建议首先阅读 `start` 目录下的入门论文列表及笔记：\n    *   论文列表：`start\u002Ffirst_cnn_papers.md`\n    *   对应笔记：`start\u002Ffirst_cnn_papers_notes.md`\n\n2.  **按主题查阅**：\n    进入 `topics\u002F` 目录查看特定领域的笔记（如 BEV 感知、占据网络、3D 车道线检测等），或查看 `paper_notes\u002F` 目录获取最新论文（如 2025 年的扩散模型、VLA 模型等）的详细解读。\n\n    ```bash\n    # 示例：在终端快速查看某个主题的笔记内容\n    cat topics\u002Ftopic_occupancy_network.md\n    ```\n\n3.  **追踪最新动态**：\n    查看根目录或按月分类的文件夹（如 `2025-01`, `2025-09`），获取关于 Diffusion LLM、端到端自动驾驶（End-to-End Driving）等最新论文的简要笔记和链接。\n\n### 核心资源索引\n*   **可信论文源列表**：`trusty.md`\n*   **AI 播客笔记**：`podcast\u002F` (包含 OpenAI 等专家访谈笔记)\n*   **技术速查表 (Scratchpad)**：`gist\u002F` (包含计算硬件、Attention Mask 等快速笔记)","某自动驾驶初创公司的感知算法工程师正在为量产项目调研最新的单目 3D 车道线检测方案，急需快速掌握学术界前沿进展并复现核心思路。\n\n### 没有 Learning-Deep-Learning 时\n- 在海量论文中盲目搜索，难以辨别哪些是真正经过工业界验证的高质量文章，容易陷入低效阅读。\n- 面对复杂的数学公式和模型架构，缺乏直观的图解和通俗的笔记辅助，理解成本极高，往往读几页就放弃。\n- 无法快速建立知识体系，不知道从哪篇经典论文入手，导致学习路径混乱，浪费数周时间仍在基础概念上打转。\n- 缺少将理论与实际工程（如 BEV 感知、占用网络）结合的案例分析，难以将学术成果转化为落地代码。\n\n### 使用 Learning-Deep-Learning 后\n- 直接参考作者整理的“可信论文源”和“首月必读清单”，迅速锁定如 Monocular 3D Lane Line Detection 等关键文献，精准高效。\n- 借助作者提供的详细阅读笔记和可视化图解，轻松攻克 Transformer 张量重塑、语义占用预测等晦涩难点，理解速度提升数倍。\n- 跟随作者从物理学家转型 AI 专家的实战路径，按主题（如拥挤场景检测、动态物体 SLAM）系统构建知识树，学习路线清晰明确。\n- 结合《The Thinking Car》专栏中关于中国量产挑战与工业界实践的深入分析，直接将学术理论映射到实际开发场景中，缩短研发周期。\n\nLearning-Deep-Learning 不仅是一份论文笔记库，更是连接学术前沿与自动驾驶工程落地的加速器，帮助开发者在复杂技术浪潮中少走弯路。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fpatrick-llgc_Learning-Deep-Learning_2698745c.png","patrick-llgc","Patrick Liu","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fpatrick-llgc_4c1ab3c9.jpg","A physicist-turned AI research engineer. Enthusiastic about deep learning, computer vision and embodied AI. \r\n\r\n",null,"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fpatrick-llgc\u002F","https:\u002F\u002Fgithub.com\u002Fpatrick-llgc",[84],{"name":85,"color":86,"percentage":87},"Jupyter Notebook","#DA5B0B",100,1258,180,"2026-04-03T08:58:10",1,"","未说明",{"notes":95,"python":93,"dependencies":96},"该仓库（Learning-Deep-Learning）并非可执行的 AI 软件工具，而是作者 Patrick Langechuan Liu 的个人深度学习与机器学习论文阅读笔记集合。内容主要以 Markdown 文档形式存在，包含论文摘要、解读链接以及作者在 Medium 上的博客文章索引。因此，该项目不需要特定的操作系统、GPU、内存或 Python 环境即可浏览（仅需网页浏览器或文本编辑器）。文中提到的具体算法（如 Diffusion LLM, BEV Perception 等）仅作为笔记主题讨论，仓库本身不包含需要安装依赖库才能运行的源代码或模型权重。",[],[14,54,13],[99,100,101,102,103,104,105,106,107,108,109,110,111,112],"deep-learning","paper","literature-review","machine-learning","computer-vision","cnn","paper-reading","paper-review","reinforcement-learning","medical","medical-imaging","point-cloud","3d-object-detection","3d-object-recognition","2026-03-27T02:49:30.150509","2026-04-06T05:15:17.758552",[],[]]