[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-louisfb01--best_AI_papers_2021":3,"tool-louisfb01--best_AI_papers_2021":62},[4,18,26,36,46,54],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",159267,2,"2026-04-17T11:29:14",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":42,"last_commit_at":43,"category_tags":44,"status":17},8272,"opencode","anomalyco\u002Fopencode","OpenCode 是一款开源的 AI 编程助手（Coding Agent），旨在像一位智能搭档一样融入您的开发流程。它不仅仅是一个代码补全插件，而是一个能够理解项目上下文、自主规划任务并执行复杂编码操作的智能体。无论是生成全新功能、重构现有代码，还是排查难以定位的 Bug，OpenCode 都能通过自然语言交互高效完成，显著减少开发者在重复性劳动和上下文切换上的时间消耗。\n\n这款工具专为软件开发者、工程师及技术研究人员设计，特别适合希望利用大模型能力来提升编码效率、加速原型开发或处理遗留代码维护的专业人群。其核心亮点在于完全开源的架构，这意味着用户可以审查代码逻辑、自定义行为策略，甚至私有化部署以保障数据安全，彻底打破了传统闭源 AI 助手的“黑盒”限制。\n\n在技术体验上，OpenCode 提供了灵活的终端界面（Terminal UI）和正在测试中的桌面应用程序，支持 macOS、Windows 及 Linux 全平台。它兼容多种包管理工具，安装便捷，并能无缝集成到现有的开发环境中。无论您是追求极致控制权的资深极客，还是渴望提升产出的独立开发者，OpenCode 都提供了一个透明、可信",144296,1,"2026-04-16T14:50:03",[13,45],"插件",{"id":47,"name":48,"github_repo":49,"description_zh":50,"stars":51,"difficulty_score":32,"last_commit_at":52,"category_tags":53,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":55,"name":56,"github_repo":57,"description_zh":58,"stars":59,"difficulty_score":32,"last_commit_at":60,"category_tags":61,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[45,13,15,14],{"id":63,"github_repo":64,"name":65,"description_en":66,"description_zh":67,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":78,"owner_email":79,"owner_twitter":80,"owner_website":81,"owner_url":82,"languages":79,"stars":83,"forks":84,"last_commit_at":85,"license":86,"difficulty_score":42,"env_os":87,"env_gpu":88,"env_ram":88,"env_deps":89,"category_tags":94,"github_topics":96,"view_count":32,"oss_zip_url":79,"oss_zip_packed_at":79,"status":17,"created_at":117,"updated_at":118,"faqs":119,"releases":120},8618,"louisfb01\u002Fbest_AI_papers_2021","best_AI_papers_2021","A  curated list of the latest breakthroughs in AI (in 2021) by release date with a clear video explanation, link to a more in-depth article, and code.","best_AI_papers_2021 是一份精心整理的 2021 年度人工智能突破性论文清单，旨在帮助从业者高效追踪领域前沿。面对 AI 研究日新月异、文献海量的挑战，它解决了研究人员难以快速筛选高价值成果及理解复杂技术的痛点。\n\n该资源按发布日期排序，收录了当年最具影响力的研究，不仅提供论文原文链接，还独特地配备了清晰的视频解读、深度分析文章以及可运行的代码实现（主要基于 PyTorch）。这种“视频 + 文章 + 代码”的多维呈现方式，极大地降低了理解门槛，让用户能迅速掌握从 DALL·E 等生成模型到伦理治理等关键议题的核心内容。\n\nbest_AI_papers_2021 特别适合 AI 研究人员、算法工程师、数据科学家以及希望深入了解技术趋势的开发者使用。无论是为了寻找灵感、复现经典实验，还是系统性地回顾年度进展，这份清单都能提供极大的便利。此外，项目维护者还通过通讯和社交媒体持续更新动态，并提供了与 Weights & Biases 集成的实用指南，帮助用户更好地管理机器学习实验，提升科研效率。","# 2021: A Year Full of Amazing AI papers- A Review 📌\n## A curated list of the latest breakthroughs in AI by release date with a clear video explanation, link to a more in-depth article, and code.\n\nWhile the world is still recovering, research hasn't slowed its frenetic pace, especially in the field of artificial intelligence. More, many important aspects were highlighted this year, like the ethical aspects, important biases, governance, transparency and much more. Artificial intelligence and our understanding of the human brain and its link to AI are constantly evolving, showing promising applications improving our life's quality in the near future. Still, we ought to be careful with which technology we choose to apply.\n\n>\"Science cannot tell us what we ought to do, only what we can do.\"\u003Cbr\u002F>- Jean-Paul Sartre, Being and Nothingness\n\nHere are the most interesting research papers of the year, in case you missed any of them. In short, it is curated list of the latest breakthroughs in AI and Data Science by release date with a clear video explanation, link to a more in-depth article, and code (if applicable). Enjoy the read!\n\n**The complete reference to each paper is listed at the end of this repository.** *Star this repository to stay up to date!* ⭐️\n\nMaintainer: [louisfb01](https:\u002F\u002Fgithub.com\u002Flouisfb01)\n\n[![Twitter](https:\u002F\u002Fimg.shields.io\u002Ftwitter\u002Furl\u002Fhttps\u002Ftwitter.com\u002Fcloudposse.svg?style=social&label=Follow%20%40whats_ai)](https:\u002F\u002Ftwitter.com\u002FWhats_AI)\n\nSubscribe to my [newsletter](https:\u002F\u002Flouisbouchard.substack.com\u002F) - The latest updates in AI explained every week.\n\n\n*Feel free to [message me](https:\u002F\u002Fwww.louisbouchard.ai\u002Fcontact\u002F) any interesting paper I may have missed to add to this repository.*\n\n*Tag me on **Twitter** [@Whats_AI](https:\u002F\u002Ftwitter.com\u002FWhats_AI) or **LinkedIn** [@Louis (What's AI) Bouchard](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fwhats-ai\u002F) if you share the list!*\n\n ### Watch a complete 2021 rewind in 15 minutes\n\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F3OoNOg1.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Fz5slE_akZmc)\n\n--- \n\n### If you are interested in Computer Vision research, here is another great repository for you:\nA curated list of the top 10 CV publications in 2021 with a clear video explanation, link to a more in-depth article, and code.\n\n[The Top 10 Computer Vision Papers of 2021](https:\u002F\u002Fgithub.com\u002Flouisfb01\u002Ftop-10-cv-papers-2021)\n\n----\n\n👀 **If you'd like to support my work** and use W&B (for free) to track your ML experiments and make your work reproducible or collaborate with a team, you can try it out by following [this guide](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Flouisfb01\u002Fexamples\u002Fblob\u002Fmaster\u002Fcolabs\u002Fpytorch\u002FSimple_PyTorch_Integration.ipynb)! Since most of the code here is PyTorch-based, we thought that a [QuickStart guide](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Flouisfb01\u002Fexamples\u002Fblob\u002Fmaster\u002Fcolabs\u002Fpytorch\u002FSimple_PyTorch_Integration.ipynb) for using W&B on PyTorch would be most interesting to share.\n\n👉Follow [this quick guide](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Flouisfb01\u002Fexamples\u002Fblob\u002Fmaster\u002Fcolabs\u002Fpytorch\u002FSimple_PyTorch_Integration.ipynb), use the same W&B lines in your code or any of the repos below, and have all your experiments automatically tracked in your w&b account! It doesn't take more than 5 minutes to set up and will change your life as it did for me! [Here's a more advanced guide](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Flouisfb01\u002Fexamples\u002Fblob\u002Fmaster\u002Fcolabs\u002Fpytorch\u002FOrganizing_Hyperparameter_Sweeps_in_PyTorch_with_W%26B.ipynb) for using Hyperparameter Sweeps if interested :)\n\n🙌 Thank you to [Weights & Biases](https:\u002F\u002Fwandb.ai\u002F) for sponsoring this repository and the work I've been doing, and thanks to any of you using this link and trying W&B!\n\n[![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Flouisfb01\u002Fexamples\u002Fblob\u002Fmaster\u002Fcolabs\u002Fpytorch\u002FSimple_PyTorch_Integration.ipynb)\n\n----\n\n## The Full List\n- [DALL·E: Zero-Shot Text-to-Image Generation from OpenAI [1]](#1)\n- [VOGUE: Try-On by StyleGAN Interpolation Optimization [2]](#2)\n- [Taming Transformers for High-Resolution Image Synthesis [3]](#3)\n- [Thinking Fast And Slow in AI [4]](#4)\n- [Automatic detection and quantification of floating marine macro-litter in aerial images [5]](#5)\n- [ShaRF: Shape-conditioned Radiance Fields from a Single View [6]](#6)\n- [Generative Adversarial Transformers [7]](#7)\n- [We Asked Artificial Intelligence to Create Dating Profiles. Would You Swipe Right? [8]](#8)\n- [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows [9]](#9)\n- [IMAGE GANS MEET DIFFERENTIABLE RENDERING FOR INVERSE GRAPHICS AND INTERPRETABLE 3D NEURAL RENDERING [10]](#10)\n- [Deep nets: What have they ever done for vision? [11]](#11)\n- [Infinite Nature: Perpetual View Generation of Natural Scenes from a Single Image [12]](#12)\n- [Portable, Self-Contained Neuroprosthetic Hand with Deep Learning-Based Finger Control [13]](#13)\n- [Total Relighting: Learning to Relight Portraits for Background Replacement [14]](#14)\n- [LASR: Learning Articulated Shape Reconstruction from a Monocular Video [15]](#15)\n- [Enhancing Photorealism Enhancement [16]](#16)\n- [DefakeHop: A Light-Weight High-Performance Deepfake Detector [17]](#17)\n- [High-Resolution Photorealistic Image Translation in Real-Time: A Laplacian Pyramid Translation Network [18]](#18)\n- [Barbershop: GAN-based Image Compositing using Segmentation Masks [19]](#19)\n- [TextStyleBrush: Transfer of text aesthetics from a single example [20]](#20)\n- [Animating Pictures with Eulerian Motion Fields [21]](#21)\n- [CVPR 2021 Best Paper Award: GIRAFFE - Controllable Image Generation [22]](#22)\n- [GitHub Copilot & Codex: Evaluating Large Language Models Trained on Code [23]](#23)\n- [Apple: Recognizing People in Photos Through Private On-Device Machine Learning [24]](#24)\n- [Image Synthesis and Editing with Stochastic Differential Equations [25]](#25)\n- [Sketch Your Own GAN [26]](#26)\n- [Tesla's Autopilot Explained [27]](#27)\n- [Styleclip: Text-driven manipulation of StyleGAN imagery [28]](#28)\n- [TimeLens: Event-based Video Frame Interpolation [29]](#29)\n- [Diverse Generation from a Single Video Made Possible [30]](#30)\n- [Skillful Precipitation Nowcasting using Deep Generative Models of Radar [31]](#31)\n- [The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks [32]](#32)\n- [ADOP: Approximate Differentiable One-Pixel Point Rendering [33]](#33)\n- [(Style)CLIPDraw: Coupling Content and Style in Text-to-Drawing Synthesis [34]](#34)\n- [SwinIR: Image restoration using swin transformer [35]](#35)\n- [EditGAN: High-Precision Semantic Image Editing [36]](#36)\n- [CityNeRF: Building NeRF at City Scale [37]](#37)\n- [ClipCap: CLIP Prefix for Image Captioning [38]](#38)\n- [Paper references](#references)\n\n---\n\n## DALL·E: Zero-Shot Text-to-Image Generation from OpenAI [1]\u003Ca name=\"1\">\u003C\u002Fa>\nOpenAI successfully trained a network able to generate images from text captions. It is very similar to GPT-3 and Image GPT and produces amazing results.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FCzdyuce.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FDJToDLBPovg)\n* Short read: [OpenAI’s DALL·E: Text-to-Image Generation Explained](https:\u002F\u002Fwww.louisbouchard.ai\u002Fopenais-dall-e-text-to-image-generation-explained\u002F)\n* Paper: [Zero-Shot Text-to-Image Generation](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2102.12092.pdf)\n* Code: [Code & more information for the discrete VAE used for DALL·E](https:\u002F\u002Fgithub.com\u002Fopenai\u002FDALL-E)\n\n\n## VOGUE: Try-On by StyleGAN Interpolation Optimization [2]\u003Ca name=\"2\">\u003C\u002Fa>\nGoogle used a modified StyleGAN2 architecture to create an online fitting room where you can automatically try-on any pants or shirts you want using only an image of yourself.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FFQL9bwU.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Fi4MnLJGZbaM)\n* Short read: [The AI-Powered Online Fitting Room: VOGUE](https:\u002F\u002Fmedium.com\u002Ftowards-artificial-intelligence\u002Fthe-ai-powered-online-fitting-room-vogue-5f77c599832)\n* Paper: [VOGUE: Try-On by StyleGAN Interpolation Optimization](https:\u002F\u002Fvogue-try-on.github.io\u002Fstatic_files\u002Fresources\u002FVOGUE-virtual-try-on.pdf)\n\n\n## Taming Transformers for High-Resolution Image Synthesis [3]\u003Ca name=\"3\">\u003C\u002Fa>\nTl;DR: They combined the efficiency of GANs and convolutional approaches with the expressivity of transformers to produce a powerful and time-efficient method for semantically-guided high-quality image synthesis.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F0zUY1tm.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FJfUTd8fjtX8)\n* Short read: [Combining the Transformers Expressivity with the CNNs Efficiency for High-Resolution Image Synthesis](https:\u002F\u002Fmedium.com\u002Ftowards-artificial-intelligence\u002Fcombining-the-transformers-expressivity-with-the-cnns-efficiency-for-high-resolution-image-synthesis-31c6767547da)\n* Paper: [Taming Transformers for High-Resolution Image Synthesis](https:\u002F\u002Fcompvis.github.io\u002Ftaming-transformers\u002F)\n* Code: [Taming Transformers](https:\u002F\u002Fgithub.com\u002FCompVis\u002Ftaming-transformers)\n\n\n## Thinking Fast And Slow in AI [4]\u003Ca name=\"4\">\u003C\u002Fa>\nDrawing inspiration from Human Capabilities Towards a more general and trustworthy AI & 10 Questions for the AI Research Community.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FH8X58lb.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002F3nvAaVSQxs4)\n* Short read: [Third Wave of AI | Thinking Fast and Slow](https:\u002F\u002Fwww.louisbouchard.ai\u002Fthird-wave-of-ai-thinking-fast-and-slow\u002F)\n* Paper: [Thinking Fast And Slow in AI](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.06002)\n\n\n## Automatic detection and quantification of floating marine macro-litter in aerial images [5]\u003Ca name=\"5\">\u003C\u002Fa>\nOdei Garcia-Garin et al. from the University of Barcelona have developed a deep learning-based algorithm able to detect and quantify floating garbage from aerial images. They also made a web-oriented application allowing users to identify these garbages, called floating marine macro-litter, or FMML, within images of the sea surface.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FMmlYblV.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002F2dTSsdW0WYI)\n* Short read: [An AI Software Able To Detect and Count Plastic Waste in the Ocean](https:\u002F\u002Fpub.towardsai.net\u002Fan-ai-software-able-to-detect-and-count-plastic-waste-in-the-ocean-7211aa0baf89)\n* Paper: [Automatic detection and quantification of floating marine macro-litter in aerial images: Introducing a novel deep learning approach connected to a web application in R, Environmental Pollution](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.envpol.2021.116490)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Famonleong\u002FMARLIT)\n\n\n## ShaRF: Shape-conditioned Radiance Fields from a Single View [6]\u003Ca name=\"6\">\u003C\u002Fa>\nJust imagine how cool it would be to just take a picture of an object and have it in 3D to insert in the movie or video game you are creating or in a 3D scene for an illustration.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FWV6lq5s.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FgHkkrNMlGNg)\n* Short read: [ShaRF: Take a Picture From a Real-Life Object, and Create a 3D Model of It](https:\u002F\u002Fpub.towardsai.net\u002Fsharf-take-a-picture-from-a-real-life-object-and-create-a-3d-model-of-it-c6809806b32)\n* Paper: [ShaRF: Shape-conditioned Radiance Fields from a Single View](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.08860)\n* [Click here for the code](http:\u002F\u002Fwww.krematas.com\u002Fsharf\u002Findex.html)\n\n\n## Generative Adversarial Transformers [7]\u003Ca name=\"7\">\u003C\u002Fa>\nThey basically leverage transformers’ attention mechanism in the powerful StyleGAN2 architecture to make it even more powerful!\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FCJzGHxa.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FHO-_t0UArd4)\n* Short read: [GANsformers: Scene Generation with Generative Adversarial Transformers](https:\u002F\u002Fwhats-ai.medium.com\u002Fgenerative-adversarial-transformers-gansformers-explained-bf1fa76ef58d)\n* Paper: [Generative Adversarial Transformers](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2103.01209.pdf)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Fdorarad\u002Fgansformer)\n\n>Subscribe to my weekly [newsletter](http:\u002F\u002Feepurl.com\u002FhuGLT5) and stay up-to-date with new publications in AI for 2022!\n\n\n## We Asked Artificial Intelligence to Create Dating Profiles. Would You Swipe Right? [8]\u003Ca name=\"8\">\u003C\u002Fa>\nWould you swipe right on an AI profile? Can you distinguish an actual human from a machine? This is what this study reveals using AI-made-up people on dating apps.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FVKZrTBH.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FIoRH5u13P-4)\n* Short read: [Would You Swipe Right on an AI Profile?](https:\u002F\u002Fpub.towardsai.net\u002Fwould-you-swipe-right-on-an-ai-profile-98dc8a4451ec)\n* Paper: [We Asked Artificial Intelligence to Create Dating Profiles. Would You Swipe Right?](https:\u002F\u002Fstudyonline.unsw.edu.au\u002Fblog\u002Fai-generated-dating-profile)\n* [Click here for the code](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1VLG8e7YSEwypxU-noRNhsv5dW4NfTGce#forceEdit=true&sandboxMode=true&scrollTo=aeXshJM-Cuaf)\n\n\n## Swin Transformer: Hierarchical Vision Transformer using Shifted Windows [9]\u003Ca name=\"9\">\u003C\u002Fa>\nWill Transformers Replace CNNs in Computer Vision? In less than 5 minutes, you will know how the transformer architecture can be applied to computer vision with a new paper called the Swin Transformer.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002Fr9aL2iU.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FQcCJJOLCeJQ)\n* Short read: [Will Transformers Replace CNNs in Computer Vision?](https:\u002F\u002Fpub.towardsai.net\u002Fwill-transformers-replace-cnns-in-computer-vision-55657a196833)\n* Paper: [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.14030v1)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FSwin-Transformer)\n\n\n## IMAGE GANS MEET DIFFERENTIABLE RENDERING FOR INVERSE GRAPHICS AND INTERPRETABLE 3D NEURAL RENDERING [10]\u003Ca name=\"10\">\u003C\u002Fa>\nThis promising model called GANverse3D only needs an image to create a 3D figure that can be customized and animated!\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FJJ5UAEp.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FdvjwRBZ3Hnw)\n* Short read: [Create 3D Models from Images! GANverse3D & NVIDIA Omniverse](https:\u002F\u002Fwww.louisbouchard.ai\u002Fganverse3d\u002F)\n* Paper: [IMAGE GANS MEET DIFFERENTIABLE RENDERING FOR INVERSE GRAPHICS AND INTERPRETABLE 3D NEURAL RENDERING](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.09125.pdf)\n\n\n## Deep nets: What have they ever done for vision? [11]\u003Ca name=\"11\">\u003C\u002Fa>\n\"I will openly share everything about deep nets for vision applications, their successes, and the limitations we have to address.\"\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FPQX8Phj.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FGhPDNzAVNDk)\n* Short read: [What is the state of AI in computer vision?](https:\u002F\u002Fwww.louisbouchard.ai\u002Fai-in-computer-vision\u002F)\n* Paper: [Deep nets: What have they ever done for vision?](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.04025)\n\n\n## Infinite Nature: Perpetual View Generation of Natural Scenes from a Single Image [12]\u003Ca name=\"12\">\u003C\u002Fa>\nThe next step for view synthesis: Perpetual View Generation, where the goal is to take an image to fly into it and explore the landscape!\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FezIYce7.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FNIOt1HLV_Mo)\n* Short read: [Infinite Nature: Fly into an image and explore the landscape](https:\u002F\u002Fwww.louisbouchard.ai\u002Finfinite-nature\u002F)\n* Paper: [Infinite Nature: Perpetual View Generation of Natural Scenes from a Single Image](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2012.09855.pdf)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Finfinite_nature)\n* [Colab demo](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fgoogle-research\u002Fgoogle-research\u002Fblob\u002Fmaster\u002Finfinite_nature\u002Finfinite_nature_demo.ipynb#scrollTo=sCuRX1liUEVM)\n\n\n##  Portable, Self-Contained Neuroprosthetic Hand with Deep Learning-Based Finger Control [13]\u003Ca name=\"13\">\u003C\u002Fa>\nWith this AI-powered nerve interface, the amputee can control a neuroprosthetic hand with life-like dexterity and intuitiveness.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F07soslr.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FwNBrCRzlbVw)\n* Short read: [An amputee with an AI-Powered Hand! 🦾](https:\u002F\u002Fwww.louisbouchard.ai\u002Fan-amputee-with-an-ai-powered-hand\u002F)\n* Paper: [Portable, Self-Contained Neuroprosthetic Hand with Deep Learning-Based Finger Control](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.13452)\n\n\n## Total Relighting: Learning to Relight Portraits for Background Replacement [14]\u003Ca name=\"14\">\u003C\u002Fa>\nProperly relight any portrait based on the lighting of the new background you add. Have you ever wanted to change the background of a picture but have it look realistic? If you’ve already tried that, you already know that it isn’t simple. You can’t just take a picture of yourself in your home and change the background for a beach. It just looks bad and not realistic. Anyone will just say “that’s photoshopped” in a second. For movies and professional videos, you need the perfect lighting and artists to reproduce a high-quality image, and that’s super expensive. There’s no way you can do that with your own pictures. Or can you?\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002Fa4KCChf.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FrVP2tcF_yRI)\n* Short read: [Realistic Lighting on Different Backgrounds](https:\u002F\u002Fwww.louisbouchard.ai\u002Fbackgrounds-with-lighting\u002F)\n* Paper: [Total Relighting: Learning to Relight Portraits for Background Replacement](https:\u002F\u002Faugmentedperception.github.io\u002Ftotal_relighting\u002Ftotal_relighting_paper.pdf)\n\n\n## LASR: Learning Articulated Shape Reconstruction from a Monocular Video [15]\u003Ca name=\"15\">\u003C\u002Fa>\nGenerate 3D models of humans or animals moving from only a short video as input. This is a new method for generating 3D models of humans or animals moving from only a short video as input. Indeed, it actually understands that this is an odd shape, that it can move, but still needs to stay attached as this is still one \"object\" and not just many objects together...\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F9PmIb2e.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Flac7wqjS-8E)\n* Short read: [Articulated 3D Reconstruction from Videos](https:\u002F\u002Fwww.louisbouchard.ai\u002F3d-reconstruction-from-videos\u002F)\n* Paper: [LASR: Learning Articulated Shape Reconstruction from a Monocular Video](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fpapers\u002FYang_LASR_Learning_Articulated_Shape_Reconstruction_From_a_Monocular_Video_CVPR_2021_paper.pdf)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Flasr)\n\n\n## Enhancing Photorealism Enhancement [16]\u003Ca name=\"16\">\u003C\u002Fa>\nThis AI can be applied live to the video game and transform every frame to look much more natural. The researchers from Intel Labs just published this paper called Enhancing Photorealism Enhancement. And if you think that this may be \"just another GAN,\" taking a picture of the video game as an input and changing it following the style of the natural world, let me change your mind. They worked on this model for two years to make it extremely robust. It can be applied live to the video game and transform every frame to look much more natural. Just imagine the possibilities where you can put a lot less effort into the game graphic, make it super stable and complete, then improve the style using this model...\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FMNpYCVj.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002F3rYosbwXm1w)\n* Short read: [Is AI The Future Of Video Game Design? Enhancing Photorealism Enhancement](https:\u002F\u002Fwww.louisbouchard.ai\u002Fthe-future-of-video-game\u002F)\n* Paper: [Enhancing Photorealism Enhancement](http:\u002F\u002Fvladlen.info\u002Fpapers\u002FEPE.pdf)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Fisl-org\u002FPhotorealismEnhancement)\n\n\n## DefakeHop: A Light-Weight High-Performance Deepfake Detector [17]\u003Ca name=\"17\">\u003C\u002Fa>\nHow to Spot a Deep Fake in 2021. Breakthrough US Army technology using artificial intelligence to find deepfakes.\n\nWhile they seem like they’ve always been there, the very first realistic deepfake didn’t appear until 2017. It went from the first-ever resembling fake images automatically generated to today’s identical copy of someone on videos, with sound.\n\nThe reality is that we cannot see the difference between a real video or picture and a deepfake anymore. How can we tell what’s real from what isn’t? How can audio files or video files be used in court as proof if an AI can entirely generate them? Well, this new paper may provide answers to these questions. And the answer here may again be the use of artificial intelligence. The saying “I’ll believe it when I’ll see it” may soon change for “I’ll believe it when the AI tells me to believe it…”\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FZsyJyeJ.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FYMir8sRWRos)\n* Short read: [How to Spot a Deep Fake. Breakthrough US Army technology (2021)](https:\u002F\u002Fwww.louisbouchard.ai\u002Fspot-deepfakes\u002F)\n* Paper: [DefakeHop: A Light-Weight High-Performance Deepfake Detector](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.06929)\n\n\n## High-Resolution Photorealistic Image Translation in Real-Time: A Laplacian Pyramid Translation Network [18]\u003Ca name=\"18\">\u003C\u002Fa>\nApply any style to your 4K image in real-time using this new machine learning-based approach!\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F6Z2OvBm.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FX7WzlAyUGPo)\n* Short read: [High-Resolution Photorealistic Image Translation in Real-Time](https:\u002F\u002Fwww.louisbouchard.ai\u002F4k-image-translation-in-real-time\u002F)\n* Paper: [High-Resolution Photorealistic Image Translation in Real-Time: A Laplacian Pyramid Translation Network](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.09188.pdf)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Fcsjliang\u002FLPTN)\n\n\n## Barbershop: GAN-based Image Compositing using Segmentation Masks [19]\u003Ca name=\"19\">\u003C\u002Fa>\nThis article is not about a new technology in itself. Instead, it is about a new and exciting application of GANs. Indeed, you saw the title, and it wasn’t clickbait. This AI can transfer your hair to see how it would look like before committing to the change…\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FTYzXcQ0.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FHtqYMvBVJD8)\n* Short read: [Barbershop: Try Different Hairstyles and Hair Colors from Pictures (GANs)](https:\u002F\u002Fwww.louisbouchard.ai\u002Fbarbershop\u002F)\n* Paper: [Barbershop: GAN-based Image Compositing using Segmentation Masks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.01505.pdf)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002FZPdesu\u002FBarbershop)\n\n\n## TextStyleBrush: Transfer of text aesthetics from a single example [20]\u003Ca name=\"20\">\u003C\u002Fa>\nThis new Facebook AI model can translate or edit text directly in the image in your own language, following the same style!\n\nImagine you are on vacation in another country where you do not speak the language. You want to try out a local restaurant, but their menu is in the language you don’t speak. I think this won’t be too hard to imagine as most of us already faced this situation whether you see menu items or directions and you can’t understand what’s written. Well, in 2020, you would take out your phone and google translate what you see. In 2021 you don’t even need to open google translate anymore and try to write what you see one by one to translate it. Instead, you can simply use this new model by Facebook AI to translate every text in the image in your own language…\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FJDBqrlv.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FhhAri5fl-XI)\n* Short read: [Translate or Edit Text from Images Emulating the Style: TextStyleBrush](https:\u002F\u002Fwww.louisbouchard.ai\u002Ftextstylebrush\u002F)\n* Paper: [TextStyleBrush: Transfer of text aesthetics from a single example](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.08385)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FIMGUR5K-Handwriting-Dataset?fbclid=IwAR0pRAxhf8Vg-5H3fA0BEaRrMeD21HfoCJ-so8V0qmWK7Ub21dvy_jqgiVo)\n\n\n>If you’d like to read more research papers as well, I recommend you read [my article](https:\u002F\u002Fpub.towardsai.net\u002Fhow-to-read-more-research-papers-7737e3770d7f) where I share my best tips for finding and reading more research papers.\n\n## Animating Pictures with Eulerian Motion Fields [21]\u003Ca name=\"21\">\u003C\u002Fa>\nThis model takes a picture, understands which particles are supposed to be moving, and realistically animates them in an infinite loop while conserving the rest of the picture entirely still creating amazing-looking videos like this one...\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FJyarpBv.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FKgTa2r7d0I0)\n* Short read: [Create Realistic Animated Looping Videos from Pictures](https:\u002F\u002Fwww.louisbouchard.ai\u002Fanimate-pictures\u002F)\n* Paper: [Animating Pictures with Eulerian Motion Fields](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.15128)\n* [Click here for the code](https:\u002F\u002Feulerian.cs.washington.edu\u002F)\n\n\n## CVPR 2021 Best Paper Award: GIRAFFE - Controllable Image Generation [22]\u003Ca name=\"22\">\u003C\u002Fa>\nUsing a modified GAN architecture, they can move objects in the image without affecting the background or the other objects!\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FZP6F9SF.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FJIJkURAkCxM)\n* Short read: [CVPR 2021 Best Paper Award: GIRAFFE - Controllable Image Generation](https:\u002F\u002Fwww.louisbouchard.ai\u002Fcvpr-2021-best-paper\u002F)\n* Paper: [GIRAFFE: Representing Scenes as Compositional Generative Neural Feature Fields](http:\u002F\u002Fwww.cvlibs.net\u002Fpublications\u002FNiemeyer2021CVPR.pdf)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Fautonomousvision\u002Fgiraffe)\n\n\n## GitHub Copilot & Codex: Evaluating Large Language Models Trained on Code [23]\u003Ca name=\"23\">\u003C\u002Fa>\nFind out how this new model from OpenAI Generates Code From Words!\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FgFOnhmV.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Faz3oVVkTFB8)\n* Short read: [OpenAI's New Code Generator: GitHub Copilot (and Codex)](https:\u002F\u002Fwww.louisbouchard.ai\u002Fgithub-copilot\u002F)\n* Paper: [Evaluating Large Language Models Trained on Code](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2107.03374.pdf)\n* [Click here for the code](https:\u002F\u002Fcopilot.github.com\u002F)\n\n\n## Apple: Recognizing People in Photos Through Private On-Device Machine Learning [24]\u003Ca name=\"24\">\u003C\u002Fa>\nUsing multiple machine learning-based algorithms running privately on your device, Apple allows you to accurately curate and organize your images and videos on iOS 15.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FHBuOzrG.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FLIV-M-gFRFA)\n* Short read: [How Apple Photos Recognizes People in Private Photos Using Machine Learning](https:\u002F\u002Fwww.louisbouchard.ai\u002Fhow-apple-photos-recognizes-people\u002F)\n* Paper: [Recognizing People in Photos Through Private On-Device Machine Learning](https:\u002F\u002Fmachinelearning.apple.com\u002Fresearch\u002Frecognizing-people-photos)\n\n\n## Image Synthesis and Editing with Stochastic Differential Equations [25]\u003Ca name=\"25\">\u003C\u002Fa>\nSay goodbye to complex GAN and transformer architectures for image generation!\nThis new method by Chenling Meng et al. from Stanford University and Carnegie Mellon University can generate new images from any user-based inputs. Even people like me with zero artistic skills can now generate beautiful images or modifications out of quick sketches...\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FIdrRxix.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FxoEkSWJSm1k)\n* Short read: [Image Synthesis and Editing from Sketches: SDEdit. No more tedious training needed!](https:\u002F\u002Fwww.louisbouchard.ai\u002Fimage-synthesis-from-sketches\u002F)\n* Paper: [Image Synthesis and Editing with Stochastic Differential Equations](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.01073.pdf)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Fermongroup\u002FSDEdit)\n* [Colab demo](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1KkLS53PndXKQpPlS1iK-k1nRQYmlb4aO?usp=sharing)\n\n\n## Sketch Your Own GAN [26]\u003Ca name=\"26\">\u003C\u002Fa>\nMake GANs training easier for everyone by generating Images following a sketch! Indeed, whit this new method, you can control your GAN’s outputs based on the simplest type of knowledge you could provide it: hand-drawn sketches.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FzbD9T8e.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Fvz_wEQkTLk0)\n* Short read: [Make GANs Training Easier for Everyone : Generate Images Following a Sketch](https:\u002F\u002Fwww.louisbouchard.ai\u002Fmake-gans-training-easier\u002F)\n* Paper: [Sketch Your Own GAN](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.02774)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002FPeterWang512\u002FGANSketching)\n\n\n## Tesla's Autopilot Explained [27]\u003Ca name=\"27\">\u003C\u002Fa>\nIf you wonder how a Tesla car can not only see but navigate the roads with other vehicles, this is the video you were waiting for. A couple of days ago was the first Tesla AI day where Andrej Karpathy, the Director of AI at Tesla, and others presented how Tesla’s autopilot works from the image acquisition through their eight cameras to the navigation process on the roads.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FLo7s7db.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FDTHqgDqkIRw)\n* Short read: [Tesla's Autopilot Explained](https:\u002F\u002Fwww.louisbouchard.ai\u002Ftesla-autopilot-explained-tesla-ai-day\u002F)\n\n\n## Styleclip: Text-driven manipulation of StyleGAN imagery [28]\u003Ca name=\"28\">\u003C\u002Fa>\nAI could generate images, then, using a lot of brainpower and trial and error, researchers could control the results following specific styles. Now, with this new model, you can do that using only text!\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FKJIpmys.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FRAXrwPskNso)\n* Short read: [Manipulate Real Images With Text - An AI For Creative Artists! StyleCLIP Explained](https:\u002F\u002Fwww.louisbouchard.ai\u002Fstyleclip\u002F)\n* Paper: [Styleclip: Text-driven manipulation of StyleGAN imagery.](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.17249)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Forpatashnik\u002FStyleCLIP)\n* [Colab demo](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Forpatashnik\u002FStyleCLIP\u002Fblob\u002Fmain\u002Fnotebooks\u002FStyleCLIP_global.ipynb)\n\n\n## TimeLens: Event-based Video Frame Interpolation [29]\u003Ca name=\"29\">\u003C\u002Fa>\nTimeLens can understand the movement of the particles in-between the frames of a video to reconstruct what really happened at a speed even our eyes cannot see. In fact, it achieves results that our intelligent phones and no other models could reach before!\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FZF4fK31.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FHWA0yVXYRlk)\n* Short read: [How to Make Slow Motion Videos With AI!](https:\u002F\u002Fwww.louisbouchard.ai\u002Ftimelens\u002F)\n* Paper: [TimeLens: Event-based Video Frame Interpolation](http:\u002F\u002Frpg.ifi.uzh.ch\u002Fdocs\u002FCVPR21_Gehrig.pdf)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Fuzh-rpg\u002Frpg_timelens)\n\n>Subscribe to my weekly [newsletter](http:\u002F\u002Feepurl.com\u002FhuGLT5) and stay up-to-date with new publications in AI for 2022!\n\n## Diverse Generation from a Single Video Made Possible [30]\u003Ca name=\"30\">\u003C\u002Fa>\nHave you ever wanted to edit a video?\n\nRemove or add someone, change the background, make it last a bit longer, or change the resolution to fit a specific aspect ratio without compressing or stretching it. For those of you who already ran advertisement campaigns, you certainly wanted to have variations of your videos for AB testing and see what works best. Well, this new research by Niv Haim et al. can help you do all of these out of a single video and in HD!\n\nIndeed, using a simple video, you can perform any tasks I just mentioned in seconds or a few minutes for high-quality videos. You can basically use it for any video manipulation or video generation application you have in mind. It even outperforms GANs in all ways and doesn’t use any deep learning fancy research nor requires a huge and impractical dataset! And the best thing is that this technique is scalable to high-resolution videos.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F4OE71WI.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FUy8yKPEi1dg)\n* Short read: [Generate Video Variations - No dataset or deep learning required!](https:\u002F\u002Fwww.louisbouchard.ai\u002Fvgpnn-generate-video-variations\u002F)\n* Paper: [Diverse Generation from a Single Video Made Possible](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.08591)\n* [Click here for the code](https:\u002F\u002Fnivha.github.io\u002Fvgpnn\u002F)\n\n\n## Skillful Precipitation Nowcasting using Deep Generative Models of Radar [31]\u003Ca name=\"31\">\u003C\u002Fa>\nDeepMind just released a Generative model able to outperform widely-used nowcasting methods in 89% of situations for its accuracy and usefulness assessed by more than 50 expert meteorologists! Their model focuses on predicting precipitations in the next 2 hours and achieves that surprisingly well. It is a generative model, which means that it will generate the forecasts instead of simply predicting them. It basically takes radar data from the past to create future radar data. So using both time and spatial components from the past, they can generate what it will look like in the near future.\n\nYou can see this as the same as Snapchat filters, taking your face and generating a new face with modifications on it. To train such a generative model, you need a bunch of data from both the human faces and the kind of face you want to generate. Then, using a very similar model trained for many hours, you will have a powerful generative model. This kind of model often uses GANs architectures for training purposes and then uses the generator model independently.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FdE7MQ6E.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FdlSIq64psEY)\n* Short read: [DeepMind uses AI to Predict More Accurate Weather Forecasts](https:\u002F\u002Fwww.louisbouchard.ai\u002Fdeepmind-rain-nowcasting\u002F)\n* Paper: [Skillful Precipitation Nowcasting using Deep Generative Models of Radar](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-021-03854-z)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Fdeepmind-research\u002Ftree\u002Fmaster\u002Fnowcasting)\n\n\n## The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks [32]\u003Ca name=\"32\">\u003C\u002Fa>\nHave you ever tuned in to a video or a TV show and the actors were completely inaudible, or the music was way too loud? Well, this problem, also called the cocktail party problem, may never happen again. Mitsubishi and Indiana University just published a new model as well as a new dataset tackling this task of identifying the right soundtrack. For example, if we take the same audio clip we just ran with the music way too loud, you can simply turn up or down the audio track you want to give more importance to the speech than the music.\n\nThe problem here is isolating any independent sound source from a complex acoustic scene like a movie scene or a youtube video where some sounds are not well balanced. Sometimes you simply cannot hear some actors because of the music playing or explosions or other ambient sounds in the background. Well, if you successfully isolate the different categories in a soundtrack, it means that you can also turn up or down only one of them, like turning down the music a bit to hear all the other actors correctly. This is exactly what the researchers achieved.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F3hQeWiG.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FRpxufqt5r6I)\n* Short read: [Isolate Voice, Music, and Sound Effects With AI](https:\u002F\u002Fwww.louisbouchard.ai\u002Fisolate-voice-music-and-sound-effects-with-ai\u002F)\n* Paper: [The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.09958.pdf)\n* [Click here for the code](https:\u002F\u002Fcocktail-fork.github.io\u002F)\n\n\n## ADOP: Approximate Differentiable One-Pixel Point Rendering [33]\u003Ca name=\"33\">\u003C\u002Fa>\nImagine you want to generate a 3D model or simply a fluid video out of a bunch of pictures you took. Well, it is now possible! I don't want to give out too much, but the results are simply amazing and you need to check it out by yourself!\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FaMAWzJU.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FJfph7Vld_Nw)\n* Short read: [AI Synthesizes Smooth Videos from a Couple of Images!](https:\u002F\u002Fwww.louisbouchard.ai\u002Fai-synthesizes-smooth-videos-from-a-couple-of-images\u002F)\n* Paper: [ADOP: Approximate Differentiable One-Pixel Point Rendering](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.06635.pdf)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Fdarglein\u002FADOP)\n\n\n## (Style)CLIPDraw: Coupling Content and Style in Text-to-Drawing Synthesis [34]\u003Ca name=\"34\">\u003C\u002Fa>\nHave you ever dreamed of taking the style of a picture, like this cool TikTok drawing style on the left, and applying it to a new picture of your choice? Well, I did, and it has never been easier to do. In fact, you can even achieve that from only text and can try it right now with this new method and their Google Colab notebook available for everyone (see references). Simply take a picture of the style you want to copy, enter the text you want to generate, and this algorithm will generate a new picture out of it! Just look back at the results above, such a big step forward! The results are extremely impressive, especially if you consider that they were made from a single line of text!\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FWIZYx0d.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002F5xzcIzHm8Wo)\n* Short read: [Text-to-Drawing Synthesis With Artistic Control | CLIPDraw & StyleCLIPDraw](https:\u002F\u002Fwww.louisbouchard.ai\u002Fclipdraw\u002F)\n* Paper (CLIPDraw): [CLIPDraw: exploring text-to-drawing synthesis through language-image encoders](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.14843)\n* Paper (StyleCLIPDraw): [StyleCLIPDraw: Coupling Content and Style in Text-to-Drawing Synthesis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.03133)\n* [CLIPDraw Colab demo](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fkvfrans\u002Fclipdraw\u002Fblob\u002Fmain\u002Fclipdraw.ipynb)\n* [StyleCLIPDraw Colab demo](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fpschaldenbrand\u002FStyleCLIPDraw\u002Fblob\u002Fmaster\u002FStyle_ClipDraw.ipynb)\n\n\n## SwinIR: Image restoration using swin transformer [35]\u003Ca name=\"35\">\u003C\u002Fa>\nHave you ever had an image you really liked and could only manage to find a small version of it that looked like this image below on the left? How cool would it be if you could take this image and make it twice look as good? It’s great, but what if you could make it even four or eight times more high definition? Now we’re talking, just look at that.\n\nHere we enhanced the resolution of the image by a factor of four, meaning that we have four times more height and width pixels for more details, making it look a lot smoother. The best thing is that this is done within a few seconds, completely automatically, and works with pretty much any image. Oh, and you can even use it yourself with a demo they made available...\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FqDyvbkv.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FGFm3RfrtDoU)\n* Short read: [SwinIR: Image restoration using swin transformer](https:\u002F\u002Fwww.louisbouchard.ai\u002Fswinir\u002F)\n* Paper: [SwinIR: Image restoration using swin transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.10257)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002FJingyunLiang\u002FSwinIR)\n* [Demo](https:\u002F\u002Freplicate.ai\u002Fjingyunliang\u002Fswinir)\n\n\n## EditGAN: High-Precision Semantic Image Editing [36]\u003Ca name=\"36\">\u003C\u002Fa>\nControl any feature from quick drafts, and it will only edit what you want keeping the rest of the image the same! SOTA Image Editing from sketches model based on GANs by NVIDIA, MIT and UofT.\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FEM68uUJ.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Fbus4OGyMQec)\n* Short read: [NVIDIA EditGAN: Image Editing with Full Control From Sketches](https:\u002F\u002Fwww.louisbouchard.ai\u002Feditgan\u002F)\n* Paper: [EditGAN: High-Precision Semantic Image Editing](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.03186)\n* [Click here for the code (will be released soon)](https:\u002F\u002Fnv-tlabs.github.io\u002FeditGAN\u002F)\n\n\n## CityNeRF: Building NeRF at City Scale [37]\u003Ca name=\"37\">\u003C\u002Fa>\nThe model is called CityNeRF and grows from NeRF, which I previously covered on my channel. NeRF is one of the first models using radiance fields and machine learning to construct 3D models out of images. But NeRF is not that efficient and works for a single scale. Here, CityNeRF is applied to satellite and ground-level images at the same time to produce various 3D model scales for any viewpoint. In simple words, they bring NeRF to city-scale. But how?\n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002Ftvr0LY9.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Fswfx0bJMIlY)\n* Short read: [CityNeRF: 3D Modelling at City Scale!](https:\u002F\u002Fwww.louisbouchard.ai\u002Fcitynerf\u002F)\n* Paper: [CityNeRF: Building NeRF at City Scale](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.05504.pdf)\n* [Click here for the code (will be released soon)](https:\u002F\u002Fcity-super.github.io\u002Fcitynerf\u002F)\n\n\n## ClipCap: CLIP Prefix for Image Captioning [38]\u003Ca name=\"38\">\u003C\u002Fa>\nWe’ve seen AI generate images from other images using GANs. Then, there were models able to generate questionable images using text. In early 2021, DALL-E was published, beating all previous attempts to generate images from text input using CLIP, a model that links images with text as a guide. A very similar task called image captioning may sound really simple but is, in fact, just as complex. It is the ability of a machine to generate a natural description of an image.\nIt’s easy to simply tag the objects you see in the image but it is quite another challenge to understand what’s happening in a single 2-dimensional picture, and this new model does it extremely well... \n\n* Short Video Explanation:\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FQtz6hPA.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FVQDrmuccWDo)\n* Short read: [New SOTA Image Captioning: ClipCap](https:\u002F\u002Fwww.louisbouchard.ai\u002Fclipcap\u002F)\n* Paper: [ClipCap: CLIP Prefix for Image Captioning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09734)\n* [Click here for the code](https:\u002F\u002Fgithub.com\u002Frmokady\u002FCLIP_prefix_caption)\n* [Click here for the Colab Demo](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1tuoAC5F4sC7qid56Z0ap-stR3rwdk0ZV?usp=sharing)\n\n\n---\n\n\n>If you would like to read more papers and have a broader view, here is another great repository for you covering 2020:\n[2020: A Year Full of Amazing AI papers- A Review](https:\u002F\u002Fgithub.com\u002Flouisfb01\u002FBest_AI_paper_2020) and feel free to subscribe to my weekly [newsletter](https:\u002F\u002Flouisbouchard.substack.com\u002F) and stay up-to-date with new publications in AI for 2022!\n\n\n*Tag me on **Twitter** [@Whats_AI](https:\u002F\u002Ftwitter.com\u002FWhats_AI) or **LinkedIn** [@Louis (What's AI) Bouchard](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fwhats-ai\u002F) if you share the list!*\n\n---\n\n## Paper references\u003Ca name=\"references\">\u003C\u002Fa>\n\n[1] A. Ramesh et al., Zero-shot text-to-image generation, 2021. arXiv:2102.12092\n\n[2] Lewis, Kathleen M et al., (2021), VOGUE: Try-On by StyleGAN Interpolation Optimization.\n\n[3] Taming Transformers for High-Resolution Image Synthesis, Esser et al., 2020.\n\n[4] Thinking Fast And Slow in AI, Booch et al., (2020), https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.06002.\n\n[5] Odei Garcia-Garin et al., Automatic detection and quantification of floating marine macro-litter in aerial images: Introducing a novel deep learning approach connected to a web application in R, Environmental Pollution, https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.envpol.2021.116490.\n\n[6] Rematas, K., Martin-Brualla, R., and Ferrari, V., “ShaRF: Shape-conditioned Radiance Fields from a Single View”, (2021), https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.08860\n\n[7] Drew A. Hudson and C. Lawrence Zitnick, Generative Adversarial Transformers, (2021)\n\n[8] Sandra Bryant et al., “We Asked Artificial Intelligence to Create Dating Profiles. Would You Swipe Right?”, (2021), UNSW Sydney blog.\n\n[9] Liu, Z. et al., 2021, “Swin Transformer: Hierarchical Vision Transformer using Shifted Windows”, arXiv preprint https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.14030v1\n\n[10] Zhang, Y., Chen, W., Ling, H., Gao, J., Zhang, Y., Torralba, A. and Fidler, S., 2020. Image gans meet differentiable rendering for inverse graphics and interpretable 3d neural rendering. arXiv preprint arXiv:2010.09125.\n\n[11] Yuille, A.L., and Liu, C., 2021. Deep nets: What have they ever done for vision?. International Journal of Computer Vision, 129(3), pp.781–802, https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.04025.\n\n[12] Liu, A., Tucker, R., Jampani, V., Makadia, A., Snavely, N. and Kanazawa, A., 2020. Infinite Nature: Perpetual View Generation of Natural Scenes from a Single Image, https:\u002F\u002Farxiv.org\u002Fpdf\u002F2012.09855.pdf\n\n[13] Nguyen & Drealan et al. (2021) A Portable, Self-Contained Neuroprosthetic Hand with Deep Learning-Based Finger Control: https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.13452\n\n[14] Pandey et al., 2021, Total Relighting: Learning to Relight Portraits for Background Replacement, doi: 10.1145\u002F3450626.3459872, https:\u002F\u002Faugmentedperception.github.io\u002Ftotal_relighting\u002Ftotal_relighting_paper.pdf.\n\n[15] Gengshan Yang et al., (2021), LASR: Learning Articulated Shape Reconstruction from a Monocular Video, CVPR, https:\u002F\u002Flasr-google.github.io\u002F.\n\n[16] Richter, Abu AlHaija, Koltun, (2021), \"Enhancing Photorealism Enhancement\", https:\u002F\u002Fintel-isl.github.io\u002FPhotorealismEnhancement\u002F.\n\n[17] DeepFakeHop: Chen, Hong-Shuo, et al., (2021), “DefakeHop: A Light-Weight High-Performance Deepfake Detector.” ArXiv abs\u002F2103.06929.\n\n[18] Liang, Jie and Zeng, Hui and Zhang, Lei, (2021), \"High-Resolution Photorealistic Image Translation in Real-Time: A Laplacian Pyramid Translation Network\", https:\u002F\u002Fexport.arxiv.org\u002Fpdf\u002F2105.09188.pdf.\n\n[19] Peihao Zhu et al., (2021), Barbershop, https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.01505.pdf.\n\n[20] Praveen Krishnan, Rama Kovvuri, Guan Pang, Boris Vassilev, and Tal Hassner, Facebook AI, (2021), ”TextStyleBrush: Transfer of text aesthetics from a single example”.\n\n[21] Holynski, Aleksander, et al. “Animating Pictures with Eulerian Motion Fields.” Proceedings of the IEEE\u002FCVF Conference on Computer Vision and Pattern Recognition. 2021.\n\n[22] Michael Niemeyer and Andreas Geiger, (2021), \"GIRAFFE: Representing Scenes as Compositional Generative Neural Feature Fields\", Published in CVPR 2021.\n\n[23] Chen, M., Tworek, J., Jun, H., Yuan, Q., Pinto, H.P.D.O., Kaplan, J., Edwards, H., Burda, Y., Joseph, N., Brockman, G. and Ray, A., 2021. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374.\n\n[24] Apple, “Recognizing People in Photos Through Private On-Device Machine Learning”, (2021), https:\u002F\u002Fmachinelearning.apple.com\u002Fresearch\u002Frecognizing-people-photos\n\n[25] Meng, C., Song, Y., Song, J., Wu, J., Zhu, J.Y. and Ermon, S., 2021. Sdedit: Image synthesis and editing with stochastic differential equations. arXiv preprint arXiv:2108.01073.\n\n[26] Wang, S.Y., Bau, D. and Zhu, J.Y., 2021. Sketch Your Own GAN. In Proceedings of the IEEE\u002FCVF International Conference on Computer Vision (pp. 14050-14060).\n\n[27] “Tesla AI Day”, Tesla, August 19th 2021, https:\u002F\u002Fyoutu.be\u002Fj0z4FweCy4M\n\n[28] Patashnik, Or, et al., (2021), “Styleclip: Text-driven manipulation of StyleGAN imagery.”, https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.17249\n\n[29] Stepan Tulyakov*, Daniel Gehrig*, Stamatios Georgoulis, Julius Erbach, Mathias Gehrig, Yuanyou Li, Davide Scaramuzza, TimeLens: Event-based Video Frame Interpolation, IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Nashville, 2021, http:\u002F\u002Frpg.ifi.uzh.ch\u002Fdocs\u002FCVPR21_Gehrig.pdf\n\n[30] Haim, N., Feinstein, B., Granot, N., Shocher, A., Bagon, S., Dekel, T., & Irani, M. (2021). Diverse Generation from a Single Video Made Possible, https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.08591.\n\n[31] Ravuri, S., Lenc, K., Willson, M., Kangin, D., Lam, R., Mirowski, P., Fitzsimons, M., Athanassiadou, M., Kashem, S., Madge, S. and Prudden, R., 2021. Skillful Precipitation Nowcasting using Deep Generative Models of Radar, https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-021-03854-z\n\n[32] Petermann, D., Wichern, G., Wang, Z., & Roux, J.L. (2021). The Cocktail Fork Problem: Three-Stem Audio Separation for Real-World Soundtracks. https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.09958.pdf.\n\n[33] Rückert, D., Franke, L. and Stamminger, M., 2021. ADOP: Approximate Differentiable One-Pixel Point Rendering, https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.06635.pdf.\n\n[34] a) CLIPDraw: exploring text-to-drawing synthesis through language-image encoders \u003Cbr\u002F>\nb) StyleCLIPDraw: Schaldenbrand, P., Liu, Z. and Oh, J., 2021. StyleCLIPDraw: Coupling Content and Style in Text-to-Drawing Synthesis.\n\n[35] Liang, J., Cao, J., Sun, G., Zhang, K., Van Gool, L. and Timofte, R., 2021. SwinIR: Image restoration using swin transformer. In Proceedings of the IEEE\u002FCVF International Conference on Computer Vision (pp. 1833–1844).\n\n[36] Ling, H., Kreis, K., Li, D., Kim, S.W., Torralba, A. and Fidler, S., 2021, May. EditGAN: High-Precision Semantic Image Editing. In Thirty-Fifth Conference on Neural Information Processing Systems.\n\n[37] Xiangli, Y., Xu, L., Pan, X., Zhao, N., Rao, A., Theobalt, C., Dai, B. and Lin, D., 2021. CityNeRF: Building NeRF at City Scale.\n\n[38] Mokady, R., Hertz, A. and Bermano, A.H., 2021. ClipCap: CLIP Prefix for Image Captioning. https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09734\n","# 2021：充满惊人AI论文的一年——回顾 📌\n## 按发布日期整理的最新AI突破精选列表，附清晰视频讲解、深度文章链接及代码。\n\n尽管全球仍在复苏中，科研工作却并未放缓其迅猛步伐，尤其是在人工智能领域。今年更是凸显了许多重要议题，如伦理考量、关键偏见、治理机制、透明度等。人工智能与我们对人脑的理解及其与AI的关联正不断演进，展现出在不久的将来提升生活质量的广阔前景。然而，我们也必须谨慎选择所应用的技术。\n\n>“科学不能告诉我们应该做什么，它只能告诉我们能够做什么。”\u003Cbr\u002F>——让-保罗·萨特，《存在与虚无》\n\n以下列出了今年最引人注目的研究论文，以防你错过了任何一篇。简而言之，这是一份按发表日期排序的最新AI与数据科学突破精选清单，每篇都配有清晰的视频解读、深入阅读链接以及相关代码（如适用）。祝你阅读愉快！\n\n**本仓库末尾列出了每篇论文的完整引用信息。** *请给本仓库加星标，以便及时获取更新！* ⭐️\n\n维护者：[louisfb01](https:\u002F\u002Fgithub.com\u002Flouisfb01)\n\n[![Twitter](https:\u002F\u002Fimg.shields.io\u002Ftwitter\u002Furl\u002Fhttps\u002Ftwitter.com\u002Fcloudposse.svg?style=social&label=Follow%20%40whats_ai)](https:\u002F\u002Ftwitter.com\u002FWhats_AI)\n\n订阅我的[新闻通讯](https:\u002F\u002Flouisbouchard.substack.com\u002F)——每周为您解读最新的AI动态。\n\n*如果您发现有我遗漏但值得关注的论文，欢迎随时[联系我](https:\u002F\u002Fwww.louisbouchard.ai\u002Fcontact\u002F)，我会将其加入本仓库。*\n\n*如果您分享此列表，请在**Twitter**上@我 [@Whats_AI](https:\u002F\u002Ftwitter.com\u002FWhats_AI) 或在**LinkedIn**上@我 [@Louis (What's AI) Bouchard](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fwhats-ai\u002F)！*\n\n### 观看15分钟的2021年全回顾\n\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F3OoNOg1.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Fz5slE_akZmc)\n\n---\n\n### 如果您对计算机视觉研究感兴趣，这里还有另一份精彩的资源：\n2021年顶级CV论文精选列表，附清晰视频讲解、深度文章链接及代码。\n\n[2021年十大计算机视觉论文](https:\u002F\u002Fgithub.com\u002Flouisfb01\u002Ftop-10-cv-papers-2021)\n\n----\n\n👀 **如果您想支持我的工作**，并希望免费使用Weights & Biases来追踪您的机器学习实验、提高实验可重复性或与团队协作，您可以按照[这份指南](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Flouisfb01\u002Fexamples\u002Fblob\u002Fmaster\u002Fcolabs\u002Fpytorch\u002FSimple_PyTorch_Integration.ipynb)试用！由于这里的大部分代码基于PyTorch，我们认为分享一份关于如何在PyTorch中使用W&B的[快速入门指南](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Flouisfb01\u002Fexamples\u002Fblob\u002Fmaster\u002Fcolabs\u002Fpytorch\u002FSimple_PyTorch_Integration.ipynb)会非常有意义。\n\n👉请跟随[这份快速指南](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Flouisfb01\u002Fexamples\u002Fblob\u002Fmaster\u002Fcolabs\u002Fpytorch\u002FSimple_PyTorch_Integration.ipynb)，将相同的W&B代码片段融入您的项目或下方任一仓库中，您的所有实验都将自动记录到您的W&B账户！设置过程不超过5分钟，它将彻底改变您的工作方式，就像对我一样！如果您有兴趣，还可以参考[这份进阶指南](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Flouisfb01\u002Fexamples\u002Fblob\u002Fmaster\u002Fcolabs\u002Fpytorch\u002FOrganizing_Hyperparameter_Sweeps_in_PyTorch_with_W%26B.ipynb)，了解如何使用超参数搜索功能 :)\n\n🙌 感谢[Weights & Biases](https:\u002F\u002Fwandb.ai\u002F)对本仓库及我工作的赞助，也感谢每一位通过此链接尝试W&B的朋友们！\n\n[![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Flouisfb01\u002Fexamples\u002Fblob\u002Fmaster\u002Fcolabs\u002Fpytorch\u002FSimple_PyTorch_Integration.ipynb)\n\n----\n\n## 完整列表\n- [DALL·E：来自OpenAI的零样本文本到图像生成 [1]](#1)\n- [VOGUE：基于StyleGAN插值优化的试穿技术 [2]](#2)\n- [驯服Transformer用于高分辨率图像合成 [3]](#3)\n- [AI中的快思考与慢思考 [4]](#4)\n- [航空影像中漂浮海洋大型垃圾的自动检测与量化 [5]](#5)\n- [ShaRF：单视角条件下的形状辐射场 [6]](#6)\n- [生成对抗Transformer [7]](#7)\n- [我们请人工智能创建了约会资料。你会右滑吗？ [8]](#8)\n- [Swin Transformer：基于移位窗口的层次化视觉Transformer [9]](#9)\n- [图像GANs邂逅可微渲染：逆向图形与可解释的3D神经渲染 [10]](#10)\n- [深度网络：它们究竟为视觉带来了什么？ [11]](#11)\n- [无限自然：从单张图像持续生成自然场景视图 [12]](#12)\n- [便携式、自-contained的神经假肢手，采用基于深度学习的手指控制 [13]](#13)\n- [全面重打光：学习为肖像重新打光以替换背景 [14]](#14)\n- [LASR：从单目视频中学习关节式形状重建 [15]](#15)\n- [增强照片真实感 [16]](#16)\n- [DefakeHop：轻量级高性能深度伪造检测器 [17]](#17)\n- [实时高分辨率写实图像翻译：拉普拉斯金字塔翻译网络 [18]](#18)\n- [Barbershop：基于GAN和分割掩码的图像合成 [19]](#19)\n- [TextStyleBrush：从单一示例迁移文本美学 [20]](#20)\n- [用欧拉运动场为图片添加动画 [21]](#21)\n- [CVPR 2021最佳论文奖：GIRAFFE——可控图像生成 [22]](#22)\n- [GitHub Copilot & Codex：评估基于代码训练的大语言模型 [23]](#23)\n- [Apple：通过设备端私密机器学习识别照片中的人 [24]](#24)\n- [利用随机微分方程进行图像合成与编辑 [25]](#25)\n- [亲手绘制你的GAN [26]](#26)\n- [特斯拉自动驾驶详解 [27]](#27)\n- [Styleclip：基于文本操控StyleGAN图像 [28]](#28)\n- [TimeLens：基于事件的视频帧插值 [29]](#29)\n- [从单个视频实现多样化生成 [30]](#30)\n- [利用雷达的深度生成模型进行熟练的降水临近预报 [31]](#31)\n- [鸡尾酒叉子问题：针对现实世界音轨的三声道音频分离 [32]](#32)\n- [ADOP：近似可微的一像素点渲染 [33]](#33)\n- [(Style)CLIPDraw：在文本到绘画合成中耦合内容与风格 [34]](#34)\n- [SwinIR：使用Swin Transformer进行图像修复 [35]](#35)\n- [EditGAN：高精度语义图像编辑 [36]](#36)\n- [CityNeRF：城市尺度的NeRF构建 [37]](#37)\n- [ClipCap：用于图像字幕的CLIP前缀 [38]](#38)\n- [论文参考文献](#references)\n\n---\n\n## DALL·E：来自 OpenAI 的零样本文本到图像生成 [1]\u003Ca name=\"1\">\u003C\u002Fa>\nOpenAI 成功训练了一种能够根据文本描述生成图像的网络。它与 GPT-3 和 Image GPT 非常相似，并能产生令人惊叹的效果。\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FCzdyuce.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FDJToDLBPovg)\n* 简短阅读：[OpenAI 的 DALL·E：文本到图像生成详解](https:\u002F\u002Fwww.louisbouchard.ai\u002Fopenais-dall-e-text-to-image-generation-explained\u002F)\n* 论文：[零样本文本到图像生成](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2102.12092.pdf)\n* 代码：[用于 DALL·E 的离散 VAE 的代码及其他信息](https:\u002F\u002Fgithub.com\u002Fopenai\u002FDALL-E)\n\n\n## VOGUE：基于 StyleGAN 插值优化的试穿功能 [2]\u003Ca name=\"2\">\u003C\u002Fa>\n谷歌使用修改后的 StyleGAN2 架构，创建了一个在线试衣间，用户只需上传一张自己的照片，即可自动试穿任意裤子或衬衫。\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FFQL9bwU.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Fi4MnLJGZbaM)\n* 简短阅读：[人工智能驱动的在线试衣间：VOGUE](https:\u002F\u002Fmedium.com\u002Ftowards-artificial-intelligence\u002Fthe-ai-powered-online-fitting-room-vogue-5f77c599832)\n* 论文：[VOGUE：基于 StyleGAN 插值优化的试穿功能](https:\u002F\u002Fvogue-try-on.github.io\u002Fstatic_files\u002Fresources\u002FVOGUE-virtual-try-on.pdf)\n\n\n## 用于高分辨率图像合成的 Transformer 改良 [3]\u003Ca name=\"3\">\u003C\u002Fa>\n简而言之：他们将 GAN 和卷积方法的高效性与 Transformer 的强大表达能力相结合，从而提出了一种高效且语义引导的高质量图像合成方法。\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F0zUY1tm.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FJfUTd8fjtX8)\n* 简短阅读：[结合 Transformer 的表达能力和 CNN 的效率进行高分辨率图像合成](https:\u002F\u002Fmedium.com\u002Ftowards-artificial-intelligence\u002Fcombining-the-transformers-expressivity-with-the-cnns-efficiency-for-high-resolution-image-synthesis-31c6767547da)\n* 论文：[用于高分辨率图像合成的 Transformer 改良](https:\u002F\u002Fcompvis.github.io\u002Ftaming-transformers\u002F)\n* 代码：[Transformer 改良](https:\u002F\u002Fgithub.com\u002FCompVis\u002Ftaming-transformers)\n\n\n## 人工智能中的快思考与慢思考 [4]\u003Ca name=\"4\">\u003C\u002Fa>\n以人类认知能力为灵感，迈向更通用、更可信的人工智能，以及对人工智能研究社区的十个问题。\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FH8X58lb.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002F3nvAaVSQxs4)\n* 简短阅读：[人工智能的第三次浪潮 | 快思考与慢思考](https:\u002F\u002Fwww.louisbouchard.ai\u002Fthird-wave-of-ai-thinking-fast-and-slow\u002F)\n* 论文：[人工智能中的快思考与慢思考](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.06002)\n\n\n## 航拍图像中漂浮海洋大型垃圾的自动检测与量化 [5]\u003Ca name=\"5\">\u003C\u002Fa>\n巴塞罗那大学的 Odei Garcia-Garin 等人开发了一种基于深度学习的算法，能够从航拍图像中检测并量化漂浮垃圾。他们还制作了一个面向Web的应用程序，允许用户识别这些被称为“漂浮海洋大型垃圾”（FMML）的垃圾。\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FMmlYblV.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002F2dTSsdW0WYI)\n* 简短阅读：[一款能够检测并计数海洋塑料垃圾的人工智能软件](https:\u002F\u002Fpub.towardsai.net\u002Fan-ai-software-able-to-detect-and-count-plastic-waste-in-the-ocean-7211aa0baf89)\n* 论文：[航拍图像中漂浮海洋大型垃圾的自动检测与量化：介绍一种结合R语言Web应用的新深度学习方法，《环境污染》](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.envpol.2021.116490)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002Famonleong\u002FMARLIT)\n\n\n## ShaRF：单视角下的形状条件辐射场 [6]\u003Ca name=\"6\">\u003C\u002Fa>\n想象一下，只需拍摄一件物体的照片，就能将其转换为3D模型，然后插入到你正在制作的电影或游戏中，或者用于插画的3D场景中，该有多酷！\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FWV6lq5s.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FgHkkrNMlGNg)\n* 简短阅读：[ShaRF：拍摄现实物体的照片，即可创建其3D模型](https:\u002F\u002Fpub.towardsai.net\u002Fsharf-take-a-picture-from-a-real-life-object-and-create-a-3d-model-of-it-c6809806b32)\n* 论文：[单视角下的形状条件辐射场](https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.08860)\n* [点击此处获取代码](http:\u002F\u002Fwww.krematas.com\u002Fsharf\u002Findex.html)\n\n\n## 生成对抗式 Transformer [7]\u003Ca name=\"7\">\u003C\u002Fa>\n他们基本上是在强大的 StyleGAN2 架构中利用 Transformer 的注意力机制，使其功能更加强大！\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FCJzGHxa.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FHO-_t0UArd4)\n* 简短阅读：[GANsformer：使用生成对抗式 Transformer 进行场景生成](https:\u002F\u002Fwhats-ai.medium.com\u002Fgenerative-adversarial-transformers-gansformers-explained-bf1fa76ef58d)\n* 论文：[生成对抗式 Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2103.01209.pdf)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002Fdorarad\u002Fgansformer)\n\n> 订阅我的每周[通讯](http:\u002F\u002Feepurl.com\u002FhuGLT5)，及时了解2022年人工智能领域的最新动态！\n\n\n## 我们请人工智能创建了约会资料。你会右滑吗？ [8]\u003Ca name=\"8\">\u003C\u002Fa>\n你会对一个人工智能生成的资料右滑吗？你能分辨出真人和机器的区别吗？这项研究通过在约会应用上使用人工智能虚构的人物来揭示答案。\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FVKZrTBH.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FIoRH5u13P-4)\n* 简短阅读：[你会对人工智能生成的资料右滑吗？](https:\u002F\u002Fpub.towardsai.net\u002Fwould-you-swipe-right-on-an-ai-profile-98dc8a4451ec)\n* 论文：[我们请人工智能创建了约会资料。你会右滑吗？](https:\u002F\u002Fstudyonline.unsw.edu.au\u002Fblog\u002Fai-generated-dating-profile)\n* [点击此处获取代码](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1VLG8e7YSEwypxU-noRNhsv5dW4NfTGce#forceEdit=true&sandboxMode=true&scrollTo=aeXshJM-Cuaf)\n\n\n## Swin Transformer：基于移位窗口的层次化视觉 Transformer [9]\u003Ca name=\"9\">\u003C\u002Fa>\nTransformer 是否会取代 CNN 在计算机视觉中的地位？只需不到5分钟，你就能了解 Transformer 架构如何通过一篇名为 Swin Transformer 的新论文应用于计算机视觉。\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002Fr9aL2iU.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FQcCJJOLCeJQ)\n* 简短阅读：[Transformer 会取代 CNN 在计算机视觉中的地位吗？](https:\u002F\u002Fpub.towardsai.net\u002Fwill-transformers-replace-cnns-in-computer-vision-55657a196833)\n* 论文：[Swin Transformer：基于移位窗口的层次化视觉 Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.14030v1)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FSwin-Transformer)\n\n## 图像生成对抗网络与可微分渲染的结合：用于逆向图形学和可解释的3D神经渲染 [10]\u003Ca name=\"10\">\u003C\u002Fa>\n这款极具前景的模型名为GANverse3D，仅需一张图像即可生成可定制且可动画化的3D模型！\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FJJ5UAEp.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FdvjwRBZ3Hnw)\n* 简短阅读：[从图像创建3D模型！GANverse3D与NVIDIA Omniverse](https:\u002F\u002Fwww.louisbouchard.ai\u002Fganverse3d\u002F)\n* 论文：[图像生成对抗网络与可微分渲染的结合：用于逆向图形学和可解释的3D神经渲染](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.09125.pdf)\n\n\n## 深度神经网络：它们为计算机视觉做了什么？ [11]\u003Ca name=\"11\">\u003C\u002Fa>\n“我将公开分享关于视觉应用中深度神经网络的一切——它们的成功之处，以及我们仍需解决的局限性。”\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FPQX8Phj.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FGhPDNzAVNDk)\n* 简短阅读：[计算机视觉领域的人工智能现状如何？](https:\u002F\u002Fwww.louisbouchard.ai\u002Fai-in-computer-vision\u002F)\n* 论文：[深度神经网络：它们为计算机视觉做了什么？](https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.04025)\n\n\n## 无限自然：基于单张图像的自然场景持续视图生成 [12]\u003Ca name=\"12\">\u003C\u002Fa>\n视图合成的下一步：持续视图生成，其目标是让人仿佛飞入图像之中，尽情探索其中的风景！\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FezIYce7.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FNIOt1HLV_Mo)\n* 简短阅读：[无限自然：飞入图像，探索风景](https:\u002F\u002Fwww.louisbouchard.ai\u002Finfinite-nature\u002F)\n* 论文：[无限自然：基于单张图像的自然场景持续视图生成](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2012.09855.pdf)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fgoogle-research\u002Ftree\u002Fmaster\u002Finfinite_nature)\n* [Colab演示](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fgoogle-research\u002Fgoogle-research\u002Fblob\u002Fmaster\u002Finfinite_nature\u002Finfinite_nature_demo.ipynb#scrollTo=sCuRX1liUEVM)\n\n\n## 基于深度学习手指控制的便携式、自成体系的神经假肢手 [13]\u003Ca name=\"13\">\u003C\u002Fa>\n借助这一由人工智能驱动的神经接口，截肢者能够以近乎真实的灵巧性和直观性来控制神经假肢手。\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F07soslr.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FwNBrCRzlbVw)\n* 简短阅读：[一位拥有AI赋能假肢手的截肢者！🦾](https:\u002F\u002Fwww.louisbouchard.ai\u002Fan-amputee-with-an-ai-powered-hand\u002F)\n* 论文：[基于深度学习手指控制的便携式、自成体系的神经假肢手](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.13452)\n\n\n## 全局重光照：学习对人像进行重光照以实现背景替换 [14]\u003Ca name=\"14\">\u003C\u002Fa>\n根据你添加的新背景的光线条件，对任意人像进行恰当的重光照。你是否曾希望更换照片的背景，同时让效果看起来非常逼真？如果你试过这样做，就会明白这并不简单。你不能只是在家中拍一张自己的照片，然后把背景换成海滩——那样只会显得很假，不真实。任何人一眼就能看出这是PS过的。对于电影和专业视频来说，需要完美的灯光和专业的美术师才能还原高质量的画面，而这成本极高。你自己拍摄的照片根本做不到这一点。难道真的做不到吗？\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002Fa4KCChf.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FrVP2tcF_yRI)\n* 简短阅读：[不同背景下的逼真光照](https:\u002F\u002Fwww.louisbouchard.ai\u002Fbackgrounds-with-lighting\u002F)\n* 论文：[全局重光照：学习对人像进行重光照以实现背景替换](https:\u002F\u002Faugmentedperception.github.io\u002Ftotal_relighting\u002Ftotal_relighting_paper.pdf)\n\n\n## LASR：从单目视频中学习关节式形状重建 [15]\u003Ca name=\"15\">\u003C\u002Fa>\n仅需一段短视频作为输入，即可生成正在运动的人或动物的3D模型。这是一种全新的方法，只需一段短视频就能生成运动中的人或动物的3D模型。它不仅能理解这是一个具有复杂形状的物体，并且可以运动，还能确保这些部分始终连接在一起，因为它们仍然是一个整体，而非多个独立的物体……\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F9PmIb2e.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Flac7wqjS-8E)\n* 简短阅读：[从视频中进行关节式3D重建](https:\u002F\u002Fwww.louisbouchard.ai\u002F3d-reconstruction-from-videos\u002F)\n* 论文：[LASR：从单目视频中学习关节式形状重建](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021\u002Fpapers\u002FYang_LASR_Learning_Articulated_Shape_Reconstruction_From_a_Monocular_Video_CVPR_2021_paper.pdf)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Flasr)\n\n\n## 增强照片级写实效果 [16]\u003Ca name=\"16\">\u003C\u002Fa>\n该AI可实时应用于视频游戏，将每一帧画面处理得更加自然逼真。英特尔实验室的研究人员刚刚发表了一篇题为“增强照片级写实效果”的论文。如果你认为这不过是“又一个GAN”，即以游戏画面作为输入，再按照自然世界的风格进行修改，那我要改变你的想法。他们为此模型投入了两年时间，使其极为稳健。它可以实时应用于视频游戏，将每一帧画面处理得更加自然逼真。想象一下，你可以大大减少在游戏画面制作上的精力投入，使游戏内容更加稳定和完善，然后再用这个模型来提升画面风格……\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FMNpYCVj.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002F3rYosbwXm1w)\n* 简短阅读：[人工智能是视频游戏设计的未来吗？增强照片级写实效果](https:\u002F\u002Fwww.louisbouchard.ai\u002Fthe-future-of-video-game\u002F)\n* 论文：[增强照片级写实效果](http:\u002F\u002Fvladlen.info\u002Fpapers\u002FEPE.pdf)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002Fisl-org\u002FPhotorealismEnhancement)\n\n## DefakeHop：轻量级高性能深度伪造检测器 [17]\u003Ca name=\"17\">\u003C\u002Fa>\n2021年如何识破深度伪造？美国陆军突破性人工智能技术助力发现深度伪造。\n\n尽管它们似乎一直存在，但首个逼真的深度伪造直到2017年才出现。从最初自动生成的逼真假图像，发展到如今视频中与真人无异、连声音都高度仿真的深度伪造内容。\n\n如今，我们已经很难分辨一段视频或一张图片是真实的还是深度伪造的。那么，我们该如何辨别真伪呢？如果人工智能能够完全生成音频和视频文件，它们又如何作为法庭证据使用呢？这篇新论文或许能为我们解答这些问题。而答案可能再次指向人工智能的应用。“眼见为实”这句老话，也许很快就会变成“只有AI告诉我它是真的，我才相信……”\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FZsyJyeJ.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FYMir8sRWRos)\n* 简短阅读：[如何识破深度伪造？美国陆军突破性技术（2021）](https:\u002F\u002Fwww.louisbouchard.ai\u002Fspot-deepfakes\u002F)\n* 论文：[DefakeHop：轻量级高性能深度伪造检测器](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.06929)\n\n\n## 高分辨率照片级实时图像风格迁移：拉普拉斯金字塔风格迁移网络 [18]\u003Ca name=\"18\">\u003C\u002Fa>\n利用这一基于机器学习的新方法，你可以实时将任意风格应用到4K图像上！\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F6Z2OvBm.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FX7WzlAyUGPo)\n* 简短阅读：[高分辨率照片级实时图像风格迁移](https:\u002F\u002Fwww.louisbouchard.ai\u002F4k-image-translation-in-real-time\u002F)\n* 论文：[高分辨率照片级实时图像风格迁移：拉普拉斯金字塔风格迁移网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.09188.pdf)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002Fcsjliang\u002FLPTN)\n\n\n## Barbershop：基于GAN与分割掩码的图像合成 [19]\u003Ca name=\"19\">\u003C\u002Fa>\n本文并非介绍一项全新技术，而是探讨GAN的一种令人兴奋的新应用。正如标题所示，这绝非噱头——借助这款AI，你可以在真正改变发型之前，先试一试不同发型的效果……\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FTYzXcQ0.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FHtqYMvBVJD8)\n* 简短阅读：[Barbershop：通过图片尝试不同发型和发色（GAN）](https:\u002F\u002Fwww.louisbouchard.ai\u002Fbarbershop\u002F)\n* 论文：[Barbershop：基于GAN与分割掩码的图像合成](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.01505.pdf)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002FZPdesu\u002FBarbershop)\n\n\n## TextStyleBrush：从单个示例迁移文本美学 [20]\u003Ca name=\"20\">\u003C\u002Fa>\n这款全新的Facebook AI模型能够按照相同的风格，直接在图像中翻译或编辑文本，且支持你的母语！\n\n想象一下，你在国外度假，却不会说当地语言。你想去一家当地餐厅用餐，但菜单全是陌生的文字。这种情景并不难想象——我们大多数人都曾遇到过类似的情况：看到菜单或路标上的文字却无法理解其含义。过去，在2020年，你可能会拿出手机用谷歌翻译逐字逐句地翻译。而到了2021年，你甚至无需再打开谷歌翻译，也不必费力地逐字输入来翻译了。相反，只需使用Facebook AI推出的这款新模型，就能将图像中的所有文字一键翻译成你的母语……\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FJDBqrlv.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FhhAri5fl-XI)\n* 简短阅读：[模仿风格翻译或编辑图片中的文字：TextStyleBrush](https:\u002F\u002Fwww.louisbouchard.ai\u002Ftextstylebrush\u002F)\n* 论文：[TextStyleBrush：从单个示例迁移文本美学](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.08385)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FIMGUR5K-Handwriting-Dataset?fbclid=IwAR0pRAxhf8Vg-5H3fA0BEaRrMeD21HfoCJ-so8V0qmWK7Ub21dvy_jqgiVo)\n\n\n> 如果你也想阅读更多研究论文，我推荐你阅读我的文章[如何阅读更多研究论文](https:\u002F\u002Fpub.towardsai.net\u002Fhow-to-read-more-research-papers-7737e3770d7f)，其中分享了我寻找和阅读更多研究论文的最佳技巧。\n\n## 利用欧拉运动场为图片添加动画 [21]\u003Ca name=\"21\">\u003C\u002Fa>\n该模型可以分析图片中哪些区域的像素应该运动，并以逼真的方式将其循环动画化，同时保持图片其余部分完全静止，从而生成如图所示的惊艳视频效果……\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FJyarpBv.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FKgTa2r7d0I0)\n* 简短阅读：[从图片创建逼真循环动画视频](https:\u002F\u002Fwww.louisbouchard.ai\u002Fanimate-pictures\u002F)\n* 论文：[利用欧拉运动场为图片添加动画](https:\u002F\u002Farxiv.org\u002Fabs\u002F2011.15128)\n* [点击此处获取代码](https:\u002F\u002Feulerian.cs.washington.edu\u002F)\n\n\n## CVPR 2021最佳论文奖：GIRAFFE——可控图像生成 [22]\u003Ca name=\"22\">\u003C\u002Fa>\n通过改进的GAN架构，他们能够在不影响背景和其他物体的情况下移动图像中的对象！\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FZP6F9SF.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FJIJkURAkCxM)\n* 简短阅读：[CVPR 2021最佳论文奖：GIRAFFE——可控图像生成](https:\u002F\u002Fwww.louisbouchard.ai\u002Fcvpr-2021-best-paper\u002F)\n* 论文：[GIRAFFE：将场景表示为组合式生成神经特征场](http:\u002F\u002Fwww.cvlibs.net\u002Fpublications\u002FNiemeyer2021CVPR.pdf)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002Fautonomousvision\u002Fgiraffe)\n\n\n## GitHub Copilot & Codex：评估基于代码训练的大规模语言模型 [23]\u003Ca name=\"23\">\u003C\u002Fa>\n了解OpenAI这款新模型如何根据文字生成代码！\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FgFOnhmV.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Faz3oVVkTFB8)\n* 简短阅读：[OpenAI全新代码生成器：GitHub Copilot（及Codex）](https:\u002F\u002Fwww.louisbouchard.ai\u002Fgithub-copilot\u002F)\n* 论文：[评估基于代码训练的大规模语言模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2107.03374.pdf)\n* [点击此处获取代码](https:\u002F\u002Fcopilot.github.com\u002F)\n\n## 苹果：通过私密的设备端机器学习识别照片中的人脸 [24]\u003Ca name=\"24\">\u003C\u002Fa>\n苹果利用在您的设备上私密运行的多种基于机器学习的算法，让您能够在 iOS 15 上准确地整理和组织您的照片与视频。\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FHBuOzrG.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FLIV-M-gFRFA)\n* 简短文章：[苹果照片如何利用机器学习在私密照片中识别人物](https:\u002F\u002Fwww.louisbouchard.ai\u002Fhow-apple-photos-recognizes-people\u002F)\n* 论文：[通过私密的设备端机器学习识别照片中的人脸](https:\u002F\u002Fmachinelearning.apple.com\u002Fresearch\u002Frecognizing-people-photos)\n\n\n## 基于随机微分方程的图像合成与编辑 [25]\u003Ca name=\"25\">\u003C\u002Fa>\n告别复杂的 GAN 和 Transformer 架构来进行图像生成吧！\n斯坦福大学和卡内基梅隆大学的 Chenling Meng 等人提出的这一新方法，能够根据用户的任意输入生成新图像。即便是像我这样毫无艺术功底的人，现在也能仅凭草图就生成精美的图片或对其进行修改……\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FIdrRxix.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FxoEkSWJSm1k)\n* 简短文章：[基于草图的图像合成与编辑：SDEdit。无需再进行繁琐的训练！](https:\u002F\u002Fwww.louisbouchard.ai\u002Fimage-synthesis-from-sketches\u002F)\n* 论文：[基于随机微分方程的图像合成与编辑](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2108.01073.pdf)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002Fermongroup\u002FSDEdit)\n* [Colab 演示](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1KkLS53PndXKQpPlS1iK-k1nRQYmlb4aO?usp=sharing)\n\n\n## 根据草图生成你的 GAN [26]\u003Ca name=\"26\">\u003C\u002Fa>\n通过根据草图生成图像，让每个人都能更轻松地训练 GAN！事实上，借助这一新方法，你可以仅凭最简单的知识——手绘草图——来控制 GAN 的输出。\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FzbD9T8e.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Fvz_wEQkTLk0)\n* 简短文章：[让 GAN 训练对所有人更简单：根据草图生成图像](https:\u002F\u002Fwww.louisbouchard.ai\u002Fmake-gans-training-easier\u002F)\n* 论文：[根据草图生成你的 GAN](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.02774)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002FPeterWang512\u002FGANSketching)\n\n\n## 特斯拉自动驾驶系统详解 [27]\u003Ca name=\"27\">\u003C\u002Fa>\n如果您好奇特斯拉汽车是如何不仅“看”到周围环境，还能与其他车辆一起安全行驶的，那么这段视频正是您一直在等待的。几天前，特斯拉举办了首届 AI 日活动，特斯拉 AI 部门负责人 Andrej Karpathy 等人详细介绍了特斯拉自动驾驶系统的运作方式，从八路摄像头采集图像到最终的道路导航过程。\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FLo7s7db.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FDTHqgDqkIRw)\n* 简短文章：[特斯拉自动驾驶系统详解](https:\u002F\u002Fwww.louisbouchard.ai\u002Ftesla-autopilot-explained-tesla-ai-day\u002F)\n\n\n## StyleCLIP：文本驱动的 StyleGAN 图像操控 [28]\u003Ca name=\"28\">\u003C\u002Fa>\n过去，AI 可以生成图像，但研究人员需要耗费大量精力并通过反复试验来控制结果的风格。而现在，借助这个新模型，你只需使用文本就能实现这一点！\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FKJIpmys.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FRAXrwPskNso)\n* 简短文章：[用文本操控真实图像——专为创意艺术家打造的 AI！StyleCLIP 解析](https:\u002F\u002Fwww.louisbouchard.ai\u002Fstyleclip\u002F)\n* 论文：[StyleCLIP：文本驱动的 StyleGAN 图像操控](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.17249)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002Forpatashnik\u002FStyleCLIP)\n* [Colab 演示](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Forpatashnik\u002FStyleCLIP\u002Fblob\u002Fmain\u002Fnotebooks\u002FStyleCLIP_global.ipynb)\n\n\n## TimeLens：基于事件的视频帧插值 [29]\u003Ca name=\"29\">\u003C\u002Fa>\nTimeLens 能够理解视频帧之间物体的运动，从而重建出连我们肉眼都无法捕捉到的细微变化。实际上，它所达到的效果是目前智能手机及其他现有模型都无法企及的！\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FZF4fK31.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FHWA0yVXYRlk)\n* 简短文章：[如何用 AI 制作慢动作视频！](https:\u002F\u002Fwww.louisbouchard.ai\u002Ftimelens\u002F)\n* 论文：[TimeLens：基于事件的视频帧插值](http:\u002F\u002Frpg.ifi.uzh.ch\u002Fdocs\u002FCVPR21_Gehrig.pdf)\n* [点击此处获取代码](https:\u002F\u002Fgithub.com\u002Fuzh-rpg\u002Frpg_timelens)\n\n> 订阅我的每周[通讯](http:\u002F\u002Feepurl.com\u002FhuGLT5)，及时了解 2022 年人工智能领域的最新动态！\n\n## 单一视频即可实现多样化生成 [30]\u003Ca name=\"30\">\u003C\u002Fa>\n您是否曾想过编辑一段视频？\n\n比如移除或添加某个人物、更换背景、延长时长，或者调整分辨率以适配特定的宽高比而不压缩或拉伸画面。对于那些已经开展过广告投放的人来说，一定希望为 A\u002FB 测试准备多版本视频，以便找出最佳效果。而 Niv Haim 等人的这项新研究，恰好可以帮助您仅凭一段视频就在高清画质下完成所有这些操作！\n\n只需一段简单的视频，您便能在几秒甚至几分钟内完成上述任务，且生成的视频质量极高。基本上，您可以将其应用于任何视频处理或视频生成场景。它在各方面都优于 GAN，并且无需复杂的深度学习技术或庞大的数据集！更棒的是，该技术还可以扩展到高分辨率视频。\n\n* 短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F4OE71WI.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FUy8yKPEi1dg)\n* 简短文章：[生成视频变体——无需数据集或深度学习！](https:\u002F\u002Fwww.louisbouchard.ai\u002Fvgpnn-generate-video-variations\u002F)\n* 论文：[单一视频即可实现多样化生成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.08591)\n* [点击此处获取代码](https:\u002F\u002Fnivha.github.io\u002Fvgpnn\u002F)\n\n## 基于雷达深度生成模型的精准降水临近预报 [31]\u003Ca name=\"31\">\u003C\u002Fa>\nDeepMind 刚刚发布了一款生成式模型，其准确性和实用性在超过 50 位气象专家的评估中，在 89% 的情况下都优于目前广泛使用的临近预报方法！该模型专注于预测未来 2 小时内的降水情况，并且表现得非常出色。它是一种生成式模型，这意味着它不是简单地预测天气，而是直接生成预报结果。具体来说，它会利用过去的雷达数据来生成未来的雷达图像。通过结合过去的时间和空间信息，模型能够预测出不久后的天气状况。\n\n你可以把它想象成 Snapchat 的滤镜：输入一张人脸，然后生成经过修饰的新脸。要训练这样的生成模型，你需要大量的人脸数据以及你希望生成的特定类型的人脸数据。接着，使用经过长时间训练的类似模型，就能得到一个功能强大的生成模型。这类模型通常采用 GAN 架构进行训练，随后独立使用生成器部分。\n\n* 简短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FdE7MQ6E.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FdlSIq64psEY)\n* 简短阅读：[DeepMind 利用 AI 预测更精准的天气预报](https:\u002F\u002Fwww.louisbouchard.ai\u002Fdeepmind-rain-nowcasting\u002F)\n* 论文：[基于雷达深度生成模型的精准降水临近预报](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-021-03854-z)\n* [点击此处查看代码](https:\u002F\u002Fgithub.com\u002Fdeepmind\u002Fdeepmind-research\u002Ftree\u002Fmaster\u002Fnowcasting)\n\n\n## 鸡尾酒叉问题：面向真实场景音轨的三路音频分离 [32]\u003Ca name=\"32\">\u003C\u002Fa>\n你是否曾经看过视频或电视剧时，演员的声音完全听不清，或者背景音乐声太大？这种被称为“鸡尾酒会问题”的现象，或许将不再出现。三菱公司和印第安纳大学最近联合发布了一个新模型及配套数据集，专门解决如何从复杂音轨中准确分离出所需声音的问题。例如，如果一段音频中音乐声过大而人声被掩盖，只需调整相应音轨的音量，就能让语音更加清晰，同时降低音乐的干扰。\n\n问题的关键在于从复杂的声学场景中分离出各个独立的声音源——比如电影片段或 YouTube 视频中，各种声音往往混杂在一起、难以平衡。有时由于背景音乐、爆炸声或其他环境音的影响，演员的台词根本听不清楚。而一旦成功分离出不同类别的音轨，就可以单独调节它们的音量，比如适当降低音乐音量，以便更好地听到演员的对话。这正是研究人员所实现的目标。\n\n* 简短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002F3hQeWiG.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FRpxufqt5r6I)\n* 简短阅读：[利用 AI 分离语音、音乐和音效](https:\u002F\u002Fwww.louisbouchard.ai\u002Fisolate-voice-music-and-sound-effects-with-ai\u002F)\n* 论文：[鸡尾酒叉问题：面向真实场景音轨的三路音频分离](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.09958.pdf)\n* [点击此处查看代码](https:\u002F\u002Fcocktail-fork.github.io\u002F)\n\n\n## ADOP：近似可微分的一像素点渲染 [33]\u003Ca name=\"33\">\u003C\u002Fa>\n想象一下，你想要根据拍摄的一组照片生成一个 3D 模型，或者制作一段流畅的视频。现在，这一切都成为可能了！具体内容不便多说，但效果真的令人惊叹，建议大家亲自体验一下！\n\n* 简短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FaMAWzJU.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FJfph7Vld_Nw)\n* 简短阅读：[AI 可以仅凭几张图片合成流畅视频！](https:\u002F\u002Fwww.louisbouchard.ai\u002Fai-synthesizes-smooth-videos-from-a-couple-of-images\u002F)\n* 论文：[ADOP：近似可微分的一像素点渲染](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.06635.pdf)\n* [点击此处查看代码](https:\u002F\u002Fgithub.com\u002Fdarglein\u002FADOP)\n\n\n## (Style)CLIPDraw：文本到绘画合成中的内容与风格耦合 [34]\u003Ca name=\"34\">\u003C\u002Fa>\n你有没有想过，能否把一张图片的风格（比如左侧这张酷炫的 TikTok 绘画风格）应用到自己选择的新图片上？我曾经有过这样的想法，而现在实现起来比以往任何时候都更容易。事实上，你甚至只需要输入一段文字，就能借助这项新技术及其公开的 Google Colab 笔记本完成操作（详见参考资料）。只需上传一张你想要模仿的风格图片，再输入你希望生成的文字内容，算法就会自动生成一幅符合要求的新图！看看上面的效果吧，这真是巨大的进步！尤其考虑到这些作品仅仅基于一行文字就完成了，成果令人印象深刻。\n\n* 简短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FWIZYx0d.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002F5xzcIzHm8Wo)\n* 简短阅读：[具有艺术控制能力的文本到绘画合成 | CLIPDraw & StyleCLIPDraw](https:\u002F\u002Fwww.louisbouchard.ai\u002Fclipdraw\u002F)\n* 论文（CLIPDraw）：[CLIPDraw：通过语言-图像编码器探索文本到绘画合成](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.14843)\n* 论文（StyleCLIPDraw）：[StyleCLIPDraw：文本到绘画合成中的内容与风格耦合](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.03133)\n* [CLIPDraw Colab 示例](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fkvfrans\u002Fclipdraw\u002Fblob\u002Fmain\u002Fclipdraw.ipynb)\n* [StyleCLIPDraw Colab 示例](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fpschaldenbrand\u002FStyleCLIPDraw\u002Fblob\u002Fmaster\u002FStyle_ClipDraw.ipynb)\n\n\n## SwinIR：基于 Swin Transformer 的图像修复 [35]\u003Ca name=\"35\">\u003C\u002Fa>\n你是否曾经遇到过特别喜欢的一张图片，却只能找到像左图那样质量很差的小尺寸版本？如果能将这张图片放大两倍，让它看起来同样清晰该有多好？这已经很不错了，但如果能将其分辨率提升到原来的四倍甚至八倍呢？那才叫真正厉害，看看下面的效果就知道了。\n\n在这里，我们把图像的分辨率提升了四倍，也就是说高度和宽度上的像素数量增加了四倍，从而呈现出更加细腻的画面效果，整体看起来也更加平滑。最棒的是，整个过程只需几秒钟即可自动完成，而且几乎适用于任何类型的图像。此外，他们还提供了一个演示页面，大家可以亲自试一试……\n\n* 简短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FqDyvbkv.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FGFm3RfrtDoU)\n* 简短阅读：[SwinIR：基于 Swin Transformer 的图像修复](https:\u002F\u002Fwww.louisbouchard.ai\u002Fswinir\u002F)\n* 论文：[SwinIR：基于 Swin Transformer 的图像修复](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.10257)\n* [点击此处查看代码](https:\u002F\u002Fgithub.com\u002FJingyunLiang\u002FSwinIR)\n* [演示链接](https:\u002F\u002Freplicate.ai\u002Fjingyunliang\u002Fswinir)\n\n## EditGAN：高精度语义图像编辑 [36]\u003Ca name=\"36\">\u003C\u002Fa>\n只需通过快速草图即可控制图像中的任意特征，且仅对您指定的部分进行编辑，其余部分保持不变！这是由英伟达、麻省理工学院和多伦多大学联合推出的基于生成对抗网络的最先进草图驱动图像编辑模型。\n\n* 简短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FEM68uUJ.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Fbus4OGyMQec)\n* 简短阅读：[英伟达 EditGAN：从草图实现完全可控的图像编辑](https:\u002F\u002Fwww.louisbouchard.ai\u002Feditgan\u002F)\n* 论文：[EditGAN：高精度语义图像编辑](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.03186)\n* [点击此处查看代码（即将发布）](https:\u002F\u002Fnv-tlabs.github.io\u002FeditGAN\u002F)\n\n\n## CityNeRF：城市尺度的 NeRF 构建 [37]\u003Ca name=\"37\">\u003C\u002Fa>\n该模型名为 CityNeRF，源自我此前在频道中介绍过的 NeRF。NeRF 是最早利用辐射场和机器学习技术，从图像重建 3D 模型的模型之一。然而，NeRF 效率较低，且仅适用于单一尺度。而 CityNeRF 则同时结合卫星影像与地面视角图像，能够为任意视角生成不同尺度的 3D 模型。简而言之，它将 NeRF 技术扩展到了城市级别。那么，它是如何做到的呢？\n\n* 简短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002Ftvr0LY9.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002Fswfx0bJMIlY)\n* 简短阅读：[CityNeRF：城市尺度的 3D 建模！](https:\u002F\u002Fwww.louisbouchard.ai\u002Fcitynerf\u002F)\n* 论文：[CityNeRF：城市尺度的 NeRF 构建](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.05504.pdf)\n* [点击此处查看代码（即将发布）](https:\u002F\u002Fcity-super.github.io\u002Fcitynerf\u002F)\n\n\n## ClipCap：用于图像描述的 CLIP 前缀 [38]\u003Ca name=\"38\">\u003C\u002Fa>\n我们曾见过 AI 利用 GAN 从其他图像生成新图像；随后又出现了能根据文本生成质量参差不齐图像的模型。2021 年初，DALL-E 正式发布，凭借 CLIP 模型——一个以文本为指导将图像与文本关联起来的工具——在文本到图像生成任务上超越了所有先前尝试。与此非常相似的任务——图像描述——听起来似乎很简单，但实际上同样复杂。其核心在于让机器能够为一张图像生成自然流畅的描述文字。\n\n简单地标注出图像中的物体并不难，但要理解一幅二维图像中究竟发生了什么，则是一项巨大的挑战。而这款新模型在这方面表现得极为出色……\n\n* 简短视频讲解：\u003Cbr\u002F>\n[\u003Cimg src=\"https:\u002F\u002Fimgur.com\u002FQtz6hPA.png\" width=\"512\"\u002F>](https:\u002F\u002Fyoutu.be\u002FVQDrmuccWDo)\n* 简短阅读：[全新 SOTA 图像描述模型：ClipCap](https:\u002F\u002Fwww.louisbouchard.ai\u002Fclipcap\u002F)\n* 论文：[ClipCap：用于图像描述的 CLIP 前缀](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09734)\n* [点击此处查看代码](https:\u002F\u002Fgithub.com\u002Frmokady\u002FCLIP_prefix_caption)\n* [点击此处查看 Colab 示例](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1tuoAC5F4sC7qid56Z0ap-stR3rwdk0ZV?usp=sharing)\n\n\n---\n\n\n>如果您想阅读更多论文并获得更广阔的视野，这里还有一个很棒的资源库，涵盖了 2020 年的相关工作：\n[2020：充满惊人 AI 论文的一年——综述](https:\u002F\u002Fgithub.com\u002Flouisfb01\u002FBest_AI_paper_2020)，同时欢迎订阅我的每周[通讯](https:\u002F\u002Flouisbouchard.substack.com\u002F)，及时了解 2022 年的最新 AI 研究成果！\n\n* 如果您分享此列表，请在 **Twitter** 上标记我 [@Whats_AI](https:\u002F\u002Ftwitter.com\u002FWhats_AI) 或在 **LinkedIn** 上标记我 [@Louis (What's AI) Bouchard](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fwhats-ai\u002F)！\n\n---\n\n## 论文参考文献\u003Ca name=\"references\">\u003C\u002Fa>\n\n[1] A. Ramesh 等，零样本文本到图像生成，2021年。arXiv:2102.12092\n\n[2] Lewis, Kathleen M 等，（2021），VOGUE：基于StyleGAN插值优化的试穿技术。\n\n[3] 用于高分辨率图像合成的Transformer驯化，Esser 等，2020年。\n\n[4] AI中的快思考与慢思考，Booch 等，（2020），https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.06002。\n\n[5] Odei Garcia-Garin 等，航空影像中漂浮海洋大型垃圾的自动检测与量化：介绍一种结合R语言Web应用的新型深度学习方法，环境污染，https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.envpol.2021.116490。\n\n[6] Rematas, K., Martin-Brualla, R. 和 Ferrari, V.，“ShaRF：单视图形状条件辐射场”，（2021），https:\u002F\u002Farxiv.org\u002Fabs\u002F2102.08860\n\n[7] Drew A. Hudson 和 C. Lawrence Zitnick，生成对抗Transformer，（2021）\n\n[8] Sandra Bryant 等，“我们让人工智能创建了约会资料。你会右滑吗？”，（2021），UNSW悉尼博客。\n\n[9] 刘Z. 等，2021年，“Swin Transformer：使用移位窗口的层次化视觉Transformer”，arXiv预印本 https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.14030v1\n\n[10] 张Y.、陈W.、凌H.、高J.、张Y.、Torralba A. 和 Fidler S.，2020年。图像GANs与可微渲染结合，用于逆向图形和可解释的3D神经渲染。arXiv预印本 arXiv:2010.09125。\n\n[11] Yuille, A.L. 和 Liu, C.，2021年。深度网络：它们为视觉领域做了什么？国际计算机视觉杂志，129(3)，页781–802，https:\u002F\u002Farxiv.org\u002Fabs\u002F1805.04025。\n\n[12] 刘A.、塔克R.、詹帕尼V.、马卡迪A.、斯纳维利N. 和金泽川A.，2020年。无限自然：从单张图像生成自然场景的永恒视图，https:\u002F\u002Farxiv.org\u002Fpdf\u002F2012.09855.pdf\n\n[13] Nguyen & Drealan 等（2021）基于深度学习手指控制的便携式、自成一体的神经假手：https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.13452\n\n[14] Pandey 等，2021年，全面重光：学习为背景替换重新打光肖像，doi: 10.1145\u002F3450626.3459872，https:\u002F\u002Faugmentedperception.github.io\u002Ftotal_relighting\u002Ftotal_relighting_paper.pdf。\n\n[15] 杨耿山 等，（2021），LASR：从单目视频中学习关节形状重建，CVPR，https:\u002F\u002Flasr-google.github.io\u002F。\n\n[16] Richter、Abu AlHaija、Koltun，（2021），“增强照片真实感”，https:\u002F\u002Fintel-isl.github.io\u002FPhotorealismEnhancement\u002F。\n\n[17] DeepFakeHop：陈洪硕等，（2021），“DefakeHop：轻量级高性能Deepfake检测器”。ArXiv abs\u002F2103.06929。\n\n[18] 梁杰、曾辉和张磊，（2021），“实时高分辨率照片级图像翻译：拉普拉斯金字塔翻译网络”，https:\u002F\u002Fexport.arxiv.org\u002Fpdf\u002F2105.09188.pdf。\n\n[19] 裴浩 等，（2021），理发店，https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.01505.pdf。\n\n[20] Praveen Krishnan、Rama Kovvuri、Guan Pang、Boris Vassilev 和 Tal Hassner，Facebook AI，（2021），“TextStyleBrush：从单个示例转移文本美学”。\n\n[21] Holynski, Aleksander 等。“用欧拉运动场为图片动画化”。IEEE\u002FCVF计算机视觉与模式识别会议论文集。2021年。\n\n[22] Michael Niemeyer 和 Andreas Geiger，（2021），“GIRAFFE：将场景表示为组合式生成神经特征场”，发表于CVPR 2021。\n\n[23] 陈M.、特沃雷克J.、俊H.、袁Q.、平托H.P.D.O.、卡普兰J.、爱德华兹H.、伯克曼Y.、约瑟夫N.、布罗克曼G. 和雷A.，2021年。评估基于代码训练的大规模语言模型。arXiv预印本 arXiv:2107.03374。\n\n[24] 苹果公司，“通过设备端私有机器学习在照片中识别人物”，（2021），https:\u002F\u002Fmachinelearning.apple.com\u002Fresearch\u002Frecognizing-people-photos\n\n[25] 孟C.、宋Y.、宋J.、吴J.、朱J.Y. 和 Ermon S.，2021年。Sdedit：利用随机微分方程进行图像合成与编辑。arXiv预印本 arXiv:2108.01073。\n\n[26] 王S.Y.、鲍D. 和朱J.Y.，2021年。亲手绘制GAN。IEEE\u002FCVF国际计算机视觉会议论文集（第14050–14060页）。\n\n[27] “特斯拉AI日”，特斯拉，2021年8月19日，https:\u002F\u002Fyoutu.be\u002Fj0z4FweCy4M\n\n[28] Patashnik, Or 等，（2021），“Styleclip：基于文本操控StyleGAN图像”。https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.17249\n\n[29] 斯捷潘·图利亚科夫*、丹尼尔·格里格*、斯塔马蒂奥斯·乔治乌利斯、尤利乌斯·埃尔巴赫、马蒂亚斯·格里格、袁友李、达维德·斯卡拉穆扎，TimeLens：基于事件的视频帧插值，IEEE计算机视觉与模式识别会议（CVPR），纳什维尔，2021年，http:\u002F\u002Frpg.ifi.uzh.ch\u002Fdocs\u002FCVPR21_Gehrig.pdf\n\n[30] 海姆N.、费因斯坦B.、格拉诺特N.、肖彻A.、巴贡S.、德凯尔T. 和伊拉尼M.（2021）。从单个视频实现多样化生成，https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.08591。\n\n[31] 拉武里S.、伦克K.、威尔逊M.、康金D.、拉姆R.、米罗夫斯基P.、菲茨西蒙斯M.、阿萨尼亚杜M.、卡谢姆S.、马奇S. 和普鲁登R.，2021年。利用雷达深度生成模型进行熟练的降水临近预报，https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs41586-021-03854-z\n\n[32] 彼得曼D.、维希恩G.、王Z. 和鲁克斯J.L.（2021）。鸡尾酒叉问题：针对现实世界音轨的三声道音频分离。https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.09958.pdf。\n\n[33] 鲁克特D.、弗兰克L. 和施塔明格M.，2021年。ADOP：近似可微的一像素点渲染，https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.06635.pdf。\n\n[34] a) CLIPDraw：通过语言-图像编码器探索文本到绘画的合成\u003Cbr\u002F>\nb) StyleCLIPDraw：施尔登布兰德P.、刘Z. 和欧J.，2021年。StyleCLIPDraw：在文本到绘画合成中耦合内容与风格。\n\n[35] 梁J.、曹J.、孙G.、张K.、范古尔L. 和季莫夫特R.，2021年。SwinIR：使用Swin Transformer进行图像修复。IEEE\u002FCVF国际计算机视觉会议论文集（第1833–1844页）。\n\n[36] 凌H.、克莱斯K.、李D.、金S.W.、托拉尔巴A. 和菲德勒S.，2021年5月。EditGAN：高精度语义图像编辑。第三十五届神经信息处理系统大会。\n\n[37] 向立Y.、徐L.、潘X.、赵N.、饶A.、泰奥巴尔特C.、戴B. 和林D.，2021年。CityNeRF：城市尺度的NeRF构建。\n\n[38] 莫卡迪R.、赫兹A. 和贝尔马诺A.H.，2021年。ClipCap：用于图像字幕的CLIP前缀。https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.09734","# best_AI_papers_2021 快速上手指南\n\n`best_AI_papers_2021` 并非一个可直接安装的软件包或库，而是一个**精选的 2021 年人工智能突破性论文清单**。该仓库由社区维护，为每篇论文提供了视频讲解、深度文章链接以及对应的代码实现地址（通常指向原始作者的 GitHub 仓库）。\n\n本指南将指导你如何浏览该清单，并获取其中感兴趣论文的代码进行运行。\n\n## 环境准备\n\n由于本仓库包含的是指向不同项目的链接，因此没有统一的系统要求。你需要根据具体想运行的论文代码来准备环境。但大多数 2021 年的 AI 论文代码基于 **PyTorch**，建议提前准备以下通用开发环境：\n\n*   **操作系统**: Linux (推荐 Ubuntu 18.04+), macOS, 或 Windows (配合 WSL2)\n*   **Python**: 3.7 或更高版本\n*   **包管理工具**: `pip` 或 `conda`\n*   **深度学习框架**: PyTorch (大多数项目依赖)\n*   **硬件**: 推荐配备 NVIDIA GPU 以加速模型推理和训练（显存需求视具体论文而定）\n\n**前置依赖安装示例 (通用):**\n```bash\n# 创建虚拟环境 (推荐)\npython -m venv ai_papers_env\nsource ai_papers_env\u002Fbin\u002Factivate  # Linux\u002FmacOS\n# ai_papers_env\\Scripts\\activate   # Windows\n\n# 安装基础依赖\npip install torch torchvision torchaudio --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu118\npip install jupyterlab matplotlib numpy pandas\n```\n\n> **提示**: 国内开发者可使用清华源加速 pip 安装：\n> `pip install \u003Cpackage_name> -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple`\n\n## 安装步骤\n\n本仓库本身无需“安装”，只需克隆到本地即可浏览清单和文档。\n\n1.  **克隆仓库**:\n    ```bash\n    git clone https:\u002F\u002Fgithub.com\u002Flouisfb01\u002Fbest_AI_papers_2021.git\n    cd best_AI_papers_2021\n    ```\n\n2.  **获取具体论文代码**:\n    *   在本地打开 `README.md` 文件，浏览 \"The Full List\" 部分。\n    *   找到感兴趣的论文（例如 `DALL·E` 或 `Swin Transformer`）。\n    *   点击该条目下的 **Code** 链接，这将跳转到该论文官方或复现版的 GitHub 仓库。\n    *   在新的终端窗口中，克隆该具体项目的代码。\n\n    *示例：获取 \"Taming Transformers\" 的代码*\n    ```bash\n    # 从 README 中找到对应链接后执行\n    git clone https:\u002F\u002Fgithub.com\u002FCompVis\u002Ftaming-transformers.git\n    cd taming-transformers\n    ```\n\n3.  **安装具体项目依赖**:\n    进入具体项目目录后，通常需执行该项目特有的安装命令（请查阅该项目自身的 README）：\n    ```bash\n    # 常见安装命令示例\n    pip install -e .\n    # 或\n    pip install -r requirements.txt\n    ```\n\n## 基本使用\n\n使用流程分为两步：**查阅综述** 和 **运行代码**。\n\n### 1. 查阅论文综述\n在 `best_AI_papers_2021` 目录下打开 `README.md`，你可以看到每篇论文的简介、短视频讲解链接（YouTube）和深度解读文章。\n\n*   **操作**: 使用 Markdown 阅读器（如 VS Code, Typora）或直接在 GitHub 网页上浏览。\n*   **目的**: 快速了解论文核心思想，判断是否值得深入研读代码。\n\n### 2. 运行示例代码\n以清单中的 **[3] Taming Transformers** 为例，假设你已按“安装步骤”克隆了其代码仓库：\n\n*   **下载预训练模型** (参考该项目说明):\n    ```bash\n    bash scripts\u002Fdownload_first_stages.sh\n    bash scripts\u002Fdownload_models.sh\n    ```\n\n*   **运行图像生成示例**:\n    ```bash\n    python scripts\u002Fknn_generate_from_img.py --inpath data\u002Fexample_images\u002F --outdir outputs\n    ```\n\n*   **使用 Colab 快速体验**:\n    许多项目在原作者仓库中提供了 Google Colab 笔记本。你可以在对应项目的 GitHub 页面寻找 \"Open in Colab\" 按钮，无需本地配置 GPU 即可直接运行演示。\n\n> **注意**: 每个论文项目的具体运行命令差异较大，请务必以跳转后**具体代码仓库的 README 说明**为准。本仓库主要作为索引和导航使用。","某计算机视觉团队的算法工程师正急需为新一代图像生成项目寻找 2021 年的前沿技术基线，以突破现有模型的性能瓶颈。\n\n### 没有 best_AI_papers_2021 时\n- **信息检索低效**：需要在 arXiv、Twitter 和各类博客中手动筛选海量论文，难以快速锁定真正具有突破性的 2021 年度成果。\n- **理解门槛过高**：面对复杂的数学公式和专业术语，缺乏直观的视频讲解，导致非该细分领域的工程师难以快速掌握核心思想。\n- **复现成本巨大**：找到论文后往往找不到官方代码或高质量的非官方实现，需要从零开始编写代码，浪费数周时间验证可行性。\n- **忽视伦理风险**：容易忽略论文中关于偏见、治理和透明度等关键伦理讨论，给后续产品落地埋下合规隐患。\n\n### 使用 best_AI_papers_2021 后\n- **精准获取前沿**：直接按发布日期查阅经过策展的 2021 年突破性列表，几分钟内即可定位到如 DALL·E 等关键论文及其深度解读文章。\n- **视频辅助理解**：利用清单中配套的清晰视频解释，团队能迅速理解模型架构与创新点，将技术调研周期从数天缩短至几小时。\n- **代码即刻复用**：每个条目均附带可用代码链接，工程师可直接基于现有实现进行微调实验，大幅降低复现难度并加速原型开发。\n- **全面评估技术**：通过清单中对伦理、偏见等维度的标注，团队在选型阶段就能规避潜在风险，确保技术方案既先进又负责任。\n\nbest_AI_papers_2021 将原本分散且高门槛的科研信息转化为结构化的工程资产，帮助开发者在纷繁的技术浪潮中高效决策并快速落地。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flouisfb01_best_AI_papers_2021_9e0e9fed.png","louisfb01","Louis-François Bouchard","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Flouisfb01_d681bed6.png","Making AI accessible on YouTube, Newsletter, Spotify, Apple podcasts.\r\n\r\nCo-Founder at Towards AI.\r\nex-PhD student at Mila, Polytechnique Montréal","Mila\u002FPolytechnique Montréal & @towardsai","montreal",null,"Whats_AI","https:\u002F\u002Fwww.louisbouchard.ai\u002F","https:\u002F\u002Fgithub.com\u002Flouisfb01",2907,238,"2026-04-16T14:03:44","MIT","","未说明",{"notes":90,"python":88,"dependencies":91},"该仓库并非单一可运行的软件工具，而是 2021 年优秀 AI 论文的精选列表。每个论文条目通常链接到其独立的原始代码仓库、论文 PDF 和解释视频。因此，没有统一的运行环境要求。具体依赖需参考列表中各个项目（如 DALL·E, VOGUE, Taming Transformers 等）的独立文档。文中提到大部分代码基于 PyTorch，并提供了使用 Weights & Biases 追踪实验的指南。",[92,93],"PyTorch (提及基于 PyTorch)","Weights & Biases (可选，用于实验追踪)",[15,95,14,13],"其他",[97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116],"ai","artificial-intelligence","machine-learning","deep-learning","research","research-paper","paper","papers","computer-vision","computer-science","innovation","sota","state-of-the-art","state-of-art","2021","technology","sota-technique","python","artificialintelligence","machinelearning","2026-03-27T02:49:30.150509","2026-04-18T02:20:45.349460",[],[]]