[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-google-research--vision_transformer":3,"tool-google-research--vision_transformer":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",158594,2,"2026-04-16T23:34:05",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":65,"owner_location":65,"owner_email":65,"owner_twitter":65,"owner_website":76,"owner_url":77,"languages":78,"stars":87,"forks":88,"last_commit_at":89,"license":90,"difficulty_score":10,"env_os":91,"env_gpu":92,"env_ram":93,"env_deps":94,"category_tags":102,"github_topics":65,"view_count":32,"oss_zip_url":65,"oss_zip_packed_at":65,"status":17,"created_at":103,"updated_at":104,"faqs":105,"releases":136},8230,"google-research\u002Fvision_transformer","vision_transformer",null,"vision_transformer 是谷歌开源的一套基于 JAX\u002FFlax 框架的视觉模型代码库，核心提供了 Vision Transformer (ViT) 和 MLP-Mixer 等前沿架构的实现。它主要解决了传统卷积神经网络在图像识别任务中的局限性，展示了 Transformer 架构如何在不依赖强数据增强或预训练的情况下，依然能超越 ResNet 等经典模型，实现高精度的图像分类与零样本迁移学习。\n\n这套工具特别适合 AI 研究人员和深度学习开发者使用。如果你希望复现顶级论文成果、探索大规模视觉模型的训练技巧，或者需要在 ImageNet 等数据集上进行模型微调与推理，vision_transformer 提供了经过验证的预训练权重和完整的实验代码。其独特亮点在于不仅收录了多篇开创性论文的核心算法，还配套了详细的 Colab 交互式教程，支持直接在云端 GPU 或 TPU 上运行。此外，项目生成的数万个检查点甚至能被流行的 PyTorch 库直接加载，极大地降低了跨框架使用的门槛，是深入理解现代计算机视觉架构不可多得的实践资源。","# Vision Transformer and MLP-Mixer Architectures\n\nIn this repository we release models from the papers\n\n- [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.11929)\n- [MLP-Mixer: An all-MLP Architecture for Vision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.01601)\n- [How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.10270)\n- [When Vision Transformers Outperform ResNets without Pretraining or Strong Data Augmentations](https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.01548)\n- [LiT: Zero-Shot Transfer with Locked-image text Tuning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.07991)\n- [Surrogate Gap Minimization Improves Sharpness-Aware Training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.08065)\n\nThe models were pre-trained on the [ImageNet](http:\u002F\u002Fwww.image-net.org\u002F) and\n[ImageNet-21k](http:\u002F\u002Fwww.image-net.org\u002F) datasets. We provide the code for\nfine-tuning the released models in\n[JAX](https:\u002F\u002Fjax.readthedocs.io)\u002F[Flax](http:\u002F\u002Fflax.readthedocs.io).\n\nThe models from this codebase were originally trained in\nhttps:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbig_vision\u002F\nwhere you can find more advanced code (e.g. multi-host training), as well as\nsome of the original training scripts (e.g.\n[configs\u002Fvit_i21k.py](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbig_vision\u002Fblob\u002Fmain\u002Fbig_vision\u002Fconfigs\u002Fvit_i21k.py)\nfor pre-training a ViT, or\n[configs\u002Ftransfer.py](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbig_vision\u002Fblob\u002Fmain\u002Fbig_vision\u002Fconfigs\u002Ftransfer.py)\nfor transfering a model).\n\nTable of contents:\n\n- [Vision Transformer and MLP-Mixer Architectures](#vision-transformer-and-mlp-mixer-architectures)\n\t- [Colab](#colab)\n\t- [Installation](#installation)\n\t- [Fine-tuning a model](#fine-tuning-a-model)\n\t- [Vision Transformer](#vision-transformer)\n\t\t- [Available ViT models](#available-vit-models)\n\t\t- [Expected ViT results](#expected-vit-results)\n\t- [MLP-Mixer](#mlp-mixer)\n\t\t- [Available Mixer models](#available-mixer-models)\n\t\t- [Expected Mixer results](#expected-mixer-results)\n\t- [LiT models](#lit-models)\n\t- [Running on cloud](#running-on-cloud)\n\t\t- [Create a VM](#create-a-vm)\n\t\t- [Setup VM](#setup-vm)\n\t- [Bibtex](#bibtex)\n\t- [Disclaimers](#disclaimers)\n\t- [Changelog](#changelog)\n\n\n## Colab\n\nBelow Colabs run both with GPUs, and TPUs (8 cores, data parallelism).\n\nThe first Colab demonstrates the JAX code of Vision Transformers and MLP Mixers.\nThis Colab allows you to edit the files from the repository directly in the\nColab UI and has annotated Colab cells that walk you through the code step by\nstep, and lets you interact with the data.\n\nhttps:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax.ipynb\n\nThe second Colab allows you to explore the >50k Vision Transformer and hybrid\ncheckpoints that were used to generate the data of the third paper \"How to train\nyour ViT? ...\". The Colab includes code to explore and select checkpoints, and\nto do inference both using the JAX code from this repo, and also using the\npopular [`timm`] PyTorch library that can directly load these checkpoints as\nwell. Note that a handful of models are also available directly from TF-Hub:\n[sayakpaul\u002Fcollections\u002Fvision_transformer] (external contribution by [Sayak\nPaul]).\n\nThe second Colab also lets you fine-tune the checkpoints on any tfds dataset\nand your own dataset with examples in individual JPEG files (optionally directly\nreading from Google Drive).\n\nhttps:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax_augreg.ipynb\n\n**Note**: As for now (6\u002F20\u002F21) Google Colab only supports a single GPU (Nvidia\nTesla T4), and TPUs (currently TPUv2-8) are attached indirectly to the Colab VM\nand communicate over slow network, which leads to pretty bad training speed. You\nwould usually want to set up a dedicated machine if you have a non-trivial\namount of data to fine-tune on. For details see the\n[Running on cloud](#running-on-cloud) section.\n\n\n[`timm`]: https:\u002F\u002Fgithub.com\u002Frwightman\u002Fpytorch-image-models\n[sayakpaul\u002Fcollections\u002Fvision_transformer]: https:\u002F\u002Ftfhub.dev\u002Fsayakpaul\u002Fcollections\u002Fvision_transformer\n[Sayak Paul]: https:\u002F\u002Fgithub.com\u002Fsayakpaul\n\n\n\n## Installation\n\nMake sure you have `Python>=3.10` installed on your machine.\n\nInstall JAX and python dependencies by running:\n\n```\n# If using GPU:\npip install -r vit_jax\u002Frequirements.txt\n\n# If using TPU:\npip install -r vit_jax\u002Frequirements-tpu.txt\n```\n\nFor newer versions of [JAX](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fjax), follow the instructions\nprovided in the corresponding repository linked here. Note that installation\ninstructions for CPU, GPU and TPU differs slightly.\n\nInstall [Flaxformer](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fflaxformer), follow the instructions\nprovided in the corresponding repository linked here.\n\nFor more details refer to the section [Running on cloud](#running-on-cloud)\nbelow.\n\n\n## Fine-tuning a model\n\nYou can run fine-tuning of the downloaded model on your dataset of interest. All\nmodels share the same command line interface.\n\nFor example for fine-tuning a ViT-B\u002F16 (pre-trained on imagenet21k) on CIFAR10\n(note how we specify `b16,cifar10` as arguments to the config, and how we\ninstruct the code to access the models directly from a GCS bucket instead of\nfirst downloading them into the local directory):\n\n```bash\npython -m vit_jax.main --workdir=\u002Ftmp\u002Fvit-$(date +%s) \\\n    --config=$(pwd)\u002Fvit_jax\u002Fconfigs\u002Fvit.py:b16,cifar10 \\\n    --config.pretrained_dir='gs:\u002F\u002Fvit_models\u002Fimagenet21k'\n```\n\nIn order to fine-tune a Mixer-B\u002F16 (pre-trained on imagenet21k) on CIFAR10:\n\n```bash\npython -m vit_jax.main --workdir=\u002Ftmp\u002Fvit-$(date +%s) \\\n    --config=$(pwd)\u002Fvit_jax\u002Fconfigs\u002Fmixer_base16_cifar10.py \\\n    --config.pretrained_dir='gs:\u002F\u002Fmixer_models\u002Fimagenet21k'\n```\n\nThe \"How to train your ViT? ...\" paper added >50k checkpoints that you can\nfine-tune with the [`configs\u002Faugreg.py`] config. When you only specify the model\nname (the `config.name` value from [`configs\u002Fmodel.py`]), then the best i21k\ncheckpoint by upstream validation accuracy (\"recommended\" checkpoint, see\nsection 4.5 of the paper) is chosen. To make up your mind which model you want\nto use, have a look at Figure 3 in the paper. It's also possible to choose a\ndifferent checkpoint (see Colab [`vit_jax_augreg.ipynb`]) and then specify the\nvalue from the `filename` or `adapt_filename` column, which correspond to the\nfilenames without `.npz` from the [`gs:\u002F\u002Fvit_models\u002Faugreg`] directory.\n\n```bash\npython -m vit_jax.main --workdir=\u002Ftmp\u002Fvit-$(date +%s) \\\n    --config=$(pwd)\u002Fvit_jax\u002Fconfigs\u002Faugreg.py:R_Ti_16 \\\n    --config.dataset=oxford_iiit_pet \\\n    --config.base_lr=0.01\n```\n\nCurrently, the code will automatically download CIFAR-10 and CIFAR-100 datasets.\nOther public or custom datasets can be easily integrated, using [tensorflow\ndatasets library](https:\u002F\u002Fgithub.com\u002Ftensorflow\u002Fdatasets\u002F). Note that you will\nalso need to update `vit_jax\u002Finput_pipeline.py` to specify some parameters about\nany added dataset.\n\nNote that our code uses all available GPUs\u002FTPUs for fine-tuning.\n\nTo see a detailed list of all available flags, run `python3 -m vit_jax.train\n--help`.\n\nNotes on memory:\n\n- Different models require different amount of memory. Available memory also\n  depends on the accelerator configuration (both type and count). If you\n  encounter an out-of-memory error you can increase the value of\n  `--config.accum_steps=8` -- alternatively, you could also decrease the\n  `--config.batch=512` (and decrease `--config.base_lr` accordingly).\n- The host keeps a shuffle buffer in memory. If you encounter a host OOM (as\n  opposed to an accelerator OOM), you can decrease the default\n  `--config.shuffle_buffer=50000`.\n\n\n## Vision Transformer\n\nby Alexey Dosovitskiy\\*†, Lucas Beyer\\*, Alexander Kolesnikov\\*, Dirk\nWeissenborn\\*, Xiaohua Zhai\\*, Thomas Unterthiner, Mostafa Dehghani, Matthias\nMinderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit and Neil Houlsby\\*†.\n\n(\\*) equal technical contribution, (†) equal advising.\n\n![Figure 1 from paper](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgoogle-research_vision_transformer_readme_02813f99e72c.png)\n\nOverview of the model: we split an image into fixed-size patches, linearly embed\neach of them, add position embeddings, and feed the resulting sequence of\nvectors to a standard Transformer encoder. In order to perform classification,\nwe use the standard approach of adding an extra learnable \"classification token\"\nto the sequence.\n\n### Available ViT models\n\nWe provide a variety of ViT models in different GCS buckets. The models can be\ndownloaded with e.g.:\n\n```\nwget https:\u002F\u002Fstorage.googleapis.com\u002Fvit_models\u002Fimagenet21k\u002FViT-B_16.npz\n```\n\nThe model filenames (without the `.npz` extension) correspond to the\n`config.model_name` in [`vit_jax\u002Fconfigs\u002Fmodels.py`]\n\n- [`gs:\u002F\u002Fvit_models\u002Fimagenet21k`] - Models pre-trained on ImageNet-21k.\n- [`gs:\u002F\u002Fvit_models\u002Fimagenet21k+imagenet2012`] - Models pre-trained on\n  ImageNet-21k and fine-tuned on ImageNet.\n- [`gs:\u002F\u002Fvit_models\u002Faugreg`] - Models pre-trained on ImageNet-21k,\n  applying varying amounts of [AugReg]. Improved performance.\n- [`gs:\u002F\u002Fvit_models\u002Fsam`] - Models pre-trained on ImageNet with [SAM].\n- [`gs:\u002F\u002Fvit_models\u002Fgsam`] - Models pre-trained on ImageNet with [GSAM].\n\nWe recommend using the following checkpoints, trained with [AugReg] that have\nthe best pre-training metrics:\n\n|  Model   |                                   Pre-trained checkpoint                                   |   Size   |                                                       Fine-tuned checkpoint                                                        | Resolution | Img\u002Fsec | Imagenet accuracy |\n| :------- | :----------------------------------------------------------------------------------------- | -------: | :--------------------------------------------------------------------------------------------------------------------------------- | ---------: | ------: | ----------------: |\n| L\u002F16     | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FL_16-i21k-300ep-lr_0.001-aug_strong1-wd_0.1-do_0.0-sd_0.0.npz`     | 1243 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FL_16-i21k-300ep-lr_0.001-aug_strong1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz`     |        384 |      50 |            85.59% |\n| B\u002F16     | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FB_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz`     |  391 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FB_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz`     |        384 |     138 |            85.49% |\n| S\u002F16     | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FS_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npz`     |  115 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FS_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz`     |        384 |     300 |            83.73% |\n| R50+L\u002F32 | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FR50_L_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1.npz` | 1337 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FR50_L_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz` |        384 |     327 |            85.99% |\n| R26+S\u002F32 | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FR26_S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0.npz`  |  170 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FR26_S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz`  |        384 |     560 |            83.85% |\n| Ti\u002F16    | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FTi_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz`      |   37 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FTi_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz`      |        384 |     610 |            78.22% |\n| B\u002F32     | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FB_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0.npz`      |  398 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FB_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz`      |        384 |     955 |            83.59% |\n| S\u002F32     | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FS_32-i21k-300ep-lr_0.001-aug_none-wd_0.1-do_0.0-sd_0.0.npz`        |  118 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FS_32-i21k-300ep-lr_0.001-aug_none-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz`        |        384 |    2154 |            79.58% |\n| R+Ti\u002F16  | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FR_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz`    |   40 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FR_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz`    |        384 |    2426 |            75.40% |\n\nThe results from the original ViT paper (https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.11929) have\nbeen replicated using the models from [`gs:\u002F\u002Fvit_models\u002Fimagenet21k`]:\n\n| model        | dataset      | dropout=0.0                                                                                                                                                         | dropout=0.1                                                                                                                                                          |\n|:-------------|:-------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| R50+ViT-B_16 | cifar10      | 98.72%, 3.9h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5ER50.ViT-B_16\u002Fcifar10\u002Fdo_0.0&_smoothingWeight=0)      | 98.94%, 10.1h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5ER50.ViT-B_16\u002Fcifar10\u002Fdo_0.1&_smoothingWeight=0)      |\n| R50+ViT-B_16 | cifar100     | 90.88%, 4.1h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5ER50.ViT-B_16\u002Fcifar100\u002Fdo_0.0&_smoothingWeight=0)     | 92.30%, 10.1h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5ER50.ViT-B_16\u002Fcifar100\u002Fdo_0.1&_smoothingWeight=0)     |\n| R50+ViT-B_16 | imagenet2012 | 83.72%, 9.9h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5ER50.ViT-B_16\u002Fimagenet2012\u002Fdo_0.0&_smoothingWeight=0) | 85.08%, 24.2h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5ER50.ViT-B_16\u002Fimagenet2012\u002Fdo_0.1&_smoothingWeight=0) |\n| ViT-B_16     | cifar10      | 99.02%, 2.2h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_16\u002Fcifar10\u002Fdo_0.0&_smoothingWeight=0)          | 98.76%, 7.8h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_16\u002Fcifar10\u002Fdo_0.1&_smoothingWeight=0)           |\n| ViT-B_16     | cifar100     | 92.06%, 2.2h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_16\u002Fcifar100\u002Fdo_0.0&_smoothingWeight=0)         | 91.92%, 7.8h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_16\u002Fcifar100\u002Fdo_0.1&_smoothingWeight=0)          |\n| ViT-B_16     | imagenet2012 | 84.53%, 6.5h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_16\u002Fimagenet2012\u002Fdo_0.0&_smoothingWeight=0)     | 84.12%, 19.3h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_16\u002Fimagenet2012\u002Fdo_0.1&_smoothingWeight=0)     |\n| ViT-B_32     | cifar10      | 98.88%, 0.8h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_32\u002Fcifar10\u002Fdo_0.0&_smoothingWeight=0)          | 98.75%, 1.8h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_32\u002Fcifar10\u002Fdo_0.1&_smoothingWeight=0)           |\n| ViT-B_32     | cifar100     | 92.31%, 0.8h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_32\u002Fcifar100\u002Fdo_0.0&_smoothingWeight=0)         | 92.05%, 1.8h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_32\u002Fcifar100\u002Fdo_0.1&_smoothingWeight=0)          |\n| ViT-B_32     | imagenet2012 | 81.66%, 3.3h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_32\u002Fimagenet2012\u002Fdo_0.0&_smoothingWeight=0)     | 81.31%, 4.9h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_32\u002Fimagenet2012\u002Fdo_0.1&_smoothingWeight=0)      |\n| ViT-L_16     | cifar10      | 99.13%, 6.9h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_16\u002Fcifar10\u002Fdo_0.0&_smoothingWeight=0)          | 99.14%, 24.7h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_16\u002Fcifar10\u002Fdo_0.1&_smoothingWeight=0)          |\n| ViT-L_16     | cifar100     | 92.91%, 7.1h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_16\u002Fcifar100\u002Fdo_0.0&_smoothingWeight=0)         | 93.22%, 24.4h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_16\u002Fcifar100\u002Fdo_0.1&_smoothingWeight=0)         |\n| ViT-L_16     | imagenet2012 | 84.47%, 16.8h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_16\u002Fimagenet2012\u002Fdo_0.0&_smoothingWeight=0)    | 85.05%, 59.7h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_16\u002Fimagenet2012\u002Fdo_0.1&_smoothingWeight=0)     |\n| ViT-L_32     | cifar10      | 99.06%, 1.9h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_32\u002Fcifar10\u002Fdo_0.0&_smoothingWeight=0)          | 99.09%, 6.1h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_32\u002Fcifar10\u002Fdo_0.1&_smoothingWeight=0)           |\n| ViT-L_32     | cifar100     | 93.29%, 1.9h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_32\u002Fcifar100\u002Fdo_0.0&_smoothingWeight=0)         | 93.34%, 6.2h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_32\u002Fcifar100\u002Fdo_0.1&_smoothingWeight=0)          |\n| ViT-L_32     | imagenet2012 | 81.89%, 7.5h (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_32\u002Fimagenet2012\u002Fdo_0.0&_smoothingWeight=0)     | 81.13%, 15.0h (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_32\u002Fimagenet2012\u002Fdo_0.1&_smoothingWeight=0)     |\n\nWe also would like to emphasize that high-quality results can be achieved with\nshorter training schedules and encourage users of our code to play with\nhyper-parameters to trade-off accuracy and computational budget.\nSome examples for CIFAR-10\u002F100 datasets are presented in the table below.\n\n| upstream    | model    | dataset      | total_steps \u002F warmup_steps  | accuracy | wall-clock time |                                                                         link |\n| ----------- | -------- | ------------ | --------------------------- | -------- | --------------- | ---------------------------------------------------------------------------- |\n| imagenet21k | ViT-B_16 | cifar10      | 500 \u002F 50                    |   98.59% |             17m | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FQgkpiW53RPmjkabe1ME31g\u002F) |\n| imagenet21k | ViT-B_16 | cifar10      | 1000 \u002F 100                  |   98.86% |             39m | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002Fw8DQkDeJTOqJW5js80gOQg\u002F) |\n| imagenet21k | ViT-B_16 | cifar100     | 500 \u002F 50                    |   89.17% |             17m | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002F5hM4GrnAR0KEZg725Ewnqg\u002F) |\n| imagenet21k | ViT-B_16 | cifar100     | 1000 \u002F 100                  |   91.15% |             39m | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FQLQTaaIoT9uEcAjtA0eRwg\u002F) |\n\n\n## MLP-Mixer\n\nby Ilya Tolstikhin\\*, Neil Houlsby\\*, Alexander Kolesnikov\\*, Lucas Beyer\\*,\nXiaohua Zhai, Thomas Unterthiner, Jessica Yung, Andreas Steiner, Daniel Keysers,\nJakob Uszkoreit, Mario Lucic, Alexey Dosovitskiy.\n\n(\\*) equal contribution.\n\n![Figure 1 from paper](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgoogle-research_vision_transformer_readme_f6f0d60e263b.png)\n\nMLP-Mixer (*Mixer* for short) consists of per-patch linear embeddings, Mixer\nlayers, and a classifier head. Mixer layers contain one token-mixing MLP and one\nchannel-mixing MLP, each consisting of two fully-connected layers and a GELU\nnonlinearity. Other components include: skip-connections, dropout, and linear\nclassifier head.\n\nFor installation follow [the same steps](#installation) as above.\n\n### Available Mixer models\n\nWe provide the Mixer-B\u002F16 and Mixer-L\u002F16 models pre-trained on the ImageNet and\nImageNet-21k datasets. Details can be found in Table 3 of the Mixer paper. All\nthe models can be found at:\n\nhttps:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fmixer_models\u002F\n\nNote that these models are also available directly from TF-Hub:\n[sayakpaul\u002Fcollections\u002Fmlp-mixer] (external contribution by [Sayak\nPaul]).\n\n[sayakpaul\u002Fcollections\u002Fmlp-mixer]: https:\u002F\u002Ftfhub.dev\u002Fsayakpaul\u002Fcollections\u002Fmlp-mixer\n\n### Expected Mixer results\n\nWe ran the fine-tuning code on Google Cloud machine with four V100 GPUs with the\ndefault adaption parameters from this repository. Here are the results:\n\nupstream     | model      | dataset | accuracy | wall_clock_time | link\n:----------- | :--------- | :------ | -------: | :-------------- | :---\nImageNet     | Mixer-B\u002F16 | cifar10 | 96.72%   | 3.0h            | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002Fj9zCYt9yQVm93nqnsDZayA\u002F)\nImageNet     | Mixer-L\u002F16 | cifar10 | 96.59%   | 3.0h            | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FQ4feeErzRGGop5XzAvYj2g\u002F)\nImageNet-21k | Mixer-B\u002F16 | cifar10 | 96.82%   | 9.6h            | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FmvP4McV2SEGFeIww20ie5Q\u002F)\nImageNet-21k | Mixer-L\u002F16 | cifar10 | 98.34%   | 10.0h           | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FdolAJyQYTYmudytjalF6Jg\u002F)\n\n\n## LiT models\n\nFor details, refer to the Google AI blog post\n[LiT: adding language understanding to image models](http:\u002F\u002Fai.googleblog.com\u002F2022\u002F04\u002Flocked-image-tuning-adding-language.html),\nor read the CVPR paper \"LiT: Zero-Shot Transfer with Locked-image text Tuning\"\n(https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.07991).\n\nWe published a Transformer B\u002F16-base model with an ImageNet zeroshot accuracy of\n72.1%, and a L\u002F16-large model with an ImageNet zeroshot accuracy of 75.7%. For\nmore details about these models, please refer to the\n[LiT model card](model_cards\u002Flit.md).\n\nWe provide a in-browser demo with small text encoders for interactive use (the\nsmallest models should even run on a modern cell phone):\n\nhttps:\u002F\u002Fgoogle-research.github.io\u002Fvision_transformer\u002Flit\u002F\n\nAnd finally a Colab to use the JAX models with both image and text encoders:\n\nhttps:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Flit.ipynb\n\nNote that none of above models support multi-lingual inputs yet, but we're\nworking on publishing such models and will update this repository once they\nbecome available.\n\nThis repository only contains evaluation code for LiT models. You can find the\ntraining code in the `big_vision` repository:\n\nhttps:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbig_vision\u002Ftree\u002Fmain\u002Fbig_vision\u002Fconfigs\u002Fproj\u002Fimage_text\n\nExpected zeroshot results from [`model_cards\u002Flit.md`] (note that the zeroshot\nevaluation is slightly different from the simplified evaluation in the Colab):\n\n| Model | B16B_2 | L16L |\n| :--- | ---: | ---: |\n| ImageNet zero-shot | 73.9% | 75.7% |\n| ImageNet v2 zero-shot | 65.1% | 66.6% |\n| CIFAR100 zero-shot | 79.0% | 80.5% |\n| Pets37 zero-shot | 83.3% | 83.3% |\n| Resisc45 zero-shot | 25.3% | 25.6% |\n| MS-COCO Captions image-to-text retrieval | 51.6% | 48.5% |\n| MS-COCO Captions text-to-image retrieval | 31.8% | 31.1% |\n\n## Running on cloud\n\nWhile above [colabs](#colab) are pretty useful to get started, you would usually\nwant to train on a larger machine with more powerful accelerators.\n\n### Create a VM\n\nYou can use the following commands to setup a VM with GPUs on Google Cloud:\n\n```bash\n# Set variables used by all commands below.\n# Note that project must have accounting set up.\n# For a list of zones with GPUs refer to\n# https:\u002F\u002Fcloud.google.com\u002Fcompute\u002Fdocs\u002Fgpus\u002Fgpu-regions-zones\nPROJECT=my-awesome-gcp-project  # Project must have billing enabled.\nVM_NAME=vit-jax-vm-gpu\nZONE=europe-west4-b\n\n# Below settings have been tested with this repository. You can choose other\n# combinations of images & machines (e.g.), refer to the corresponding gcloud commands:\n# gcloud compute images list --project ml-images\n# gcloud compute machine-types list\n# etc.\ngcloud compute instances create $VM_NAME \\\n    --project=$PROJECT --zone=$ZONE \\\n    --image=c1-deeplearning-tf-2-5-cu110-v20210527-debian-10 \\\n    --image-project=ml-images --machine-type=n1-standard-96 \\\n    --scopes=cloud-platform,storage-full --boot-disk-size=256GB \\\n    --boot-disk-type=pd-ssd --metadata=install-nvidia-driver=True \\\n    --maintenance-policy=TERMINATE \\\n    --accelerator=type=nvidia-tesla-v100,count=8\n\n# Connect to VM (after some minutes needed to setup & start the machine).\ngcloud compute ssh --project $PROJECT --zone $ZONE $VM_NAME\n\n# Stop the VM after use (only storage is billed for a stopped VM).\ngcloud compute instances stop --project $PROJECT --zone $ZONE $VM_NAME\n\n# Delete VM after use (this will also remove all data stored on VM).\ngcloud compute instances delete --project $PROJECT --zone $ZONE $VM_NAME\n```\n\nAlternatively, you can use the following similar commands to set up a Cloud VM\nwith TPUs attached to them (below commands copied from the [TPU tutorial]):\n\n[TPU tutorial]: https:\u002F\u002Fcloud.google.com\u002Ftpu\u002Fdocs\u002Fjax-quickstart-tpu-vm\n\n```bash\nPROJECT=my-awesome-gcp-project  # Project must have billing enabled.\nVM_NAME=vit-jax-vm-tpu\nZONE=europe-west4-a\n\n# Required to set up service identity initially.\ngcloud beta services identity create --service tpu.googleapis.com\n\n# Create a VM with TPUs directly attached to it.\ngcloud alpha compute tpus tpu-vm create $VM_NAME \\\n    --project=$PROJECT --zone=$ZONE \\\n    --accelerator-type v3-8 \\\n    --version tpu-vm-base\n\n# Connect to VM (after some minutes needed to setup & start the machine).\ngcloud alpha compute tpus tpu-vm ssh --project $PROJECT --zone $ZONE $VM_NAME\n\n# Stop the VM after use (only storage is billed for a stopped VM).\ngcloud alpha compute tpus tpu-vm stop --project $PROJECT --zone $ZONE $VM_NAME\n\n# Delete VM after use (this will also remove all data stored on VM).\ngcloud alpha compute tpus tpu-vm delete --project $PROJECT --zone $ZONE $VM_NAME\n```\n\n### Setup VM\n\nAnd then fetch the repository and the install dependencies (including `jaxlib`\nwith TPU support) as usual:\n\n```bash\ngit clone --depth=1 --branch=master https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\ncd vision_transformer\n\n# optional: install virtualenv\npip3 install virtualenv\npython3 -m virtualenv env\n. env\u002Fbin\u002Factivate\n```\n\nIf you're connected to a VM with GPUs attached, install JAX and other dependencies with the following\ncommand:\n\n```bash\npip install -r vit_jax\u002Frequirements.txt\n```\n\nIf you're connected to a VM with TPUs attached, install JAX and other dependencies with the following\ncommand:\n\n```bash\npip install -r vit_jax\u002Frequirements-tpu.txt\n```\n\nInstall [Flaxformer](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fflaxformer), follow the instructions\nprovided in the corresponding repository linked here.\n\nFor both GPUs and TPUs, Check that JAX can connect to attached accelerators with the command:\n```bash\npython -c 'import jax; print(jax.devices())'\n```\n\nAnd finally execute one of the commands mentioned in the section\n[fine-tuning a model](#fine-tuning-a-model).\n\n\n## Bibtex\n\n```\n@article{dosovitskiy2020vit,\n  title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},\n  author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},\n  journal={ICLR},\n  year={2021}\n}\n\n@article{tolstikhin2021mixer,\n  title={MLP-Mixer: An all-MLP Architecture for Vision},\n  author={Tolstikhin, Ilya and Houlsby, Neil and Kolesnikov, Alexander and Beyer, Lucas and Zhai, Xiaohua and Unterthiner, Thomas and Yung, Jessica and Steiner, Andreas and Keysers, Daniel and Uszkoreit, Jakob and Lucic, Mario and Dosovitskiy, Alexey},\n  journal={arXiv preprint arXiv:2105.01601},\n  year={2021}\n}\n\n@article{steiner2021augreg,\n  title={How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers},\n  author={Steiner, Andreas and Kolesnikov, Alexander and and Zhai, Xiaohua and Wightman, Ross and Uszkoreit, Jakob and Beyer, Lucas},\n  journal={arXiv preprint arXiv:2106.10270},\n  year={2021}\n}\n\n@article{chen2021outperform,\n  title={When Vision Transformers Outperform ResNets without Pretraining or Strong Data Augmentations},\n  author={Chen, Xiangning and Hsieh, Cho-Jui and Gong, Boqing},\n  journal={arXiv preprint arXiv:2106.01548},\n  year={2021},\n}\n\n@article{zhuang2022gsam,\n  title={Surrogate Gap Minimization Improves Sharpness-Aware Training},\n  author={Zhuang, Juntang and Gong, Boqing and Yuan, Liangzhe and Cui, Yin and Adam, Hartwig and Dvornek, Nicha and Tatikonda, Sekhar and Duncan, James and Liu, Ting},\n  journal={ICLR},\n  year={2022},\n}\n\n@article{zhai2022lit,\n  title={LiT: Zero-Shot Transfer with Locked-image Text Tuning},\n  author={Zhai, Xiaohua and Wang, Xiao and Mustafa, Basil and Steiner, Andreas and Keysers, Daniel and Kolesnikov, Alexander and Beyer, Lucas},\n  journal={CVPR},\n  year={2022}\n}\n```\n\n\n## Changelog\n\nIn reverse chronological order:\n\n- 2022-08-18: Added LiT-B16B_2 model that was trained for 60k steps\n  (LiT_B16B: 30k) without linear head on the image side (LiT_B16B: 768) and has\n  better performance.\n\n- 2022-06-09: Added the ViT and Mixer models trained from scratch using\n  [GSAM] on ImageNet without strong data augmentations. The resultant ViTs\n  outperform those of similar sizes trained using AdamW optimizer or the\n  original [SAM] algorithm, or with strong data augmentations.\n\n- 2022-04-14: Added models and Colab for [LiT models](#lit-models).\n\n- 2021-07-29: Added ViT-B\u002F8 AugReg models (3 upstream checkpoints and adaptations\n  with resolution=224).\n\n- 2021-07-02: Added the \"When Vision Transformers Outperform\n  ResNets...\" paper\n\n- 2021-07-02: Added [SAM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.01412)\n  (Sharpness-Aware Minimization) optimized ViT and MLP-Mixer checkpoints.\n\n- 2021-06-20: Added the \"How to train your ViT? ...\" paper, and a new\n  Colab to explore the >50k pre-trained and fine-tuned checkpoints mentioned in\n  the paper.\n\n- 2021-06-18: This repository was rewritten to use Flax Linen API and\n  `ml_collections.ConfigDict` for configuration.\n\n- 2021-05-19: With publication of the \"How to train your ViT? ...\"\n  paper, we added more than 50k ViT and hybrid models pre-trained on ImageNet and\n  ImageNet-21k with various degrees of data augmentation and model regularization,\n  and fine-tuned on ImageNet, Pets37, Kitti-distance, CIFAR-100, and Resisc45.\n  Check out [`vit_jax_augreg.ipynb`] to navigate this treasure trove of models!\n  For example, you can use that Colab to fetch the filenames of recommended\n  pre-trained and fine-tuned checkpoints from the `i21k_300` column of Table 3 in\n  the paper.\n\n- 2020-12-01: Added the R50+ViT-B\u002F16 hybrid model (ViT-B\u002F16 on\n  top of a Resnet-50 backbone). When pretrained on imagenet21k, this model\n  achieves almost the performance of the L\u002F16 model with less than half the\n  computational finetuning cost. Note that \"R50\" is somewhat modified for the\n  B\u002F16 variant: The original ResNet-50 has [3,4,6,3] blocks, each reducing the\n  resolution of the image by a factor of two. In combination with the ResNet\n  stem this would result in a reduction of 32x so even with a patch size of\n  (1,1) the ViT-B\u002F16 variant cannot be realized anymore. For this reason we\n  instead use [3,4,9] blocks for the R50+B\u002F16 variant.\n\n- 2020-11-09: Added the ViT-L\u002F16 model.\n\n- 2020-10-29: Added ViT-B\u002F16 and ViT-L\u002F16 models pretrained\n  on ImageNet-21k and then fine-tuned on ImageNet at 224x224 resolution (instead\n  of default 384x384). These models have the suffix \"-224\" in their name.\n  They are expected to achieve 81.2% and 82.7% top-1 accuracies respectively.\n\n\n## Disclaimers\n\nOpen source release prepared by Andreas Steiner.\n\nNote: This repository was forked and modified from\n[google-research\u002Fbig_transfer](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbig_transfer).\n\n**This is not an official Google product.**\n\n\n[GSAM]: https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.08065\n[SAM]: https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.01412\n[AugReg]: https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.10270\n\n[`vit_jax\u002Fconfigs\u002Fmodels.py`]: https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax\u002Fconfigs\u002Fmodels.py\n[`model_cards\u002Flit.md`]: https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fmodel_cards\u002Flit.md\n\n[`configs\u002Faugreg.py`]: https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax\u002Fconfigs\u002Faugreg.py\n[`configs\u002Fmodel.py`]: https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax\u002Fconfigs\u002Fmodels.py\n[`vit_jax_augreg.ipynb`]: https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax_augreg.ipynb\n[`vit_jax.ipynb`]: https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax.ipynb\n\n[`gs:\u002F\u002Fvit_models\u002Fimagenet21k`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fbrowser\u002Fvit_models\u002Fimagenet21k\u002F\n[`gs:\u002F\u002Fvit_models\u002Fimagenet21k+imagenet2012`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fbrowser\u002Fvit_models\u002Fimagenet21k+imagenet2012\u002F\n[`gs:\u002F\u002Fvit_models\u002Faugreg`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fbrowser\u002Fvit_models\u002Faugreg\u002F\n[`gs:\u002F\u002Fvit_models\u002Fsam`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fbrowser\u002Fvit_models\u002Fsam\u002F\n[`gs:\u002F\u002Fmixer_models\u002Fsam`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fmixer_models\u002Fsam\u002F\n[`gs:\u002F\u002Fvit_models\u002Fgsam`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fbrowser\u002Fvit_models\u002Fgsam\u002F\n[`gs:\u002F\u002Fmixer_models\u002Fgsam`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fmixer_models\u002Fgsam\u002F\n","# 视觉Transformer与MLP-Mixer架构\n\n在本仓库中，我们发布了来自以下论文的模型：\n\n- 《一张图像胜过16×16个词：大规模图像识别中的Transformer》（https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.11929）\n- 《MLP-Mixer：一种全MLP的视觉架构》（https:\u002F\u002Farxiv.org\u002Fabs\u002F2105.01601）\n- 《如何训练你的ViT？视觉Transformer中的数据、增强与正则化》（https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.10270）\n- 《当视觉Transformer无需预训练或强数据增强即可超越ResNet时》（https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.01548）\n- 《LiT：基于锁定图像文本微调的零样本迁移》（https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.07991）\n- 《代理差距最小化改进了尖锐度感知训练》（https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.08065）\n\n这些模型已在[ImageNet](http:\u002F\u002Fwww.image-net.org\u002F)和[ImageNet-21k](http:\u002F\u002Fwww.image-net.org\u002F)数据集上进行了预训练。我们提供了使用[JAX](https:\u002F\u002Fjax.readthedocs.io)\u002F[Flax](http:\u002F\u002Fflax.readthedocs.io)对已发布模型进行微调的代码。\n\n本代码库中的模型最初是在https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbig_vision\u002F中训练的，在那里您可以找到更高级的代码（例如多主机训练），以及一些原始的训练脚本（例如用于预训练ViT的[configs\u002Fvit_i21k.py](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbig_vision\u002Fblob\u002Fmain\u002Fbig_vision\u002Fconfigs\u002Fvit_i21k.py)，或用于模型迁移的[configs\u002Ftransfer.py](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbig_vision\u002Fblob\u002Fmain\u002Fbig_vision\u002Fconfigs\u002Ftransfer.py)）。\n\n目录：\n\n- [视觉Transformer与MLP-Mixer架构](#vision-transformer-and-mlp-mixer-architectures)\n\t- [Colab](#colab)\n\t- [安装](#installation)\n\t- [微调模型](#fine-tuning-a-model)\n\t- [视觉Transformer](#vision-transformer)\n\t\t- [可用的ViT模型](#available-vit-models)\n\t\t- [预期的ViT结果](#expected-vit-results)\n\t- [MLP-Mixer](#mlp-mixer)\n\t\t- [可用的Mixer模型](#available-mixer-models)\n\t\t- [预期的Mixer结果](#expected-mixer-results)\n\t- [LiT模型](#lit-models)\n\t- [云端运行](#running-on-cloud)\n\t\t- [创建虚拟机](#create-a-vm)\n\t\t- [设置虚拟机](#setup-vm)\n\t- [BibTeX](#bibtex)\n\t- [免责声明](#disclaimers)\n\t- [更新日志](#changelog)\n\n\n## Colab\n\n以下Colab既可以使用GPU，也可以使用TPU（8核，数据并行）运行。\n\n第一个Colab展示了视觉Transformer和MLP Mixer的JAX代码。该Colab允许您直接在Colab界面中编辑仓库中的文件，并配有注释过的单元格，逐步引导您理解代码，同时让您与数据进行交互。\n\nhttps:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax.ipynb\n\n第二个Colab可以让您探索超过5万个用于生成第三篇论文“如何训练你的ViT？…”数据的视觉Transformer及混合模型检查点。该Colab包含用于探索和选择检查点的代码，以及使用本仓库的JAX代码和流行的[`timm`] PyTorch库进行推理的功能——后者可以直接加载这些检查点。需要注意的是，少数模型也可直接从TF-Hub获取：[sayakpaul\u002Fcollections\u002Fvision_transformer]（由[Sayak Paul]贡献）。\n\n第二个Colab还允许您在任何tfds数据集以及您自己的数据集上对检查点进行微调，这些数据集可以是单独的JPEG文件形式（可选地直接从Google Drive读取）。\n\nhttps:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax_augreg.ipynb\n\n**注意**：截至2021年6月20日，Google Colab仅支持单个GPU（Nvidia Tesla T4），而TPU（目前为TPUv2-8）则是间接连接到Colab虚拟机并通过较慢的网络通信，这导致训练速度非常慢。如果您有大量数据需要微调，通常建议搭建一台专用机器。有关详细信息，请参阅[云端运行](#running-on-cloud)部分。\n\n\n[`timm`]: https:\u002F\u002Fgithub.com\u002Frwightman\u002Fpytorch-image-models\n[sayakpaul\u002Fcollections\u002Fvision_transformer]: https:\u002F\u002Ftfhub.dev\u002Fsayakpaul\u002Fcollections\u002Fvision_transformer\n[Sayak Paul]: https:\u002F\u002Fgithub.com\u002Fsayakpaul\n\n\n\n## 安装\n\n请确保您的机器上已安装`Python>=3.10`。\n\n通过运行以下命令安装JAX和Python依赖项：\n\n```\n# 如果使用GPU：\npip install -r vit_jax\u002Frequirements.txt\n\n# 如果使用TPU：\npip install -r vit_jax\u002Frequirements-tpu.txt\n```\n\n对于较新版本的[JAX](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fjax)，请遵循此处链接的相应仓库提供的说明。请注意，CPU、GPU和TPU的安装说明略有不同。\n\n安装[Flaxformer](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fflaxformer)，并按照此处链接的相应仓库提供的说明进行操作。\n\n更多详情请参阅下方的[云端运行](#running-on-cloud)部分。\n\n## 微调模型\n\n您可以在感兴趣的自己的数据集上对下载的模型进行微调。所有模型都使用相同的命令行界面。\n\n例如，要在 CIFAR10 数据集上微调一个在 ImageNet21k 上预训练的 ViT-B\u002F16 模型（注意我们如何将 `b16,cifar10` 作为配置参数传入，以及如何指示代码直接从 GCS 存储桶中加载模型，而不是先将其下载到本地目录）：\n\n```bash\npython -m vit_jax.main --workdir=\u002Ftmp\u002Fvit-$(date +%s) \\\n    --config=$(pwd)\u002Fvit_jax\u002Fconfigs\u002Fvit.py:b16,cifar10 \\\n    --config.pretrained_dir='gs:\u002F\u002Fvit_models\u002Fimagenet21k'\n```\n\n如果要对在 ImageNet21k 上预训练的 Mixer-B\u002F16 模型进行 CIFAR10 数据集上的微调：\n\n```bash\npython -m vit_jax.main --workdir=\u002Ftmp\u002Fvit-$(date +%s) \\\n    --config=$(pwd)\u002Fvit_jax\u002Fconfigs\u002Fmixer_base16_cifar10.py \\\n    --config.pretrained_dir='gs:\u002F\u002Fmixer_models\u002Fimagenet21k'\n```\n\n论文《如何训练你的 ViT？》添加了超过 5 万个检查点，您可以使用 [`configs\u002Faugreg.py`] 配置对其进行微调。当您仅指定模型名称（即 [`configs\u002Fmodel.py`] 中的 `config.name` 值）时，系统会自动选择上游验证准确率最高的 i21k 检查点（“推荐”检查点，详见论文第 4.5 节）。要决定使用哪个模型，可以参考论文中的图 3。此外，您也可以选择其他检查点（参见 Colab 笔记本 [`vit_jax_augreg.ipynb`]），并指定来自 `filename` 或 `adapt_filename` 列的值，这些值对应于 [`gs:\u002F\u002Fvit_models\u002Faugreg`] 目录中不带 `.npz` 后缀的文件名。\n\n```bash\npython -m vit_jax.main --workdir=\u002Ftmp\u002Fvit-$(date +%s) \\\n    --config=$(pwd)\u002Fvit_jax\u002Fconfigs\u002Faugreg.py:R_Ti_16 \\\n    --config.dataset=oxford_iiit_pet \\\n    --config.base_lr=0.01\n```\n\n目前，代码会自动下载 CIFAR-10 和 CIFAR-100 数据集。其他公开或自定义数据集也可以通过 [TensorFlow Datasets 库](https:\u002F\u002Fgithub.com\u002Ftensorflow\u002Fdatasets\u002F) 轻松集成。请注意，您还需要更新 `vit_jax\u002Finput_pipeline.py` 文件，以指定新增数据集的相关参数。\n\n需要注意的是，我们的代码会利用所有可用的 GPU\u002FTPU 进行微调。\n\n如需查看所有可用标志的详细列表，请运行 `python3 -m vit_jax.train --help`。\n\n内存注意事项：\n\n- 不同模型所需的内存不同。可用内存还取决于加速器的配置（类型和数量）。如果遇到内存不足错误，您可以将 `--config.accum_steps=8` 的值调高；或者相应地降低 `--config.batch=512`（并按比例调整 `--config.base_lr`）。\n- 主机端会在内存中维护一个打乱缓冲区。如果您遇到主机端 OOM 错误（而非加速器端 OOM），可以将默认的 `--config.shuffle_buffer=50000` 值调低。\n\n## 视觉 Transformer\n\n作者：Alexey Dosovitskiy\\*†、Lucas Beyer\\*、Alexander Kolesnikov\\*、Dirk Weissenborn\\*、Xiaohua Zhai\\*、Thomas Unterthiner、Mostafa Dehghani、Matthias Minderer、Georg Heigold、Sylvain Gelly、Jakob Uszkoreit 和 Neil Houlsby\\*†。\n\n(\\*) 共同技术贡献，(†) 共同指导。\n\n![论文中的图 1](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgoogle-research_vision_transformer_readme_02813f99e72c.png)\n\n模型概述：我们将图像分割成固定大小的补丁，对每个补丁进行线性嵌入，加入位置嵌入，然后将得到的向量序列输入标准的 Transformer 编码器。为了进行分类，我们采用在序列中添加一个额外的可学习“分类标记”的标准方法。\n\n### 可用的 ViT 模型\n\n我们在不同的 GCS 存储桶中提供了多种 ViT 模型。例如，可以通过以下命令下载模型：\n\n```\nwget https:\u002F\u002Fstorage.googleapis.com\u002Fvit_models\u002Fimagenet21k\u002FViT-B_16.npz\n```\n\n模型文件名（不含 `.npz` 后缀）与 [`vit_jax\u002Fconfigs\u002Fmodels.py`] 中的 `config.model_name` 对应。\n\n- [`gs:\u002F\u002Fvit_models\u002Fimagenet21k`]：在 ImageNet-21k 上预训练的模型。\n- [`gs:\u002F\u002Fvit_models\u002Fimagenet21k+imagenet2012`]：在 ImageNet-21k 上预训练并在 ImageNet 上微调的模型。\n- [`gs:\u002F\u002Fvit_models\u002Faugreg`]：在 ImageNet-21k 上预训练并应用不同程度 [AugReg] 技术的模型，性能有所提升。\n- [`gs:\u002F\u002Fvit_models\u002Fsam`]：使用 [SAM] 技术在 ImageNet 上预训练的模型。\n- [`gs:\u002F\u002Fvit_models\u002Fgsam`]：使用 [GSAM] 技术在 ImageNet 上预训练的模型。\n\n我们推荐使用以下经过 [AugReg] 训练且预训练指标最优的检查点：\n\n|  模型   |                                   预训练检查点                                   |   大小   |                                                       微调后检查点                                                        | 分辨率 | 图像\u002F秒 | ImageNet 准确率 |\n| :------- | :----------------------------------------------------------------------------------------- | -------: | :--------------------------------------------------------------------------------------------------------------------------------- | ---------: | ------: | ----------------: |\n| L\u002F16     | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FL_16-i21k-300ep-lr_0.001-aug_strong1-wd_0.1-do_0.0-sd_0.0.npz`     | 1243 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FL_16-i21k-300ep-lr_0.001-aug_strong1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz`     |        384 |      50 |            85.59% |\n| B\u002F16     | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FB_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz`     |  391 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FB_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz`     |        384 |     138 |            85.49% |\n| S\u002F16     | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FS_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npz`     |  115 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FS_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz`     |        384 |     300 |            83.73% |\n| R50+L\u002F32 | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FR50_L_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1.npz` | 1337 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FR50_L_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz` |        384 |     327 |            85.99% |\n| R26+S\u002F32 | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FR26_S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0.npz`  |  170 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FR26_S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz`  |        384 |     560 |            83.85% |\n| Ti\u002F16    | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FTi_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz`      |   37 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FTi_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz`      |        384 |     610 |            78.22% |\n| B\u002F32     | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FB_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0.npz`      |  398 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FB_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz`      |        384 |     955 |            83.59% |\n| S\u002F32     | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FS_32-i21k-300ep-lr_0.001-aug_none-wd_0.1-do_0.0-sd_0.0.npz`        |  118 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FS_32-i21k-300ep-lr_0.001-aug_none-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz`        |        384 |    2154 |            79.58% |\n| R+Ti\u002F16  | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FR_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz`    |   40 MiB | `gs:\u002F\u002Fvit_models\u002Faugreg\u002FR_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz`    |        384 |    2426 |            75.40% |\n\n使用 [`gs:\u002F\u002Fvit_models\u002Fimagenet21k`] 中的模型，已复现了原始 ViT 论文（https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.11929）中的结果：\n\n| 模型         | 数据集       | dropout=0.0                                                                                                                                                         | dropout=0.1                                                                                                                                                          |\n|:-------------|:-------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| R50+ViT-B_16 | cifar10      | 98.72%, 3.9小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5ER50.ViT-B_16\u002Fcifar10\u002Fdo_0.0&_smoothingWeight=0)      | 98.94%, 10.1小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5ER50.ViT-B_16\u002Fcifar10\u002Fdo_0.1&_smoothingWeight=0)      |\n| R50+ViT-B_16 | cifar100     | 90.88%, 4.1小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5ER50.ViT-B_16\u002Fcifar100\u002Fdo_0.0&_smoothingWeight=0)     | 92.30%, 10.1小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5ER50.ViT-B_16\u002Fcifar100\u002Fdo_0.1&_smoothingWeight=0)     |\n| R50+ViT-B_16 | imagenet2012 | 83.72%, 9.9小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5ER50.ViT-B_16\u002Fimagenet2012\u002Fdo_0.0&_smoothingWeight=0) | 85.08%, 24.2小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5ER50.ViT-B_16\u002Fimagenet2012\u002Fdo_0.1&_smoothingWeight=0) |\n| ViT-B_16     | cifar10      | 99.02%, 2.2小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_16\u002Fcifar10\u002Fdo_0.0&_smoothingWeight=0)          | 98.76%, 7.8小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_16\u002Fcifar10\u002Fdo_0.1&_smoothingWeight=0)           |\n| ViT-B_16     | cifar100     | 92.06%, 2.2小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_16\u002Fcifar100\u002Fdo_0.0&_smoothingWeight=0)         | 91.92%, 7.8小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_16\u002Fcifar100\u002Fdo_0.1&_smoothingWeight=0)          |\n| ViT-B_16     | imagenet2012 | 84.53%, 6.5小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_16\u002Fimagenet2012\u002Fdo_0.0&_smoothingWeight=0)     | 84.12%, 19.3小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_16\u002Fimagenet2012\u002Fdo_0.1&_smoothingWeight=0)     |\n| ViT-B_32     | cifar10      | 98.88%, 0.8小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_32\u002Fcifar10\u002Fdo_0.0&_smoothingWeight=0)          | 98.75%, 1.8小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_32\u002Fcifar10\u002Fdo_0.1&_smoothingWeight=0)           |\n| ViT-B_32     | cifar100     | 92.31%, 0.8小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_32\u002Fcifar100\u002Fdo_0.0&_smoothingWeight=0)         | 92.05%, 1.8小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_32\u002Fcifar100\u002Fdo_0.1&_smoothingWeight=0)          |\n| ViT-B_32     | imagenet2012 | 81.66%, 3.3小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_32\u002Fimagenet2012\u002Fdo_0.0&_smoothingWeight=0)     | 81.31%, 4.9小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-B_32\u002Fimagenet2012\u002Fdo_0.1&_smoothingWeight=0)      |\n| ViT-L_16     | cifar10      | 99.13%, 6.9小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_16\u002Fcifar10\u002Fdo_0.0&_smoothingWeight=0)          | 99.14%, 24.7小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_16\u002Fcifar10\u002Fdo_0.1&_smoothingWeight=0)          |\n| ViT-L_16     | cifar100     | 92.91%, 7.1小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_16\u002Fcifar100\u002Fdo_0.0&_smoothingWeight=0)         | 93.22%, 24.4小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_16\u002Fcifar100\u002Fdo_0.1&_smoothingWeight=0)         |\n| ViT-L_16     | imagenet2012 | 84.47%, 16.8小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_16\u002Fimagenet2012\u002Fdo_0.0&_smoothingWeight=0)    | 85.05%, 59.7小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_16\u002Fimagenet2012\u002Fdo_0.1&_smoothingWeight=0)     |\n| ViT-L_32     | cifar10      | 99.06%, 1.9小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_32\u002Fcifar10\u002Fdo_0.0&_smoothingWeight=0)          | 99.09%, 6.1小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_32\u002Fcifar10\u002Fdo_0.1&_smoothingWeight=0)           |\n| ViT-L_32     | cifar100     | 93.29%, 1.9小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_32\u002Fcifar100\u002Fdo_0.0&_smoothingWeight=0)         | 93.34%, 6.2小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_32\u002Fcifar100\u002Fdo_0.1&_smoothingWeight=0)          |\n| ViT-L_32     | imagenet2012 | 81.89%, 7.5小时 (A100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_32\u002Fimagenet2012\u002Fdo_0.0&_smoothingWeight=0)     | 81.13%, 15.0小时 (V100), [tb.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FnwXQNjudRJW3dtQzhPZwwA\u002F#scalars&regexInput=%5EViT-L_32\u002Fimagenet2012\u002Fdo_0.1&_smoothingWeight=0)     |\n\n我们还想强调，通过较短的训练周期也能获得高质量的结果，并鼓励使用我们代码的用户尝试调整超参数，以在准确性和计算资源之间取得平衡。\n下表展示了一些针对 CIFAR-10\u002F100 数据集的例子。\n\n| 上游        | 模型       | 数据集      | 总步数 \u002F 预热步数  | 准确率   | 实际耗时    |                                                                         链接 |\n| ----------- | -------- | ------------ | --------------------------- | -------- | --------------- | ---------------------------------------------------------------------------- |\n| imagenet21k | ViT-B_16 | cifar10      | 500 \u002F 50                    |   98.59% |             17m | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FQgkpiW53RPmjkabe1ME31g\u002F) |\n| imagenet21k | ViT-B_16 | cifar10      | 1000 \u002F 100                  |   98.86% |             39m | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002Fw8DQkDeJTOqJW5js80gOQg\u002F) |\n| imagenet21k | ViT-B_16 | cifar100     | 500 \u002F 50                    |   89.17% |             17m | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002F5hM4GrnAR0KEZg725Ewnqg\u002F) |\n| imagenet21k | ViT-B_16 | cifar100     | 1000 \u002F 100                  |   91.15% |             39m | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FQLQTaaIoT9uEcAjtA0eRwg\u002F) |\n\n\n\n\n## MLP-Mixer\n\n作者：Ilya Tolstikhin\\*、Neil Houlsby\\*、Alexander Kolesnikov\\*、Lucas Beyer\\*，\nXiaohua Zhai、Thomas Unterthiner、Jessica Yung、Andreas Steiner、Daniel Keysers、\nJakob Uszkoreit、Mario Lucic、Alexey Dosovitskiy。\n\n(\\*) 共同第一作者。\n\n![论文中的图1](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgoogle-research_vision_transformer_readme_f6f0d60e263b.png)\n\nMLP-Mixer（简称*Mixer*）由逐块线性嵌入、Mixer层和分类头组成。Mixer层包含一个通道混合的MLP和一个特征混合的MLP，每个MLP由两个全连接层和一个GELU非线性激活函数构成。其他组件包括：残差连接、Dropout以及线性分类头。\n\n安装步骤与上述相同，请参阅[安装说明](#installation)。\n\n### 可用的Mixer模型\n\n我们提供了在ImageNet和ImageNet-21k数据集上预训练的Mixer-B\u002F16和Mixer-L\u002F16模型。详细信息请参见Mixer论文中的表3。所有模型均可在以下地址找到：\n\nhttps:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fmixer_models\u002F\n\n请注意，这些模型也可直接从TF-Hub获取：\n[sayakpaul\u002Fcollections\u002Fmlp-mixer]（由[Sayak Paul]贡献的外部资源）。\n\n[sayakpaul\u002Fcollections\u002Fmlp-mixer]: https:\u002F\u002Ftfhub.dev\u002Fsayakpaul\u002Fcollections\u002Fmlp-mixer\n\n### 预期的Mixer结果\n\n我们在Google Cloud的四张V100 GPU机器上运行了微调代码，并使用了本仓库中的默认适配参数。以下是结果：\n\n上游         | 模型        | 数据集 | 准确率   | 实际耗时    | 链接\n:----------- | :---------- | :----- | -------: | :---------- | :---\nImageNet     | Mixer-B\u002F16  | cifar10 | 96.72%   | 3.0小时     | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002Fj9zCYt9yQVm93nqnsDZayA\u002F)\nImageNet     | Mixer-L\u002F16  | cifar10 | 96.59%   | 3.0小时     | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FQ4feeErzRGGop5XzAvYj2g\u002F)\nImageNet-21k | Mixer-B\u002F16  | cifar10 | 96.82%   | 9.6小时     | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FmvP4McV2SEGFeIww20ie5Q\u002F)\nImageNet-21k | Mixer-L\u002F16  | cifar10 | 98.34%   | 10.0小时    | [tensorboard.dev](https:\u002F\u002Ftensorboard.dev\u002Fexperiment\u002FdolAJyQYTYmudytjalF6Jg\u002F)\n\n\n## LiT模型\n\n有关详情，请参阅Google AI博客文章\n[LiT：为图像模型添加语言理解能力](http:\u002F\u002Fai.googleblog.com\u002F2022\u002F04\u002Flocked-image-tuning-adding-language.html)，\n或阅读CVPR论文“LiT：通过锁定图像文本微调实现零样本迁移”\n(https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.07991)。\n\n我们发布了一个Transformer B\u002F16-base模型，其在ImageNet上的零样本准确率为72.1%，以及一个L\u002F16-large模型，其在ImageNet上的零样本准确率为75.7%。更多关于这些模型的详细信息，请参阅\n[LiT模型卡片](model_cards\u002Flit.md)。\n\n我们提供了一个基于浏览器的演示，其中包含小型文本编码器，可供交互使用（最小的模型甚至可以在现代手机上运行）：\n\nhttps:\u002F\u002Fgoogle-research.github.io\u002Fvision_transformer\u002Flit\u002F\n\n最后，还有一个Colab笔记本，用于使用带有图像和文本编码器的JAX模型：\n\nhttps:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Flit.ipynb\n\n需要注意的是，目前上述任何模型都不支持多语言输入，但我们正在开发此类模型，并将在它们可用时更新此仓库。\n\n该仓库仅包含LiT模型的评估代码。训练代码可在`big_vision`仓库中找到：\n\nhttps:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbig_vision\u002Ftree\u002Fmain\u002Fbig_vision\u002Fconfigs\u002Fproj\u002Fimage_text\n\n预期的零样本结果来自[`model_cards\u002Flit.md`]（请注意，零样本评估与Colab中的简化评估略有不同）：\n\n| 模型 | B16B_2 | L16L |\n| :--- | ---: | ---: |\n| ImageNet 零样本 | 73.9% | 75.7% |\n| ImageNet v2 零样本 | 65.1% | 66.6% |\n| CIFAR100 零样本 | 79.0% | 80.5% |\n| Pets37 零样本 | 83.3% | 83.3% |\n| Resisc45 零样本 | 25.3% | 25.6% |\n| MS-COCO 文字描述到图像检索 | 51.6% | 48.5% |\n| MS-COCO 图像到文字描述检索 | 31.8% | 31.1% |\n\n## 在云端运行\n\n虽然上述[Colab](#colab)对于入门非常有用，但通常您会希望在配备更强大加速器的大型机器上进行训练。\n\n### 创建虚拟机\n\n您可以通过以下命令在Google Cloud上设置一台带有GPU的虚拟机：\n\n```bash\n# 设置以下所有命令中使用的变量。\n# 注意，项目必须已设置账单。\n# 关于拥有GPU的区域列表，请参考\n# https:\u002F\u002Fcloud.google.com\u002Fcompute\u002Fdocs\u002Fgpus\u002Fgpu-regions-zones\nPROJECT=my-awesome-gcp-project  # 项目必须启用计费。\nVM_NAME=vit-jax-vm-gpu\nZONE=europe-west4-b\n\n# 以下配置已在本仓库中测试过。您可以选择其他镜像和机器组合（例如），请参考相应的gcloud命令：\n# gcloud compute images list --project ml-images\n# gcloud compute machine-types list\n# 等等。\ngcloud compute instances create $VM_NAME \\\n    --project=$PROJECT --zone=$ZONE \\\n    --image=c1-deeplearning-tf-2-5-cu110-v20210527-debian-10 \\\n    --image-project=ml-images --machine-type=n1-standard-96 \\\n    --scopes=cloud-platform,storage-full --boot-disk-size=256GB \\\n    --boot-disk-type=pd-ssd --metadata=install-nvidia-driver=True \\\n    --maintenance-policy=TERMINATE \\\n    --accelerator=type=nvidia-tesla-v100,count=8\n\n# 连接到虚拟机（在机器设置并启动后的几分钟内）。\ngcloud compute ssh --project $PROJECT --zone $ZONE $VM_NAME\n\n# 使用完毕后停止虚拟机（停止后的虚拟机仅收取存储费用）。\ngcloud compute instances stop --project $PROJECT --zone $ZONE $VM_NAME\n\n# 使用完毕后删除虚拟机（这也将删除虚拟机上存储的所有数据）。\ngcloud compute instances delete --project $PROJECT --zone $ZONE $VM_NAME\n```\n\n或者，您也可以使用以下类似的命令来设置带有 TPU 挂载的 Cloud 虚拟机（以下命令摘自 [TPU 教程]）：\n\n[TPU 教程]: https:\u002F\u002Fcloud.google.com\u002Ftpu\u002Fdocs\u002Fjax-quickstart-tpu-vm\n\n```bash\nPROJECT=my-awesome-gcp-project  # 项目必须启用结算功能。\nVM_NAME=vit-jax-vm-tpu\nZONE=europe-west4-a\n\n# 初始设置服务身份时需要执行此操作。\ngcloud beta services identity create --service tpu.googleapis.com\n\n# 创建直接挂载 TPU 的虚拟机。\ngcloud alpha compute tpus tpu-vm create $VM_NAME \\\n    --project=$PROJECT --zone=$ZONE \\\n    --accelerator-type v3-8 \\\n    --version tpu-vm-base\n\n# 连接到虚拟机（在机器完成设置并启动后的几分钟内）。\ngcloud alpha compute tpus tpu-vm ssh --project $PROJECT --zone $ZONE $VM_NAME\n\n# 使用完毕后停止虚拟机（已停止的虚拟机仅按存储收费）。\ngcloud alpha compute tpus tpu-vm stop --project $PROJECT --zone $ZONE $VM_NAME\n\n# 使用完毕后删除虚拟机（这也将删除虚拟机上存储的所有数据）。\ngcloud alpha compute tpus tpu-vm delete --project $PROJECT --zone $ZONE $VM_NAME\n```\n\n### 设置虚拟机\n\n然后按照常规方式克隆仓库并安装依赖项（包括支持 TPU 的 `jaxlib`）：\n\n```bash\ngit clone --depth=1 --branch=master https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\ncd vision_transformer\n\n# 可选：安装 virtualenv\npip3 install virtualenv\npython3 -m virtualenv env\n. env\u002Fbin\u002Factivate\n```\n\n如果您连接的是带有 GPU 的虚拟机，请使用以下命令安装 JAX 和其他依赖项：\n\n```bash\npip install -r vit_jax\u002Frequirements.txt\n```\n\n如果您连接的是带有 TPU 的虚拟机，请使用以下命令安装 JAX 和其他依赖项：\n\n```bash\npip install -r vit_jax\u002Frequirements-tpu.txt\n```\n\n安装 [Flaxformer](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fflaxformer)，并按照此处链接的相应仓库中的说明进行操作。\n\n无论是 GPU 还是 TPU，都请通过以下命令检查 JAX 是否能够连接到已挂载的加速器：\n\n```bash\npython -c 'import jax; print(jax.devices())'\n```\n\n最后，执行“[微调模型](#fine-tuning-a-model)”一节中提到的其中一个命令。\n\n\n## Bibtex\n\n```\n@article{dosovitskiy2020vit,\n  title={一张图胜过 16×16 个词：大规模图像识别中的 Transformer},\n  author={多索维茨基、卢卡斯·贝耶尔、亚历山大·科列斯尼科夫、迪尔克·魏森博恩、夏晓华·翟、托马斯·昂特尔蒂纳、莫斯塔法·德赫加尼、马蒂亚斯·明德勒、格奥尔格·海戈尔德、西尔万·盖利、雅各布·乌斯科雷特、尼尔·豪尔斯比},\n  journal={ICLR},\n  year={2021}\n}\n\n@article{tolstikhin2021mixer,\n  title={MLP-Mixer：一种用于视觉任务的全 MLP 架构},\n  author={伊利亚·托尔斯提欣、尼尔·豪尔斯比、亚历山大·科列斯尼科夫、卢卡斯·贝耶尔、夏晓华·翟、托马斯·昂特尔蒂纳、杰西卡·容、安德烈亚斯·施泰纳、丹尼尔·凯瑟斯、雅各布·乌斯科雷特、马里奥·卢奇奇、阿列克谢·多索维茨基},\n  journal={arXiv 预印本 arXiv:2105.01601},\n  year={2021}\n}\n\n@article{steiner2021augreg,\n  title={如何训练你的 ViT？视觉 Transformer 中的数据、增强与正则化},\n  author={安德烈亚斯·施泰纳、亚历山大·科列斯尼科夫、夏晓华·翟、罗斯·怀特曼、雅各布·乌斯科雷特、卢卡斯·贝耶尔},\n  journal={arXiv 预印本 arXiv:2106.10270},\n  year={2021}\n}\n\n@article{chen2021outperform,\n  title={当 Vision Transformer 在无需预训练或强大数据增强的情况下超越 ResNet 时},\n  author={陈翔宁、许祖辉、龚博清},\n  journal={arXiv 预印本 arXiv:2106.01548},\n  year={2021},\n}\n\n@article{zhuang2022gsam,\n  title={代理差距最小化改进了尖锐度感知训练},\n  author={庄俊堂、龚博清、袁良哲、崔寅、哈特维格·亚当、妮查·德沃内克、塞卡尔·塔蒂孔达、詹姆斯·邓肯、刘婷},\n  journal={ICLR},\n  year={2022},\n}\n\n@article{zhai2022lit,\n  title={LiT：基于锁定图像文本调优的零样本迁移学习},\n  author={夏晓华·翟、王肖、巴西尔·穆斯塔法、安德烈亚斯·施泰纳、丹尼尔·凯瑟斯、亚历山大·科列斯尼科夫、卢卡斯·贝耶尔},\n  journal={CVPR},\n  year={2022}\n}\n```\n\n## 更改记录\n\n按时间倒序排列：\n\n- 2022-08-18：新增了在图像侧未使用线性头（LiT_B16B：768）且训练了6万步的LiT-B16B_2模型，其性能优于LiT_B16B。\n\n- 2022-06-09：新增了在ImageNet数据集上使用[GSAM]从头开始训练的ViT和Mixer模型，未采用强数据增强。这些ViT模型的性能优于使用AdamW优化器、原始[SAM]算法或强数据增强训练的同规模模型。\n\n- 2022-04-14：新增了[LiT模型](#lit-models)及其Colab笔记本。\n\n- 2021-07-29：新增了ViT-B\u002F8 AugReg模型（3个上游检查点及分辨率=224的适配版本）。\n\n- 2021-07-02：新增了论文《当视觉Transformer超越ResNet时...》。\n\n- 2021-07-02：新增了使用[SAM]（Sharpness-Aware Minimization，尖锐度感知最小化）优化的ViT和MLP-Mixer检查点。\n\n- 2021-06-20：新增了论文《如何训练你的ViT？...》，并提供了一个新的Colab笔记本，用于探索论文中提到的5万多份预训练和微调检查点。\n\n- 2021-06-18：本仓库已重写，采用Flax Linen API和`ml_collections.ConfigDict`进行配置管理。\n\n- 2021-05-19：随着《如何训练你的ViT？...》论文的发表，我们新增了5万多份在ImageNet和ImageNet-21k数据集上预训练、并经过不同程度数据增强和模型正则化的ViT及混合模型，这些模型还在ImageNet、Pets37、Kitti-distance、CIFAR-100和Resisc45数据集上进行了微调。请查看[`vit_jax_augreg.ipynb`]来浏览这一丰富的模型资源！例如，您可以通过该Colab获取论文中表3 `i21k_300`列中推荐的预训练和微调检查点文件名。\n\n- 2020-12-01：新增了R50+ViT-B\u002F16混合模型（在ResNet-50骨干网络之上叠加ViT-B\u002F16）。当在imagenet21k数据集上预训练时，该模型仅需不到一半的计算资源即可达到与L\u002F16模型相近的性能。需要注意的是，“R50”在B\u002F16变体中有所调整：原始ResNet-50包含[3,4,6,3]层，每层都会使图像分辨率降低两倍。结合ResNet的stem部分，最终分辨率将被缩小32倍，即使使用(1,1)的patch size，也无法实现ViT-B\u002F16变体。因此，我们在R50+B\u002F16变体中采用了[3,4,9]层的配置。\n\n- 2020-11-09：新增了ViT-L\u002F16模型。\n\n- 2020-10-29：新增了在ImageNet-21k数据集上预训练、随后在ImageNet数据集上以224x224分辨率（而非默认的384x384）微调的ViT-B\u002F16和ViT-L\u002F16模型。这些模型名称后均带有“-224”后缀。预计它们的top-1准确率分别为81.2%和82.7%。\n\n\n## 免责声明\n\n开源发布由Andreas Steiner整理。\n\n注：本仓库基于[google-research\u002Fbig_transfer](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbig_transfer)分叉并修改而来。\n\n**这并非谷歌官方产品。**\n\n\n[GSAM]: https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.08065\n[SAM]: https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.01412\n[AugReg]: https:\u002F\u002Farxiv.org\u002Fabs\u002F2106.10270\n\n[`vit_jax\u002Fconfigs\u002Fmodels.py`]: https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax\u002Fconfigs\u002Fmodels.py\n[`model_cards\u002Flit.md`]: https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fmodel_cards\u002Flit.md\n\n[`configs\u002Faugreg.py`]: https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax\u002Fconfigs\u002Faugreg.py\n[`configs\u002Fmodel.py`]: https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax\u002Fconfigs\u002Fmodels.py\n[`vit_jax_augreg.ipynb`]: https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax_augreg.ipynb\n[`vit_jax.ipynb`]: https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fgoogle-research\u002Fvision_transformer\u002Fblob\u002Fmain\u002Fvit_jax.ipynb\n\n[`gs:\u002F\u002Fvit_models\u002Fimagenet21k`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fbrowser\u002Fvit_models\u002Fimagenet21k\u002F\n[`gs:\u002F\u002Fvit_models\u002Fimagenet21k+imagenet2012`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fbrowser\u002Fvit_models\u002Fimagenet21k+imagenet2012\u002F\n[`gs:\u002F\u002Fvit_models\u002Faugreg`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fbrowser\u002Fvit_models\u002Faugreg\u002F\n[`gs:\u002F\u002Fvit_models\u002Fsam`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fbrowser\u002Fvit_models\u002Fsam\u002F\n[`gs:\u002F\u002Fmixer_models\u002Fsam`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fmixer_models\u002Fsam\u002F\n[`gs:\u002F\u002Fvit_models\u002Fgsam`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fbrowser\u002Fvit_models\u002Fgsam\u002F\n[`gs:\u002F\u002Fmixer_models\u002Fgsam`]: https:\u002F\u002Fconsole.cloud.google.com\u002Fstorage\u002Fmixer_models\u002Fgsam\u002F","# Vision Transformer (ViT) 快速上手指南\n\n本指南基于 Google Research 开源的 `vision_transformer` 项目，帮助开发者快速在 JAX\u002FFlax 环境下部署和微调 ViT 及 MLP-Mixer 模型。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**: Linux (推荐) 或 macOS。Windows 用户建议使用 WSL2 或 Docker。\n*   **Python 版本**: >= 3.10\n*   **硬件加速**:\n    *   **GPU**: 支持 CUDA 的 NVIDIA 显卡（需安装对应的 CUDA 驱动）。\n    *   **TPU**: 可选，需配置 TPU 环境（如 Google Cloud TPU 或 Colab TPU）。\n*   **依赖框架**: 本项目基于 [JAX](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fjax) 和 [Flax](https:\u002F\u002Fflax.readthedocs.io\u002F) 构建。\n\n> **注意**：国内开发者若遇到 PyPI 下载缓慢问题，建议在 pip 命令后添加 `-i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple` 使用清华镜像源。\n\n## 安装步骤\n\n### 1. 安装基础依赖\n\n根据您使用的硬件类型（GPU 或 TPU），选择对应的依赖文件进行安装。\n\n**如果您使用 GPU：**\n```bash\npip install -r vit_jax\u002Frequirements.txt\n```\n\n**如果您使用 TPU：**\n```bash\npip install -r vit_jax\u002Frequirements-tpu.txt\n```\n\n> **提示**：如果上述命令因网络原因失败，请结合国内镜像源使用，例如：\n> `pip install -r vit_jax\u002Frequirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple`\n\n### 2. 安装 Flaxformer\n\n该项目依赖 `Flaxformer` 库，请参照其官方仓库说明进行安装：\n```bash\npip install flaxformer\n```\n*(注：如遇版本兼容问题，请参考 [Flaxformer GitHub](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fflaxformer) 获取最新安装指令)*\n\n## 基本使用\n\n安装完成后，您可以直接加载预训练模型并在自定义数据集上进行微调（Fine-tuning）。所有模型共享统一的命令行接口。\n\n### 场景一：微调 ViT 模型\n\n以下示例展示如何将预训练在 `ImageNet-21k` 上的 **ViT-B\u002F16** 模型微调到 **CIFAR-10** 数据集。代码会自动从 Google Cloud Storage (GCS) 拉取模型权重，无需手动下载。\n\n```bash\npython -m vit_jax.main --workdir=\u002Ftmp\u002Fvit-$(date +%s) \\\n    --config=$(pwd)\u002Fvit_jax\u002Fconfigs\u002Fvit.py:b16,cifar10 \\\n    --config.pretrained_dir='gs:\u002F\u002Fvit_models\u002Fimagenet21k'\n```\n\n### 场景二：微调 MLP-Mixer 模型\n\n以下示例展示如何微调 **Mixer-B\u002F16** 模型到 **CIFAR-10** 数据集：\n\n```bash\npython -m vit_jax.main --workdir=\u002Ftmp\u002Fvit-$(date +%s) \\\n    --config=$(pwd)\u002Fvit_jax\u002Fconfigs\u002Fmixer_base16_cifar10.py \\\n    --config.pretrained_dir='gs:\u002F\u002Fmixer_models\u002Fimagenet21k'\n```\n\n### 场景三：使用增强正则化（AugReg）模型\n\n如果您希望使用论文《How to train your ViT?》中提到的性能更强的 AugReg 预训练模型（例如在 Oxford-IIIT Pet 数据集上微调）：\n\n```bash\npython -m vit_jax.main --workdir=\u002Ftmp\u002Fvit-$(date +%s) \\\n    --config=$(pwd)\u002Fvit_jax\u002Fconfigs\u002Faugreg.py:R_Ti_16 \\\n    --config.dataset=oxford_iiit_pet \\\n    --config.base_lr=0.01\n```\n\n### 显存优化提示\n\n如果在运行过程中遇到 **Out Of Memory (OOM)** 错误，可以通过调整以下参数来降低显存占用：\n\n1.  **增加梯度累积步数**（减少每次反向传播的频率）：\n    添加参数 `--config.accum_steps=8`\n2.  **减小 Batch Size**（需同步降低学习率）：\n    添加参数 `--config.batch=256` 并相应调整 `--config.base_lr`\n3.  **减小 Shuffle Buffer**（解决主机内存不足）：\n    添加参数 `--config.shuffle_buffer=10000`\n\n### 查看可用数据集与参数\n\n要查看所有支持的命令行标志和数据集配置，可运行：\n```bash\npython3 -m vit_jax.train --help\n```","某医疗影像初创团队正致力于开发一套辅助诊断系统，需要从有限的肺部 CT 扫描数据中高精度识别早期结节病变。\n\n### 没有 vision_transformer 时\n- 传统卷积神经网络（CNN）在提取全局上下文信息时能力有限，容易漏诊位置隐蔽或形态微小的病灶。\n- 由于缺乏大规模预训练模型支持，团队必须从头训练深度模型，导致在数据量不足时严重过拟合，泛化能力差。\n- 调整网络结构以平衡精度与速度极为耗时，往往需要数周反复试验不同的 ResNet 变体才能勉强达标。\n- 迁移学习过程复杂，难以直接利用业界最先进的 ImageNet-21k 预训练权重来加速特定医疗场景的收敛。\n\n### 使用 vision_transformer 后\n- 借助 ViT 的自注意力机制，模型能精准捕捉图像长距离依赖关系，显著提升了微小结节的检出率和分类准确度。\n- 直接加载官方提供的 ImageNet-21k 预训练检查点进行微调，仅需少量标注数据即可快速收敛，有效解决了小样本难题。\n- 通过 JAX\u002FFlax 代码库灵活调用不同规模的 ViT 或 MLP-Mixer 架构，几天内即可完成从选型到部署的全流程验证。\n- 利用 Colab 笔记本直接探索超过 5 万个预训练模型权重，快速锁定最适合当前医疗数据分布的最优解，大幅降低试错成本。\n\nvision_transformer 通过将前沿的 Transformer 架构与高质量预训练权重相结合，让资源有限的团队也能轻松构建出超越传统 CNN 的高性能视觉诊断系统。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgoogle-research_vision_transformer_8ff79012.png","google-research","Google Research","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fgoogle-research_c23b2adf.png","","https:\u002F\u002Fresearch.google","https:\u002F\u002Fgithub.com\u002Fgoogle-research",[79,83],{"name":80,"color":81,"percentage":82},"Jupyter Notebook","#DA5B0B",95.8,{"name":84,"color":85,"percentage":86},"Python","#3572A5",4.2,12457,1461,"2026-04-16T13:07:32","Apache-2.0","Linux","可选但推荐（支持 NVIDIA GPU 和 Google TPU）。Colab 默认使用 Tesla T4。具体显存需求取决于模型大小和 batch size，若遇显存不足需调整 accum_steps 或 batch 大小。","未说明（主机端需内存存储 shuffle buffer，默认 50000，若遇 OOM 需减小该值）",{"notes":95,"python":96,"dependencies":97},"该工具基于 JAX\u002FFlax 框架，非 PyTorch。支持在 Google Cloud (GCP) 上运行多主机训练。模型权重存储在 Google Cloud Storage (GCS)，可直接通过 gs:\u002F\u002F 路径加载无需本地下载。不同模型对显存要求差异较大，可通过增加梯度累积步数 (--config.accum_steps) 或减小批次大小 (--config.batch) 来适应低显存环境。",">=3.10",[98,99,100,101],"jax","flax","flaxformer","tensorflow-datasets",[15,14],"2026-03-27T02:49:30.150509","2026-04-17T09:53:26.527723",[106,111,116,121,126,131],{"id":107,"question_zh":108,"answer_zh":109,"source_url":110},36835,"如何复现论文中仅在 ImageNet2012 上训练的模型？需要哪些超参数和训练命令？","建议使用以下配置进行训练：300 个 epoch，学习率 3e-3，前 10k 步进行 warmup 然后使用余弦退火策略，批量大小（batch size）设为 4096。如果在损失函数中使用 `sigmoid_xent` 而不是 `softmax_xent` 并配合标签平滑（label smoothing），请注意不要将标签除以类别数，而应设置为 `labels = labels * (1 - label_smoothing) + label_smoothing`。在此配置下，验证集准确率最终可达约 0.7479。","https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fissues\u002F89",{"id":112,"question_zh":113,"answer_zh":114,"source_url":115},36836,"为什么未微调的 ViT-B_16 模型权重文件中，最终层（head）的权重和偏置全为 0？","这是预期行为。对于在 ImageNet-21k 上预训练但未在 ImageNet-1k 上微调的模型，其最终分类层（head）的权重通常被初始化为 0，因为这些权重尚未针对特定任务进行训练。如果您需要使用已微调的模型或更多预训练变体，建议参考 `rwightman\u002Fpytorch-image-models` 仓库或 Google Cloud Storage 上的 `vit_models` 存储桶获取其他检查点。","https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fissues\u002F10",{"id":117,"question_zh":118,"answer_zh":119,"source_url":120},36837,"在 Colab 中运行 Vision Transformer 代码时遇到内存不足（OOM）导致会话崩溃，如何解决？","如果遇到内存不足问题，请尝试以下解决方案：1. 不要选择 \"High-RAM\" 实例选项，改用标准的运行时环境（Regular option），有用户反馈高内存选项反而会导致兼容性问题；2. 确保使用 TPU 内核作为运行时类型（Runtime type = TPU kernel）；3. 在开始运行前重置并启动一个新的内核（Fresh kernel）；4. 确保直接运行官方提供的 notebook（vit_jax.ipynb）且未做任何修改。","https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fissues\u002F54",{"id":122,"question_zh":123,"answer_zh":124,"source_url":125},36838,"是否提供混合架构（Hybrid models，如 ResNet+ViT）的预训练模型？","是的，项目已发布基于 ImageNet-21k 预训练并在 ImageNet-2012 上微调的混合模型（例如 R50+ViT-B_16）。您可以在 Google Cloud Storage 的 `vit_models\u002Fimagenet21k+imagenet2012\u002F` 目录下找到名为 `R50+ViT-B_16.npz` 的文件。相关代码已合并到主仓库，并可与 README 中链接的 Colab 笔记本配合使用。注意：基于私有数据集 JFT-300M 预训练的模型权重无法公开。","https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fissues\u002F22",{"id":127,"question_zh":128,"answer_zh":129,"source_url":130},36839,"如何使用 Rollout 方法可视化 Vision Transformer 的注意力图（Attention Maps）？","注意力图是模型参数和输入图像的函数，与标签无关，因此无法直接获取针对特定标签的注意力图（如需此类功能可尝试适配 CAM 等技术）。要计算注意力权重并执行 Rollout，通常需要平均所有注意力头（heads）的结果。核心实现涉及计算点积注意力权重，可参考 Flax 库中的 `get_dot_product_attention_weights` 函数逻辑：输入 query, key, value，计算注意力矩阵，然后在多层之间进行矩阵乘法累积（Rollout）以得到最终的注意力分布。","https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fissues\u002F18",{"id":132,"question_zh":133,"answer_zh":134,"source_url":135},36840,"在 ImageNet-21k 上预训练 ViT-L\u002F16 时，应使用什么超参数、数据增强策略和损失函数？","1. 优化器：使用 `flax.optim.Adam`，权重衰减（weight_decay）设为 0.1。仅初始化最终分类头的偏置项。2. 数据集特性：ImageNet-21k 和 JFT-300M 是多标签数据集，而 ImageNet-1k 是单标签。3. 损失函数：尽管 ImageNet-1k 是单标签，但在该实现中统一使用了 Sigmoid Cross Entropy 损失（而非 Softmax），这也适用于多标签的 ImageNet-21k 预训练任务。","https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fvision_transformer\u002Fissues\u002F34",[]]