[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-leondgarse--keras_cv_attention_models":3,"tool-leondgarse--keras_cv_attention_models":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",143909,2,"2026-04-07T11:33:18",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107888,"2026-04-06T11:32:50",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":10,"last_commit_at":59,"category_tags":60,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[35,15,13,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":74,"owner_company":74,"owner_location":74,"owner_email":76,"owner_twitter":74,"owner_website":74,"owner_url":77,"languages":78,"stars":83,"forks":84,"last_commit_at":85,"license":86,"difficulty_score":10,"env_os":87,"env_gpu":88,"env_ram":87,"env_deps":89,"category_tags":100,"github_topics":102,"view_count":32,"oss_zip_url":74,"oss_zip_packed_at":74,"status":17,"created_at":118,"updated_at":119,"faqs":120,"releases":151},5031,"leondgarse\u002Fkeras_cv_attention_models","keras_cv_attention_models","Keras beit,caformer,CMT,CoAtNet,convnext,davit,dino,efficientdet,edgenext,efficientformer,efficientnet,eva,fasternet,fastervit,fastvit,flexivit,gcvit,ghostnet,gpvit,hornet,hiera,iformer,inceptionnext,lcnet,levit,maxvit,mobilevit,moganet,nat,nfnets,pvt,swin,tinynet,tinyvit,uniformer,volo,vanillanet,yolor,yolov7,yolov8,yolox,gpt2,llama2, alias kecam","keras_cv_attention_models 是一个专为 Keras 和 TensorFlow 生态打造的开源模型库，旨在让开发者能够轻松调用并复现当前最前沿的计算机视觉与语言模型。它解决了在 Keras 环境中难以直接获取最新学术成果（如 ConvNeXt、Swin Transformer、YOLOv8、LLaMA2 等）的痛点，将原本分散且实现复杂的论文代码统一封装为简洁易用的接口。\n\n无论是需要快速验证想法的算法研究人员，还是致力于将先进模型部署到生产环境的工程师，都能从中受益。用户只需几行代码即可实例化包括识别、检测、分割乃至大语言模型在内的上百种架构，并支持加载预训练权重进行推理或微调。\n\n其独特亮点在于极高的模型覆盖度，不仅囊括了 EfficientNet、MobileViT 等经典网络，更紧跟前沿收录了 Hiera、VanillaNet 等新锐模型。此外，它还提供了从 ImageNet\u002FCOCO 训练脚本到 TFLite 端侧转换的一站式工作流支持。需要注意的是，目前该库主要兼容传统 Keras 环境，若使用新版 TensorFlow 需进行特定配置。总体而言，它是","keras_cv_attention_models 是一个专为 Keras 和 TensorFlow 生态打造的开源模型库，旨在让开发者能够轻松调用并复现当前最前沿的计算机视觉与语言模型。它解决了在 Keras 环境中难以直接获取最新学术成果（如 ConvNeXt、Swin Transformer、YOLOv8、LLaMA2 等）的痛点，将原本分散且实现复杂的论文代码统一封装为简洁易用的接口。\n\n无论是需要快速验证想法的算法研究人员，还是致力于将先进模型部署到生产环境的工程师，都能从中受益。用户只需几行代码即可实例化包括识别、检测、分割乃至大语言模型在内的上百种架构，并支持加载预训练权重进行推理或微调。\n\n其独特亮点在于极高的模型覆盖度，不仅囊括了 EfficientNet、MobileViT 等经典网络，更紧跟前沿收录了 Hiera、VanillaNet 等新锐模型。此外，它还提供了从 ImageNet\u002FCOCO 训练脚本到 TFLite 端侧转换的一站式工作流支持。需要注意的是，目前该库主要兼容传统 Keras 环境，若使用新版 TensorFlow 需进行特定配置。总体而言，它是连接学术创新与工程落地的高效桥梁。","# ___Keras_cv_attention_models___\n***\n- **WARNING: currently NOT compatible with `keras 3.x`, if using `tensorflow>=2.16.0`, needs to install `pip install tf-keras~=$(pip show tensorflow | awk -F ': ' '\u002FVersion\u002F{print $2}')` manually. While importing, import this package ahead of Tensorflow, or set `export TF_USE_LEGACY_KERAS=1`.**\n- **It's not recommended downloading and loading model from h5 file directly, better building model and loading weights like `import kecam; mm = kecam.models.LCNet050()`.**\n- **coco_train_script.py for TF is still under testing...**\n\u003C!-- TOC depthFrom:1 depthTo:6 withLinks:1 updateOnSave:1 orderedList:0 -->\n\n- [___>>>> Roadmap and todo list \u003C\u003C\u003C\u003C___](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fwiki\u002FRoadmap)\n- [General Usage](#general-usage)\n  - [Basic](#basic)\n  - [T4 Inference](#t4-inference)\n  - [Layers](#layers)\n  - [Model surgery](#model-surgery)\n  - [ImageNet training and evaluating](#imagenet-training-and-evaluating)\n  - [COCO training and evaluating](#coco-training-and-evaluating)\n  - [CLIP training and evaluating](#clip-training-and-evaluating)\n  - [Text training](#text-training)\n  - [DDPM training](#ddpm-training)\n  - [Visualizing](#visualizing)\n  - [TFLite Conversion](#tflite-conversion)\n  - [Using PyTorch as backend](#using-pytorch-as-backend)\n  - [Using keras core as backend](#using-keras-core-as-backend)\n- [Recognition Models](#recognition-models)\n  - [AotNet](#aotnet)\n  - [BEiT](#beit)\n  - [BEiTV2](#beitv2)\n  - [BotNet](#botnet)\n  - [CAFormer](#caformer)\n  - [CMT](#cmt)\n  - [CoaT](#coat)\n  - [CoAtNet](#coatnet)\n  - [ConvNeXt](#convnext)\n  - [ConvNeXtV2](#convnextv2)\n  - [CoTNet](#cotnet)\n  - [CSPNeXt](#cspnext)\n  - [DaViT](#davit)\n  - [DiNAT](#dinat)\n  - [DINOv2](#dinov2)\n  - [EdgeNeXt](#edgenext)\n  - [EfficientFormer](#efficientformer)\n  - [EfficientFormerV2](#efficientformerv2)\n  - [EfficientNet](#efficientnet)\n  - [EfficientNetEdgeTPU](#efficientnetedgetpu)\n  - [EfficientNetV2](#efficientnetv2)\n  - [EfficientViT_B](#efficientvit_b)\n  - [EfficientViT_M](#efficientvit_m)\n  - [EVA](#eva)\n  - [EVA02](#eva02)\n  - [FasterNet](#fasternet)\n  - [FasterViT](#fastervit)\n  - [FastViT](#fastvit)\n  - [FBNetV3](#fbnetv3)\n  - [FlexiViT](#flexivit)\n  - [GCViT](#gcvit)\n  - [GhostNet](#ghostnet)\n  - [GhostNetV2](#ghostnetv2)\n  - [GMLP](#gmlp)\n  - [GPViT](#gpvit)\n  - [HaloNet](#halonet)\n  - [Hiera](#hiera)\n  - [HorNet](#hornet)\n  - [IFormer](#iformer)\n  - [InceptionNeXt](#inceptionnext)\n  - [LCNet](#lcnet)\n  - [LeViT](#levit)\n  - [MaxViT](#maxvit)\n  - [MetaTransFormer](#metatransformer)\n  - [MLP mixer](#mlp-mixer)\n  - [MobileNetV3](#mobilenetv3)\n  - [MobileViT](#mobilevit)\n  - [MobileViT_V2](#mobilevit_v2)\n  - [MogaNet](#moganet)\n  - [NAT](#nat)\n  - [NFNets](#nfnets)\n  - [PVT_V2](#pvt_v2)\n  - [RegNetY](#regnety)\n  - [RegNetZ](#regnetz)\n  - [RepViT](#repvit)\n  - [ResMLP](#resmlp)\n  - [ResNeSt](#resnest)\n  - [ResNetD](#resnetd)\n  - [ResNetQ](#resnetq)\n  - [ResNeXt](#resnext)\n  - [SwinTransformerV2](#swintransformerv2)\n  - [TinyNet](#tinynet)\n  - [TinyViT](#tinyvit)\n  - [UniFormer](#uniformer)\n  - [VanillaNet](#vanillanet)\n  - [ViT-5](#vit-5)\n  - [VOLO](#volo)\n  - [WaveMLP](#wavemlp)\n- [Detection Models](#detection-models)\n  - [EfficientDet](#efficientdet)\n  - [YOLO_NAS](#yolo_nas)\n  - [YOLOR](#yolor)\n  - [YOLOV7](#yolov7)\n  - [YOLOV8](#yolov8)\n  - [YOLOX](#yolox)\n- [Language Models](#language-models)\n  - [GPT2](#gpt2)\n  - [LLaMA2](#llama2)\n- [Stable Diffusion](#stable-diffusion)\n- [Segmentation Models](#segmentation-models)\n  - [YOLOV8 Segmentation](#yolov8-segmentation)\n  - [Segment Anything](#segment-anything)\n- [Licenses](#licenses)\n- [Citing](#citing)\n\n\u003C!-- \u002FTOC -->\n***\n\n# General Usage\n## Basic\n  - **Default import** will not specific these while using them in READMEs.\n    ```py\n    import os\n    import sys\n    import tensorflow as tf\n    import numpy as np\n    import pandas as pd\n    import matplotlib.pyplot as plt\n    from tensorflow import keras\n    ```\n  - Install as pip package. `kecam` is a short alias name of this package. **Note**: the pip package `kecam` doesn't set any backend requirement, make sure either Tensorflow or PyTorch installed before hand. For PyTorch backend usage, refer [Keras PyTorch Backend](keras_cv_attention_models\u002Fpytorch_backend).\n    ```sh\n    pip install -U kecam\n    # Or\n    pip install -U keras-cv-attention-models\n    # Or\n    pip install -U git+https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\n    ```\n    Refer to each sub directory for detail usage.\n  - **Basic model prediction**\n    ```py\n    from keras_cv_attention_models import volo\n    mm = volo.VOLO_d1(pretrained=\"imagenet\")\n\n    \"\"\" Run predict \"\"\"\n    import tensorflow as tf\n    from tensorflow import keras\n    from keras_cv_attention_models.test_images import cat\n    img = cat()\n    imm = keras.applications.imagenet_utils.preprocess_input(img, mode='torch')\n    pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy()\n    pred = tf.nn.softmax(pred).numpy()  # If classifier activation is not softmax\n    print(keras.applications.imagenet_utils.decode_predictions(pred)[0])\n    # [('n02124075', 'Egyptian_cat', 0.99664897),\n    #  ('n02123045', 'tabby', 0.0007249644),\n    #  ('n02123159', 'tiger_cat', 0.00020345),\n    #  ('n02127052', 'lynx', 5.4973923e-05),\n    #  ('n02123597', 'Siamese_cat', 2.675306e-05)]\n    ```\n    Or just use model preset `preprocess_input` and `decode_predictions`\n    ```py\n    from keras_cv_attention_models import coatnet\n    mm = coatnet.CoAtNet0()\n\n    from keras_cv_attention_models.test_images import cat\n    preds = mm(mm.preprocess_input(cat()))\n    print(mm.decode_predictions(preds))\n    # [[('n02124075', 'Egyptian_cat', 0.9999875), ('n02123045', 'tabby', 5.194884e-06), ...]]\n    ```\n    The preset `preprocess_input` and `decode_predictions` also compatible with PyTorch backend.\n    ```py\n    os.environ['KECAM_BACKEND'] = 'torch'\n\n    from keras_cv_attention_models import caformer\n    mm = caformer.CAFormerS18()\n    # >>>> Using PyTorch backend\n    # >>>> Aligned input_shape: [3, 224, 224]\n    # >>>> Load pretrained from: ~\u002F.keras\u002Fmodels\u002Fcaformer_s18_224_imagenet.h5\n\n    from keras_cv_attention_models.test_images import cat\n    preds = mm(mm.preprocess_input(cat()))\n    print(preds.shape)\n    # torch.Size([1, 1000])\n    print(mm.decode_predictions(preds))\n    # [[('n02124075', 'Egyptian_cat', 0.8817097), ('n02123045', 'tabby', 0.009335292), ...]]\n    ```\n  - **`num_classes=0`** set for excluding model top `GlobalAveragePooling2D + Dense` layers.\n    ```py\n    from keras_cv_attention_models import resnest\n    mm = resnest.ResNest50(num_classes=0)\n    print(mm.output_shape)\n    # (None, 7, 7, 2048)\n    ```\n  - **`num_classes={custom output classes}`** others than `1000` or `0` will just skip loading the header Dense layer weights. As `model.load_weights(weight_file, by_name=True, skip_mismatch=True)` is used for loading weights.\n    ```py\n    from keras_cv_attention_models import swin_transformer_v2\n\n    mm = swin_transformer_v2.SwinTransformerV2Tiny_window8(num_classes=64)\n    # >>>> Load pretrained from: ~\u002F.keras\u002Fmodels\u002Fswin_transformer_v2_tiny_window8_256_imagenet.h5\n    # WARNING:tensorflow:Skipping loading weights for layer #601 (named predictions) due to mismatch in shape for weight predictions\u002Fkernel:0. Weight expects shape (768, 64). Received saved weight with shape (768, 1000)\n    # WARNING:tensorflow:Skipping loading weights for layer #601 (named predictions) due to mismatch in shape for weight predictions\u002Fbias:0. Weight expects shape (64,). Received saved weight with shape (1000,)\n    ```\n  - **Reload own model weights by set `pretrained=\"xxx.h5\"`**. Better than calling `model.load_weights` directly, if reloading model with different `input_shape` and with weights shape not matching.\n    ```py\n    import os\n    from keras_cv_attention_models import coatnet\n    pretrained = os.path.expanduser('~\u002F.keras\u002Fmodels\u002Fcoatnet0_224_imagenet.h5')\n    mm = coatnet.CoAtNet1(input_shape=(384, 384, 3), pretrained=pretrained)  # No sense, just showing usage\n    ```\n  - **Alias name `kecam`** can be used instead of `keras_cv_attention_models`. It's `__init__.py` only with `from keras_cv_attention_models import *`.\n    ```py\n    import kecam\n    mm = kecam.yolor.YOLOR_CSP()\n    imm = kecam.test_images.dog_cat()\n    preds = mm(mm.preprocess_input(imm))\n    bboxs, lables, confidences = mm.decode_predictions(preds)[0]\n    kecam.coco.show_image_with_bboxes(imm, bboxs, lables, confidences)\n    ```\n  - **Calculate flops** method from [TF 2.0 Feature: Flops calculation #32809](https:\u002F\u002Fgithub.com\u002Ftensorflow\u002Ftensorflow\u002Fissues\u002F32809#issuecomment-849439287). For PyTorch backend, needs `thop` `pip install thop`.\n    ```py\n    from keras_cv_attention_models import coatnet, resnest, model_surgery\n\n    model_surgery.get_flops(coatnet.CoAtNet0())\n    # >>>> FLOPs: 4,221,908,559, GFLOPs: 4.2219G\n    model_surgery.get_flops(resnest.ResNest50())\n    # >>>> FLOPs: 5,378,399,992, GFLOPs: 5.3784G\n    ```\n  - **[Deprecated] `tensorflow_addons`** is not imported by default. While reloading model depending on `GroupNormalization` like `MobileViTV2` from `h5` directly, needs to import `tensorflow_addons` manually first.\n    ```py\n    import tensorflow_addons as tfa\n\n    model_path = os.path.expanduser('~\u002F.keras\u002Fmodels\u002Fmobilevit_v2_050_256_imagenet.h5')\n    mm = keras.models.load_model(model_path)\n    ```\n  - **Export TF model to onnx**. Needs `tf2onnx` for TF, `pip install onnx tf2onnx onnxsim onnxruntime`. For using PyTorch backend, exporting onnx is supported by PyTorch.\n    ```py\n    from keras_cv_attention_models import volo, nat, model_surgery\n    mm = nat.DiNAT_Small(pretrained=True)\n    model_surgery.export_onnx(mm, fuse_conv_bn=True, batch_size=1, simplify=True)\n    # Exported simplified onnx: dinat_small.onnx\n\n    # Run test\n    from keras_cv_attention_models.imagenet import eval_func\n    aa = eval_func.ONNXModelInterf(mm.name + '.onnx')\n    inputs = np.random.uniform(size=[1, *mm.input_shape[1:]]).astype('float32')\n    print(f\"{np.allclose(aa(inputs), mm(inputs), atol=1e-5) = }\")\n    # np.allclose(aa(inputs), mm(inputs), atol=1e-5) = True\n    ```\n  - **Model summary** `model_summary.csv` contains gathered model info.\n    - `params` for model params count in `M`\n    - `flops` for FLOPs in `G`\n    - `input` for model input shape\n    - `acc_metrics` means `Imagenet Top1 Accuracy` for recognition models, `COCO val AP` for detection models\n    - `inference_qps` for `T4 inference query per second` with `batch_size=1 + trtexec`\n    - `extra` means if any extra training info.\n    ```py\n    from keras_cv_attention_models import plot_func\n    plot_series = [\n        \"efficientnetv2\", 'tinynet', 'lcnet', 'mobilenetv3', 'fasternet', 'fastervit', 'ghostnet',\n        'inceptionnext', 'efficientvit_b', 'mobilevit', 'convnextv2', 'efficientvit_m', 'hiera',\n    ]\n    plot_func.plot_model_summary(\n        plot_series, model_table=\"model_summary.csv\", log_scale_x=True, allow_extras=['mae_in1k_ft1k']\n    )\n    ```\n    ![model_summary](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_ffe507351a3b.png)\n  - **Code format** is using `line-length=160`:\n    ```sh\n    find .\u002F* -name \"*.py\" | grep -v __init__ | xargs -I {} black -l 160 {}\n    ```\n## T4 Inference\n  - **T4 Inference** in the model tables are tested using `trtexec` on `Tesla T4` with `CUDA=12.0.1-1, Driver=525.60.13`. All models are exported as ONNX using PyTorch backend, using `batch_szie=1` only. **Note: this data is for reference only, and vary in different batch sizes or benchmark tools or platforms or implementations**.\n  - All results are tested using colab [trtexec.ipynb](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1xLwfvbZNqadkdAZu9b0UzOrETLo657oc?usp=drive_link). Thus reproducible by any others.\n  ```py\n  os.environ[\"KECAM_BACKEND\"] = \"torch\"\n\n  from keras_cv_attention_models import convnext, test_images, imagenet\n  # >>>> Using PyTorch backend\n  mm = convnext.ConvNeXtTiny()\n  mm.export_onnx(simplify=True)\n  # Exported onnx: convnext_tiny.onnx\n  # Running onnxsim.simplify...\n  # Exported simplified onnx: convnext_tiny.onnx\n\n  # Onnx run test\n  tt = imagenet.eval_func.ONNXModelInterf('convnext_tiny.onnx')\n  print(mm.decode_predictions(tt(mm.preprocess_input(test_images.cat()))))\n  # [[('n02124075', 'Egyptian_cat', 0.880507), ('n02123045', 'tabby', 0.0047998047), ...]]\n\n  \"\"\" Run trtexec benchmark \"\"\"\n  !trtexec --onnx=convnext_tiny.onnx --fp16 --allowGPUFallback --useSpinWait --useCudaGraph\n  ```\n## Layers\n  - [attention_layers](keras_cv_attention_models\u002Fattention_layers) is `__init__.py` only, which imports core layers defined in model architectures. Like `RelativePositionalEmbedding` from `botnet`, `outlook_attention` from `volo`, and many other `Positional Embedding Layers` \u002F `Attention Blocks`.\n  ```py\n  from keras_cv_attention_models import attention_layers\n  aa = attention_layers.RelativePositionalEmbedding()\n  print(f\"{aa(tf.ones([1, 4, 14, 16, 256])).shape = }\")\n  # aa(tf.ones([1, 4, 14, 16, 256])).shape = TensorShape([1, 4, 14, 16, 14, 16])\n  ```\n## Model surgery\n  - [model_surgery](keras_cv_attention_models\u002Fmodel_surgery) including functions used to change model parameters after built.\n  ```py\n  from keras_cv_attention_models import model_surgery\n  mm = keras.applications.ResNet50()  # Trainable params: 25,583,592\n\n  # Replace all ReLU with PReLU. Trainable params: 25,606,312\n  mm = model_surgery.replace_ReLU(mm, target_activation='PReLU')\n\n  # Fuse conv and batch_norm layers. Trainable params: 25,553,192\n  mm = model_surgery.convert_to_fused_conv_bn_model(mm)\n  ```\n## ImageNet training and evaluating\n  - [ImageNet](keras_cv_attention_models\u002Fimagenet) contains more detail usage and some comparing results.\n  - [Init Imagenet dataset using tensorflow_datasets #9](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F9).\n  - For custom dataset, `custom_dataset_script.py` can be used creating a `json` format file, which can be used as `--data_name xxx.json` for training, detail usage can be found in [Custom recognition dataset](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F52#discussion-3971513).\n  - Another method creating custom dataset is using `tfds.load`, refer [Writing custom datasets](https:\u002F\u002Fwww.tensorflow.org\u002Fdatasets\u002Fadd_dataset) and [Creating private tensorflow_datasets from tfds #48](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F48) by @Medicmind.\n  - Running an AWS Sagemaker estimator job using `keras_cv_attention_models` can be found in [AWS Sagemaker script example](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F107) by @Medicmind.\n  - `aotnet.AotNet50` default parameters set is a typical `ResNet50` architecture with `Conv2D use_bias=False` and `padding` like `PyTorch`.\n  - Default parameters for `train_script.py` is like `A3` configuration from [ResNet strikes back: An improved training procedure in timm](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.00476.pdf) with `batch_size=256, input_shape=(160, 160)`.\n    ```sh\n    # `antialias` is default enabled for resize, can be turned off be set `--disable_antialias`.\n    CUDA_VISIBLE_DEVICES='0' TF_XLA_FLAGS=\"--tf_xla_auto_jit=2\" python3 train_script.py --seed 0 -s aotnet50\n    ```\n    ```sh\n    # Evaluation using input_shape (224, 224).\n    # `antialias` usage should be same with training.\n    CUDA_VISIBLE_DEVICES='1' python3 eval_script.py -m aotnet50_epoch_103_val_acc_0.7674.h5 -i 224 --central_crop 0.95\n    # >>>> Accuracy top1: 0.78466 top5: 0.94088\n    ```\n    ![aotnet50_imagenet](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_441ce7e7b0f5.png)\n  - **Restore from break point** by setting `--restore_path` and `--initial_epoch`, and keep other parameters same. `restore_path` is higher priority than `model` and `additional_model_kwargs`, also restore `optimizer` and `loss`. `initial_epoch` is mainly for learning rate scheduler. If not sure where it stopped, check `checkpoints\u002F{save_name}_hist.json`.\n    ```py\n    import json\n    with open(\"checkpoints\u002Faotnet50_hist.json\", \"r\") as ff:\n        aa = json.load(ff)\n    len(aa['lr'])\n    # 41 ==> 41 epochs are finished, initial_epoch is 41 then, restart from epoch 42\n    ```\n    ```sh\n    CUDA_VISIBLE_DEVICES='0' TF_XLA_FLAGS=\"--tf_xla_auto_jit=2\" python3 train_script.py --seed 0 -r checkpoints\u002Faotnet50_latest.h5 -I 41\n    # >>>> Restore model from: checkpoints\u002Faotnet50_latest.h5\n    # Epoch 42\u002F105\n    ```\n  - **`eval_script.py`** is used for evaluating model accuracy. [EfficientNetV2 self tested imagenet accuracy #19](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F19) just showing how different parameters affecting model accuracy.\n    ```sh\n    # evaluating pretrained builtin model\n    CUDA_VISIBLE_DEVICES='1' python3 eval_script.py -m regnet.RegNetZD8\n    # evaluating pretrained timm model\n    CUDA_VISIBLE_DEVICES='1' python3 eval_script.py -m timm.models.resmlp_12_224 --input_shape 224\n\n    # evaluating specific h5 model\n    CUDA_VISIBLE_DEVICES='1' python3 eval_script.py -m checkpoints\u002Fxxx.h5\n    # evaluating specific tflite model\n    CUDA_VISIBLE_DEVICES='1' python3 eval_script.py -m xxx.tflite\n    ```\n  - **Progressive training** refer to [PDF 2104.00298 EfficientNetV2: Smaller Models and Faster Training](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.00298.pdf). AotNet50 A3 progressive input shapes `96 128 160`:\n    ```sh\n    CUDA_VISIBLE_DEVICES='1' TF_XLA_FLAGS=\"--tf_xla_auto_jit=2\" python3 progressive_train_script.py \\\n    --progressive_epochs 33 66 -1 \\\n    --progressive_input_shapes 96 128 160 \\\n    --progressive_magnitudes 2 4 6 \\\n    -s aotnet50_progressive_3_lr_steps_100 --seed 0\n    ```\n    ![aotnet50_progressive_160](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_0ac3b49a6c69.png)\n  - Transfer learning with `freeze_backbone` or `freeze_norm_layers`: [EfficientNetV2B0 transfer learning on cifar10 testing freezing backbone #55](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F55).\n  - [Token label train test on CIFAR10 #57](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F57). **Currently not working as well as expected**. `Token label` is implementation of [Github zihangJiang\u002FTokenLabeling](https:\u002F\u002Fgithub.com\u002FzihangJiang\u002FTokenLabeling), paper [PDF 2104.10858 All Tokens Matter: Token Labeling for Training Better Vision Transformers](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.10858.pdf).\n## COCO training and evaluating\n  - **Currently still under testing**.\n  - [COCO](keras_cv_attention_models\u002Fcoco) contains more detail usage.\n  - `custom_dataset_script.py` can be used creating a `json` format file, which can be used as `--data_name xxx.json` for training, detail usage can be found in [Custom detection dataset](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F52#discussioncomment-2460664).\n  - Default parameters for `coco_train_script.py` is `EfficientDetD0` with `input_shape=(256, 256, 3), batch_size=64, mosaic_mix_prob=0.5, freeze_backbone_epochs=32, total_epochs=105`. Technically, it's any `pyramid structure backbone` + `EfficientDet \u002F YOLOX header \u002F YOLOR header` + `anchor_free \u002F yolor \u002F efficientdet anchors` combination supported.\n  - Currently 4 types anchors supported, parameter **`anchors_mode`** controls which anchor to use, value in `[\"efficientdet\", \"anchor_free\", \"yolor\", \"yolov8\"]`. Default `None` for `det_header` presets.\n  - **NOTE: `YOLOV8` has a default `regression_len=64` for bbox output length. Typically it's `4` for other detection models, for yolov8 it's `reg_max=16 -> regression_len = 16 * 4 == 64`.**\n\n    | anchors_mode | use_object_scores | num_anchors | anchor_scale | aspect_ratios | num_scales | grid_zero_start |\n    | ------------ | ----------------- | ----------- | ------------ | ------------- | ---------- | --------------- |\n    | efficientdet | False             | 9           | 4            | [1, 2, 0.5]   | 3          | False           |\n    | anchor_free  | True              | 1           | 1            | [1]           | 1          | True            |\n    | yolor        | True              | 3           | None         | presets       | None       | offset=0.5      |\n    | yolov8       | False             | 1           | 1            | [1]           | 1          | False           |\n\n    ```sh\n    # Default EfficientDetD0\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py\n    # Default EfficientDetD0 using input_shape 512, optimizer adamw, freezing backbone 16 epochs, total 50 + 5 epochs\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py -i 512 -p adamw --freeze_backbone_epochs 16 --lr_decay_steps 50\n\n    # EfficientNetV2B0 backbone + EfficientDetD0 detection header\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --backbone efficientnet.EfficientNetV2B0 --det_header efficientdet.EfficientDetD0\n    # ResNest50 backbone + EfficientDetD0 header using yolox like anchor_free anchors\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --backbone resnest.ResNest50 --anchors_mode anchor_free\n    # UniformerSmall32 backbone + EfficientDetD0 header using yolor anchors\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --backbone uniformer.UniformerSmall32 --anchors_mode yolor\n\n    # Typical YOLOXS with anchor_free anchors\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --det_header yolox.YOLOXS --freeze_backbone_epochs 0\n    # YOLOXS with efficientdet anchors\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --det_header yolox.YOLOXS --anchors_mode efficientdet --freeze_backbone_epochs 0\n    # CoAtNet0 backbone + YOLOX header with yolor anchors\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --backbone coatnet.CoAtNet0 --det_header yolox.YOLOX --anchors_mode yolor\n\n    # Typical YOLOR_P6 with yolor anchors\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --det_header yolor.YOLOR_P6 --freeze_backbone_epochs 0\n    # YOLOR_P6 with anchor_free anchors\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --det_header yolor.YOLOR_P6 --anchors_mode anchor_free  --freeze_backbone_epochs 0\n    # ConvNeXtTiny backbone + YOLOR header with efficientdet anchors\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --backbone convnext.ConvNeXtTiny --det_header yolor.YOLOR --anchors_mode yolor\n    ```\n    **Note: COCO training still under testing, may change parameters and default behaviors. Take the risk if would like help developing.**\n  - **`coco_eval_script.py`** is used for evaluating model AP \u002F AR on COCO validation set. It has a dependency `pip install pycocotools` which is not in package requirements. More usage can be found in [COCO Evaluation](keras_cv_attention_models\u002Fcoco#evaluation).\n    ```sh\n    # EfficientDetD0 using resize method bilinear w\u002Fo antialias\n    CUDA_VISIBLE_DEVICES='1' python3 coco_eval_script.py -m efficientdet.EfficientDetD0 --resize_method bilinear --disable_antialias\n    # >>>> [COCOEvalCallback] input_shape: (512, 512), pyramid_levels: [3, 7], anchors_mode: efficientdet\n\n    # YOLOX using BGR input format\n    CUDA_VISIBLE_DEVICES='1' python3 coco_eval_script.py -m yolox.YOLOXTiny --use_bgr_input --nms_method hard --nms_iou_or_sigma 0.65\n    # >>>> [COCOEvalCallback] input_shape: (416, 416), pyramid_levels: [3, 5], anchors_mode: anchor_free\n\n    # YOLOR \u002F YOLOV7 using letterbox_pad and other tricks.\n    CUDA_VISIBLE_DEVICES='1' python3 coco_eval_script.py -m yolor.YOLOR_CSP --nms_method hard --nms_iou_or_sigma 0.65 \\\n    --nms_max_output_size 300 --nms_topk -1 --letterbox_pad 64 --input_shape 704\n    # >>>> [COCOEvalCallback] input_shape: (704, 704), pyramid_levels: [3, 5], anchors_mode: yolor\n\n    # Specify h5 model\n    CUDA_VISIBLE_DEVICES='1' python3 coco_eval_script.py -m checkpoints\u002Fyoloxtiny_yolor_anchor.h5\n    # >>>> [COCOEvalCallback] input_shape: (416, 416), pyramid_levels: [3, 5], anchors_mode: yolor\n    ```\n  - **[Experimental] Training using PyTorch backend**\n    ```py\n    import os, sys, torch\n    os.environ[\"KECAM_BACKEND\"] = \"torch\"\n\n    from keras_cv_attention_models.yolov8 import train, yolov8\n    from keras_cv_attention_models import efficientnet\n\n    global_device = torch.device(\"cuda:0\") if torch.cuda.is_available() and int(os.environ.get(\"CUDA_VISIBLE_DEVICES\", \"0\")) >= 0 else torch.device(\"cpu\")\n    # model Trainable params: 7,023,904, GFLOPs: 8.1815G\n    bb = efficientnet.EfficientNetV2B0(input_shape=(3, 640, 640), num_classes=0)\n    model = yolov8.YOLOV8_N(backbone=bb, classifier_activation=None, pretrained=None).to(global_device)  # Note: classifier_activation=None\n    # model = yolov8.YOLOV8_N(input_shape=(3, None, None), classifier_activation=None, pretrained=None).to(global_device)\n    ema = train.train(model, dataset_path=\"coco.json\", initial_epoch=0)\n    ```\n    ![yolov8_training](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_9ca0b522c34d.png)\n## CLIP training and evaluating\n  - [CLIP](keras_cv_attention_models\u002Fclip) contains more detail usage.\n  - `custom_dataset_script.py` can be used creating a `tsv` \u002F `json` format file, which can be used as `--data_name xxx.tsv` for training, detail usage can be found in [Custom caption dataset](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F52#discussioncomment-6516154).\n  - **Train using `clip_train_script.py on COCO captions`** Default `--data_path` is a testing one `datasets\u002Fcoco_dog_cat\u002Fcaptions.tsv`.\n    ```sh\n    CUDA_VISIBLE_DEVICES=1 TF_XLA_FLAGS=\"--tf_xla_auto_jit=2\" python clip_train_script.py -i 160 -b 128 \\\n    --text_model_pretrained None --data_path coco_captions.tsv\n    ```\n    **Train Using PyTorch backend by setting `KECAM_BACKEND='torch'`**\n    ```sh\n    KECAM_BACKEND='torch' CUDA_VISIBLE_DEVICES=1 python clip_train_script.py -i 160 -b 128 \\\n    --text_model_pretrained None --data_path coco_captions.tsv\n    ```\n    ![clip_torch_tf](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_884ee09338b0.png)\n## Text training\n  - Currently it's only a simple one modified from [Github karpathy\u002FnanoGPT](https:\u002F\u002Fgithub.com\u002Fkarpathy\u002FnanoGPT).\n  - **Train using `text_train_script.py`** As dataset is randomly sampled, needs to specify `steps_per_epoch`\n    ```sh\n    CUDA_VISIBLE_DEVICES=1 TF_XLA_FLAGS=\"--tf_xla_auto_jit=2\" python text_train_script.py -m LLaMA2_15M \\\n    --steps_per_epoch 8000 --batch_size 8 --tokenizer SentencePieceTokenizer\n    ```\n    **Train Using PyTorch backend by setting `KECAM_BACKEND='torch'`**\n    ```sh\n    KECAM_BACKEND='torch' CUDA_VISIBLE_DEVICES=1 python text_train_script.py -m LLaMA2_15M \\\n    --steps_per_epoch 8000 --batch_size 8 --tokenizer SentencePieceTokenizer\n    ```\n    **Plotting**\n    ```py\n    from keras_cv_attention_models import plot_func\n    hists = ['checkpoints\u002Ftext_llama2_15m_tensorflow_hist.json', 'checkpoints\u002Ftext_llama2_15m_torch_hist.json']\n    plot_func.plot_hists(hists, addition_plots=['val_loss', 'lr'], skip_first=3)\n    ```\n    ![text_tf_torch](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_169d0748adf2.png)\n## DDPM training\n  - [Stable Diffusion](keras_cv_attention_models\u002Fstable_diffusion) contains more detail usage.\n  - **Note: Works better with PyTorch backend, Tensorflow one seems overfitted if training logger like `--epochs 200`, and evaluation runs ~5 times slower. [???]**\n  - **Dataset** can be a directory containing images for basic DDPM training using images only, or a recognition json file created following [Custom recognition dataset](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F52#discussion-3971513), which will train using labels as instruction.\n    ```sh\n    python custom_dataset_script.py --train_images cifar10\u002Ftrain\u002F --test_images cifar10\u002Ftest\u002F\n    # >>>> total_train_samples: 50000, total_test_samples: 10000, num_classes: 10\n    # >>>> Saved to: cifar10.json\n    ```\n  - **Train using `ddpm_train_script.py on cifar10 with labels`** Default `--data_path` is builtin `cifar10`.\n    ```py\n    # Set --eval_interval 50 as TF evaluation is rather slow [???]\n    TF_XLA_FLAGS=\"--tf_xla_auto_jit=2\" CUDA_VISIBLE_DEVICES=1 python ddpm_train_script.py --eval_interval 50\n    ```\n    **Train Using PyTorch backend by setting `KECAM_BACKEND='torch'`**\n    ```py\n    KECAM_BACKEND='torch' CUDA_VISIBLE_DEVICES=1 python ddpm_train_script.py\n    ```\n    ![ddpm_unet_test_E100](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_aae031eaab57.png)\n## Visualizing\n  - [Visualizing](keras_cv_attention_models\u002Fvisualizing) is for visualizing convnet filters or attention map scores.\n  - **make_and_apply_gradcam_heatmap** is for Grad-CAM class activation visualization.\n    ```py\n    from keras_cv_attention_models import visualizing, test_images, resnest\n    mm = resnest.ResNest50()\n    img = test_images.dog()\n    superimposed_img, heatmap, preds = visualizing.make_and_apply_gradcam_heatmap(mm, img, layer_name=\"auto\")\n    ```\n    ![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_b9beb662002f.png)\n  - **plot_attention_score_maps** is model attention score maps visualization.\n    ```py\n    from keras_cv_attention_models import visualizing, test_images, botnet\n    img = test_images.dog()\n    _ = visualizing.plot_attention_score_maps(botnet.BotNetSE33T(), img)\n    ```\n    ![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_086c1a2556dc.png)\n## TFLite Conversion\n  - Currently `TFLite` not supporting `tf.image.extract_patches` \u002F `tf.transpose with len(perm) > 4`. Some operations could be supported in latest or `tf-nightly` version, like previously not supported `gelu` \u002F `Conv2D with groups>1` are working now. May try if encountering issue.\n  - More discussion can be found [Converting a trained keras CV attention model to TFLite #17](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F17). Some speed testing results can be found [How to speed up inference on a quantized model #44](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F44#discussioncomment-2348910).\n  - Functions like `model_surgery.convert_groups_conv2d_2_split_conv2d` and `model_surgery.convert_gelu_to_approximate` are not needed using up-to-date TF version.\n  - Not supporting `VOLO` \u002F `HaloNet` models converting, cause they need a longer `tf.transpose` `perm`.\n  - **model_surgery.convert_dense_to_conv** converts all `Dense` layer with 3D \u002F 4D inputs to `Conv1D` \u002F `Conv2D`, as currently TFLite xnnpack not supporting it.\n    ```py\n    from keras_cv_attention_models import beit, model_surgery, efficientformer, mobilevit\n\n    mm = efficientformer.EfficientFormerL1()\n    mm = model_surgery.convert_dense_to_conv(mm)  # Convert all Dense layers\n    converter = tf.lite.TFLiteConverter.from_keras_model(mm)\n    open(mm.name + \".tflite\", \"wb\").write(converter.convert())\n    ```\n    | Model             | Dense, use_xnnpack=false  | Conv, use_xnnpack=false   | Conv, use_xnnpack=true    |\n    | ----------------- | ------------------------- | ------------------------- | ------------------------- |\n    | MobileViT_S       | Inference (avg) 215371 us | Inference (avg) 163836 us | Inference (avg) 163817 us |\n    | EfficientFormerL1 | Inference (avg) 126829 us | Inference (avg) 107053 us | Inference (avg) 107132 us |\n  - **model_surgery.convert_extract_patches_to_conv** converts `tf.image.extract_patches` to a `Conv2D` version:\n    ```py\n    from keras_cv_attention_models import cotnet, model_surgery\n    from keras_cv_attention_models.imagenet import eval_func\n\n    mm = cotnet.CotNetSE50D()\n    mm = model_surgery.convert_groups_conv2d_2_split_conv2d(mm)\n    # mm = model_surgery.convert_gelu_to_approximate(mm)  # Not required if using up-to-date TFLite\n    mm = model_surgery.convert_extract_patches_to_conv(mm)\n    converter = tf.lite.TFLiteConverter.from_keras_model(mm)\n    open(mm.name + \".tflite\", \"wb\").write(converter.convert())\n    test_inputs = np.random.uniform(size=[1, *mm.input_shape[1:]])\n    print(np.allclose(mm(test_inputs), eval_func.TFLiteModelInterf(mm.name + '.tflite')(test_inputs), atol=1e-7))\n    # True\n    ```\n  - **model_surgery.prepare_for_tflite** is just a combination of above functions:\n    ```py\n    from keras_cv_attention_models import beit, model_surgery\n\n    mm = beit.BeitBasePatch16()\n    mm = model_surgery.prepare_for_tflite(mm)\n    converter = tf.lite.TFLiteConverter.from_keras_model(mm)\n    open(mm.name + \".tflite\", \"wb\").write(converter.convert())\n    ```\n  - **Detection models** including `efficinetdet` \u002F `yolox` \u002F `yolor`, model can be converted a TFLite format directly. If need [DecodePredictions](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fblob\u002Fmain\u002Fkeras_cv_attention_models\u002Fcoco\u002Feval_func.py#L8) also included in TFLite model, need to set `use_static_output=True` for `DecodePredictions`, as TFLite requires a more static output shape. Model output shape will be fixed as `[batch, max_output_size, 6]`. The last dimension `6` means `[bbox_top, bbox_left, bbox_bottom, bbox_right, label_index, confidence]`, and those valid ones are where `confidence > 0`.\n    ```py\n    \"\"\" Init model \"\"\"\n    from keras_cv_attention_models import efficientdet\n    model = efficientdet.EfficientDetD0(pretrained=\"coco\")\n\n    \"\"\" Create a model with DecodePredictions using `use_static_output=True` \"\"\"\n    model.decode_predictions.use_static_output = True\n    # parameters like score_threshold \u002F iou_or_sigma can be set another value if needed.\n    nn = model.decode_predictions(model.outputs[0], score_threshold=0.5)\n    bb = keras.models.Model(model.inputs[0], nn)\n\n    \"\"\" Convert TFLite \"\"\"\n    converter = tf.lite.TFLiteConverter.from_keras_model(bb)\n    open(bb.name + \".tflite\", \"wb\").write(converter.convert())\n\n    \"\"\" Inference test \"\"\"\n    from keras_cv_attention_models.imagenet import eval_func\n    from keras_cv_attention_models import test_images\n\n    dd = eval_func.TFLiteModelInterf(bb.name + \".tflite\")\n    imm = test_images.cat()\n    inputs = tf.expand_dims(tf.image.resize(imm, dd.input_shape[1:-1]), 0)\n    inputs = keras.applications.imagenet_utils.preprocess_input(inputs, mode='torch')\n    preds = dd(inputs)[0]\n    print(f\"{preds.shape = }\")\n    # preds.shape = (100, 6)\n\n    pred = preds[preds[:, -1] > 0]\n    bboxes, labels, confidences = pred[:, :4], pred[:, 4], pred[:, -1]\n    print(f\"{bboxes = }, {labels = }, {confidences = }\")\n    # bboxes = array([[0.22825494, 0.47238672, 0.816262  , 0.8700745 ]], dtype=float32),\n    # labels = array([16.], dtype=float32),\n    # confidences = array([0.8309707], dtype=float32)\n\n    \"\"\" Show result \"\"\"\n    from keras_cv_attention_models.coco import data\n    data.show_image_with_bboxes(imm, bboxes, labels, confidences, num_classes=90)\n    ```\n## Using PyTorch as backend\n  - **Experimental** [Keras PyTorch Backend](keras_cv_attention_models\u002Fpytorch_backend).\n  - **Set os environment `export KECAM_BACKEND='torch'` to enable this PyTorch backend.**\n  - Currently supports most recognition and detection models except hornet*gf \u002F nfnets \u002F volo. For detection models, using `torchvision.ops.nms` while running prediction.\n  - **Basic model build and prediction**.\n    - Will load same `h5` weights as TF one if available.\n    - Note: `input_shape` will auto fit image data format. Given `input_shape=(224, 224, 3)` or `input_shape=(3, 224, 224)`, will both set to `(3, 224, 224)` if `channels_first`.\n    - Note: model is default set to `eval` mode.\n    ```py\n    os.environ['KECAM_BACKEND'] = 'torch'\n    from keras_cv_attention_models import res_mlp\n    mm = res_mlp.ResMLP12()\n    # >>>> Load pretrained from: ~\u002F.keras\u002Fmodels\u002Fresmlp12_imagenet.h5\n    print(f\"{mm.input_shape = }\")\n    # mm.input_shape = [None, 3, 224, 224]\n\n    import torch\n    print(f\"{isinstance(mm, torch.nn.Module) = }\")\n    # isinstance(mm, torch.nn.Module) = True\n\n    # Run prediction\n    from keras_cv_attention_models.test_images import cat\n    print(mm.decode_predictions(mm(mm.preprocess_input(cat())))[0])\n    # [('n02124075', 'Egyptian_cat', 0.9597896), ('n02123045', 'tabby', 0.012809471), ...]\n    ```\n  - **Export typical PyTorch onnx \u002F pth**.\n    ```py\n    import torch\n    torch.onnx.export(mm, torch.randn(1, 3, *mm.input_shape[2:]), mm.name + \".onnx\")\n\n    # Or by export_onnx\n    mm.export_onnx()\n    # Exported onnx: resmlp12.onnx\n\n    mm.export_pth()\n    # Exported pth: resmlp12.pth\n    ```\n  - **Save weights as h5**. This `h5` can also be loaded in typical TF backend model. Currently it's only weights without model structure supported.\n    ```py\n    mm.save_weights(\"foo.h5\")\n    ```\n  - **Training with compile and fit** Note: loss function arguments should be `y_true, y_pred`, while typical torch loss functions using `y_pred, y_true`.\n    ```py\n    import torch\n    from keras_cv_attention_models.backend import models, layers\n    mm = models.Sequential([layers.Input([3, 32, 32]), layers.Conv2D(32, 3), layers.GlobalAveragePooling2D(), layers.Dense(10)])\n    if torch.cuda.is_available():\n        _ = mm.to(\"cuda\")\n    xx = torch.rand([64, *mm.input_shape[1:]])\n    yy = torch.functional.F.one_hot(torch.randint(0, mm.output_shape[-1], size=[64]), mm.output_shape[-1]).float()\n    loss = lambda y_true, y_pred: (y_true - y_pred.float()).abs().mean()\n    # Will check kwargs for calling `self.train_compile` or `torch.nn.Module.compile`\n    mm.compile(optimizer=\"AdamW\", loss=loss, metrics='acc', grad_accumulate=4)\n    mm.fit(xx, yy, epochs=2, batch_size=4)\n    ```\n## Using keras core as backend\n  - **[Experimental] Set os environment `export KECAM_BACKEND='keras_core'` to enable this `keras_core` backend. Not using `keras>3.0`, as still not compiling with TensorFlow==2.15.0**\n  - `keras-core` has its own backends, supporting tensorflow \u002F torch \u002F jax, by editting `~\u002F.keras\u002Fkeras.json` `\"backend\"` value.\n  - Currently most recognition models except `HaloNet` \u002F `BotNet` supported, also `GPT2` \u002F `LLaMA2` supported.\n  - **Basic model build and prediction**.\n    ```py\n    !pip install sentencepiece  # required for llama2 tokenizer\n    os.environ['KECAM_BACKEND'] = 'keras_core'\n    os.environ['KERAS_BACKEND'] = 'jax'\n    import kecam\n    print(f\"{kecam.backend.backend() = }\")\n    # kecam.backend.backend() = 'jax'\n    mm = kecam.llama2.LLaMA2_42M()\n    # >>>> Load pretrained from: ~\u002F.keras\u002Fmodels\u002Fllama2_42m_tiny_stories.h5\n    mm.run_prediction('As evening fell, a maiden stood at the edge of a wood. In her hands,')\n    # >>>> Load tokenizer from file: ~\u002F.keras\u002Fdatasets\u002Fllama_tokenizer.model\n    # \u003Cs>\n    # As evening fell, a maiden stood at the edge of a wood. In her hands, she held a beautiful diamond. Everyone was surprised to see it.\n    # \"What is it?\" one of the kids asked.\n    # \"It's a diamond,\" the maiden said.\n    # ...\n    ```\n***\n\n# Recognition Models\n## AotNet\n  - [Keras AotNet](keras_cv_attention_models\u002Faotnet) is just a `ResNet` \u002F `ResNetV2` like framework, that set parameters like `attn_types` and `se_ratio` and others, which is used to apply different types attention layer. Works like `byoanet` \u002F `byobnet` from `timm`.\n  - Default parameters set is a typical `ResNet` architecture with `Conv2D use_bias=False` and `padding` like `PyTorch`.\n  ```py\n  from keras_cv_attention_models import aotnet\n  # Mixing se and outlook and halo and mhsa and cot_attention, 21M parameters.\n  # 50 is just a picked number that larger than the relative `num_block`.\n  attn_types = [None, \"outlook\", [\"bot\", \"halo\"] * 50, \"cot\"],\n  se_ratio = [0.25, 0, 0, 0],\n  model = aotnet.AotNet50V2(attn_types=attn_types, se_ratio=se_ratio, stem_type=\"deep\", strides=1)\n  model.summary()\n  ```\n## BEiT\n  - [Keras BEiT](keras_cv_attention_models\u002Fbeit) includes models from [PDF 2106.08254 BEiT: BERT Pre-Training of Image Transformers](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.08254.pdf).\n\n  | Model                      | Params  | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | -------------------------- | ------- | ------- | ----- | -------- | ------------ |\n  | [BeitBasePatch16, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_base_patch16_224_imagenet21k-ft1k.h5)  | 86.53M  | 17.61G  | 224   | 85.240   | 321.226 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_base_patch16_384_imagenet21k-ft1k.h5)            | 86.74M  | 55.70G  | 384   | 86.808   | 164.705 qps  |\n  | [BeitLargePatch16, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_large_patch16_224_imagenet21k-ft1k.h5) | 304.43M | 61.68G  | 224   | 87.476   | 105.998 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_large_patch16_384_imagenet21k-ft1k.h5)            | 305.00M | 191.65G | 384   | 88.382   | 45.7307 qps  |\n  | - [21k_ft1k, 512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_large_patch16_512_imagenet21k-ft1k.h5)            | 305.67M | 363.46G | 512   | 88.584   | 21.3097 qps  |\n## BEiTV2\n  - [Keras BEiT](keras_cv_attention_models\u002Fbeit) includes models from BeitV2 Paper [PDF 2208.06366 BEiT v2: Masked Image Modeling with Vector-Quantized Visual Tokenizers](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.06366.pdf).\n\n  | Model              | Params  | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------------ | ------- | ------ | ----- | -------- | ------------ |\n  | BeitV2BasePatch16  | 86.53M  | 17.61G | 224   | 85.5     | 322.52 qps   |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_v2_base_patch16_224_imagenet21k-ft1k.h5) | 86.53M          | 17.61G | 224   | 86.5     | 322.52 qps   |\n  | BeitV2LargePatch16 | 304.43M | 61.68G | 224   | 87.3     | 105.734 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_v2_large_patch16_224_imagenet21k-ft1k.h5)         | 304.43M | 61.68G | 224   | 88.4     | 105.734 qps  |\n## BotNet\n  - [Keras BotNet](keras_cv_attention_models\u002Fbotnet) is for [PDF 2101.11605 Bottleneck Transformers for Visual Recognition](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.11605.pdf).\n\n  | Model         | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------- | ------ | ------ | ----- | -------- | ------------ |\n  | BotNet50      | 21M    | 5.42G  | 224   |          | 746.454 qps  |\n  | BotNet101     | 41M    | 9.13G  | 224   |          | 448.102 qps  |\n  | BotNet152     | 56M    | 12.84G | 224   |          | 316.671 qps  |\n  | [BotNet26T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbotnet\u002Fbotnet26t_256_imagenet.h5)     | 12.5M  | 3.30G  | 256   | 79.246   | 1188.84 qps  |\n  | [BotNextECA26T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbotnet\u002Fbotnext_eca26t_256_imagenet.h5) | 10.59M | 2.45G  | 256   | 79.270   | 1038.19 qps  |\n  | [BotNetSE33T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbotnet\u002Fbotnet_se33t_256_imagenet.h5)   | 13.7M  | 3.89G  | 256   | 81.2     | 610.429 qps  |\n## CAFormer\n  - [Keras CAFormer](keras_cv_attention_models\u002Fcaformer) is for [PDF 2210.13452 MetaFormer Baselines for Vision](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.13452.pdf). `CAFormer` is using 2 transformer stacks, while `ConvFormer` is all conv blocks.\n\n  | Model                   | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | ----------------------- | ------ | ----- | ----- | -------- | ------------ |\n  | [CAFormerS18](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s18_224_imagenet.h5)             | 26M    | 4.1G  | 224   | 83.6     | 399.127 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s18_384_imagenet.h5)                   | 26M    | 13.4G | 384   | 85.0     | 181.993 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s18_224_imagenet21k-ft1k.h5)      | 26M    | 4.1G  | 224   | 84.1     | 399.127 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s18_384_imagenet21k-ft1k.h5) | 26M    | 13.4G | 384   | 85.4     | 181.993 qps  |\n  | [CAFormerS36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s36_224_imagenet.h5)             | 39M    | 8.0G  | 224   | 84.5     | 204.328 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s36_384_imagenet.h5)                   | 39M    | 26.0G | 384   | 85.7     | 102.04 qps   |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s36_224_imagenet21k-ft1k.h5)      | 39M    | 8.0G  | 224   | 85.8     | 204.328 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s36_384_imagenet21k-ft1k.h5) | 39M    | 26.0G | 384   | 86.9     | 102.04 qps   |\n  | [CAFormerM36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_m36_224_imagenet.h5)             | 56M    | 13.2G | 224   | 85.2     | 162.257 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_m36_384_imagenet.h5)                   | 56M    | 42.0G | 384   | 86.2     | 65.6188 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_m36_224_imagenet21k-ft1k.h5)      | 56M    | 13.2G | 224   | 86.6     | 162.257 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_m36_384_imagenet21k-ft1k.h5) | 56M    | 42.0G | 384   | 87.5     | 65.6188 qps  |\n  | [CAFormerB36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_b36_224_imagenet.h5)             | 99M    | 23.2G | 224   | 85.5     | 116.865 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_b36_384_imagenet.h5)                   | 99M    | 72.2G | 384   | 86.4     | 50.0244 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_b36_224_imagenet21k-ft1k.h5)      | 99M    | 23.2G | 224   | 87.4     | 116.865 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_b36_384_imagenet21k-ft1k.h5) | 99M    | 72.2G | 384   | 88.1     | 50.0244 qps  |\n\n  | Model                   | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | ----------------------- | ------ | ----- | ----- | -------- | ------------ |\n  | [ConvFormerS18](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s18_224_imagenet.h5)           | 27M    | 3.9G  | 224   | 83.0     | 295.114 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s18_384_imagenet.h5)                   | 27M    | 11.6G | 384   | 84.4     | 145.923 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s18_224_imagenet21k-ft1k.h5)      | 27M    | 3.9G  | 224   | 83.7     | 295.114 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s36_384_imagenet21k-ft1k.h5) | 27M    | 11.6G | 384   | 85.0     | 145.923 qps  |\n  | [ConvFormerS36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s36_224_imagenet.h5)           | 40M    | 7.6G  | 224   | 84.1     | 161.609 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s36_384_imagenet.h5)                   | 40M    | 22.4G | 384   | 85.4     | 80.2101 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s36_224_imagenet21k-ft1k.h5)      | 40M    | 7.6G  | 224   | 85.4     | 161.609 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s36_384_imagenet21k-ft1k.h5) | 40M    | 22.4G | 384   | 86.4     | 80.2101 qps  |\n  | [ConvFormerM36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_m36_224_imagenet.h5)           | 57M    | 12.8G | 224   | 84.5     | 130.161 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_m36_384_imagenet.h5)                   | 57M    | 37.7G | 384   | 85.6     | 63.9712 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_m36_224_imagenet21k-ft1k.h5)      | 57M    | 12.8G | 224   | 86.1     | 130.161 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_m36_384_imagenet21k-ft1k.h5) | 57M    | 37.7G | 384   | 86.9     | 63.9712 qps  |\n  | [ConvFormerB36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_b36_224_imagenet.h5)           | 100M   | 22.6G | 224   | 84.8     | 98.0751 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_b36_384_imagenet.h5)                   | 100M   | 66.5G | 384   | 85.7     | 48.5897 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_b36_224_imagenet21k-ft1k.h5)      | 100M   | 22.6G | 224   | 87.0     | 98.0751 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_b36_384_imagenet21k-ft1k.h5) | 100M   | 66.5G | 384   | 87.6     | 48.5897 qps  |\n## CMT\n  - [Keras CMT](keras_cv_attention_models\u002Fcmt) is for [PDF 2107.06263 CMT: Convolutional Neural Networks Meet Vision Transformers](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2107.06263.pdf).\n\n  | Model                              | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | ---------------------------------- | ------ | ----- | ----- | -------- | ------------ |\n  | CMTTiny, (Self trained 105 epochs) | 9.5M   | 0.65G | 160   | 77.4     | 315.566 qps  |\n  | - [(305 epochs)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcmt\u002Fcmt_tiny_160_imagenet.h5)                     | 9.5M   | 0.65G | 160   | 78.94    | 315.566 qps  |\n  | - [224, (fine-tuned 69 epochs)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcmt\u002Fcmt_tiny_224_imagenet.h5)      | 9.5M   | 1.32G | 224   | 80.73    | 254.87 qps   |\n  | [CMTTiny_torch, (1000 epochs)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcmt\u002Fcmt_tiny_torch_160_imagenet.h5)       | 9.5M   | 0.65G | 160   | 79.2     | 338.207 qps  |\n  | [CMTXS_torch](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcmt\u002Fcmt_xs_torch_192_imagenet.h5)                        | 15.2M  | 1.58G | 192   | 81.8     | 241.288 qps  |\n  | [CMTSmall_torch](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcmt\u002Fcmt_small_torch_224_imagenet.h5)                     | 25.1M  | 4.09G | 224   | 83.5     | 171.109 qps  |\n  | [CMTBase_torch](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcmt\u002Fcmt_base_torch_256_imagenet.h5)                      | 45.7M  | 9.42G | 256   | 84.5     | 103.34 qps   |\n## CoaT\n  - [Keras CoaT](keras_cv_attention_models\u002Fcoat) is for [PDF 2104.06399 CoaT: Co-Scale Conv-Attentional Image Transformers](http:\u002F\u002Farxiv.org\u002Fabs\u002F2104.06399).\n\n  | Model         | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | ------------- | ------ | ----- | ----- | -------- | ------------ |\n  | [CoaTLiteTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoat\u002Fcoat_lite_tiny_imagenet.h5)  | 5.7M   | 1.60G | 224   | 77.5     | 450.27 qps   |\n  | [CoaTLiteMini](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoat\u002Fcoat_lite_mini_imagenet.h5)  | 11M    | 2.00G | 224   | 79.1     | 452.884 qps  |\n  | [CoaTLiteSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoat\u002Fcoat_lite_small_imagenet.h5) | 20M    | 3.97G | 224   | 81.9     | 248.846 qps  |\n  | [CoaTTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoat\u002Fcoat_tiny_imagenet.h5)      | 5.5M   | 4.33G | 224   | 78.3     | 152.495 qps  |\n  | [CoaTMini](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoat\u002Fcoat_mini_imagenet.h5)      | 10M    | 6.78G | 224   | 81.0     | 124.845 qps  |\n## CoAtNet\n  - [Keras CoAtNet](keras_cv_attention_models\u002Fcoatnet) is for [PDF 2106.04803 CoAtNet: Marrying Convolution and Attention for All Data Sizes](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.04803.pdf).\n\n  | Model                               | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ----------------------------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [CoAtNet0, 160, (105 epochs)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoatnet\u002Fcoatnet0_160_imagenet.h5) | 23.3M  | 2.09G  | 160   | 80.48    | 584.059 qps  |\n  | [CoAtNet0, (305 epochs)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoatnet\u002Fcoatnet0_224_imagenet.h5) | 23.8M  | 4.22G  | 224   | 82.79    | 400.333 qps  |\n  | CoAtNet0                            | 25M    | 4.6G   | 224   | 82.0     | 400.333 qps  |\n  | - use_dw_strides=False              | 25M    | 4.2G   | 224   | 81.6     | 461.197 qps  |\n  | CoAtNet1                            | 42M    | 8.8G   | 224   | 83.5     | 206.954 qps  |\n  | - use_dw_strides=False              | 42M    | 8.4G   | 224   | 83.3     | 228.938 qps  |\n  | CoAtNet2                            | 75M    | 16.6G  | 224   | 84.1     | 156.359 qps  |\n  | - use_dw_strides=False              | 75M    | 15.7G  | 224   | 84.1     | 165.846 qps  |\n  | CoAtNet2, 21k_ft1k                  | 75M    | 16.6G  | 224   | 87.1     | 156.359 qps  |\n  | CoAtNet3                            | 168M   | 34.7G  | 224   | 84.5     | 95.0703 qps  |\n  | CoAtNet3, 21k_ft1k                  | 168M   | 34.7G  | 224   | 87.6     | 95.0703 qps  |\n  | CoAtNet3, 21k_ft1k                  | 168M   | 203.1G | 512   | 87.9     | 95.0703 qps  |\n  | CoAtNet4, 21k_ft1k                  | 275M   | 360.9G | 512   | 88.1     | 74.6022 qps  |\n  | CoAtNet4, 21k_ft1k, PT-RA-E150      | 275M   | 360.9G | 512   | 88.56    | 74.6022 qps  |\n## ConvNeXt\n  - [Keras ConvNeXt](keras_cv_attention_models\u002Fconvnext) is for [PDF 2201.03545 A ConvNet for the 2020s](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.03545.pdf).\n\n  | Model                   | Params | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | ----------------------- | ------ | ------- | ----- | -------- | ------------ |\n  | [ConvNeXtTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_tiny_imagenet.h5)            | 28M    | 4.49G   | 224   | 82.1     | 361.58 qps   |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_tiny_224_imagenet21k-ft1k.h5)      | 28M    | 4.49G   | 224   | 82.9     | 361.58 qps   |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_tiny_384_imagenet21k-ft1k.h5) | 28M    | 13.19G  | 384   | 84.1     | 182.134 qps  |\n  | [ConvNeXtSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_small_imagenet.h5)           | 50M    | 8.73G   | 224   | 83.1     | 202.007 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_small_224_imagenet21k-ft1k.h5)      | 50M    | 8.73G   | 224   | 84.6     | 202.007 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_small_384_imagenet21k-ft1k.h5) | 50M    | 25.67G  | 384   | 85.8     | 108.125 qps  |\n  | [ConvNeXtBase](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_base_224_imagenet.h5)            | 89M    | 15.42G  | 224   | 83.8     | 160.036 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_base_384_imagenet.h5)                   | 89M    | 45.32G  | 384   | 85.1     | 83.3095 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_base_224_imagenet21k-ft1k.h5)      | 89M    | 15.42G  | 224   | 85.8     | 160.036 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_base_384_imagenet21k-ft1k.h5) | 89M    | 45.32G  | 384   | 86.8     | 83.3095 qps  |\n  | [ConvNeXtLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_large_224_imagenet.h5)           | 198M   | 34.46G  | 224   | 84.3     | 102.27 qps   |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_large_384_imagenet.h5)                   | 198M   | 101.28G | 384   | 85.5     | 47.2086 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_large_224_imagenet21k-ft1k.h5)      | 198M   | 34.46G  | 224   | 86.6     | 102.27 qps   |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_large_384_imagenet21k-ft1k.h5) | 198M   | 101.28G | 384   | 87.5     | 47.2086 qps  |\n  | [ConvNeXtXlarge, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_xlarge_224_imagenet21k-ft1k.h5)     | 350M   | 61.06G  | 224   | 87.0     | 40.5776 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_xlarge_384_imagenet21k-ft1k.h5)              | 350M   | 179.43G | 384   | 87.8     | 21.797 qps   |\n  | [ConvNeXtXXLarge, clip](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_xxlarge_clip-ft1k.h5)   | 846M   | 198.09G | 256   | 88.6     |              |\n## ConvNeXtV2\n  - [Keras ConvNeXt](keras_cv_attention_models\u002Fconvnext) includes implementation of [PDF 2301.00808 ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2301.00808.pdf). **Please note the CC-BY-NC 4.0 license on theses weights, non-commercial use only**.\n\n  | Model                   | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ----------------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [ConvNeXtV2Atto](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_atto_imagenet.h5)          | 3.7M   | 0.55G  | 224   | 76.7     | 705.822 qps  |\n  | [ConvNeXtV2Femto](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_femto_imagenet.h5)         | 5.2M   | 0.78G  | 224   | 78.5     | 728.02 qps   |\n  | [ConvNeXtV2Pico](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_pico_imagenet.h5)          | 9.1M   | 1.37G  | 224   | 80.3     | 591.502 qps  |\n  | [ConvNeXtV2Nano](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_nano_imagenet.h5)          | 15.6M  | 2.45G  | 224   | 81.9     | 471.918 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_nano_224_imagenet21k-ft1k.h5)      | 15.6M  | 2.45G  | 224   | 82.1     | 471.918 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_nano_384_imagenet21k-ft1k.h5) | 15.6M  | 7.21G  | 384   | 83.4     | 213.802 qps  |\n  | [ConvNeXtV2Tiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_tiny_imagenet.h5)          | 28.6M  | 4.47G  | 224   | 83.0     | 301.982 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_tiny_224_imagenet21k-ft1k.h5)      | 28.6M  | 4.47G  | 224   | 83.9     | 301.982 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_tiny_384_imagenet21k-ft1k.h5) | 28.6M  | 13.1G  | 384   | 85.1     | 139.578 qps  |\n  | [ConvNeXtV2Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_base_imagenet.h5)          | 89M    | 15.4G  | 224   | 84.9     | 132.575 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_base_224_imagenet21k-ft1k.h5)      | 89M    | 15.4G  | 224   | 86.8     | 132.575 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_base_384_imagenet21k-ft1k.h5) | 89M    | 45.2G  | 384   | 87.7     | 66.5729 qps  |\n  | [ConvNeXtV2Large](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_large_imagenet.h5)         | 198M   | 34.4G  | 224   | 85.8     | 86.8846 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_large_224_imagenet21k-ft1k.h5)      | 198M   | 34.4G  | 224   | 87.3     | 86.8846 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_large_384_imagenet21k-ft1k.h5) | 198M   | 101.1G | 384   | 88.2     | 24.4542 qps  |\n  | [ConvNeXtV2Huge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_huge_imagenet.h5)          | 660M   | 115G   | 224   | 86.3     |              |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_huge_384_imagenet21k-ft1k.h5)      | 660M   | 337.9G | 384   | 88.7     |              |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_huge_512_imagenet21k-ft1k.h5) | 660M   | 600.8G | 512   | 88.9     |              |\n## CoTNet\n  - [Keras CoTNet](keras_cv_attention_models\u002Fcotnet) is for [PDF 2107.12292 Contextual Transformer Networks for Visual Recognition](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2107.12292.pdf).\n\n  | Model        | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------ |:------:| ------ | ----- |:--------:| ------------ |\n  | [CotNet50](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcotnet\u002Fcotnet50_224_imagenet.h5)     | 22.2M  | 3.25G  | 224   |   81.3   | 324.913 qps  |\n  | [CotNetSE50D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcotnet\u002Fcotnet_se50d_224_imagenet.h5)  | 23.1M  | 4.05G  | 224   |   81.6   | 513.077 qps  |\n  | [CotNet101](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcotnet\u002Fcotnet101_224_imagenet.h5)    | 38.3M  | 6.07G  | 224   |   82.8   | 183.824 qps  |\n  | [CotNetSE101D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcotnet\u002Fcotnet_se101d_224_imagenet.h5) | 40.9M  | 8.44G  | 224   |   83.2   | 251.487 qps  |\n  | [CotNetSE152D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcotnet\u002Fcotnet_se152d_224_imagenet.h5) | 55.8M  | 12.22G | 224   |   84.0   | 175.469 qps  |\n  | [CotNetSE152D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcotnet\u002Fcotnet_se152d_320_imagenet.h5) | 55.8M  | 24.92G | 320   |   84.6   | 175.469 qps  |\n## CSPNeXt\n  - [Keras CSPNeXt](keras_cv_attention_models\u002Fcspnext) is for backbone of [PDF 2212.07784 RTMDet: An Empirical Study of Designing Real-Time Object Detectors](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.07784).\n\n  | Model         | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | ------------- | ------ | ----- | ----- | -------- | -------- |\n  | [CSPNeXtTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcspnext\u002Fcspnext_tiny_imagenet.h5)   | 2.73M  | 0.34G | 224   | 69.44    |  |\n  | [CSPNeXtSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcspnext\u002Fcspnext_small_imagenet.h5)  | 4.89M  | 0.66G | 224   | 74.41    |  |\n  | [CSPNeXtMedium](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcspnext\u002Fcspnext_medium_imagenet.h5) | 13.05M | 1.92G | 224   | 79.27    |  |\n  | [CSPNeXtLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcspnext\u002Fcspnext_large_imagenet.h5)  | 27.16M | 4.19G | 224   | 81.30    |  |\n  | [CSPNeXtXLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcspnext\u002Fcspnext_xlarge_imagenet.h5) | 48.85M | 7.75G | 224   | 82.10    |  |\n## DaViT\n  - [Keras DaViT](keras_cv_attention_models\u002Fdavit) is for [PDF 2204.03645 DaViT: Dual Attention Vision Transformers](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.03645.pdf).\n\n  | Model              | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------------ | ------ | ------ | ----- | -------- | ------------ |\n  | [DaViT_T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fdavit\u002Fdavit_t_imagenet.h5)            | 28.36M | 4.56G  | 224   | 82.8     | 224.563 qps  |\n  | [DaViT_S](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fdavit\u002Fdavit_s_imagenet.h5)            | 49.75M | 8.83G  | 224   | 84.2     | 145.838 qps  |\n  | [DaViT_B](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fdavit\u002Fdavit_b_imagenet.h5)            | 87.95M | 15.55G | 224   | 84.6     | 114.527 qps  |\n  | DaViT_L, 21k_ft1k  | 196.8M | 103.2G | 384   | 87.5     | 34.7015 qps  |\n  | DaViT_H, 1.5B      | 348.9M | 327.3G | 512   | 90.2     | 12.363 qps   |\n  | DaViT_G, 1.5B      | 1.406B | 1.022T | 512   | 90.4     |              |\n## DiNAT\n  - [Keras DiNAT](keras_cv_attention_models\u002Fnat) is for [PDF 2209.15001 Dilated Neighborhood Attention Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.15001.pdf).\n\n  | Model                     | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [DiNAT_Mini](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_mini_imagenet.h5)                | 20.0M  | 2.73G  | 224   | 81.8     | 83.9943 qps  |\n  | [DiNAT_Tiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_tiny_imagenet.h5)                | 27.9M  | 4.34G  | 224   | 82.7     | 61.1902 qps  |\n  | [DiNAT_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_small_imagenet.h5)               | 50.7M  | 7.84G  | 224   | 83.8     | 41.0343 qps  |\n  | [DiNAT_Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_base_imagenet.h5)                | 89.8M  | 13.76G | 224   | 84.4     | 30.1332 qps  |\n  | [DiNAT_Large, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_large_224_imagenet21k-ft1k.h5)     | 200.9M | 30.58G | 224   | 86.6     | 18.4936 qps  |\n  | - [21k, (num_classes=21841)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_large_imagenet21k.h5)   | 200.9M | 30.58G | 224   |          |              |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_large_384_imagenet21k-ft1k.h5)           | 200.9M | 89.86G | 384   | 87.4     |              |\n  | [DiNAT_Large_K11, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_large_k11_imagenet21k-ft1k.h5) | 201.1M | 92.57G | 384   | 87.5     |              |\n## DINOv2\n  - [Keras DINOv2](keras_cv_attention_models\u002Fbeit) includes models from [PDF 2304.07193 DINOv2: Learning Robust Visual Features without Supervision](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.07193.pdf).\n\n  | Model              | Params  | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | ------------------ | ------- | ------- | ----- | -------- | ------------ |\n  | [DINOv2_ViT_Small14](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fdinov2_vit_small14_518_imagenet.h5) | 22.83M  | 47.23G  | 518   | 81.1     | 165.271 qps  |\n  | [DINOv2_ViT_Base14](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fdinov2_vit_base14_518_imagenet.h5)  | 88.12M  | 152.6G  | 518   | 84.5     | 54.9769 qps  |\n  | [DINOv2_ViT_Large14](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fdinov2_vit_large14_518_imagenet.h5) | 306.4M  | 509.6G  | 518   | 86.3     | 17.4108 qps  |\n  | [DINOv2_ViT_Giant14](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fdinov2_vit_giant14_518_imagenet.h5) | 1139.6M | 1790.3G | 518   | 86.5     |              |\n## EdgeNeXt\n  - [Keras EdgeNeXt](keras_cv_attention_models\u002Fedgenext) is for [PDF 2206.10589 EdgeNeXt: Efficiently Amalgamated CNN-Transformer Architecture for Mobile Vision Applications](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.10589.pdf).\n\n  | Model             | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ----------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [EdgeNeXt_XX_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_xx_small_256_imagenet.h5) | 1.33M  | 266M   | 256   | 71.23    | 902.957 qps  |\n  | [EdgeNeXt_X_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_x_small_256_imagenet.h5)  | 2.34M  | 547M   | 256   | 74.96    | 638.346 qps  |\n  | [EdgeNeXt_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_small_256_imagenet.h5)    | 5.59M  | 1.27G  | 256   | 79.41    | 536.762 qps  |\n  | - [usi](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_small_256_usi.h5)             | 5.59M  | 1.27G  | 256   | 81.07    | 536.762 qps  |\n  | [EdgeNeXt_Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_base_256_imagenet.h5)     | 18.5M  | 3.86G  | 256   | 82.47    | 383.461 qps  |\n  | - [usi](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_base_256_usi.h5)             | 18.5M  | 3.86G  | 256   | 83.31    | 383.461 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_base_256_imagenet-ft1k.h5)        | 18.5M  | 3.86G  | 256   | 83.68    | 383.461 qps  |\n## EfficientFormer\n  - [Keras EfficientFormer](keras_cv_attention_models\u002Fefficientformer) is for [PDF 2206.01191 EfficientFormer: Vision Transformers at MobileNet Speed](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.01191.pdf).\n\n  | Model                      | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | -------------------------- | ------ | ----- | ----- | -------- | ------------ |\n  | [EfficientFormerL1, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Fefficientformer_l1_224_imagenet.h5) | 12.3M  | 1.31G | 224   | 79.2     | 1214.22 qps  |\n  | [EfficientFormerL3, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Fefficientformer_l3_224_imagenet.h5) | 31.4M  | 3.95G | 224   | 82.4     | 596.705 qps  |\n  | [EfficientFormerL7, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Fefficientformer_l7_224_imagenet.h5) | 74.4M  | 9.79G | 224   | 83.3     | 298.434 qps  |\n## EfficientFormerV2\n  - [Keras EfficientFormer](keras_cv_attention_models\u002Fefficientformer) includes implementation of [PDF 2212.08059 Rethinking Vision Transformers for MobileNet Size and Speed](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.08059.pdf).\n\n  | Model                        | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ---------------------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [EfficientFormerV2S0, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientformer\u002Fefficientformer_v2_s0_224_imagenet.h5) | 3.60M  | 405.2M | 224   | 76.2     | 1114.38 qps  |\n  | [EfficientFormerV2S1, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientformer\u002Fefficientformer_v2_s1_224_imagenet.h5) | 6.19M  | 665.6M | 224   | 79.7     | 841.186 qps  |\n  | [EfficientFormerV2S2, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientformer\u002Fefficientformer_v2_s2_224_imagenet.h5) | 12.7M  | 1.27G  | 224   | 82.0     | 573.9 qps    |\n  | [EfficientFormerV2L, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientformer\u002Fefficientformer_v2_l_224_imagenet.h5)  | 26.3M  | 2.59G  | 224   | 83.5     | 377.224 qps  |\n## EfficientNet\n  - [Keras EfficientNet](keras_cv_attention_models\u002Fefficientnet) includes implementation of [PDF 1911.04252 Self-training with Noisy Student improves ImageNet classification](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.04252.pdf).\n\n  | Model                          | Params | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | ------------------------------ | ------ | ------- | ----- | -------- | ------------ |\n  | [EfficientNetV1B0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b0-imagenet.h5)               | 5.3M   | 0.39G   | 224   | 77.6     | 1129.93 qps  |\n  | - [NoisyStudent](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b0-noisy_student.h5)                 | 5.3M   | 0.39G   | 224   | 78.8     | 1129.93 qps  |\n  | [EfficientNetV1B1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b1-imagenet.h5)               | 7.8M   | 0.70G   | 240   | 79.6     | 758.639 qps  |\n  | - [NoisyStudent](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b1-noisy_student.h5)                 | 7.8M   | 0.70G   | 240   | 81.5     | 758.639 qps  |\n  | [EfficientNetV1B2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b2-imagenet.h5)               | 9.1M   | 1.01G   | 260   | 80.5     | 668.959 qps  |\n  | - [NoisyStudent](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b2-noisy_student.h5)                 | 9.1M   | 1.01G   | 260   | 82.4     | 668.959 qps  |\n  | [EfficientNetV1B3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b3-imagenet.h5)               | 12.2M  | 1.86G   | 300   | 81.9     | 473.607 qps  |\n  | - [NoisyStudent](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b3-noisy_student.h5)                 | 12.2M  | 1.86G   | 300   | 84.1     | 473.607 qps  |\n  | [EfficientNetV1B4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b4-imagenet.h5)               | 19.3M  | 4.46G   | 380   | 83.3     | 265.244 qps  |\n  | - [NoisyStudent](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b4-noisy_student.h5)                 | 19.3M  | 4.46G   | 380   | 85.3     | 265.244 qps  |\n  | [EfficientNetV1B5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b5-imagenet.h5)               | 30.4M  | 10.40G  | 456   | 84.3     | 146.758 qps  |\n  | - [NoisyStudent](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b5-noisy_student.h5)                 | 30.4M  | 10.40G  | 456   | 86.1     | 146.758 qps  |\n  | [EfficientNetV1B6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b6-imagenet.h5)               | 43.0M  | 19.29G  | 528   | 84.8     | 88.0369 qps  |\n  | - [NoisyStudent](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b6-noisy_student.h5)                 | 43.0M  | 19.29G  | 528   | 86.4     | 88.0369 qps  |\n  | [EfficientNetV1B7](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b7-imagenet.h5)               | 66.3M  | 38.13G  | 600   | 85.2     | 52.6616 qps  |\n  | - [NoisyStudent](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b7-noisy_student.h5)                 | 66.3M  | 38.13G  | 600   | 86.9     | 52.6616 qps  |\n  | [EfficientNetV1L2, NoisyStudent](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-l2-noisy_student.h5) | 480.3M | 477.98G | 800   | 88.4     |              |\n## EfficientNetEdgeTPU\n  - [Keras EfficientNetEdgeTPU](keras_cv_attention_models\u002Fefficientnet) includes implementation of [PDF 1911.04252 Self-training with Noisy Student improves ImageNet classification](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.04252.pdf).\n\n  | Model                          | Params | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | ------------------------------ | ------ | ------- | ----- | -------- | ------------ |\n  | [EfficientNetEdgeTPUSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetedgetpu-small-imagenet.h5)       | 5.49M  | 1.79G   | 224   | 78.07    | 1459.38 qps  |\n  | [EfficientNetEdgeTPUMedium](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetedgetpu-medium-imagenet.h5)      | 6.90M  | 3.01G   | 240   | 79.25    | 1028.95 qps  |\n  | [EfficientNetEdgeTPULarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetedgetpu-large-imagenet.h5)       | 10.59M | 7.94G   | 300   | 81.32    | 527.034 qps  |\n## EfficientNetV2\n  - [Keras EfficientNet](keras_cv_attention_models\u002Fefficientnet) includes implementation of [PDF 2104.00298 EfficientNetV2: Smaller Models and Faster Training](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.00298).\n\n  | Model                      | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | -------------------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [EfficientNetV2B0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b0-imagenet.h5)           | 7.1M   | 0.72G  | 224   | 78.7     | 1109.84 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b0-21k-ft1k.h5)         | 7.1M   | 0.72G  | 224   | 77.55?   | 1109.84 qps  |\n  | [EfficientNetV2B1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b1-imagenet.h5)           | 8.1M   | 1.21G  | 240   | 79.8     | 842.372 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b1-21k-ft1k.h5)         | 8.1M   | 1.21G  | 240   | 79.03?   | 842.372 qps  |\n  | [EfficientNetV2B2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b2-imagenet.h5)           | 10.1M  | 1.71G  | 260   | 80.5     | 762.865 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b2-21k-ft1k.h5)         | 10.1M  | 1.71G  | 260   | 79.48?   | 762.865 qps  |\n  | [EfficientNetV2B3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b3-imagenet.h5)           | 14.4M  | 3.03G  | 300   | 82.1     | 548.501 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b3-21k-ft1k.h5)         | 14.4M  | 3.03G  | 300   | 82.46?   | 548.501 qps  |\n  | [EfficientNetV2T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-t-imagenet.h5)            | 13.6M  | 3.18G  | 288   | 82.34    | 496.483 qps  |\n  | [EfficientNetV2T_GC](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-t-gc-imagenet.h5)         | 13.7M  | 3.19G  | 288   | 82.46    | 368.763 qps  |\n  | [EfficientNetV2S](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-s-imagenet.h5)            | 21.5M  | 8.41G  | 384   | 83.9     | 344.109 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-s-21k-ft1k.h5)         | 21.5M  | 8.41G  | 384   | 84.9     | 344.109 qps  |\n  | [EfficientNetV2M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-m-imagenet.h5)            | 54.1M  | 24.69G | 480   | 85.2     | 145.346 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-m-21k-ft1k.h5)         | 54.1M  | 24.69G | 480   | 86.2     | 145.346 qps  |\n  | [EfficientNetV2L](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-l-imagenet.h5)            | 119.5M | 56.27G | 480   | 85.7     | 85.6514 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-l-21k-ft1k.h5)         | 119.5M | 56.27G | 480   | 86.9     | 85.6514 qps  |\n  | [EfficientNetV2XL, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-xl-21k-ft1k.h5) | 206.8M | 93.66G | 512   | 87.2     | 55.141 qps   |\n## EfficientViT_B\n  - [Keras EfficientViT_B](keras_cv_attention_models\u002Fefficientvit) is for Paper [PDF 2205.14756 EfficientViT: Lightweight Multi-Scale Attention for On-Device Semantic Segmentation](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.14756.pdf).\n\n  | Model           | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | --------------- | ------ | ----- | ----- | -------- | ------------ |\n  | [EfficientViT_B0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b0_224_imagenet.h5) | 3.41M  | 0.12G | 224   | 71.6 ?   | 1581.76 qps  |\n  | [EfficientViT_B1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b1_224_imagenet.h5) | 9.10M  | 0.58G | 224   | 79.4     | 943.587 qps  |\n  | - [256](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b1_256_imagenet.h5)           | 9.10M  | 0.78G | 256   | 79.9     | 840.844 qps  |\n  | - [288](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b1_288_imagenet.h5)            | 9.10M  | 1.03G | 288   | 80.4     | 680.088 qps  |\n  | [EfficientViT_B2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b2_224_imagenet.h5) | 24.33M | 1.68G | 224   | 82.1     | 583.295 qps  |\n  | - [256](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b2_256_imagenet.h5)            | 24.33M | 2.25G | 256   | 82.7     | 507.187 qps  |\n  | - [288](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b2_288_imagenet.h5)            | 24.33M | 2.92G | 288   | 83.1     | 419.93 qps   |\n  | [EfficientViT_B3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b3_224_imagenet.h5) | 48.65M | 4.14G | 224   | 83.5     | 329.764 qps  |\n  | - [256](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b3_256_imagenet.h5)            | 48.65M | 5.51G | 256   | 83.8     | 288.605 qps  |\n  | - [288](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b3_288_imagenet.h5)            | 48.65M | 7.14G | 288   | 84.2     | 229.992 qps  |\n  | [EfficientViT_L1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_l1_224_imagenet.h5) | 52.65M | 5.28G | 224   | 84.48    | 503.068 qps |\n  | [EfficientViT_L2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_l2_224_imagenet.h5) | 63.71M | 6.98G | 224   | 85.05    | 396.255 qps |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_l2_384_imagenet.h5)            | 63.71M | 20.7G | 384   | 85.98    | 207.322 qps |\n  | [EfficientViT_L3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_l3_224_imagenet.h5) | 246.0M | 27.6G | 224   | 85.814   | 174.926 qps |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_l3_384_imagenet.h5)            | 246.0M | 81.6G | 384   | 86.408   | 86.895 qps  |\n## EfficientViT_M\n  - [Keras EfficientViT_M](keras_cv_attention_models\u002Fefficientvit) is for Paper [PDF 2305.07027 EfficientViT: Memory Efficient Vision Transformer with Cascaded Group Attention](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.07027.pdf).\n\n  | Model           | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | --------------- | ------ | ----- | ----- | -------- | ------------ |\n  | [EfficientViT_M0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_m0_224_imagenet.h5) | 2.35M  | 79.4M | 224   | 63.2     | 814.522 qps  |\n  | [EfficientViT_M1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_m1_224_imagenet.h5) | 2.98M  | 167M  | 224   | 68.4     | 948.041 qps  |\n  | [EfficientViT_M2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_m2_224_imagenet.h5) | 4.19M  | 201M  | 224   | 70.8     | 906.286 qps  |\n  | [EfficientViT_M3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_m3_224_imagenet.h5) | 6.90M  | 263M  | 224   | 73.4     | 758.086 qps  |\n  | [EfficientViT_M4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_m4_224_imagenet.h5) | 8.80M  | 299M  | 224   | 74.3     | 672.891 qps  |\n  | [EfficientViT_M5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_m5_224_imagenet.h5) | 12.47M | 522M  | 224   | 77.1     | 577.254 qps  |\n## EVA\n  - [Keras EVA](keras_cv_attention_models\u002Fbeit) includes models from [PDF 2211.07636 EVA: Exploring the Limits of Masked Visual Representation Learning at Scale](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.07636.pdf).\n\n  | Model                 | Params  | FLOPs    | Input | Top1 Acc | T4 Inference |\n  | --------------------- | ------- | -------- | ----- | -------- | ------------ |\n  | [EvaLargePatch14, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva_large_patch14_196_imagenet21k-ft1k.h5)  | 304.14M | 61.65G   | 196   | 88.59    | 115.532 qps  |\n  | - [21k_ft1k, 336](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva_large_patch14_336_imagenet21k-ft1k.h5)            | 304.53M | 191.55G  | 336   | 89.20    | 53.3467 qps  |\n  | [EvaGiantPatch14, clip](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva_giant_patch14_224_imagenet21k-ft1k.h5) | 1012.6M | 267.40G  | 224   | 89.10    |              |\n  | - [m30m](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva_giant_patch14_336_imagenet21k-ft1k.h5)                | 1013.0M | 621.45G  | 336   | 89.57    |              |\n  | - [m30m](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva_giant_patch14_560_imagenet21k-ft1k.h5)                | 1014.4M | 1911.61G | 560   | 89.80    |              |\n## EVA02\n  - [Keras EVA02](keras_cv_attention_models\u002Fbeit) includes models from [PDF 2303.11331 EVA: EVA-02: A Visual Representation for Neon Genesis](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.11331.pdf).\n\n  | Model                                  | Params  | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | -------------------------------------- | ------- | ------- | ----- | -------- | ------------ |\n  | [EVA02TinyPatch14, mim_in22k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva02_tiny_patch14_336_mim_in22k_ft1k.h5)       | 5.76M   | 4.72G   | 336   | 80.658   | 320.123 qps  |\n  | [EVA02SmallPatch14, mim_in22k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva02_small_patch14_336_mim_in22k_ft1k.h5)      | 22.13M  | 15.57G  | 336   | 85.74    | 161.774 qps  |\n  | [EVA02BasePatch14, mim_in22k_ft22k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva02_base_patch14_448_mim_in22k_ft22k_ft1k.h5) | 87.12M  | 107.6G  | 448   | 88.692   | 34.3962 qps  |\n  | [EVA02LargePatch14, mim_m38m_ft22k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva02_large_patch14_448_mim_m38m_ft22k_ft1k.h5) | 305.08M | 363.68G | 448   | 90.054   |              |\n## FasterNet\n  - [Keras FasterNet](keras_cv_attention_models\u002Ffasternet) includes implementation of [PDF 2303.03667 Run, Don’t Walk: Chasing Higher FLOPS for Faster Neural Networks ](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.03667.pdf).\n\n  | Model       | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ----------- | ------ | ------ | ----- | -------- | ------------ |\n  | [FasterNetT0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffasternet\u002Ffasternet_t0_imagenet.h5) | 3.9M   | 0.34G  | 224   | 71.9     | 1890.83 qps  |\n  | [FasterNetT1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffasternet\u002Ffasternet_t1_imagenet.h5) | 7.6M   | 0.85G  | 224   | 76.2     | 1788.16 qps  |\n  | [FasterNetT2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffasternet\u002Ffasternet_t2_imagenet.h5) | 15.0M  | 1.90G  | 224   | 78.9     | 1353.12 qps  |\n  | [FasterNetS](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffasternet\u002Ffasternet_s_imagenet.h5)  | 31.1M  | 4.55G  | 224   | 81.3     | 818.814 qps  |\n  | [FasterNetM](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffasternet\u002Ffasternet_m_imagenet.h5)  | 53.5M  | 8.72G  | 224   | 83.0     | 436.383 qps  |\n  | [FasterNetL](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffasternet\u002Ffasternet_l_imagenet.h5)  | 93.4M  | 15.49G | 224   | 83.5     | 319.809 qps  |\n## FasterViT\n  - [Keras FasterViT](keras_cv_attention_models\u002Ffastervit) includes implementation of [PDF 2306.06189 FasterViT: Fast Vision Transformers with Hierarchical Attention](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.06189.pdf).\n\n  | Model      | Params   | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | ---------- | -------- | ------- | ----- | -------- | ------------ |\n  | [FasterViT0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_0_224_imagenet.h5) | 31.40M   | 3.51G   | 224   | 82.1     | 716.809 qps  |\n  | [FasterViT1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_1_224_imagenet.h5) | 53.37M   | 5.52G   | 224   | 83.2     | 491.971 qps  |\n  | [FasterViT2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_2_224_imagenet.h5) | 75.92M   | 9.00G   | 224   | 84.2     | 377.006 qps  |\n  | [FasterViT3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_3_224_imagenet.h5) | 159.55M  | 18.75G  | 224   | 84.9     | 216.481 qps  |\n  | [FasterViT4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_4_224_imagenet.h5) | 351.12M  | 41.57G  | 224   | 85.4     | 71.6303 qps  |\n  | [FasterViT5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_5_224_imagenet.h5) | 957.52M  | 114.08G | 224   | 85.6     |              |\n  | [FasterViT6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_6_224_imagenet.1.h5), [+.2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_6_224_imagenet.2.h5) | 1360.33M | 144.13G | 224   | 85.8     |              |\n## FastViT\n  - [Keras FastViT](keras_cv_attention_models\u002Ffastvit) includes implementation of [PDF 2303.14189 FastViT: A Fast Hybrid Vision Transformer using Structural Reparameterization](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.14189.pdf).\n\n  | Model         | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | ------------- | ------ | ----- | ----- | -------- | ------------ |\n  | [FastViT_T8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_t8_imagenet.h5)     | 4.03M  | 0.65G | 256   | 76.2     | 1020.29 qps  |\n  | - [distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_t8_distill.h5)       | 4.03M  | 0.65G | 256   | 77.2     | 1020.29 qps  |\n  | - deploy=True | 3.99M  | 0.64G | 256   | 76.2     | 1323.14 qps  |\n  | [FastViT_T12](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_t12_imagenet.h5)   | 7.55M  | 1.34G | 256   | 79.3     | 734.867 qps  |\n  | - [distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_t12_distill.h5)      | 7.55M  | 1.34G | 256   | 80.3     | 734.867 qps  |\n  | - deploy=True | 7.50M  | 1.33G | 256   | 79.3     | 956.332 qps  |\n  | [FastViT_S12](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_s12_imagenet.h5)   | 9.47M  | 1.74G | 256   | 79.9     | 666.669 qps  |\n  | - [distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_s12_distill.h5)      | 9.47M  | 1.74G | 256   | 81.1     | 666.669 qps  |\n  | - deploy=True | 9.42M  | 1.74G | 256   | 79.9     | 881.429 qps  |\n  | [FastViT_SA12](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_sa12_imagenet.h5) | 11.58M | 1.88G | 256   | 80.9     | 656.95 qps   |\n  | - [distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_sa12_distill.h5)     | 11.58M | 1.88G | 256   | 81.9     | 656.95 qps   |\n  | - deploy=True | 11.54M | 1.88G | 256   | 80.9     | 833.011 qps  |\n  | [FastViT_SA24](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_sa24_imagenet.h5) | 21.55M | 3.66G | 256   | 82.7     | 371.84 qps   |\n  | - [distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_sa24_distill.h5)     | 21.55M | 3.66G | 256   | 83.4     | 371.84 qps   |\n  | - deploy=True | 21.49M | 3.66G | 256   | 82.7     | 444.055 qps  |\n  | [FastViT_SA36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_sa36_imagenet.h5) | 31.53M | 5.44G | 256   | 83.6     | 267.986 qps  |\n  | - [distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_sa36_distill.h5)     | 31.53M | 5.44G | 256   | 84.2     | 267.986 qps  |\n  | - deploy=True | 31.44M | 5.43G | 256   | 83.6     | 325.967 qps  |\n  | [FastViT_MA36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_ma36_imagenet.h5) | 44.07M | 7.64G | 256   | 83.9     | 211.928 qps  |\n  | - [distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_ma36_distill.h5)     | 44.07M | 7.64G | 256   | 84.6     | 211.928 qps  |\n  | - deploy=True | 43.96M | 7.63G | 256   | 83.9     | 274.559 qps  |\n## FBNetV3\n  - [Keras FBNetV3](keras_cv_attention_models\u002Fmobilenetv3_family#fbnetv3) includes implementation of [PDF 2006.02049 FBNetV3: Joint Architecture-Recipe Search using Predictor Pretraining](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.02049.pdf).\n\n  | Model    | Params | FLOPs    | Input | Top1 Acc | T4 Inference |\n  | -------- | ------ | -------- | ----- | -------- | ------------ |\n  | [FBNetV3B](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ffbnetv3_b_imagenet.h5) | 5.57M  | 539.82M  | 256   | 79.15    | 713.882 qps  |\n  | [FBNetV3D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ffbnetv3_d_imagenet.h5) | 10.31M | 665.02M  | 256   | 79.68    | 635.963 qps  |\n  | [FBNetV3G](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ffbnetv3_g_imagenet.h5) | 16.62M | 1379.30M | 256   | 82.05    | 478.835 qps  |\n## FlexiViT\n  - [Keras FlexiViT](keras_cv_attention_models\u002Fbeit) includes models from [PDF 2212.08013 FlexiViT: One Model for All Patch Sizes](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.08013.pdf).\n\n  | Model         | Params  | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------- | ------- | ------ | ----- | -------- | ------------ |\n  | [FlexiViTSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fflexivit_small_240_imagenet.h5) | 22.06M  | 5.36G  | 240   | 82.53    | 744.578 qps  |\n  | [FlexiViTBase](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fflexivit_base_240_imagenet.h5)  | 86.59M  | 20.33G | 240   | 84.66    | 301.948 qps  |\n  | [FlexiViTLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fflexivit_large_240_imagenet.h5) | 304.47M | 71.09G | 240   | 85.64    | 105.187 qps  |\n## GCViT\n  - [Keras GCViT](keras_cv_attention_models\u002Fgcvit) includes implementation of [PDF 2206.09959 Global Context Vision Transformers](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.09959.pdf).\n\n  | Model           | Params | FLOPs  | Input | Top1 Acc | Download |\n  | --------------- | ------ | ------ | ----- | -------- | -------- |\n  | [GCViT_XXTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_xx_tiny_224_imagenet.h5)    | 12.0M  | 2.15G  | 224   | 79.9     | 337.7 qps   |\n  | [GCViT_XTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_x_tiny_224_imagenet.h5)     | 20.0M  | 2.96G  | 224   | 82.0     | 255.625 qps   |\n  | [GCViT_Tiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_tiny_224_imagenet.h5)      | 28.2M  | 4.83G  | 224   | 83.5     | 174.553 qps   |\n  | [GCViT_Tiny2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_tiny2_224_imagenet.h5)     | 34.5M  | 6.28G  | 224   | 83.7     |  |\n  | [GCViT_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_small_224_imagenet.h5)     | 51.1M  | 8.63G  | 224   | 84.3     | 131.577 qps   |\n  | [GCViT_Small2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_small2_224_imagenet.h5)    | 68.6M  | 11.7G  | 224   | 84.8     |  |\n  | [GCViT_Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_base_224_imagenet.h5)      | 90.3M  | 14.9G  | 224   | 85.0     | 105.845 qps   |\n  | [GCViT_Large](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_large_224_imagenet.h5)     | 202.1M | 32.8G  | 224   | 85.7     |  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_large_224_imagenet21k-ft1k.h5)      | 202.1M | 32.8G  | 224   | 86.6     |  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_large_384_imagenet21k-ft1k.h5) | 202.9M | 105.1G | 384   | 87.4     |  |\n  | - [21k_ft1k, 512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_large_512_imagenet21k-ft1k.h5) | 203.8M | 205.1G | 512   | 87.6     |  |\n## GhostNet\n  - [Keras GhostNet](keras_cv_attention_models\u002Fghostnet) includes implementation of [PDF 1911.11907 GhostNet: More Features from Cheap Operations](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.11907.pdf).\n\n  | Model        | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------ | ------ | ------ | ----- | -------- | ------------ |\n  | [GhostNet_050](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnet_050_imagenet.h5) | 2.59M  | 42.6M  | 224   | 66.88    | 1272.25 qps  |\n  | [GhostNet_100](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnet_100_imagenet.h5) | 5.18M  | 141.7M | 224   | 74.16    | 1167.4 qps   |\n  | [GhostNet_130](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnet_130_imagenet.h5) | 7.36M  | 227.7M | 224   | 75.79    | 1024.49 qps  |\n  | - [ssld](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnet_130_ssld.h5)       | 7.36M  | 227.7M | 224   | 79.38    | 1024.49 qps  |\n## GhostNetV2\n  - [Keras GhostNet](keras_cv_attention_models\u002Fghostnet) includes implementation of [PDF GhostNetV2: Enhance Cheap Operation with Long-Range Attention](https:\u002F\u002Fopenreview.net\u002Fpdf\u002F6db544c65bbd0fa7d7349508454a433c112470e2.pdf).\n\n  | Model          | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | -------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [GhostNetV2_100](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnetv2_100_imagenet.h5)    | 6.12M  | 168.5M | 224   | 75.3     | 797.088 qps  |\n  | [GhostNetV2_130](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnetv2_130_imagenet.h5)    | 8.96M  | 271.1M | 224   | 76.9     | 722.668 qps  |\n  | [GhostNetV2_160](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnetv2_160_imagenet.h5)    | 12.39M | 400.9M | 224   | 77.8     | 572.268 qps  |\n## GMLP\n  - [Keras GMLP](keras_cv_attention_models\u002Fmlp_family#gmlp) includes implementation of [PDF 2105.08050 Pay Attention to MLPs](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.08050.pdf).\n\n  | Model      | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ---------- | ------ | ------ | ----- | -------- | ------------ |\n  | GMLPTiny16 | 6M     | 1.35G  | 224   | 72.3     | 234.187 qps  |\n  | [GMLPS16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fgmlp_s16_imagenet.h5)    | 20M    | 4.44G  | 224   | 79.6     | 138.363 qps  |\n  | GMLPB16    | 73M    | 15.82G | 224   | 81.6     | 77.816 qps   |\n## GPViT\n  - [Keras GPViT](keras_cv_attention_models\u002Fgpvit) includes implementation of [PDF 2212.06795 GPVIT: A HIGH RESOLUTION NON-HIERARCHICAL VISION TRANSFORMER WITH GROUP PROPAGATION](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.06795.pdf).\n\n  | Model    | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | -------- | ------ | ------ | ----- | -------- | ------------ |\n  | [GPViT_L1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpvit\u002Fgpvit_l1_224_imagenet.h5) | 9.59M  | 6.15G  | 224   | 80.5     | 210.166 qps  |\n  | [GPViT_L2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpvit\u002Fgpvit_l2_224_imagenet.h5) | 24.2M  | 15.74G | 224   | 83.4     | 139.656 qps  |\n  | [GPViT_L3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpvit\u002Fgpvit_l3_224_imagenet.h5) | 36.7M  | 23.54G | 224   | 84.1     | 131.284 qps  |\n  | [GPViT_L4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpvit\u002Fgpvit_l4_224_imagenet.h5) | 75.5M  | 48.29G | 224   | 84.3     | 94.1899 qps  |\n## HaloNet\n  - [Keras HaloNet](keras_cv_attention_models\u002Fhalonet) is for [PDF 2103.12731 Scaling Local Self-Attention for Parameter Efficient Visual Backbones](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2103.12731.pdf).\n\n  | Model          | Params | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | -------------- | ------ | ------- | ----- | -------- | ------------ |\n  | [HaloNextECA26T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhalonet\u002Fhalonext_eca26t_256_imagenet.h5) | 10.7M  | 2.43G   | 256   | 79.50    | 1028.93 qps  |\n  | [HaloNet26T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhalonet\u002Fhalonet26t_256_imagenet.h5)     | 12.5M  | 3.18G   | 256   | 79.13    | 1096.79 qps  |\n  | [HaloNetSE33T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhalonet\u002Fhalonet_se33t_256_imagenet.h5)   | 13.7M  | 3.55G   | 256   | 80.99    | 582.008 qps  |\n  | [HaloRegNetZB](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhalonet\u002Fhaloregnetz_b_224_imagenet.h5)   | 11.68M | 1.97G   | 224   | 81.042   | 575.961 qps  |\n  | [HaloNet50T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhalonet\u002Fhalonet50t_256_imagenet.h5)     | 22.7M  | 5.29G   | 256   | 81.70    | 512.677 qps  |\n  | [HaloBotNet50T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhalonet\u002Fhalobotnet50t_256_imagenet.h5)  | 22.6M  | 5.02G   | 256   | 82.0     | 431.616 qps  |\n## Hiera\n  - [Keras Hiera](keras_cv_attention_models\u002Fhiera) is for [PDF 2306.00989 Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.00989.pdf).\n\n  | Model                        | Params  | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | ---------------------------- | ------- | ------- | ----- | -------- | ------------ |\n  | [HieraTiny, mae_in1k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhiera\u002Fhiera_tiny_224_mae_in1k_ft1k.h5)     | 27.91M  | 4.93G   | 224   | 82.8     | 644.356 qps  |\n  | [HieraSmall, mae_in1k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhiera\u002Fhiera_small_224_mae_in1k_ft1k.h5)    | 35.01M  | 6.44G   | 224   | 83.8     | 491.669 qps  |\n  | [HieraBase, mae_in1k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhiera\u002Fhiera_base_224_mae_in1k_ft1k.h5)     | 51.52M  | 9.43G   | 224   | 84.5     | 351.542 qps  |\n  | [HieraBasePlus, mae_in1k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhiera\u002Fhiera_base_plus_224_mae_in1k_ft1k.h5) | 69.90M  | 12.71G  | 224   | 85.2     | 291.446 qps  |\n  | [HieraLarge, mae_in1k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhiera\u002Fhiera_large_224_mae_in1k_ft1k.h5)    | 213.74M | 40.43G  | 224   | 86.1     | 111.042 qps  |\n  | [HieraHuge, mae_in1k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhiera\u002Fhiera_huge_224_mae_in1k_ft1k.h5)     | 672.78M | 125.03G | 224   | 86.9     |              |\n## HorNet\n  - [Keras HorNet](keras_cv_attention_models\u002Fhornet) is for [PDF 2207.14284 HorNet: Efficient High-Order Spatial Interactions with Recursive Gated Convolutions](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2207.14284.pdf).\n\n  | Model         | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [HorNetTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_tiny_224_imagenet.h5)    | 22.4M  | 4.01G  | 224   | 82.8     | 222.665 qps  |\n  | [HorNetTinyGF](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_tiny_gf_224_imagenet.h5)  | 23.0M  | 3.94G  | 224   | 83.0     |              |\n  | [HorNetSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_small_224_imagenet.h5)   | 49.5M  | 8.87G  | 224   | 83.8     | 166.998 qps  |\n  | [HorNetSmallGF](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_small_gf_224_imagenet.h5) | 50.4M  | 8.77G  | 224   | 84.0     |              |\n  | [HorNetBase](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_base_224_imagenet.h5)    | 87.3M  | 15.65G | 224   | 84.2     | 133.842 qps  |\n  | [HorNetBaseGF](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_base_gf_224_imagenet.h5)  | 88.4M  | 15.51G | 224   | 84.3     |              |\n  | [HorNetLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_large_224_imagenet22k.h5)   | 194.5M | 34.91G | 224   | 86.8     | 89.8254 qps  |\n  | [HorNetLargeGF](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_large_gf_224_imagenet22k.h5) | 196.3M | 34.72G | 224   | 87.0     |              |\n  | [HorNetLargeGF](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_large_gf_384_imagenet22k.h5) | 201.8M | 102.0G | 384   | 87.7     |              |\n## IFormer\n  - [Keras IFormer](keras_cv_attention_models\u002Fiformer) is for [PDF 2205.12956 Inception Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.12956.pdf).\n\n  | Model        | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------ | ------ | ------ | ----- | -------- | ------------ |\n  | [IFormerSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fiformer\u002Fiformer_small_224_imagenet.h5) | 19.9M  | 4.88G  | 224   | 83.4     | 254.392 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fiformer\u002Fiformer_small_384_imagenet.h5)        | 20.9M  | 16.29G | 384   | 84.6     | 128.98 qps   |\n  | [IFormerBase](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fiformer\u002Fiformer_base_224_imagenet.h5)  | 47.9M  | 9.44G  | 224   | 84.6     | 147.868 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fiformer\u002Fiformer_base_384_imagenet.h5)        | 48.9M  | 30.86G | 384   | 85.7     | 77.8391 qps  |\n  | [IFormerLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fiformer\u002Fiformer_largel_224_imagenet.h5) | 86.6M  | 14.12G | 224   | 84.6     | 113.434 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fiformer\u002Fiformer_largel_384_imagenet.h5)        | 87.7M  | 45.74G | 384   | 85.8     | 60.0292 qps  |\n## InceptionNeXt\n  - [Keras InceptionNeXt](keras_cv_attention_models\u002Finceptionnext) is for [PDF 2303.16900 InceptionNeXt: When Inception Meets ConvNeXt](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.16900.pdf).\n\n  | Model              | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | ------------------ | ------ | ------ | ----- | -------- | ------------ |\n  | [InceptionNeXtTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Finceptionnext\u002Finceptionnext_tiny_imagenet.h5)  | 28.05M | 4.21G  | 224   | 82.3     | 606.527 qps  |\n  | [InceptionNeXtSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Finceptionnext\u002Finceptionnext_small_imagenet.h5) | 49.37M | 8.39G  | 224   | 83.5     | 329.01 qps   |\n  | [InceptionNeXtBase](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Finceptionnext\u002Finceptionnext_base_224_imagenet.h5)  | 86.67M | 14.88G | 224   | 84.0     | 260.639 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Finceptionnext\u002Finceptionnext_base_384_imagenet.h5)              | 86.67M | 43.73G | 384   | 85.2     | 142.888 qps  |\n## LCNet\n  - [Keras LCNet](keras_cv_attention_models\u002Fmobilenetv3_family#lcnet) includes implementation of [PDF 2109.15099 PP-LCNet: A Lightweight CPU Convolutional Neural Network](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2109.15099.pdf).\n\n  | Model    | Params | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | -------- | ------ | ------- | ----- | -------- | ------------ |\n  | [LCNet050](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_050_imagenet.h5) | 1.88M  | 46.02M  | 224   | 63.10    | 3107.89 qps  |\n  | - [ssld](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_050_ssld.h5)   | 1.88M  | 46.02M  | 224   | 66.10    | 3107.89 qps  |\n  | [LCNet075](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_075_imagenet.h5) | 2.36M  | 96.82M  | 224   | 68.82    | 3083.55 qps  |\n  | [LCNet100](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_100_imagenet.h5) | 2.95M  | 158.28M | 224   | 72.10    | 2752.6 qps   |\n  | - [ssld](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_100_ssld.h5)   | 2.95M  | 158.28M | 224   | 74.39    | 2752.6 qps   |\n  | [LCNet150](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_150_imagenet.h5) | 4.52M  | 338.05M | 224   | 73.71    | 2250.69 qps  |\n  | [LCNet200](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_200_imagenet.h5) | 6.54M  | 585.35M | 224   | 75.18    | 2028.31 qps  |\n  | [LCNet250](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_250_imagenet.h5) | 9.04M  | 900.16M | 224   | 76.60    | 1686.7 qps   |\n  | - [ssld](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_250_ssld.h5)   | 9.04M  | 900.16M | 224   | 80.82    | 1686.7 qps   |\n## LeViT\n  - [Keras LeViT](keras_cv_attention_models\u002Flevit) is for [PDF 2104.01136 LeViT: a Vision Transformer in ConvNet’s Clothing for Faster Inference](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.01136.pdf).\n\n  | Model              | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | ------------------ | ------ | ----- | ----- | -------- | ------------ |\n  | [LeViT128S, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Flevit128s_imagenet.h5) | 7.8M   | 0.31G | 224   | 76.6     | 800.53 qps   |\n  | [LeViT128, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Flevit128_imagenet.h5)  | 9.2M   | 0.41G | 224   | 78.6     | 628.714 qps  |\n  | [LeViT192, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Flevit192_imagenet.h5)  | 11M    | 0.66G | 224   | 80.0     | 597.299 qps  |\n  | [LeViT256, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Flevit256_imagenet.h5)  | 19M    | 1.13G | 224   | 81.6     | 538.885 qps  |\n  | [LeViT384, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Flevit384_imagenet.h5)  | 39M    | 2.36G | 224   | 82.6     | 460.139 qps  |\n## MaxViT\n  - [Keras MaxViT](keras_cv_attention_models\u002Fmaxvit) is for [PDF 2204.01697 MaxViT: Multi-Axis Vision Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.01697.pdf).\n\n  | Model                      | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | -------------------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [MaxViT_Tiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_tiny_224_imagenet.h5)                | 31M    | 5.6G   | 224   | 83.62    | 195.283 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_tiny_384_imagenet.h5)                      | 31M    | 17.7G  | 384   | 85.24    | 92.5725 qps  |\n  | - [512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_tiny_512_imagenet.h5)                      | 31M    | 33.7G  | 512   | 85.72    | 52.6485 qps  |\n  | [MaxViT_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_small_224_imagenet.h5)               | 69M    | 11.7G  | 224   | 84.45    | 149.286 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_small_384_imagenet.h5)                    | 69M    | 36.1G  | 384   | 85.74    | 61.5757 qps  |\n  | - [512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_small_512_imagenet.h5)                    | 69M    | 67.6G  | 512   | 86.19    | 34.7002 qps  |\n  | [MaxViT_Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_base_224_imagenet.h5)                | 119M   | 24.2G  | 224   | 84.95    | 74.7351 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_base_384_imagenet.h5)                      | 119M   | 74.2G  | 384   | 86.34    | 31.9028 qps  |\n  | - [512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_base_512_imagenet.h5)                      | 119M   | 138.5G | 512   | 86.66    | 17.8139 qps  |\n  | - [imagenet21k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_base_224_imagenet21k.h5)              | 135M   | 24.2G  | 224   |          | 74.7351 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_base_384_imagenet21k-ft1k.h5)     | 119M   | 74.2G  | 384   | 88.24    | 31.9028 qps  |\n  | - [21k_ft1k, 512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_base_512_imagenet21k-ft1k.h5)     | 119M   | 138.5G | 512   | 88.38    | 17.8139 qps  |\n  | [MaxViT_Large](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_large_224_imagenet.h5)               | 212M   | 43.9G  | 224   | 85.17    | 58.0967 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_large_384_imagenet.h5)                    | 212M   | 133.1G | 384   | 86.40    | 24.1388 qps  |\n  | - [512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_large_512_imagenet.h5)                    | 212M   | 245.4G | 512   | 86.70    | 13.063 qps   |\n  | - [imagenet21k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_large_224_imagenet21k.h5)              | 233M   | 43.9G  | 224   |          | 58.0967 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_large_384_imagenet21k-ft1k.h5)     | 212M   | 133.1G | 384   | 88.32    | 24.1388 qps  |\n  | - [21k_ft1k, 512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_large_512_imagenet21k-ft1k.h5)     | 212M   | 245.4G | 512   | 88.46    | 13.063 qps   |\n  | [MaxViT_XLarge, imagenet21k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_xlarge_224_imagenet21k.h5) | 507M   | 97.7G  | 224   |          |              |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_xlarge_384_imagenet21k-ft1k.h5)    | 475M   | 293.7G | 384   | 88.51    |              |\n  | - [21k_ft1k, 512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_xlarge_512_imagenet21k-ft1k.h5)    | 475M   | 535.2G | 512   | 88.70    |              |\n## MetaTransFormer\n  - [Keras MetaTransFormer](keras_cv_attention_models\u002Fbeit) includes models from [PDF 2307.10802 Meta-Transformer: A Unified Framework for Multimodal Learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.10802).\n\n  | Model                                 | Params  | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------------------------------- | ------- | ------ | ----- | -------- | ------------ |\n  | [MetaTransformerBasePatch16, laion_2b](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fmeta_transformer_base_patch16_384_laion_2b.h5)  | 86.86M  | 55.73G | 384   | 85.4     | 150.731 qps  |\n  | [MetaTransformerLargePatch14, laion_2b](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fmeta_transformer_large_patch14_336_laion_2b.h5) | 304.53M | 191.6G | 336   | 88.1     | 50.1536 qps |\n## MLP mixer\n  - [Keras MLP mixer](keras_cv_attention_models\u002Fmlp_family#mlp-mixer) includes implementation of [PDF 2105.01601 MLP-Mixer: An all-MLP Architecture for Vision](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.01601.pdf).\n\n  | Model            | Params | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | ---------------- | ------ | ------- | ----- | -------- | ------------ |\n  | MLPMixerS32, JFT | 19.1M  | 1.01G   | 224   | 68.70    | 488.839 qps  |\n  | MLPMixerS16, JFT | 18.5M  | 3.79G   | 224   | 73.83    | 451.962 qps  |\n  | MLPMixerB32, JFT | 60.3M  | 3.25G   | 224   | 75.53    | 247.629 qps  |\n  | - [sam](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fmlp_mixer_b32_imagenet_sam.h5)   | 60.3M  | 3.25G   | 224   | 72.47    | 247.629 qps  |\n  | [MLPMixerB16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fmlp_mixer_b16_imagenet.h5)      | 59.9M  | 12.64G  | 224   | 76.44    | 207.423 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fmlp_mixer_b16_imagenet21k.h5)    | 59.9M  | 12.64G  | 224   | 80.64    | 207.423 qps  |\n  | - [sam](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fmlp_mixer_b16_imagenet_sam.h5)   | 59.9M  | 12.64G  | 224   | 77.36    | 207.423 qps  |\n  | - JFT            | 59.9M  | 12.64G  | 224   | 80.00    | 207.423 qps  |\n  | MLPMixerL32, JFT | 206.9M | 11.30G  | 224   | 80.67    | 95.1865 qps  |\n  | [MLPMixerL16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fmlp_mixer_l16_imagenet.h5)      | 208.2M | 44.66G  | 224   | 71.76    | 77.9928 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fmlp_mixer_l16_imagenet21k.h5)    | 208.2M | 44.66G  | 224   | 82.89    | 77.9928 qps  |\n  | - JFT            | 208.2M | 44.66G  | 224   | 84.82    | 77.9928 qps  |\n  | - 448            | 208.2M | 178.54G | 448   | 83.91    |              |\n  | - 448, JFT       | 208.2M | 178.54G | 448   | 86.78    |              |\n  | MLPMixerH14, JFT | 432.3M | 121.22G | 224   | 86.32    |              |\n  | - 448, JFT       | 432.3M | 484.73G | 448   | 87.94    |              |\n## MobileNetV3\n  - [Keras MobileNetV3](keras_cv_attention_models\u002Fmobilenetv3_family#mobilenetv3) includes implementation of [PDF 1905.02244 Searching for MobileNetV3](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.02244.pdf).\n\n  | Model               | Params | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | ------------------- | ------ | ------- | ----- | -------- | ------------ |\n  | [MobileNetV3Small050](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Fmobilenetv3_small_050_imagenet.h5) | 1.29M  | 24.92M  | 224   | 57.89    | 2458.28 qps  |\n  | [MobileNetV3Small075](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Fmobilenetv3_small_075_imagenet.h5) | 2.04M  | 44.35M  | 224   | 65.24    | 2286.44 qps  |\n  | [MobileNetV3Small100](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Fmobilenetv3_small_100_imagenet.h5) | 2.54M  | 57.62M  | 224   | 67.66    | 2058.06 qps  |\n  | [MobileNetV3Large075](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Fmobilenetv3_large_075_imagenet.h5) | 3.99M  | 156.30M | 224   | 73.44    | 1643.78 qps  |\n  | [MobileNetV3Large100](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Fmobilenetv3_large_100_imagenet.h5) | 5.48M  | 218.73M | 224   | 75.77    | 1629.44 qps  |\n  | - [miil](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Fmobilenetv3_large_100_mill.h5)              | 5.48M  | 218.73M | 224   | 77.92    | 1629.44 qps  |\n## MobileViT\n  - [Keras MobileViT](keras_cv_attention_models\u002Fmobilevit) is for [PDF 2110.02178 MOBILEVIT: LIGHT-WEIGHT, GENERAL-PURPOSE, AND MOBILE-FRIENDLY VISION TRANSFORMER](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.02178.pdf).\n\n  | Model         | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | ------------- | ------ | ----- | ----- | -------- | ------------ |\n  | [MobileViT_XXS](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_xxs_imagenet.h5) | 1.3M   | 0.42G | 256   | 69.0     | 1319.43 qps  |\n  | [MobileViT_XS](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_xs_imagenet.h5)  | 2.3M   | 1.05G | 256   | 74.7     | 1019.57 qps  |\n  | [MobileViT_S](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_s_imagenet.h5)   | 5.6M   | 2.03G | 256   | 78.3     | 790.943 qps  |\n## MobileViT_V2\n  - [Keras MobileViT_V2](keras_cv_attention_models\u002Fmobilevit) is for [PDF 2206.02680 Separable Self-attention for Mobile Vision Transformers](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.02680.pdf).\n\n  | Model              | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | ------------------ | ------ | ----- | ----- | -------- | ------------ |\n  | [MobileViT_V2_050](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_050_256_imagenet.h5)   | 1.37M  | 0.47G | 256   | 70.18    | 718.337 qps  |\n  | [MobileViT_V2_075](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_075_256_imagenet.h5)   | 2.87M  | 1.04G | 256   | 75.56    | 642.323 qps  |\n  | [MobileViT_V2_100](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_100_256_imagenet.h5)   | 4.90M  | 1.83G | 256   | 78.09    | 591.217 qps  |\n  | [MobileViT_V2_125](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_125_256_imagenet.h5)   | 7.48M  | 2.84G | 256   | 79.65    | 510.25 qps   |\n  | [MobileViT_V2_150](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_150_256_imagenet.h5)   | 10.6M  | 4.07G | 256   | 80.38    | 466.482 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_150_256_imagenet22k.h5)      | 10.6M  | 4.07G | 256   | 81.46    | 466.482 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_150_384_imagenet22k.h5) | 10.6M  | 9.15G | 384   | 82.60    | 278.834 qps  |\n  | [MobileViT_V2_175](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_175_256_imagenet.h5)   | 14.3M  | 5.52G | 256   | 80.84    | 412.759 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_175_256_imagenet22k.h5)      | 14.3M  | 5.52G | 256   | 81.94    | 412.759 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_175_384_imagenet22k.h5) | 14.3M  | 12.4G | 384   | 82.93    | 247.108 qps  |\n  | [MobileViT_V2_200](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_200_256_imagenet.h5)   | 18.4M  | 7.12G | 256   | 81.17    | 394.325 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_200_256_imagenet22k.h5)      | 18.4M  | 7.12G | 256   | 82.36    | 394.325 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_200_384_imagenet22k.h5) | 18.4M  | 16.2G | 384   | 83.41    | 229.399 qps  |\n## MogaNet\n  - [Keras MogaNet](keras_cv_attention_models\u002Fmoganet) is for [PDF 2211.03295 Efficient Multi-order Gated Aggregation Network](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.03295.pdf).\n\n  | Model        | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------ | ------ | ------ | ----- | -------- | ------------ |\n  | [MogaNetXtiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmoganet\u002Fmoganet_xtiny_imagenet.h5) | 2.96M  | 806M   | 224   | 76.5     | 398.488 qps  |\n  | [MogaNetTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmoganet\u002Fmoganet_tiny_224_imagenet.h5)  | 5.20M  | 1.11G  | 224   | 79.0     | 362.409 qps  |\n  | - [256](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmoganet\u002Fmoganet_tiny_256_imagenet.h5)        | 5.20M  | 1.45G  | 256   | 79.6     | 335.372 qps  |\n  | [MogaNetSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmoganet\u002Fmoganet_small_imagenet.h5) | 25.3M  | 4.98G  | 224   | 83.4     | 249.807 qps  |\n  | [MogaNetBase](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmoganet\u002Fmoganet_base_imagenet.h5)  | 43.7M  | 9.96G  | 224   | 84.2     | 133.071 qps  |\n  | [MogaNetLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmoganet\u002Fmoganet_large_imagenet.h5) | 82.5M  | 15.96G | 224   | 84.6     | 84.2045 qps  |\n## NAT\n  - [Keras NAT](keras_cv_attention_models\u002Fnat) is for [PDF 2204.07143 Neighborhood Attention Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.07143.pdf).\n\n  | Model     | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | --------- | ------ | ------ | ----- | -------- | ------------ |\n  | [NAT_Mini](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fnat_mini_imagenet.h5)  | 20.0M  | 2.73G  | 224   | 81.8     | 85.2324 qps  |\n  | [NAT_Tiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fnat_tiny_imagenet.h5)  | 27.9M  | 4.34G  | 224   | 83.2     | 62.6147 qps  |\n  | [NAT_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fnat_small_imagenet.h5) | 50.7M  | 7.84G  | 224   | 83.7     | 41.1545 qps  |\n  | [NAT_Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fnat_base_imagenet.h5)  | 89.8M  | 13.76G | 224   | 84.3     | 30.8989 qps  |\n## NFNets\n  - [Keras NFNets](keras_cv_attention_models\u002Fnfnets) is for [PDF 2102.06171 High-Performance Large-Scale Image Recognition Without Normalization](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2102.06171.pdf).\n\n  | Model        | Params | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | ------------ | ------ | ------- | ----- | -------- | ------------ |\n  | [NFNetL0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetl0_imagenet.h5)      | 35.07M | 7.13G   | 288   | 82.75    |              |\n  | [NFNetF0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf0_imagenet.h5)      | 71.5M  | 12.58G  | 256   | 83.6     |              |\n  | [NFNetF1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf1_imagenet.h5)      | 132.6M | 35.95G  | 320   | 84.7     |              |\n  | [NFNetF2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf2_imagenet.h5)      | 193.8M | 63.24G  | 352   | 85.1     |              |\n  | [NFNetF3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf3_imagenet.h5)      | 254.9M | 115.75G | 416   | 85.7     |              |\n  | [NFNetF4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf4_imagenet.h5)      | 316.1M | 216.78G | 512   | 85.9     |              |\n  | [NFNetF5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf5_imagenet.h5)      | 377.2M | 291.73G | 544   | 86.0     |              |\n  | [NFNetF6, sam](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf6_imagenet.h5) | 438.4M | 379.75G | 576   | 86.5     |              |\n  | NFNetF7      | 499.5M | 481.80G | 608   |          |              |\n  | [ECA_NFNetL0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Feca_nfnetl0_imagenet.h5)  | 24.14M | 7.12G   | 288   | 82.58    |              |\n  | [ECA_NFNetL1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Feca_nfnetl1_imagenet.h5)  | 41.41M | 14.93G  | 320   | 84.01    |              |\n  | [ECA_NFNetL2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Feca_nfnetl2_imagenet.h5)  | 56.72M | 30.12G  | 384   | 84.70    |              |\n  | ECA_NFNetL3  | 72.04M | 52.73G  | 448   |          |              |\n## PVT_V2\n  - [Keras PVT_V2](keras_cv_attention_models\u002Fpvt) is for [PDF 2106.13797 PVTv2: Improved Baselines with Pyramid Vision Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.13797.pdf).\n\n  | Model           | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | --------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [PVT_V2B0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b0_imagenet.h5)        | 3.7M   | 580.3M | 224   | 70.5     | 561.593 qps  |\n  | [PVT_V2B1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b1_imagenet.h5)        | 14.0M  | 2.14G  | 224   | 78.7     | 392.408 qps  |\n  | [PVT_V2B2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b2_imagenet.h5)        | 25.4M  | 4.07G  | 224   | 82.0     | 210.476 qps  |\n  | [PVT_V2B2_linear](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b2_linear_imagenet.h5) | 22.6M  | 3.94G  | 224   | 82.1     | 226.791 qps  |\n  | [PVT_V2B3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b3_imagenet.h5)        | 45.2M  | 6.96G  | 224   | 83.1     | 135.51 qps   |\n  | [PVT_V2B4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b4_imagenet.h5)        | 62.6M  | 10.19G | 224   | 83.6     | 97.666 qps   |\n  | [PVT_V2B5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b5_imagenet.h5)        | 82.0M  | 11.81G | 224   | 83.8     | 81.4798 qps  |\n## RegNetY\n  - [Keras RegNetY](keras_cv_attention_models\u002Fresnet_family#regnety) is for [PDF 2003.13678 Designing Network Design Spaces](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2003.13678.pdf).\n\n  | Model      | Params  | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ---------- | ------- | ------ | ----- | -------- | ------------ |\n  | [RegNetY040](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnety_040_imagenet.h5) | 20.65M  | 3.98G  | 224   | 82.3     | 749.277 qps  |\n  | [RegNetY064](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnety_064_imagenet.h5) | 30.58M  | 6.36G  | 224   | 83.0     | 436.946 qps  |\n  | [RegNetY080](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnety_080_imagenet.h5) | 39.18M  | 7.97G  | 224   | 83.17    | 513.43 qps   |\n  | [RegNetY160](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnety_160_imagenet.h5) | 83.59M  | 15.92G | 224   | 82.0     | 338.046 qps  |\n  | [RegNetY320](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnety_320_imagenet.h5) | 145.05M | 32.29G | 224   | 82.5     | 188.508 qps  |\n## RegNetZ\n  - [Keras RegNetZ](keras_cv_attention_models\u002Fresnet_family#regnetz) includes implementation of [Github timm\u002Fmodels\u002Fbyobnet.py](https:\u002F\u002Fgithub.com\u002Frwightman\u002Fpytorch-image-models\u002Fblob\u002Fmaster\u002Ftimm\u002Fmodels\u002Fbyobnet.py).\n  - Related paper [PDF 2004.02967 Evolving Normalization-Activation Layers](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.02967.pdf)\n\n  | Model          | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | -------------- | ------ | ----- | ----- | -------- | ------------ |\n  | [RegNetZB16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_b16_imagenet.h5)     | 9.72M  | 1.44G | 224   | 79.868   | 751.035 qps  |\n  | [RegNetZC16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_c16_imagenet.h5)     | 13.46M | 2.50G | 256   | 82.164   | 636.549 qps  |\n  | [RegNetZC16_EVO](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_c16_evo_imagenet.h5) | 13.49M | 2.55G | 256   | 81.9     |              |\n  | [RegNetZD32](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_d32_imagenet.h5)     | 27.58M | 5.96G | 256   | 83.422   | 459.204 qps  |\n  | [RegNetZD8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_d8_imagenet.h5)      | 23.37M | 3.95G | 256   | 83.5     | 460.021 qps  |\n  | [RegNetZD8_EVO](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_d8_evo_imagenet.h5)  | 23.46M | 4.61G | 256   | 83.42    |              |\n  | [RegNetZE8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_e8_imagenet.h5)      | 57.70M | 9.88G | 256   | 84.5     | 274.97 qps   |\n## RepViT\n  - [Keras RepViT](keras_cv_attention_models\u002Frepvit) is for [PDF 2307.09283 RepViT: Revisiting Mobile CNN From ViT Perspective](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.09283.pdf).\n\n  | Model                    | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | ------------------------ | ------ | ----- | ----- | -------- | -------- |\n  | [RepViT_M09, distillation](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Frepvit_m_09_imagenet.h5) | 5.10M  | 0.82G | 224   | 79.1     |  |\n  | - deploy=True            | 5.07M  | 0.82G | 224   | 79.1     | 966.72 qps  |\n  | [RepViT_M10, distillation](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Frepvit_m_10_imagenet.h5) | 6.85M  | 1.12G | 224   | 80.3     | 1157.8 qps  |\n  | - deploy=True            | 6.81M  | 1.12G | 224   | 80.3     |          |\n  | [RepViT_M11, distillation](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Frepvit_m_11_imagenet.h5) | 8.29M  | 1.35G | 224   | 81.2     | 846.682 qps  |\n  | - deploy=True            | 8.24M  | 1.35G | 224   | 81.2     | 1027.5 qps  |\n  | [RepViT_M15, distillation](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Frepvit_m_15_imagenet.h5) | 14.13M | 2.30G | 224   | 82.5     |   |\n  | - deploy=True            | 14.05M | 2.30G | 224   | 82.5     |   |\n  | [RepViT_M23, distillation](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Frepvit_m_23_imagenet.h5) | 23.01M | 4.55G | 224   | 83.7     |  |\n  | - deploy=True            | 22.93M | 4.55G | 224   | 83.7     |          |\n## ResMLP\n  - [Keras ResMLP](keras_cv_attention_models\u002Fmlp_family#resmlp) includes implementation of [PDF 2105.03404 ResMLP: Feedforward networks for image classification with data-efficient training](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.03404.pdf).\n\n  | Model         | Params | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | ------------- | ------ | ------- | ----- | -------- | ------------ |\n  | [ResMLP12](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fresmlp12_imagenet.h5)      | 15M    | 3.02G   | 224   | 77.8     | 928.402 qps  |\n  | [ResMLP24](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fresmlp24_imagenet.h5)      | 30M    | 5.98G   | 224   | 80.8     | 420.709 qps  |\n  | [ResMLP36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fresmlp36_imagenet.h5)      | 116M   | 8.94G   | 224   | 81.1     | 309.513 qps  |\n  | [ResMLP_B24](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fresmlp_b24_imagenet.h5)    | 129M   | 100.39G | 224   | 83.6     | 78.3015 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fresmlp_b24_imagenet22k.h5) | 129M   | 100.39G | 224   | 84.4     | 78.3015 qps  |\n## ResNeSt\n  - [Keras ResNeSt](keras_cv_attention_models\u002Fresnest) is for [PDF 2004.08955 ResNeSt: Split-Attention Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.08955.pdf).\n\n  | Model          | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | -------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [ResNest50](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnest\u002Fresnest50_imagenet.h5)      | 28M    | 5.38G  | 224   | 81.03    | 534.627 qps  |\n  | [ResNest101](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnest\u002Fresnest101_imagenet.h5)     | 49M    | 13.33G | 256   | 82.83    | 257.074 qps  |\n  | [ResNest200](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnest\u002Fresnest200_imagenet.h5)     | 71M    | 35.55G | 320   | 83.84    | 118.183 qps  |\n  | [ResNest269](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnest\u002Fresnest269_imagenet.h5)     | 111M   | 77.42G | 416   | 84.54    | 61.167 qps   |\n## ResNetD\n  - [Keras ResNetD](keras_cv_attention_models\u002Fresnet_family#resnetd) includes implementation of [PDF 1812.01187 Bag of Tricks for Image Classification with Convolutional Neural Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.01187.pdf)\n\n  | Model      | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ---------- | ------ | ------ | ----- | -------- | ------------ |\n  | [ResNet50D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnet50d_imagenet.h5)  | 25.58M | 4.33G  | 224   | 80.530   | 930.214 qps  |\n  | [ResNet101D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnet101d_imagenet.h5) | 44.57M | 8.04G  | 224   | 83.022   | 502.268 qps  |\n  | [ResNet152D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnet152d_imagenet.h5) | 60.21M | 11.75G | 224   | 83.680   | 353.279 qps  |\n  | [ResNet200D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnet200d_imagenet.h5) | 64.69M | 15.25G | 224   | 83.962   | 287.73 qps   |\n## ResNetQ\n  - [Keras ResNetQ](keras_cv_attention_models\u002Fresnet_family#resnetq) includes implementation of [Github timm\u002Fmodels\u002Fresnet.py](https:\u002F\u002Fgithub.com\u002Frwightman\u002Fpytorch-image-models\u002Fblob\u002Fmaster\u002Ftimm\u002Fmodels\u002Fresnet.py)\n\n  | Model     | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | --------- | ------ | ----- | ----- | -------- | ------------ |\n  | [ResNet51Q](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnet51q_imagenet.h5) | 35.7M  | 4.87G | 224   | 82.36    | 838.754 qps  |\n  | ResNet61Q | 36.8M  | 5.96G | 224   |          | 730.245 qps  |\n## ResNeXt\n  - [Keras ResNeXt](keras_cv_attention_models\u002Fresnet_family#resnext) includes implementation of [PDF 1611.05431 Aggregated Residual Transformations for Deep Neural Networks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.05431.pdf).\n  - `SWSL` means `Semi-Weakly Supervised ResNe*t` from [Github facebookresearch\u002Fsemi-supervised-ImageNet1K-models](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fsemi-supervised-ImageNet1K-models). **Please note the CC-BY-NC 4.0 license on theses weights, non-commercial use only**.\n\n  | Model                      | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | -------------------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [ResNeXt50, (32x4d)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext50_imagenet.h5)         | 25M    | 4.23G  | 224   | 79.768   | 1041.46 qps  |\n  | - [SWSL](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext50_swsl.h5)                     | 25M    | 4.23G  | 224   | 82.182   | 1041.46 qps  |\n  | [ResNeXt50D, (32x4d + deep)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext50d_imagenet.h5) | 25M    | 4.47G  | 224   | 79.676   | 1010.94 qps  |\n  | [ResNeXt101, (32x4d)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext101_imagenet.h5)        | 42M    | 7.97G  | 224   | 80.334   | 571.652 qps  |\n  | - [SWSL](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext101_swsl.h5)                     | 42M    | 7.97G  | 224   | 83.230   | 571.652 qps  |\n  | [ResNeXt101W, (32x8d)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext101_imagenet.h5)       | 89M    | 16.41G | 224   | 79.308   | 367.431 qps  |\n  | - [SWSL](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext101w_swsl.h5)                     | 89M    | 16.41G | 224   | 84.284   | 367.431 qps  |\n  | [ResNeXt101W_64, (64x4d)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext101w_64_imagenet.h5)    | 83.46M | 15.46G | 224   | 82.46    | 377.83 qps   |\n## SwinTransformerV2\n  - [Keras SwinTransformerV2](keras_cv_attention_models\u002Fswin_transformer_v2) includes implementation of [PDF 2111.09883 Swin Transformer V2: Scaling Up Capacity and Resolution](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2111.09883.pdf).\n\n  | Model                                | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------------------------------ | ------ | ------ | ----- | -------- | ------------ |\n  | [SwinTransformerV2Tiny_ns](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_tiny_ns_224_imagenet.h5)             | 28.3M  | 4.69G  | 224   | 81.8     | 289.205 qps  |\n  | [SwinTransformerV2Small_ns](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_small_ns_224_imagenet.h5)            | 49.7M  | 9.12G  | 224   | 83.5     | 169.645 qps  |\n  | [SwinTransformerV2Tiny_window8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_tiny_window8_256_imagenet.h5)        | 28.3M  | 5.99G  | 256   | 81.8     | 275.547 qps  |\n  | [SwinTransformerV2Tiny_window16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_tiny_window16_256_imagenet.h5)       | 28.3M  | 6.75G  | 256   | 82.8     | 217.207 qps  |\n  | [SwinTransformerV2Small_window8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_small_window8_256_imagenet.h5)       | 49.7M  | 11.63G | 256   | 83.7     | 157.559 qps  |\n  | [SwinTransformerV2Small_window16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_small_window16_256_imagenet.h5)      | 49.7M  | 12.93G | 256   | 84.1     | 129.953 qps  |\n  | [SwinTransformerV2Base_window8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_base_window8_256_imagenet.h5)        | 87.9M  | 20.44G | 256   | 84.2     | 126.294 qps  |\n  | [SwinTransformerV2Base_window16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_base_window16_256_imagenet.h5)       | 87.9M  | 22.17G | 256   | 84.6     | 99.634 qps   |\n  | [SwinTransformerV2Base_window16, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_base_window16_256_imagenet22k.h5)  | 87.9M  | 22.17G | 256   | 86.2     | 99.634 qps   |\n  | [SwinTransformerV2Base_window24, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_base_window24_384_imagenet22k.h5)  | 87.9M  | 55.89G | 384   | 87.1     | 35.0508 qps  |\n  | [SwinTransformerV2Large_window16, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_large_window16_256_imagenet22k.h5) | 196.7M | 48.03G | 256   | 86.9     |              |\n  | [SwinTransformerV2Large_window24, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_large_window24_384_imagenet22k.h5) | 196.7M | 117.1G | 384   | 87.6     |              |\n## TinyNet\n  - [Keras TinyNet](keras_cv_attention_models\u002Fmobilenetv3_family#tinynet) includes implementation of [PDF 2010.14819 Model Rubik’s Cube: Twisting Resolution, Depth and Width for TinyNets](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.14819.pdf).\n\n  | Model    | Params | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | -------- | ------ | ------- | ----- | -------- | ------------ |\n  | [TinyNetE](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ftinynet_e_imagenet.h5) | 2.04M  | 25.22M  | 106   | 59.86    | 2152.36 qps  |\n  | [TinyNetD](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ftinynet_d_imagenet.h5) | 2.34M  | 53.35M  | 152   | 66.96    | 1905.56 qps  |\n  | [TinyNetC](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ftinynet_c_imagenet.h5) | 2.46M  | 103.22M | 184   | 71.23    | 1353.44 qps  |\n  | [TinyNetB](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ftinynet_b_imagenet.h5) | 3.73M  | 206.28M | 188   | 74.98    | 1196.06 qps  |\n  | [TinyNetA](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ftinynet_a_imagenet.h5) | 6.19M  | 343.74M | 192   | 77.65    | 981.976 qps  |\n## TinyViT\n  - [Keras TinyViT](keras_cv_attention_models\u002Ftinyvit) includes implementation of [PDF 2207.10666 TinyViT: Fast Pretraining Distillation for Small Vision Transformers](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2207.10666.pdf).\n\n  | Model                | Params | FLOPs | Input | Top1 Acc | T4 Inference |\n  | -------------------- | ------ | ----- | ----- | -------- | ------------ |\n  | [TinyViT_5M, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_5m_224_imagenet.h5)  | 5.4M   | 1.3G  | 224   | 79.1     | 631.414 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_5m_224_imagenet21k-ft1k.h5)   | 5.4M   | 1.3G  | 224   | 80.7     | 631.414 qps  |\n  | [TinyViT_11M, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_11m_224_imagenet.h5) | 11M    | 2.0G  | 224   | 81.5     | 509.818 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_11m_224_imagenet21k-ft1k.h5)   | 11M    | 2.0G  | 224   | 83.2     | 509.818 qps  |\n  | [TinyViT_21M, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_21m_224_imagenet.h5) | 21M    | 4.3G  | 224   | 83.1     | 410.676 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_21m_224_imagenet21k-ft1k.h5)   | 21M    | 4.3G  | 224   | 84.8     | 410.676 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_21m_384_imagenet21k-ft1k.h5)           | 21M    | 13.8G | 384   | 86.2     | 199.458 qps  |\n  | - [21k_ft1k, 512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_21m_512_imagenet21k-ft1k.h5)           | 21M    | 27.0G | 512   | 86.5     | 122.846 qps  |\n## UniFormer\n  - [Keras UniFormer](keras_cv_attention_models\u002Funiformer) includes implementation of [PDF 2201.09450 UniFormer: Unifying Convolution and Self-attention for Visual Recognition](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.09450.pdf).\n\n  | Model                | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | -------------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [UniformerSmall32, token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_32_224_token_label.h5) | 22M    | 3.66G  | 224   | 83.4     | 577.334 qps  |\n  | [UniformerSmall64](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_64_224_imagenet.h5)     | 22M    | 3.66G  | 224   | 82.9     | 562.794 qps  |\n  | - [token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_64_224_token_label.h5)     | 22M    | 3.66G  | 224   | 83.4     | 562.794 qps  |\n  | [UniformerSmallPlus32](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_plus_32_224_imagenet.h5) | 24M    | 4.24G  | 224   | 83.4     | 546.82 qps   |\n  | - [token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_plus_32_224_token_label.h5)     | 24M    | 4.24G  | 224   | 83.9     | 546.82 qps   |\n  | [UniformerSmallPlus64](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_plus_64_224_imagenet.h5) | 24M    | 4.23G  | 224   | 83.4     | 538.193 qps  |\n  | - [token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_plus_64_224_token_label.h5)     | 24M    | 4.23G  | 224   | 83.6     | 538.193 qps  |\n  | [UniformerBase32, token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_base_32_224_token_label.h5)  | 50M    | 8.32G  | 224   | 85.1     | 272.485 qps  |\n  | [UniformerBase64](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_base_64_224_imagenet.h5)      | 50M    | 8.31G  | 224   | 83.8     | 286.963 qps  |\n  | - [token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_base_64_224_token_label.h5)     | 50M    | 8.31G  | 224   | 84.8     | 286.963 qps  |\n  | [UniformerLarge64, token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_large_64_224_token_label.h5) | 100M   | 19.79G | 224   | 85.6     | 154.761 qps  |\n  | - [token_label, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_large_64_384_token_label.h5)            | 100M   | 63.11G | 384   | 86.3     | 75.3487 qps  |\n## VanillaNet\n  - [Keras VanillaNet](keras_cv_attention_models\u002Fvanillanet) is for [PDF 2305.12972 VanillaNet: the Power of Minimalism in Deep Learning](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.12972.pdf).\n\n  | Model         | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | ------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [VanillaNet5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_5_imagenet.h5)   | 22.33M | 8.46G  | 224   | 72.49    | 598.964 qps  |\n  | - [deploy=True](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_5_deploy_imagenet.h5) | 15.52M | 5.17G  | 224   | 72.49    | 798.199 qps  |\n  | [VanillaNet6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_6_imagenet.h5)   | 56.12M | 10.11G | 224   | 76.36    | 465.031 qps  |\n  | - [deploy=True](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_6_deploy_imagenet.h5) | 32.51M | 6.00G  | 224   | 76.36    | 655.944 qps  |\n  | [VanillaNet7](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_7_imagenet.h5)   | 56.67M | 11.84G | 224   | 77.98    | 375.479 qps  |\n  | - [deploy=True](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_7_deploy_imagenet.h5) | 32.80M | 6.90G  | 224   | 77.98    | 527.723 qps  |\n  | [VanillaNet8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_8_imagenet.h5)   | 65.18M | 13.50G | 224   | 79.13    | 341.157 qps  |\n  | - [deploy=True](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_8_deploy_imagenet.h5) | 37.10M | 7.75G  | 224   | 79.13    | 479.328 qps  |\n  | [VanillaNet9](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_9_imagenet.h5)   | 73.68M | 15.17G | 224   | 79.87    | 312.815 qps  |\n  | - [deploy=True](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_9_deploy_imagenet.h5) | 41.40M | 8.59G  | 224   | 79.87    | 443.464 qps  |\n  | [VanillaNet10](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_10_imagenet.h5)  | 82.19M | 16.83G | 224   | 80.57    | 277.871 qps  |\n  | - [deploy=True](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_10_deploy_imagenet.h5) | 45.69M | 9.43G  | 224   | 80.57    | 408.082 qps  |\n  | VanillaNet11  | 90.69M | 18.49G | 224   | 81.08    | 267.026 qps  |\n  | - deploy=True | 50.00M | 10.27G | 224   | 81.08    | 377.239 qps  |\n  | VanillaNet12  | 99.20M | 20.16G | 224   | 81.55    | 229.987 qps  |\n  | - deploy=True | 54.29M | 11.11G | 224   | 81.55    | 358.076 qps  |\n  | VanillaNet13  | 107.7M | 21.82G | 224   | 82.05    | 218.256 qps  |\n  | - deploy=True | 58.59M | 11.96G | 224   | 82.05    | 334.244 qps  |\n## ViT-5\n  - [Keras ViT-5](keras_cv_attention_models\u002Fbeit) includes models from [PDF 2602.08071 ViT-5: Vision Transformers for The Mid-2020s](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.08071).\n\n  | Model              | Params | FLOPs  | Input | Top1 Acc |\n  | ------------------ | ------ | ------ | ----- | -------- |\n  | [ViT5_Small_Patch16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fvit5_small_patch16_224_imagenet.h5) | 22.04M | 4.73G  | 224   | 82.2     |\n  | [ViT5_Base_Patch16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fvit5_base_patch16_224_imagenet.h5) | 86.54M | 18.00G | 224   | 84.2     |\n  | [ViT5_Base_Patch16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fvit5_base_patch16_384_imagenet.h5) | 86.83M | 56.19G | 384   | 85.4     |\n  | [ViT5_Large_Patch16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fvit5_large_patch16_224_imagenet.h5) | 304.3M | 63.01G | 224   | 84.9     |\n  | ViT5_Large_Patch16 | 304.6M | 193.2G | 384   | 86.0     |\n## VOLO\n  - [Keras VOLO](keras_cv_attention_models\u002Fvolo) is for [PDF 2106.13112 VOLO: Vision Outlooker for Visual Recognition](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.13112.pdf).\n\n  | Model   | Params | FLOPs   | Input | Top1 Acc | T4 Inference |\n  | ------- | ------ | ------- | ----- | -------- | ------------ |\n  | [VOLO_d1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d1_224_imagenet.h5) | 27M    | 4.82G   | 224   | 84.2     |              |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d1_384_imagenet.h5)   | 27M    | 14.22G  | 384   | 85.2     |              |\n  | [VOLO_d2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d2_224_imagenet.h5) | 59M    | 9.78G   | 224   | 85.2     |              |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d2_384_imagenet.h5)   | 59M    | 28.84G  | 384   | 86.0     |              |\n  | [VOLO_d3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d3_224_imagenet.h5) | 86M    | 13.80G  | 224   | 85.4     |              |\n  | - [448](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d3_448_imagenet.h5)   | 86M    | 55.50G  | 448   | 86.3     |              |\n  | [VOLO_d4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d4_224_imagenet.h5) | 193M   | 29.39G  | 224   | 85.7     |              |\n  | - [448](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d4_448_imagenet.h5)   | 193M   | 117.81G | 448   | 86.8     |              |\n  | [VOLO_d5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d5_224_imagenet.h5) | 296M   | 53.34G  | 224   | 86.1     |              |\n  | - [448](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d5_448_imagenet.h5)   | 296M   | 213.72G | 448   | 87.0     |              |\n  | - [512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d5_512_imagenet.h5)   | 296M   | 279.36G | 512   | 87.1     |              |\n## WaveMLP\n  - [Keras WaveMLP](keras_cv_attention_models\u002Fmlp_family#wavemlp) includes implementation of [PDF 2111.12294 An Image Patch is a Wave: Quantum Inspired Vision MLP](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2111.12294.pdf).\n\n  | Model     | Params | FLOPs  | Input | Top1 Acc | T4 Inference |\n  | --------- | ------ | ------ | ----- | -------- | ------------ |\n  | [WaveMLP_T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fwavemlp_t_imagenet.h5) | 17M    | 2.47G  | 224   | 80.9     | 523.4 qps    |\n  | [WaveMLP_S](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fwavemlp_s_imagenet.h5) | 30M    | 4.55G  | 224   | 82.9     | 203.445 qps  |\n  | [WaveMLP_M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fwavemlp_m_imagenet.h5) | 44M    | 7.92G  | 224   | 83.3     | 147.155 qps  |\n  | WaveMLP_B | 63M    | 10.26G | 224   | 83.6     |              |\n***\n\n# Detection Models\n## EfficientDet\n  - [Keras EfficientDet](keras_cv_attention_models\u002Fefficientdet) includes implementation of [Paper 1911.09070 EfficientDet: Scalable and Efficient Object Detection](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.09070.pdf).\n  - `Det-AdvProp + AutoAugment` [Paper 2103.13886 Robust and Accurate Object Detection via Adversarial Learning](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2103.13886.pdf).\n\n  | Model              | Params | FLOPs   | Input | COCO val AP | test AP | T4 Inference |\n  | ------------------ | ------ | ------- | ----- | ----------- | ------- | ------------ |\n  | [EfficientDetD0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d0_512_coco.h5)     | 3.9M   | 2.55G   | 512   | 34.3        | 34.6    | 248.009 qps  |\n  | - Det-AdvProp      | 3.9M   | 2.55G   | 512   | 35.1        | 35.3    | 248.009 qps  |\n  | [EfficientDetD1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d1_640_coco.h5)     | 6.6M   | 6.13G   | 640   | 40.2        | 40.5    | 133.139 qps  |\n  | - Det-AdvProp      | 6.6M   | 6.13G   | 640   | 40.8        | 40.9    | 133.139 qps  |\n  | [EfficientDetD2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d2_768_coco.h5)     | 8.1M   | 11.03G  | 768   | 43.5        | 43.9    | 89.0523 qps  |\n  | - Det-AdvProp      | 8.1M   | 11.03G  | 768   | 44.3        | 44.3    | 89.0523 qps  |\n  | [EfficientDetD3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d3_896_coco.h5)     | 12.0M  | 24.95G  | 896   | 46.8        | 47.2    | 50.0498 qps  |\n  | - Det-AdvProp      | 12.0M  | 24.95G  | 896   | 47.7        | 48.0    | 50.0498 qps  |\n  | [EfficientDetD4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d4_1024_coco.h5)     | 20.7M  | 55.29G  | 1024  | 49.3        | 49.7    | 28.0086 qps  |\n  | - Det-AdvProp      | 20.7M  | 55.29G  | 1024  | 50.4        | 50.4    | 28.0086 qps  |\n  | [EfficientDetD5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d5_1280_coco.h5)     | 33.7M  | 135.62G | 1280  | 51.2        | 51.5    |              |\n  | - Det-AdvProp      | 33.7M  | 135.62G | 1280  | 52.2        | 52.5    |              |\n  | [EfficientDetD6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d6_1280_coco.h5)     | 51.9M  | 225.93G | 1280  | 52.1        | 52.6    |              |\n  | [EfficientDetD7](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d7_1536_coco.h5)     | 51.9M  | 325.34G | 1536  | 53.4        | 53.7    |              |\n  | [EfficientDetD7X](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d7x_1536_coco.h5)    | 77.0M  | 410.87G | 1536  | 54.4        | 55.1    |              |\n  | [EfficientDetLite0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_lite0_320_coco.h5)  | 3.2M   | 0.98G   | 320   | 27.5        | 26.41   | 599.616 qps  |\n  | [EfficientDetLite1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_lite1_384_coco.h5)  | 4.2M   | 1.97G   | 384   | 32.6        | 31.50   | 369.273 qps  |\n  | [EfficientDetLite2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_lite2_448_coco.h5)  | 5.3M   | 3.38G   | 448   | 36.2        | 35.06   | 278.263 qps  |\n  | [EfficientDetLite3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_lite3_512_coco.h5)  | 8.4M   | 7.50G   | 512   | 39.9        | 38.77   | 180.871 qps  |\n  | [EfficientDetLite3X](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_lite3x_640_coco.h5) | 9.3M   | 14.01G  | 640   | 44.0        | 42.64   | 115.271 qps  |\n  | [EfficientDetLite4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_lite4_640_coco.h5)  | 15.1M  | 20.20G  | 640   | 44.4        | 43.18   | 95.4122 qps  |\n## YOLO_NAS\n  - [Keras YOLO_NAS](keras_cv_attention_models\u002Fyolov8) includes implementation of [Github Deci-AI\u002Fsuper-gradients](https:\u002F\u002Fgithub.com\u002FDeci-AI\u002Fsuper-gradients) YOLO-NAS models.\n\n  | Model                   | Params | FLOPs  | Input | COCO val AP | test AP | T4 Inference |\n  | ----------------------- | ------ | ------ | ----- | ----------- | ------- | ------------ |\n  | [YOLO_NAS_S](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolo_nas_s_before_reparam_coco.h5) | 12.88M | 16.96G | 640   | 47.5        |         | 240.087 qps  |\n  | - [use_reparam_conv=False](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolo_nas_s_coco.h5)  | 12.18M | 15.92G | 640   | 47.5        |         | 345.595 qps  |\n  | [YOLO_NAS_M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolo_nas_m_before_reparam_coco.h5) | 33.86M | 47.12G | 640   | 51.55       |         | 128.96 qps   |\n  | - [use_reparam_conv=False](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolo_nas_m_coco.h5)  | 31.92M | 43.91G | 640   | 51.55       |         | 167.935 qps  |\n  | [YOLO_NAS_L](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolo_nas_l_before_reparam_coco.h5) | 44.53M | 64.53G | 640   | 52.22       |         | 98.6069 qps  |\n  | - [use_reparam_conv=False](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolo_nas_l_coco.h5)  | 42.02M | 59.95G | 640   | 52.22       |         | 131.11 qps   |\n## YOLOR\n  - [Keras YOLOR](keras_cv_attention_models\u002Fyolor) includes implementation of [Paper 2105.04206 You Only Learn One Representation: Unified Network for Multiple Tasks](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.04206.pdf).\n\n  | Model      | Params | FLOPs   | Input | COCO val AP | test AP | T4 Inference |\n  | ---------- | ------ | ------- | ----- | ----------- | ------- | ------------ |\n  | [YOLOR_CSP](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolor\u002Fyolor_csp_coco.h5)  | 52.9M  | 60.25G  | 640   | 50.0        | 52.8    | 118.746 qps  |\n  | [YOLOR_CSPX](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolor\u002Fyolor_csp_x_coco.h5) | 99.8M  | 111.11G | 640   | 51.5        | 54.8    | 67.9444 qps  |\n  | [YOLOR_P6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolor\u002Fyolor_p6_coco.h5)   | 37.3M  | 162.87G | 1280  | 52.5        | 55.7    | 49.3128 qps  |\n  | [YOLOR_W6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolor\u002Fyolor_w6_coco.h5)   | 79.9M  | 226.67G | 1280  | 53.6 ?      | 56.9    | 40.2355 qps  |\n  | [YOLOR_E6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolor\u002Fyolor_e6_coco.h5)   | 115.9M | 341.62G | 1280  | 50.3 ?      | 57.6    | 21.5719 qps  |\n  | [YOLOR_D6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolor\u002Fyolor_d6_coco.h5)   | 151.8M | 467.88G | 1280  | 50.8 ?      | 58.2    | 16.6061 qps  |\n## YOLOV7\n  - [Keras YOLOV7](keras_cv_attention_models\u002Fyolov7) includes implementation of [Paper 2207.02696 YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2207.02696.pdf).\n\n  | Model       | Params | FLOPs  | Input | COCO val AP | test AP | T4 Inference |\n  | ----------- | ------ | ------ | ----- | ----------- | ------- | ------------ |\n  | [YOLOV7_Tiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_tiny_coco.h5) | 6.23M  | 2.90G  | 416   | 33.3        |         | 845.903 qps  |\n  | [YOLOV7_CSP](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_csp_coco.h5)  | 37.67M | 53.0G  | 640   | 51.4        |         | 137.441 qps  |\n  | [YOLOV7_X](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_x_coco.h5)    | 71.41M | 95.0G  | 640   | 53.1        |         | 82.0534 qps  |\n  | [YOLOV7_W6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_w6_coco.h5)   | 70.49M | 180.1G | 1280  | 54.9        |         | 49.9841 qps  |\n  | [YOLOV7_E6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_e6_coco.h5)   | 97.33M | 257.6G | 1280  | 56.0        |         | 31.3852 qps  |\n  | [YOLOV7_D6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_d6_coco.h5)   | 133.9M | 351.4G | 1280  | 56.6        |         | 26.1346 qps  |\n  | [YOLOV7_E6E](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_e6e_coco.h5)  | 151.9M | 421.7G | 1280  | 56.8        |         | 20.1331 qps  |\n## YOLOV8\n  - [Keras YOLOV8](keras_cv_attention_models\u002Fyolov8) includes implementation of [Github ultralytics\u002Fultralytics](https:\u002F\u002Fgithub.com\u002Fultralytics\u002Fultralytics) detection and classification models.\n\n  | Model     | Params | FLOPs  | Input | COCO val AP | test AP | T4 Inference |\n  | --------- | ------ | ------ | ----- | ----------- | ------- | ------------ |\n  | [YOLOV8_N](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_n_coco.h5)   | 3.16M  | 4.39G  | 640   | 37.3        |         | 614.042 qps  |\n  | [YOLOV8_S](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_s_coco.h5)   | 11.17M | 14.33G | 640   | 44.9        |         | 349.528 qps  |\n  | [YOLOV8_M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_m_coco.h5)   | 25.90M | 39.52G | 640   | 50.2        |         | 160.212 qps  |\n  | [YOLOV8_L](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_l_coco.h5)   | 43.69M | 82.65G | 640   | 52.9        |         | 104.452 qps  |\n  | [YOLOV8_X](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_x_coco.h5)   | 68.23M | 129.0G | 640   | 53.9        |         | 66.0428 qps  |\n  | [YOLOV8_X6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_x6_coco.h5) | 97.42M | 522.6G | 1280  | 56.7 ?      |         | 17.4368 qps  |\n## YOLOX\n  - [Keras YOLOX](keras_cv_attention_models\u002Fyolox) includes implementation of [Paper 2107.08430 YOLOX: Exceeding YOLO Series in 2021](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2107.08430.pdf).\n\n  | Model     | Params | FLOPs   | Input | COCO val AP | test AP | T4 Inference |\n  | --------- | ------ | ------- | ----- | ----------- | ------- | ------------ |\n  | [YOLOXNano](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolox\u002Fyolox_nano_coco.h5) | 0.91M  | 0.53G   | 416   | 25.8        |         | 930.57 qps   |\n  | [YOLOXTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolox\u002Fyolox_tiny_coco.h5) | 5.06M  | 3.22G   | 416   | 32.8        |         | 745.2 qps    |\n  | [YOLOXS](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolox\u002Fyolox_s_coco.h5)    | 9.0M   | 13.39G  | 640   | 40.5        | 40.5    | 380.38 qps   |\n  | [YOLOXM](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolox\u002Fyolox_m_coco.h5)    | 25.3M  | 36.84G  | 640   | 46.9        | 47.2    | 181.084 qps  |\n  | [YOLOXL](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolox\u002Fyolox_l_coco.h5)    | 54.2M  | 77.76G  | 640   | 49.7        | 50.1    | 111.517 qps  |\n  | [YOLOXX](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolox\u002Fyolox_x_coco.h5)    | 99.1M  | 140.87G | 640   | 51.5        | 51.5    | 62.3189 qps  |\n***\n\n# Language Models\n## GPT2\n  - [Keras GPT2](keras_cv_attention_models\u002Fgpt2) includes implementation of [Language Models are Unsupervised Multitask Learners](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage-models.pdf). `T4 Inference` is tested using `input_shape=[1, 1024]`.\n\n  | Model            | Params  | FLOPs   | vocab_size | LAMBADA PPL | T4 Inference |\n  | ---------------- | ------- | ------- | ---------- | ----------- | ------------ |\n  | [GPT2_Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpt2\u002Fgpt2_base_webtext.h5)        | 163.04M | 146.42G | 50257      | 35.13       | 51.4483 qps  |\n  | [GPT2_Medium](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpt2\u002Fgpt2_medium_webtext.h5)      | 406.29M | 415.07G | 50257      | 15.60       | 21.756 qps   |\n  | [GPT2_Large](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpt2\u002Fgpt2_large_webtext.h5)       | 838.36M | 890.28G | 50257      | 10.87       |              |\n  | [GPT2_XLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpt2\u002Fgpt2_xlarge_webtext.1.h5), [+.2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpt2\u002Fgpt2_xlarge_webtext.2.h5) | 1.638B  | 1758.3G | 50257      | 8.63        |              |\n## LLaMA2\n  - [Keras LLaMA2](keras_cv_attention_models\u002Fllama2) includes implementation of [PDF 2307.09288 Llama 2: Open Foundation and Fine-Tuned Chat Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.09288.pdf).\n  - `tiny_stories` weights ported from [Github karpathy\u002Fllama2.c](https:\u002F\u002Fgithub.com\u002Fkarpathy\u002Fllama2.c), and `LLaMA2_1B` model weights ported from [Github jzhang38\u002FTinyLlama](https:\u002F\u002Fgithubfast.com\u002Fjzhang38\u002FTinyLlama) `TinyLlama-1.1B-Chat-V0.4` one.\n\n  | Model       | Params | FLOPs  | vocab_size | Val loss | T4 Inference |\n  | ----------- | ------ | ------ | ---------- | -------- | ------------ |\n  | [LLaMA2_15M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fllama2\u002Fllama2_15m_tiny_stories.h5)  | 24.41M | 4.06G  | 32000      | 1.072    |  |\n  | [LLaMA2_42M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fllama2\u002Fllama2_42m_tiny_stories.h5)  | 58.17M | 50.7G  | 32000      | 0.847    |  |\n  | [LLaMA2_110M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fllama2\u002Fllama2_110m_tiny_stories.h5) | 134.1M | 130.2G | 32000      | 0.760    |  |\n  | [LLaMA2_1B](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fllama2\u002Fllama2_1b_tiny_llama_1.1B_chat_v0.4.h5) | 1.10B  | 2.50T  | 32003      |          |\n  | LLaMA2_7B   | 6.74B  | 14.54T | 32000      |          |  |\n***\n\n# Stable Diffusion\n  - [Keras Stable Diffusion](keras_cv_attention_models\u002Fstable_diffusion) includes implementation of [PDF 2112.10752 High-Resolution Image Synthesis with Latent Diffusion Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.10752.pdf). Weights ported from [Github runwayml\u002Fstable-diffusion](https:\u002F\u002Fgithub.com\u002Frunwayml\u002Fstable-diffusion) `sd-v1-5.ckpt`.\n\n  | Model               | Params | FLOPs   | Input               | Download            |\n  | ------------------- | ------ | ------- | ------------------- | ------------------- |\n  | ViTTextLargePatch14 | 123.1M | 6.67G   | [None, 77]          | [vit_text_large_patch14_clip.h5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fvit_text_large_patch14_clip.h5) |\n  | Encoder             | 34.16M | 559.6G  | [None, 512, 512, 3] | [encoder_v1_5.h5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fstable_diffusion\u002Fencoder_v1_5.h5) |\n  | UNet                | 859.5M | 404.4G  | [None, 64, 64, 4]   | [unet_v1_5.h5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fstable_diffusion\u002Funet_v1_5.h5) |\n  | Decoder             | 49.49M | 1259.5G | [None, 64, 64, 4]   | [decoder_v1_5.h5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fstable_diffusion\u002Fdecoder_v1_5.h5) |\n***\n\n# Segmentation Models\n## YOLOV8 Segmentation\n  - [Keras YOLOV8](keras_cv_attention_models\u002Fyolov8) includes implementation of [Github ultralytics\u002Fultralytics](https:\u002F\u002Fgithub.com\u002Fultralytics\u002Fultralytics) segmentation models.\n\n  | Model        | Params | FLOPs   | Input | COCO val mask AP | T4 Inference |\n  | ------------ | ------ | ------- | ----- | ---------------- | ------------ |\n  | [YOLOV8_N_SEG](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_n_seg_coco.h5) | 3.41M  | 6.02G   | 640   | 30.5             |  |\n  | [YOLOV8_S_SEG](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_s_seg_coco.h5) | 11.82M | 20.08G  | 640   | 36.8             |  |\n  | [YOLOV8_M_SEG](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_m_seg_coco.h5) | 27.29M | 52.33G  | 640   | 40.8             |  |\n  | [YOLOV8_L_SEG](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_l_seg_coco.h5) | 46.00M | 105.29G | 640   | 42.6             |  |\n  | [YOLOV8_X_SEG](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_x_seg_coco.h5) | 71.83M | 164.30G | 640   | 43.4             |  |\n## Segment Anything\n  - [Keras Segment Anything](keras_cv_attention_models\u002Fsegment_anything) includes implementation of [PDF 2304.02643 Segment Anything](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.02643).\n\n  | Model               | Params | FLOPs | Input | COCO val mask AP | T4 Inference |\n  | ------------------- | ------ | ----- | ----- | ---------------- | ------------ |\n  | [MobileSAM](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fsegment_anything\u002Fmobile_sam_5m_image_encoder_1024_sam.h5)           | 5.74M  | 39.4G | 1024  | 41.0             |   |\n  | [TinySAM](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fsegment_anything\u002Ftinysam_5m_image_encoder_1024_sam.h5)           | 5.74M  | 39.4G | 1024  | 41.9             |   |\n  | [EfficientViT_SAM_L0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fsegment_anything\u002Fefficientvit_sam_l0_image_encoder_1024_sam.h5) | 30.73M | 35.4G | 512   | 45.7             |   |\n***\n\n# Licenses\n  - This part is copied and modified according to [Github rwightman\u002Fpytorch-image-models](https:\u002F\u002Fgithub.com\u002Frwightman\u002Fpytorch-image-models).\n  - **Code**. The code here is licensed MIT. It is your responsibility to ensure you comply with licenses here and conditions of any dependent licenses. Where applicable, I've linked the sources\u002Freferences for various components in docstrings. If you think I've missed anything please create an issue. So far all of the pretrained weights available here are pretrained on ImageNet and COCO with a select few that have some additional pretraining.\n  - **ImageNet Pretrained Weights**. ImageNet was released for non-commercial research purposes only (https:\u002F\u002Fimage-net.org\u002Fdownload). It's not clear what the implications of that are for the use of pretrained weights from that dataset. Any models I have trained with ImageNet are done for research purposes and one should assume that the original dataset license applies to the weights. It's best to seek legal advice if you intend to use the pretrained weights in a commercial product.\n  - **COCO Pretrained Weights**. Should follow [cocodataset termsofuse](https:\u002F\u002Fcocodataset.org\u002F#termsofuse). The annotations in COCO dataset belong to the COCO Consortium and are licensed under a [Creative Commons Attribution 4.0 License](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby\u002F4.0\u002Flegalcode). The COCO Consortium does not own the copyright of the images. Use of the images must abide by the [Flickr Terms of Use](https:\u002F\u002Fwww.flickr.com\u002Fcreativecommons\u002F). The users of the images accept full responsibility for the use of the dataset, including but not limited to the use of any copies of copyrighted images that they may create from the dataset.\n  - **Pretrained on more than ImageNet and COCO**. Several weights included or references here were pretrained with proprietary datasets that I do not have access to. These include the Facebook WSL, SSL, SWSL ResNe(Xt) and the Google Noisy Student EfficientNet models. The Facebook models have an explicit non-commercial license (CC-BY-NC 4.0, https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fsemi-supervised-ImageNet1K-models, https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FWSL-Images). The Google models do not appear to have any restriction beyond the Apache 2.0 license (and ImageNet concerns). In either case, you should contact Facebook or Google with any questions.\n***\n\n# Citing\n  - **BibTeX**\n    ```bibtex\n    @misc{leondgarse,\n      author = {Leondgarse},\n      title = {Keras CV Attention Models},\n      year = {2022},\n      publisher = {GitHub},\n      journal = {GitHub repository},\n      doi = {10.5281\u002Fzenodo.6506947},\n      howpublished = {\\url{https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models}}\n    }\n    ```\n  - **Latest DOI**: [![DOI](https:\u002F\u002Fzenodo.org\u002Fbadge\u002F391777965.svg)](https:\u002F\u002Fzenodo.org\u002Fbadge\u002Flatestdoi\u002F391777965)\n***\n","# ___Keras_cv_attention_models___\n***\n- **警告：目前与 `keras 3.x` 不兼容。如果使用 `tensorflow>=2.16.0`，需要手动安装 `pip install tf-keras~=$(pip show tensorflow | awk -F ': ' '\u002FVersion\u002F{print $2}')`。导入时，请先于 Tensorflow 导入本包，或设置 `export TF_USE_LEGACY_KERAS=1`。**\n- **不建议直接从 h5 文件下载并加载模型，最好通过构建模型后再加载权重，例如 `import kecam; mm = kecam.models.LCNet050()`。**\n- **用于 TF 的 coco_train_script.py 仍在测试中……**\n\u003C!-- TOC depthFrom:1 depthTo:6 withLinks:1 updateOnSave:1 orderedList:0 -->\n\n- [___>>>> 路线图与待办事项 \u003C\u003C\u003C\u003C___](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fwiki\u002FRoadmap)\n- [通用用法](#general-usage)\n  - [基础](#basic)\n  - [T4 推理](#t4-inference)\n  - [层](#layers)\n  - [模型手术](#model-surgery)\n  - [ImageNet 训练与评估](#imagenet-training-and-evaluating)\n  - [COCO 训练与评估](#coco-training-and-evaluating)\n  - [CLIP 训练与评估](#clip-training-and-evaluating)\n  - [文本训练](#text-training)\n  - [DDPM 训练](#ddpm-training)\n  - [可视化](#visualizing)\n  - [TFLite 转换](#tflite-conversion)\n  - [使用 PyTorch 作为后端](#using-pytorch-as-backend)\n  - [使用 keras core 作为后端](#using-keras-core-as-backend)\n- [识别模型](#recognition-models)\n  - [AotNet](#aotnet)\n  - [BEiT](#beit)\n  - [BEiTV2](#beitv2)\n  - [BotNet](#botnet)\n  - [CAFormer](#caformer)\n  - [CMT](#cmt)\n  - [CoaT](#coat)\n  - [CoAtNet](#coatnet)\n  - [ConvNeXt](#convnext)\n  - [ConvNeXtV2](#convnextv2)\n  - [CoTNet](#cotnet)\n  - [CSPNeXt](#cspnext)\n  - [DaViT](#davit)\n  - [DiNAT](#dinat)\n  - [DINOv2](#dinov2)\n  - [EdgeNeXt](#edgenext)\n  - [EfficientFormer](#efficientformer)\n  - [EfficientFormerV2](#efficientformerv2)\n  - [EfficientNet](#efficientnet)\n  - [EfficientNetEdgeTPU](#efficientnetedgetpu)\n  - [EfficientNetV2](#efficientnetv2)\n  - [EfficientViT_B](#efficientvit_b)\n  - [EfficientViT_M](#efficientvit_m)\n  - [EVA](#eva)\n  - [EVA02](#eva02)\n  - [FasterNet](#fasternet)\n  - [FasterViT](#fastervit)\n  - [FastViT](#fastvit)\n  - [FBNetV3](#fbnetv3)\n  - [FlexiViT](#flexivit)\n  - [GCViT](#gcvit)\n  - [GhostNet](#ghostnet)\n  - [GhostNetV2](#ghostnetv2)\n  - [GMLP](#gmlp)\n  - [GPViT](#gpvit)\n  - [HaloNet](#halonet)\n  - [Hiera](#hiera)\n  - [HorNet](#hornet)\n  - [IFormer](#iformer)\n  - [InceptionNeXt](#inceptionnext)\n  - [LCNet](#lcnet)\n  - [LeViT](#levit)\n  - [MaxViT](#maxvit)\n  - [MetaTransFormer](#metatransformer)\n  - [MLP 混合器](#mlp-mixer)\n  - [MobileNetV3](#mobilenetv3)\n  - [MobileViT](#mobilevit)\n  - [MobileViT_V2](#mobilevit_v2)\n  - [MogaNet](#moganet)\n  - [NAT](#nat)\n  - [NFNets](#nfnets)\n  - [PVT_V2](#pvt_v2)\n  - [RegNetY](#regnety)\n  - [RegNetZ](#regnetz)\n  - [RepViT](#repvit)\n  - [ResMLP](#resmlp)\n  - [ResNeSt](#resnest)\n  - [ResNetD](#resnetd)\n  - [ResNetQ](#resnetq)\n  - [ResNeXt](#resnext)\n  - [SwinTransformerV2](#swintransformerv2)\n  - [TinyNet](#tinynet)\n  - [TinyViT](#tinyvit)\n  - [UniFormer](#uniformer)\n  - [VanillaNet](#vanillanet)\n  - [ViT-5](#vit-5)\n  - [VOLO](#volo)\n  - [WaveMLP](#wavemlp)\n- [检测模型](#detection-models)\n  - [EfficientDet](#efficientdet)\n  - [YOLO_NAS](#yolo_nas)\n  - [YOLOR](#yolor)\n  - [YOLOV7](#yolov7)\n  - [YOLOV8](#yolov8)\n  - [YOLOX](#yolox)\n- [语言模型](#language-models)\n  - [GPT2](#gpt2)\n  - [LLaMA2](#llama2)\n- [稳定扩散](#stable-diffusion)\n- [分割模型](#segmentation-models)\n  - [YOLOV8 分割](#yolov8-segmentation)\n  - [Segment Anything](#segment-anything)\n- [许可证](#licenses)\n- [引用](#citing)\n\n\u003C!-- \u002FTOC -->\n***\n\n# 通用用法\n## 基础\n  - **默认导入** 在 README 中使用时不会特别说明。\n    ```py\n    import os\n    import sys\n    import tensorflow as tf\n    import numpy as np\n    import pandas as pd\n    import matplotlib.pyplot as plt\n    from tensorflow import keras\n    ```\n  - 以 pip 包形式安装。`kecam` 是本包的简称。**注意**：pip 包 `kecam` 不设定任何后端要求，请确保事先已安装 Tensorflow 或 PyTorch。如需使用 PyTorch 后端，请参阅 [Keras PyTorch 后端](keras_cv_attention_models\u002Fpytorch_backend)。\n    ```sh\n    pip install -U kecam\n    # 或\n    pip install -U keras-cv-attention-models\n    # 或\n    pip install -U git+https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\n    ```\n    具体用法请参考各子目录。\n  - **基础模型预测**\n    ```py\n    from keras_cv_attention_models import volo\n    mm = volo.VOLO_d1(pretrained=\"imagenet\")\n\n    \"\"\" 运行预测 \"\"\"\n    import tensorflow as tf\n    from tensorflow import keras\n    from keras_cv_attention_models.test_images import cat\n    img = cat()\n    imm = keras.applications.imagenet_utils.preprocess_input(img, mode='torch')\n    pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy()\n    pred = tf.nn.softmax(pred).numpy()  # 如果分类器激活函数不是 softmax\n    print(keras.applications.imagenet_utils.decode_predictions(pred)[0])\n    # [('n02124075', '埃及猫', 0.99664897),\n    #  ('n02123045', '虎斑猫', 0.0007249644),\n    #  ('n02123159', '虎猫', 0.00020345),\n    #  ('n02127052', '猞猁', 5.4973923e-05),\n    #  ('n02123597', '暹罗猫', 2.675306e-05)]\n    ```\n    或者直接使用模型预设的 `preprocess_input` 和 `decode_predictions`\n    ```py\n    from keras_cv_attention_models import coatnet\n    mm = coatnet.CoAtNet0()\n\n    from keras_cv_attention_models.test_images import cat\n    preds = mm(mm.preprocess_input(cat()))\n    print(mm.decode_predictions(preds))\n    # [[('n02124075', '埃及猫', 0.9999875), ('n02123045', '虎斑猫', 5.194884e-06), ...]]\n    ```\n    预设的 `preprocess_input` 和 `decode_predictions` 也兼容 PyTorch 后端。\n    ```py\n    os.environ['KECAM_BACKEND'] = 'torch'\n\n    from keras_cv_attention_models import caformer\n    mm = caformer.CAFormerS18()\n    # >>>> 使用 PyTorch 后端\n    # >>>> 对齐输入形状：[3, 224, 224]\n    # >>>> 从 ~\u002F.keras\u002Fmodels\u002Fcaformer_s18_224_imagenet.h5 加载预训练权重\n\nfrom keras_cv_attention_models.test_images import cat\n    preds = mm(mm.preprocess_input(cat()))\n    print(preds.shape)\n    # torch.Size([1, 1000])\n    print(mm.decode_predictions(preds))\n    # [[('n02124075', '埃及猫', 0.8817097), ('n02123045', '虎斑猫', 0.009335292), ...]]\n    ```\n  - 设置 **`num_classes=0`** 以排除模型顶部的 `GlobalAveragePooling2D + Dense` 层。\n    ```py\n    from keras_cv_attention_models import resnest\n    mm = resnest.ResNest50(num_classes=0)\n    print(mm.output_shape)\n    # (None, 7, 7, 2048)\n    ```\n  - 如果 **`num_classes={自定义输出类别}`** 不是 `1000` 或 `0`，则会跳过加载头部的 Dense 层权重。这是因为使用了 `model.load_weights(weight_file, by_name=True, skip_mismatch=True)` 来加载权重。\n    ```py\n    from keras_cv_attention_models import swin_transformer_v2\n\n    mm = swin_transformer_v2.SwinTransformerV2Tiny_window8(num_classes=64)\n    # >>>> 从 ~\u002F.keras\u002Fmodels\u002Fswin_transformer_v2_tiny_window8_256_imagenet.h5 加载预训练权重\n    # WARNING:tensorflow:由于权重 predictions\u002Fkernel:0 的形状不匹配，跳过加载第 601 层（名为 predictions）的权重。该权重期望形状为 (768, 64)，而保存的权重形状为 (768, 1000)。\n    # WARNING:tensorflow:由于权重 predictions\u002Fbias:0 的形状不匹配，跳过加载第 601 层（名为 predictions）的权重。该权重期望形状为 (64,)，而保存的权重形状为 (1000,)。\n    ```\n  - 可以通过设置 **`pretrained=\"xxx.h5\"`** 重新加载自己的模型权重。与直接调用 `model.load_weights` 相比，这种方法在重新加载具有不同 `input_shape` 且权重形状不匹配的模型时更为优越。\n    ```py\n    import os\n    from keras_cv_attention_models import coatnet\n    pretrained = os.path.expanduser('~\u002F.keras\u002Fmodels\u002Fcoatnet0_224_imagenet.h5')\n    mm = coatnet.CoAtNet1(input_shape=(384, 384, 3), pretrained=pretrained)  # 没什么意义，只是为了展示用法\n    ```\n  - 可以使用别名 **`kecam`** 代替 `keras_cv_attention_models`。它只是一个仅包含 `from keras_cv_attention_models import *` 的 `__init__.py` 文件。\n    ```py\n    import kecam\n    mm = kecam.yolor.YOLOR_CSP()\n    imm = kecam.test_images.dog_cat()\n    preds = mm(mm.preprocess_input(imm))\n    bboxs, lables, confidences = mm.decode_predictions(preds)[0]\n    kecam.coco.show_image_with_bboxes(imm, bboxs, lables, confidences)\n    ```\n  - 使用 [TF 2.0 功能：FLOPs 计算 #32809](https:\u002F\u002Fgithub.com\u002Ftensorflow\u002Ftensorflow\u002Fissues\u002F32809#issuecomment-849439287) 中的方法计算 FLOPs。对于 PyTorch 后端，需要安装 `thop`：`pip install thop`。\n    ```py\n    from keras_cv_attention_models import coatnet, resnest, model_surgery\n\n    model_surgery.get_flops(coatnet.CoAtNet0())\n    # >>>> FLOPs: 4,221,908,559, GFLOPs: 4.2219G\n    model_surgery.get_flops(resnest.ResNest50())\n    # >>>> FLOPs: 5,378,399,992, GFLOPs: 5.3784G\n    ```\n  - **[已弃用] `tensorflow_addons`** 默认不会被导入。如果直接从 `h5` 文件中加载依赖于 `GroupNormalization` 的模型（如 `MobileViTV2`），则需要先手动导入 `tensorflow_addons`。\n    ```py\n    import tensorflow_addons as tfa\n\n    model_path = os.path.expanduser('~\u002F.keras\u002Fmodels\u002Fmobilevit_v2_050_256_imagenet.h5')\n    mm = keras.models.load_model(model_path)\n    ```\n  - 将 TF 模型导出为 ONNX 格式。对于 TF 需要 `tf2onnx`：`pip install onnx tf2onnx onnxsim onnxruntime`。对于 PyTorch 后端，PyTorch 本身支持导出 ONNX 模型。\n    ```py\n    from keras_cv_attention_models import volo, nat, model_surgery\n    mm = nat.DiNAT_Small(pretrained=True)\n    model_surgery.export_onnx(mm, fuse_conv_bn=True, batch_size=1, simplify=True)\n    # 导出的简化 ONNX：dinat_small.onnx\n\n    # 运行测试\n    from keras_cv_attention_models.imagenet import eval_func\n    aa = eval_func.ONNXModelInterf(mm.name + '.onnx')\n    inputs = np.random.uniform(size=[1, *mm.input_shape[1:]]).astype('float32')\n    print(f\"{np.allclose(aa(inputs), mm(inputs), atol=1e-5) = }\")\n    # np.allclose(aa(inputs), mm(inputs), atol=1e-5) = True\n    ```\n  - **模型摘要** `model_summary.csv` 包含汇总的模型信息。\n    - `params` 表示模型参数数量，单位为 M\n    - `flops` 表示 FLOPs 数量，单位为 G\n    - `input` 表示模型输入形状\n    - `acc_metrics` 表示识别模型的 `Imagenet Top1 Accuracy`，检测模型的 `COCO val AP`\n    - `inference_qps` 表示使用 `batch_size=1 + trtexec` 时的 `T4 推理每秒查询数`\n    - `extra` 表示是否有额外的训练信息。\n    ```py\n    from keras_cv_attention_models import plot_func\n    plot_series = [\n        \"efficientnetv2\", 'tinynet', 'lcnet', 'mobilenetv3', 'fasternet', 'fastervit', 'ghostnet',\n        'inceptionnext', 'efficientvit_b', 'mobilevit', 'convnextv2', 'efficientvit_m', 'hiera',\n    ]\n    plot_func.plot_model_summary(\n        plot_series, model_table=\"model_summary.csv\", log_scale_x=True, allow_extras=['mae_in1k_ft1k']\n    )\n    ```\n    ![model_summary](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_ffe507351a3b.png)\n  - **代码格式** 使用 `line-length=160`：\n    ```sh\n    find .\u002F* -name \"*.py\" | grep -v __init__ | xargs -I {} black -l 160 {}\n    ```\n\n\n## T4 推理\n  - 模型表格中的 **T4 推理** 数据是在 `Tesla T4` 上使用 `trtexec` 测试得到的，使用的环境为 `CUDA=12.0.1-1, Driver=525.60.13`。所有模型均使用 PyTorch 后端导出为 ONNX 格式，且仅使用 `batch_size=1`。**注意：这些数据仅供参考，在不同的批量大小、基准测试工具、平台或实现方式下可能会有所不同**。\n  - 所有结果均在 colab 的 [trtexec.ipynb](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1xLwfvbZNqadkdAZu9b0UzOrETLo657oc?usp=drive_link) 中测试完成，因此任何人都可以复现。\n    ```py\n    os.environ[\"KECAM_BACKEND\"] = \"torch\"\n\n    from keras_cv_attention_models import convnext, test_images, imagenet\n    # >>>> 使用 PyTorch 后端\n    mm = convnext.ConvNeXtTiny()\n    mm.export_onnx(simplify=True)\n    # 导出的 ONNX：convnext_tiny.onnx\n    # 正在运行 onnxsim.simplify...\n    # 导出的简化 ONNX：convnext_tiny.onnx\n\n    # ONNX 运行测试\n    tt = imagenet.eval_func.ONNXModelInterf('convnext_tiny.onnx')\n    print(mm.decode_predictions(tt(mm.preprocess_input(test_images.cat()))))\n    # [[('n02124075', '埃及猫', 0.880507), ('n02123045', '虎斑猫', 0.0047998047), ...]]\n\n    \"\"\" 运行 trtexec 基准测试 \"\"\"\n    !trtexec --onnx=convnext_tiny.onnx --fp16 --allowGPUFallback --useSpinWait --useCudaGraph\n    ```\n## 层\n  - [attention_layers](keras_cv_attention_models\u002Fattention_layers) 仅是一个 `__init__.py` 文件，它导入了模型架构中定义的核心层。例如来自 `botnet` 的 `RelativePositionalEmbedding`，来自 `volo` 的 `outlook_attention`，以及其他许多 `Positional Embedding Layers` \u002F `Attention Blocks`。\n    ```py\n    from keras_cv_attention_models import attention_layers\n    aa = attention_layers.RelativePositionalEmbedding()\n    print(f\"{aa(tf.ones([1, 4, 14, 16, 256])).shape = }\")\n    # aa(tf.ones([1, 4, 14, 16, 256])).shape = TensorShape([1, 4, 14, 16, 14, 16])\n    ```\n\n## 模型手术\n  - [model_surgery](keras_cv_attention_models\u002Fmodel_surgery) 包含在模型构建后用于修改模型参数的函数。\n  ```py\n  from keras_cv_attention_models import model_surgery\n  mm = keras.applications.ResNet50()  # 可训练参数：25,583,592\n\n  # 将所有ReLU替换为PReLU。可训练参数：25,606,312\n  mm = model_surgery.replace_ReLU(mm, target_activation='PReLU')\n\n  # 融合卷积层和批归一化层。可训练参数：25,553,192\n  mm = model_surgery.convert_to_fused_conv_bn_model(mm)\n  ```\n## ImageNet 训练与评估\n  - [ImageNet](keras_cv_attention_models\u002Fimagenet) 包含更详细的使用说明及一些对比结果。\n  - [使用 tensorflow_datasets 初始化 ImageNet 数据集 #9](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F9)。\n  - 对于自定义数据集，可以使用 `custom_dataset_script.py` 创建一个 `json` 格式的文件，该文件可用作训练时的 `--data_name xxx.json` 参数；详细用法请参见 [自定义识别数据集](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F52#discussion-3971513)。\n  - 另一种创建自定义数据集的方法是使用 `tfds.load`，请参考 [编写自定义数据集](https:\u002F\u002Fwww.tensorflow.org\u002Fdatasets\u002Fadd_dataset) 和 @Medicmind 的 [从 tfds 创建私有 tensorflow_datasets #48](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F48)。\n  - 使用 `keras_cv_attention_models` 在 AWS Sagemaker 上运行估算器任务的示例，请参见 @Medicmind 提供的 [AWS Sagemaker 脚本示例](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F107)。\n  - `aotnet.AotNet50` 的默认参数设置是一种典型的 `ResNet50` 架构，其中 `Conv2D` 使用 `use_bias=False`，且填充方式类似于 `PyTorch`。\n  - `train_script.py` 的默认参数配置类似于 [ResNet 再出击：timm 中改进的训练流程](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.00476.pdf) 中的 `A3` 配置，即 `batch_size=256, input_shape=(160, 160)`。\n    ```sh\n    # 默认启用抗锯齿缩放，可通过设置 `--disable_antialias` 关闭。\n    CUDA_VISIBLE_DEVICES='0' TF_XLA_FLAGS=\"--tf_xla_auto_jit=2\" python3 train_script.py --seed 0 -s aotnet50\n    ```\n    ```sh\n    # 使用输入尺寸 (224, 224) 进行评估。\n    # 抗锯齿的使用应与训练时一致。\n    CUDA_VISIBLE_DEVICES='1' python3 eval_script.py -m aotnet50_epoch_103_val_acc_0.7674.h5 -i 224 --central_crop 0.95\n    # >>>> 准确率 top1: 0.78466 top5: 0.94088\n    ```\n    ![aotnet50_imagenet](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_441ce7e7b0f5.png)\n  - **从断点恢复**：通过设置 `--restore_path` 和 `--initial_epoch` 来实现，其他参数保持不变。`restore_path` 的优先级高于 `model` 和 `additional_model_kwargs`，同时会恢复 `optimizer` 和 `loss`。`initial_epoch` 主要用于学习率调度器。如果不确定停止的位置，可以查看 `checkpoints\u002F{save_name}_hist.json`。\n    ```py\n    import json\n    with open(\"checkpoints\u002Faotnet50_hist.json\", \"r\") as ff:\n        aa = json.load(ff)\n    len(aa['lr'])\n    # 41 ==> 已完成 41 个 epoch，因此 initial_epoch 为 41，从第 42 个 epoch 开始继续训练。\n    ```\n    ```sh\n    CUDA_VISIBLE_DEVICES='0' TF_XLA_FLAGS=\"--tf_xla_auto_jit=2\" python3 train_script.py --seed 0 -r checkpoints\u002Faotnet50_latest.h5 -I 41\n    # >>>> 从模型：checkpoints\u002Faotnet50_latest.h5 恢复\n    # 第 42\u002F105 个 epoch\n    ```\n  - **`eval_script.py`** 用于评估模型的准确率。[EfficientNetV2 自测 ImageNet 准确率 #19](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F19) 展示了不同参数如何影响模型的准确率。\n    ```sh\n    # 评估预训练的内置模型\n    CUDA_VISIBLE_DEVICES='1' python3 eval_script.py -m regnet.RegNetZD8\n    # 评估预训练的 timm 模型\n    CUDA_VISIBLE_DEVICES='1' python3 eval_script.py -m timm.models.resmlp_12_224 --input_shape 224\n\n    # 评估特定的 h5 模型\n    CUDA_VISIBLE_DEVICES='1' python3 eval_script.py -m checkpoints\u002Fxxx.h5\n    # 评估特定的 tflite 模型\n    CUDA_VISIBLE_DEVICES='1' python3 eval_script.py -m xxx.tflite\n    ```\n  - **渐进式训练** 参考 [PDF 2104.00298 EfficientNetV2：更小的模型和更快的训练](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.00298.pdf)。AotNet50 A3 渐进式输入尺寸 `96 128 160`：\n    ```sh\n    CUDA_VISIBLE_DEVICES='1' TF_XLA_FLAGS=\"--tf_xla_auto_jit=2\" python3 progressive_train_script.py \\\n    --progressive_epochs 33 66 -1 \\\n    --progressive_input_shapes 96 128 160 \\\n    --progressive_magnitudes 2 4 6 \\\n    -s aotnet50_progressive_3_lr_steps_100 --seed 0\n    ```\n    ![aotnet50_progressive_160](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_0ac3b49a6c69.png)\n  - 使用 `freeze_backbone` 或 `freeze_norm_layers` 进行迁移学习：[EfficientNetV2B0 在 cifar10 上进行迁移学习，测试冻结骨干网络 #55](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F55)。\n  - [CIFAR10 上的 Token Label 训练与测试 #57](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F57)。**目前效果未达预期**。`Token label` 是对 [Github zihangJiang\u002FTokenLabeling](https:\u002F\u002Fgithub.com\u002FzihangJiang\u002FTokenLabeling) 的实现，相关论文为 [PDF 2104.10858 所有 token 都重要：用于训练更好视觉 Transformer 的 Token Labeling](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.10858.pdf)。\n\n## COCO 训练与评估\n  - **目前仍在测试中**。\n  - [COCO](keras_cv_attention_models\u002Fcoco) 提供了更详细的使用说明。\n  - `custom_dataset_script.py` 可用于生成 `json` 格式的文件，该文件可作为 `--data_name xxx.json` 参数用于训练。详细用法请参见 [自定义检测数据集](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F52#discussioncomment-2460664)。\n  - `coco_train_script.py` 的默认参数为 `EfficientDetD0`，配置为 `input_shape=(256, 256, 3), batch_size=64, mosaic_mix_prob=0.5, freeze_backbone_epochs=32, total_epochs=105`。从技术上讲，任何 `金字塔结构骨干` + `EfficientDet \u002F YOLOX 头部 \u002F YOLOR 头部` + `无锚点 \u002F yolor \u002F efficientdet 锚点` 的组合都是支持的。\n  - 目前支持四种类型的锚点，参数 **`anchors_mode`** 用于控制使用哪种锚点，取值为 `[\"efficientdet\", \"anchor_free\", \"yolor\", \"yolov8\"]`。对于 `det_header` 预设，默认为 `None`。\n  - **注意：`YOLOV8` 的边界框输出长度默认为 `regression_len=64`。通常其他检测模型为 `4`，而对于 yolov8 则是 `reg_max=16 -> regression_len = 16 * 4 == 64`。**\n\n    | anchors_mode | use_object_scores | num_anchors | anchor_scale | aspect_ratios | num_scales | grid_zero_start |\n    | ------------ | ----------------- | ----------- | ------------ | ------------- | ---------- | --------------- |\n    | efficientdet | False             | 9           | 4            | [1, 2, 0.5]   | 3          | False           |\n    | anchor_free  | True              | 1           | 1            | [1]           | 1          | True            |\n    | yolor        | True              | 3           | None         | 预设          | None       | offset=0.5      |\n    | yolov8       | False             | 1           | 1            | [1]           | 1          | False           |\n\n    ```sh\n    # 默认 EfficientDetD0\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py\n    # 默认 EfficientDetD0 使用 input_shape 512、优化器 adamw、冻结骨干 16 轮、总共 50 + 5 轮\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py -i 512 -p adamw --freeze_backbone_epochs 16 --lr_decay_steps 50\n\n    # EfficientNetV2B0 骨干 + EfficientDetD0 检测头部\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --backbone efficientnet.EfficientNetV2B0 --det_header efficientdet.EfficientDetD0\n    # ResNest50 骨干 + EfficientDetD0 头部，使用类似 yolox 的无锚点锚点\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --backbone resnest.ResNest50 --anchors_mode anchor_free\n    # UniformerSmall32 骨干 + EfficientDetD0 头部，使用 yolor 锚点\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --backbone uniformer.UniformerSmall32 --anchors_mode yolor\n\n    # 典型的 YOLOXS，使用无锚点锚点\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --det_header yolox.YOLOXS --freeze_backbone_epochs 0\n    # YOLOXS 使用 efficientdet 锚点\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --det_header yolox.YOLOXS --anchors_mode efficientdet --freeze_backbone_epochs 0\n    # CoAtNet0 骨干 + YOLOX 头部，使用 yolor 锚点\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --backbone coatnet.CoAtNet0 --det_header yolox.YOLOX --anchors_mode yolor\n\n    # 典型的 YOLOR_P6，使用 yolor 锚点\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --det_header yolor.YOLOR_P6 --freeze_backbone_epochs 0\n    # YOLOR_P6 使用无锚点锚点\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --det_header yolor.YOLOR_P6 --anchors_mode anchor_free --freeze_backbone_epochs 0\n    # ConvNeXtTiny 骨干 + YOLOR 头部，使用 efficientdet 锚点\n    CUDA_VISIBLE_DEVICES='0' python3 coco_train_script.py --backbone convnext.ConvNeXtTiny --det_header yolor.YOLOR --anchors_mode yolor\n    ```\n    **注：COCO 训练仍在测试中，参数和默认行为可能会发生变化。如果您愿意参与开发，请自行承担风险。**\n  - **`coco_eval_script.py`** 用于在 COCO 验证集上评估模型的 AP \u002F AR。它依赖于 `pip install pycocotools`，该包不在项目依赖中。更多用法请参见 [COCO 评估](keras_cv_attention_models\u002Fcoco#evaluation)。\n    ```sh\n    # EfficientDetD0 使用双线性插值，不启用抗锯齿\n    CUDA_VISIBLE_DEVICES='1' python3 coco_eval_script.py -m efficientdet.EfficientDetD0 --resize_method bilinear --disable_antialias\n    # >>>> [COCOEvalCallback] input_shape: (512, 512), pyramid_levels: [3, 7], anchors_mode: efficientdet\n\n    # YOLOX 使用 BGR 输入格式\n    CUDA_VISIBLE_DEVICES='1' python3 coco_eval_script.py -m yolox.YOLOXTiny --use_bgr_input --nms_method hard --nms_iou_or_sigma 0.65\n    # >>>> [COCOEvalCallback] input_shape: (416, 416), pyramid_levels: [3, 5], anchors_mode: anchor_free\n\n    # YOLOR \u002F YOLOV7 使用 letterbox_pad 等技巧\n    CUDA_VISIBLE_DEVICES='1' python3 coco_eval_script.py -m yolor.YOLOR_CSP --nms_method hard --nms_iou_or_sigma 0.65 \\\n    --nms_max_output_size 300 --nms_topk -1 --letterbox_pad 64 --input_shape 704\n    # >>>> [COCOEvalCallback] input_shape: (704, 704), pyramid_levels: [3, 5], anchors_mode: yolor\n\n    # 指定 h5 模型\n    CUDA_VISIBLE_DEVICES='1' python3 coco_eval_script.py -m checkpoints\u002Fyoloxtiny_yolor_anchor.h5\n    # >>>> [COCOEvalCallback] input_shape: (416, 416), pyramid_levels: [3, 5], anchors_mode: yolor\n    ```\n  - **[实验性] 使用 PyTorch 后端进行训练**\n    ```py\n    import os, sys, torch\n    os.environ[\"KECAM_BACKEND\"] = \"torch\"\n\n    from keras_cv_attention_models.yolov8 import train, yolov8\n    from keras_cv_attention_models import efficientnet\n\n    global_device = torch.device(\"cuda:0\") if torch.cuda.is_available() and int(os.environ.get(\"CUDA_VISIBLE_DEVICES\", \"0\")) >= 0 else torch.device(\"cpu\")\n    # 模型可训练参数：7,023,904，GFLOPs：8.1815G\n    bb = efficientnet.EfficientNetV2B0(input_shape=(3, 640, 640), num_classes=0)\n    model = yolov8.YOLOV8_N(backbone=bb, classifier_activation=None，pretrained=None).to(global_device)  # 注意：classifier_activation=None\n    # 模型 = yolov8.YOLOV8_N(input_shape=(3, None, None)，classifier_activation=None，pretrained=None).to(global_device)\n    ema = train.train(model, dataset_path=\"coco.json\", initial_epoch=0)\n    ```\n    ![yolov8_training](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_9ca0b522c34d.png)\n\n## CLIP 训练与评估\n  - [CLIP](keras_cv_attention_models\u002Fclip) 提供了更详细的使用说明。\n  - `custom_dataset_script.py` 可用于生成 `tsv` 或 `json` 格式的文件，该文件可作为 `--data_name xxx.tsv` 用于训练。详细用法请参见 [自定义字幕数据集](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F52#discussioncomment-6516154)。\n  - **使用 `clip_train_script.py` 在 COCO 字幕数据上训练** 默认的 `--data_path` 是一个测试数据集 `datasets\u002Fcoco_dog_cat\u002Fcaptions.tsv`。\n    ```sh\n    CUDA_VISIBLE_DEVICES=1 TF_XLA_FLAGS=\"--tf_xla_auto_jit=2\" python clip_train_script.py -i 160 -b 128 \\\n    --text_model_pretrained None --data_path coco_captions.tsv\n    ```\n    **通过设置 `KECAM_BACKEND='torch'` 使用 PyTorch 后端进行训练**\n    ```sh\n    KECAM_BACKEND='torch' CUDA_VISIBLE_DEVICES=1 python clip_train_script.py -i 160 -b 128 \\\n    --text_model_pretrained None --data_path coco_captions.tsv\n    ```\n    ![clip_torch_tf](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_884ee09338b0.png)\n## 文本训练\n  - 目前这只是一个简单的实现，基于 [Github karpathy\u002FnanoGPT](https:\u002F\u002Fgithub.com\u002Fkarpathy\u002FnanoGPT) 修改而来。\n  - **使用 `text_train_script.py` 进行训练** 由于数据集是随机采样的，需要指定 `steps_per_epoch`。\n    ```sh\n    CUDA_VISIBLE_DEVICES=1 TF_XLA_FLAGS=\"--tf_xla_auto_jit=2\" python text_train_script.py -m LLaMA2_15M \\\n    --steps_per_epoch 8000 --batch_size 8 --tokenizer SentencePieceTokenizer\n    ```\n    **通过设置 `KECAM_BACKEND='torch'` 使用 PyTorch 后端进行训练**\n    ```sh\n    KECAM_BACKEND='torch' CUDA_VISIBLE_DEVICES=1 python text_train_script.py -m LLaMA2_15M \\\n    --steps_per_epoch 8000 --batch_size 8 --tokenizer SentencePieceTokenizer\n    ```\n    **绘图**\n    ```py\n    from keras_cv_attention_models import plot_func\n    hists = ['checkpoints\u002Ftext_llama2_15m_tensorflow_hist.json', 'checkpoints\u002Ftext_llama2_15m_torch_hist.json']\n    plot_func.plot_hists(hists, addition_plots=['val_loss', 'lr'], skip_first=3)\n    ```\n    ![text_tf_torch](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_169d0748adf2.png)\n## DDPM 训练\n  - [Stable Diffusion](keras_cv_attention_models\u002Fstable_diffusion) 提供了更详细的使用说明。\n  - **注意：使用 PyTorch 后端效果更好，TensorFlow 后端在类似 `--epochs 200` 的训练日志中似乎更容易过拟合，且评估速度大约慢 5 倍。[???]**\n  - **数据集** 可以是一个仅包含图像的目录，用于仅使用图像的基础 DDPM 训练；也可以是一个按照 [自定义识别数据集](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F52#discussion-3971513) 创建的识别 JSON 文件，该文件将使用标签作为指令进行训练。\n    ```sh\n    python custom_dataset_script.py --train_images cifar10\u002Ftrain\u002F --test_images cifar10\u002Ftest\u002F\n    # >>>> 总训练样本数：50000，总测试样本数：10000，类别数：10\n    # >>>> 已保存至：cifar10.json\n    ```\n  - **使用 `ddpm_train_script.py` 在带有标签的 CIFAR10 数据集上训练** 默认的 `--data_path` 是内置的 `cifar10`。\n    ```py\n    # 将 --eval_interval 设置为 50，因为 TensorFlow 的评估速度较慢 [???]\n    TF_XLA_FLAGS=\"--tf_xla_auto_jit=2\" CUDA_VISIBLE_DEVICES=1 python ddpm_train_script.py --eval_interval 50\n    ```\n    **通过设置 `KECAM_BACKEND='torch'` 使用 PyTorch 后端进行训练**\n    ```py\n    KECAM_BACKEND='torch' CUDA_VISIBLE_DEVICES=1 python ddpm_train_script.py\n    ```\n    ![ddpm_unet_test_E100](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_aae031eaab57.png)\n## 可视化\n  - [Visualizing](keras_cv_attention_models\u002Fvisualizing) 用于可视化卷积神经网络的滤波器或注意力图得分。\n  - **make_and_apply_gradcam_heatmap** 用于 Grad-CAM 类激活可视化。\n    ```py\n    from keras_cv_attention_models import visualizing, test_images, resnest\n    mm = resnest.ResNest50()\n    img = test_images.dog()\n    superimposed_img, heatmap, preds = visualizing.make_and_apply_gradcam_heatmap(mm, img, layer_name=\"auto\")\n    ```\n    ![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_b9beb662002f.png)\n  - **plot_attention_score_maps** 用于模型注意力得分图的可视化。\n    ```py\n    from keras_cv_attention_models import visualizing, test_images, botnet\n    img = test_images.dog()\n    _ = visualizing.plot_attention_score_maps(botnet.BotNetSE33T(), img)\n    ```\n    ![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_readme_086c1a2556dc.png)\n\n## TFLite 转换\n  - 目前 `TFLite` 不支持 `tf.image.extract_patches` 和 `perm` 长度大于 4 的 `tf.transpose`。某些操作在最新版本或 `tf-nightly` 版本中可能已支持，例如之前不支持的 `gelu` 和 `groups>1` 的 `Conv2D` 现在已经可以正常使用。如果遇到问题，可以尝试更新 TensorFlow 版本。\n  - 更多讨论请参见 [将训练好的 Keras CV 注意力模型转换为 TFLite #17](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F17)。一些速度测试结果可以在 [如何加速量化模型的推理 #44](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fdiscussions\u002F44#discussioncomment-2348910) 中找到。\n  - 使用最新版 TensorFlow 时，无需再使用诸如 `model_surgery.convert_groups_conv2d_2_split_conv2d` 和 `model_surgery.convert_gelu_to_approximate` 等函数。\n  - 不支持将 `VOLO` 和 `HaloNet` 模型转换为 TFLite 格式，因为这些模型需要更长的 `tf.transpose` `perm`。\n  - **model_surgery.convert_dense_to_conv** 会将所有具有 3D 或 4D 输入的 `Dense` 层转换为 `Conv1D` 或 `Conv2D`，因为当前 TFLite 的 xnnpack 尚不支持此类操作。\n    ```py\n    from keras_cv_attention_models import beit, model_surgery, efficientformer, mobilevit\n\n    mm = efficientformer.EfficientFormerL1()\n    mm = model_surgery.convert_dense_to_conv(mm)  # 将所有 Dense 层转换\n    converter = tf.lite.TFLiteConverter.from_keras_model(mm)\n    open(mm.name + \".tflite\", \"wb\").write(converter.convert())\n    ```\n    | 模型             | Dense, use_xnnpack=false  | Conv, use_xnnpack=false   | Conv, use_xnnpack=true    |\n    | ----------------- | ------------------------- | ------------------------- | ------------------------- |\n    | MobileViT_S       | 推理（平均）215371 us | 推理（平均）163836 us | 推理（平均）163817 us |\n    | EfficientFormerL1 | 推理（平均）126829 us | 推理（平均）107053 us | 推理（平均）107132 us |\n  - **model_surgery.convert_extract_patches_to_conv** 会将 `tf.image.extract_patches` 转换为等效的 `Conv2D` 实现：\n    ```py\n    from keras_cv_attention_models import cotnet, model_surgery\n    from keras_cv_attention_models.imagenet import eval_func\n\n    mm = cotnet.CotNetSE50D()\n    mm = model_surgery.convert_groups_conv2d_2_split_conv2d(mm)\n    # mm = model_surgery.convert_gelu_to_approximate(mm)  # 如果使用最新版 TFLite，则无需此步骤\n    mm = model_surgery.convert_extract_patches_to_conv(mm)\n    converter = tf.lite.TFLiteConverter.from_keras_model(mm)\n    open(mm.name + \".tflite\", \"wb\").write(converter.convert())\n    test_inputs = np.random.uniform(size=[1, *mm.input_shape[1:]])\n    print(np.allclose(mm(test_inputs), eval_func.TFLiteModelInterf(mm.name + '.tflite')(test_inputs), atol=1e-7))\n    # True\n    ```\n  - **model_surgery.prepare_for_tflite** 是上述功能的组合：\n    ```py\n    from keras_cv_attention_models import beit, model_surgery\n\n    mm = beit.BeitBasePatch16()\n    mm = model_surgery.prepare_for_tflite(mm)\n    converter = tf.lite.TFLiteConverter.from_keras_model(mm)\n    open(mm.name + \".tflite\", \"wb\").write(converter.convert())\n    ```\n  - **检测模型** 包括 `efficinetdet`、`yolox` 和 `yolor`，可以直接转换为 TFLite 格式。如果需要将 [DecodePredictions](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fblob\u002Fmain\u002Fkeras_cv_attention_models\u002Fcoco\u002Feval_func.py#L8) 也包含在 TFLite 模型中，则需将 `DecodePredictions` 的 `use_static_output` 设置为 `True`，因为 TFLite 要求输出形状更加静态。模型的输出形状将固定为 `[batch, max_output_size, 6]`。其中最后一维 `6` 表示 `[bbox_top, bbox_left, bbox_bottom, bbox_right, label_index, confidence]`，有效预测是那些 `confidence > 0` 的结果。\n    ```py\n    \"\"\" 初始化模型 \"\"\"\n    from keras_cv_attention_models import efficientdet\n    model = efficientdet.EfficientDetD0(pretrained=\"coco\")\n\n    \"\"\" 创建带有 DecodePredictions 的模型，并设置 `use_static_output=True` \"\"\"\n    model.decode_predictions.use_static_output = True\n    # 如有需要，可调整 score_threshold 和 iou_or_sigma 等参数。\n    nn = model.decode_predictions(model.outputs[0], score_threshold=0.5)\n    bb = keras.models.Model(model.inputs[0], nn)\n\n    \"\"\" 转换为 TFLite \"\"\"\n    converter = tf.lite.TFLiteConverter.from_keras_model(bb)\n    open(bb.name + \".tflite\", \"wb\").write(converter.convert())\n\n    \"\"\" 推理测试 \"\"\"\n    from keras_cv_attention_models.imagenet import eval_func\n    from keras_cv_attention_models import test_images\n\n    dd = eval_func.TFLiteModelInterf(bb.name + \".tflite\")\n    imm = test_images.cat()\n    inputs = tf.expand_dims(tf.image.resize(imm, dd.input_shape[1:-1]), 0)\n    inputs = keras.applications.imagenet_utils.preprocess_input(inputs, mode='torch')\n    preds = dd(inputs)[0]\n    print(f\"{preds.shape = }\")\n    # preds.shape = (100, 6)\n\n    pred = preds[preds[:, -1] > 0]\n    bboxes, labels, confidences = pred[:, :4], pred[:, 4], pred[:, -1]\n    print(f\"{bboxes = }, {labels = }, {confidences = }\")\n    # bboxes = array([[0.22825494, 0.47238672, 0.816262  , 0.8700745 ]], dtype=float32),\n    # labels = array([16.], dtype=float32),\n    # confidences = array([0.8309707], dtype=float32)\n\n    \"\"\" 显示结果 \"\"\"\n    from keras_cv_attention_models.coco import data\n    data.show_image_with_bboxes(imm, bboxes, labels, confidences, num_classes=90)\n    ```\n\n## 使用 PyTorch 作为后端\n  - **实验性** [Keras PyTorch 后端](keras_cv_attention_models\u002Fpytorch_backend)。\n  - **设置环境变量 `export KECAM_BACKEND='torch'` 以启用此 PyTorch 后端。**\n  - 目前支持大多数识别和检测模型，除了 hornet*gf \u002F nfnets \u002F volo。对于检测模型，在运行预测时使用 `torchvision.ops.nms`。\n  - **基本模型构建和预测**。\n    - 如果可用，将加载与 TensorFlow 版本相同的 `h5` 权重。\n    - 注意：`input_shape` 将自动适配图像数据格式。给定 `input_shape=(224, 224, 3)` 或 `input_shape=(3, 224, 224)`，如果使用 `channels_first` 格式，两者都会被设置为 `(3, 224, 224)`。\n    - 注意：模型默认设置为 `eval` 模式。\n    ```py\n    os.environ['KECAM_BACKEND'] = 'torch'\n    from keras_cv_attention_models import res_mlp\n    mm = res_mlp.ResMLP12()\n    # >>>> 从 ~\u002F.keras\u002Fmodels\u002Fresmlp12_imagenet.h5 加载预训练权重\n    print(f\"{mm.input_shape = }\")\n    # mm.input_shape = [None, 3, 224, 224]\n\n    import torch\n    print(f\"{isinstance(mm, torch.nn.Module) = }\")\n    # isinstance(mm, torch.nn.Module) = True\n\n    # 运行预测\n    from keras_cv_attention_models.test_images import cat\n    print(mm.decode_predictions(mm(mm.preprocess_input(cat())))[0])\n    # [('n02124075', 'Egyptian_cat', 0.9597896), ('n02123045', 'tabby', 0.012809471), ...]\n    ```\n  - **导出典型的 PyTorch onnx \u002F pth**。\n    ```py\n    import torch\n    torch.onnx.export(mm, torch.randn(1, 3, *mm.input_shape[2:]), mm.name + \".onnx\")\n\n    # 或者通过 export_onnx\n    mm.export_onnx()\n    # 导出的 onnx 文件：resmlp12.onnx\n\n    mm.export_pth()\n    # 导出的 pth 文件：resmlp12.pth\n    ```\n  - **将权重保存为 h5 文件**。该 `h5` 文件也可以在典型的 TensorFlow 后端模型中加载。目前仅支持权重，不包含模型结构。\n    ```py\n    mm.save_weights(\"foo.h5\")\n    ```\n  - **使用 compile 和 fit 进行训练** 注意：损失函数的参数应为 `y_true, y_pred`，而典型的 PyTorch 损失函数使用 `y_pred, y_true`。\n    ```py\n    import torch\n    from keras_cv_attention_models.backend import models, layers\n    mm = models.Sequential([layers.Input([3, 32, 32]), layers.Conv2D(32, 3), layers.GlobalAveragePooling2D(), layers.Dense(10)])\n    if torch.cuda.is_available():\n        _ = mm.to(\"cuda\")\n    xx = torch.rand([64, *mm.input_shape[1:]])\n    yy = torch.functional.F.one_hot(torch.randint(0, mm.output_shape[-1], size=[64]), mm.output_shape[-1]).float()\n    loss = lambda y_true, y_pred: (y_true - y_pred.float()).abs().mean()\n    # 将检查关键字参数以调用 `self.train_compile` 或 `torch.nn.Module.compile`\n    mm.compile(optimizer=\"AdamW\", loss=loss, metrics='acc', grad_accumulate=4)\n    mm.fit(xx, yy, epochs=2, batch_size=4)\n    ```\n## 使用 keras core 作为后端\n  - **[实验性] 设置环境变量 `export KECAM_BACKEND='keras_core'` 以启用此 `keras_core` 后端。不使用 `keras>3.0`，因为它仍然无法与 TensorFlow==2.15.0 编译。**\n  - `keras-core` 有自己的后端，支持 tensorflow \u002F torch \u002F jax，只需编辑 `~\u002F.keras\u002Fkeras.json` 中的 `\"backend\"` 值即可。\n  - 当前大多数识别模型都支持，除了 `HaloNet` \u002F `BotNet`，同时支持 `GPT2` \u002F `LLaMA2`。\n  - **基本模型构建和预测**。\n    ```py\n    !pip install sentencepiece  # llama2 分词器所需\n    os.environ['KECAM_BACKEND'] = 'keras_core'\n    os.environ['KERAS_BACKEND'] = 'jax'\n    import kecam\n    print(f\"{kecam.backend.backend() = }\")\n    # kecam.backend.backend() = 'jax'\n    mm = kecam.llama2.LLaMA2_42M()\n    # >>>> 从 ~\u002F.keras\u002Fmodels\u002Fllama2_42m_tiny_stories.h5 加载预训练权重\n    mm.run_prediction('当夜幕降临，一位少女站在森林边缘。她手中，')\n    # >>>> 从文件 ~\u002F.keras\u002Fdatasets\u002Fllama_tokenizer.model 加载分词器\n    # \u003Cs>\n    # 当夜幕降临，一位少女站在森林边缘。她手中拿着一颗美丽的钻石。所有人都惊讶地望着它。\n    # “那是什么？”一个孩子问道。\n    # “那是一颗钻石，”少女说道。\n    # ...\n    ```\n***\n\n# 识别模型\n## AotNet\n  - [Keras AotNet](keras_cv_attention_models\u002Faotnet) 只是一个类似于 `ResNet` \u002F `ResNetV2` 的框架，通过设置如 `attn_types` 和 `se_ratio` 等参数来应用不同类型的注意力层。其工作方式类似于 `timm` 中的 `byoanet` \u002F `byobnet`。\n  - 默认参数设置为典型的 `ResNet` 架构，其中 `Conv2D` 不使用偏置，填充方式类似于 PyTorch。\n    ```py\n    from keras_cv_attention_models import aotnet\n    # 混合了 se、outlook、halo、mhsa 和 cot_attention，总参数量为 2100 万。\n    # 50 只是一个大于相对 `num_block` 的数字。\n    attn_types = [None, \"outlook\", [\"bot\", \"halo\"] * 50, \"cot\"],\n    se_ratio = [0.25, 0, 0, 0],\n    model = aotnet.AotNet50V2(attn_types=attn_types, se_ratio=se_ratio, stem_type=\"deep\", strides=1)\n    model.summary()\n    ```\n## BEiT\n  - [Keras BEiT](keras_cv_attention_models\u002Fbeit) 包含来自 [PDF 2106.08254 BEiT: BERT Pre-Training of Image Transformers](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.08254.pdf) 的模型。\n\n  | 模型                      | 参数  | FLOPs   | 输入 | Top1 Acc | T4 推理 |\n  | -------------------------- | ------- | ------- | ----- | -------- | ------------ |\n  | [BeitBasePatch16, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_base_patch16_224_imagenet21k-ft1k.h5)  | 86.53M  | 17.61G  | 224   | 85.240   | 321.226 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_base_patch16_384_imagenet21k-ft1k.h5)            | 86.74M  | 55.70G  | 384   | 86.808   | 164.705 qps  |\n  | [BeitLargePatch16, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_large_patch16_224_imagenet21k-ft1k.h5) | 304.43M | 61.68G  | 224   | 87.476   | 105.998 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_large_patch16_384_imagenet21k-ft1k.h5)            | 305.00M | 191.65G | 384   | 88.382   | 45.7307 qps  |\n  | - [21k_ft1k, 512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_large_patch16_512_imagenet21k-ft1k.h5)            | 305.67M | 363.46G | 512   | 88.584   | 21.3097 qps  |\n\n## BEiTV2\n  - [Keras BEiT](keras_cv_attention_models\u002Fbeit) 包含来自 BeitV2 论文 [PDF 2208.06366 BEiT v2: 带有向量量化视觉分词器的掩码图像建模](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2208.06366.pdf) 的模型。\n\n  | 模型              | 参数量  | FLOPs  | 输入 | Top1 精度 | T4 推理 |\n  | ------------------ | ------- | ------ | ----- | -------- | ------------ |\n  | BeitV2BasePatch16  | 86.53M  | 17.61G | 224   | 85.5     | 322.52 qps   |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_v2_base_patch16_224_imagenet21k-ft1k.h5) | 86.53M          | 17.61G | 224   | 86.5     | 322.52 qps   |\n  | BeitV2LargePatch16 | 304.43M | 61.68G | 224   | 87.3     | 105.734 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fbeit_v2_large_patch16_224_imagenet21k-ft1k.h5)         | 304.43M | 61.68G | 224   | 88.4     | 105.734 qps  |\n## BotNet\n  - [Keras BotNet](keras_cv_attention_models\u002Fbotnet) 对应于 [PDF 2101.11605 用于视觉识别的瓶颈 Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2101.11605.pdf)。\n\n  | 模型         | 参数量 | FLOPs  | 输入 | Top1 精度 | T4 推理 |\n  | ------------- | ------ | ------ | ----- | -------- | ------------ |\n  | BotNet50      | 21M    | 5.42G  | 224   |          | 746.454 qps  |\n  | BotNet101     | 41M    | 9.13G  | 224   |          | 448.102 qps  |\n  | BotNet152     | 56M    | 12.84G | 224   |          | 316.671 qps  |\n  | [BotNet26T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbotnet\u002Fbotnet26t_256_imagenet.h5)     | 12.5M  | 3.30G  | 256   | 79.246   | 1188.84 qps  |\n  | [BotNextECA26T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbotnet\u002Fbotnext_eca26t_256_imagenet.h5) | 10.59M | 2.45G  | 256   | 79.270   | 1038.19 qps  |\n  | [BotNetSE33T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbotnet\u002Fbotnet_se33t_256_imagenet.h5)   | 13.7M  | 3.89G  | 256   | 81.2     | 610.429 qps  |\n\n## CAFormer\n  - [Keras CAFormer](keras_cv_attention_models\u002Fcaformer) 对应于 [PDF 2210.13452 MetaFormer 视觉基准模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2210.13452.pdf)。`CAFormer` 使用两个 Transformer 堆栈，而 `ConvFormer` 则完全由卷积块组成。\n\n  | 模型                   | 参数量 | FLOPs | 输入分辨率 | Top1 准确率 | T4 推理速度 |\n  | ----------------------- | ------ | ----- | -------- | ---------- | ------------ |\n  | [CAFormerS18](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s18_224_imagenet.h5)             | 26M    | 4.1G  | 224   | 83.6     | 399.127 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s18_384_imagenet.h5)                   | 26M    | 13.4G | 384   | 85.0     | 181.993 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s18_224_imagenet21k-ft1k.h5)      | 26M    | 4.1G  | 224   | 84.1     | 399.127 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s18_384_imagenet21k-ft1k.h5) | 26M    | 13.4G | 384   | 85.4     | 181.993 qps  |\n  | [CAFormerS36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s36_224_imagenet.h5)             | 39M    | 8.0G  | 224   | 84.5     | 204.328 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s36_384_imagenet.h5)                   | 39M    | 26.0G | 384   | 85.7     | 102.04 qps   |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s36_224_imagenet21k-ft1k.h5)      | 39M    | 8.0G  | 224   | 85.8     | 204.328 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_s36_384_imagenet21k-ft1k.h5) | 39M    | 26.0G | 384   | 86.9     | 102.04 qps   |\n  | [CAFormerM36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_m36_224_imagenet.h5)             | 56M    | 13.2G | 224   | 85.2     | 162.257 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_m36_384_imagenet.h5)                   | 56M    | 42.0G | 384   | 86.2     | 65.6188 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_m36_224_imagenet21k-ft1k.h5)      | 56M    | 13.2G | 224   | 86.6     | 162.257 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_m36_384_imagenet21k-ft1k.h5) | 56M    | 42.0G | 384   | 87.5     | 65.6188 qps  |\n  | [CAFormerB36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_b36_224_imagenet.h5)             | 99M    | 23.2G | 224   | 85.5     | 116.865 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_b36_384_imagenet.h5)                   | 99M    | 72.2G | 384   | 86.4     | 50.0244 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_b36_224_imagenet21k-ft1k.h5)      | 99M    | 23.2G | 224   | 87.4     | 116.865 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fcaformer_b36_384_imagenet21k-ft1k.h5) | 99M    | 72.2G | 384   | 88.1     | 50.0244 qps  |\n\n  | 模型                   | 参数量 | FLOPs | 输入分辨率 | Top1 准确率 | T4 推理速度 |\n  | ----------------------- | ------ | ----- | -------- | ---------- | ------------ |\n  | [ConvFormerS18](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s18_224_imagenet.h5)           | 27M    | 3.9G  | 224   | 83.0     | 295.114 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s18_384_imagenet.h5)                   | 27M    | 11.6G | 384   | 84.4     | 145.923 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s18_224_imagenet21k-ft1k.h5)      | 27M    | 3.9G  | 224   | 83.7     | 295.114 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s36_384_imagenet21k-ft1k.h5) | 27M    | 11.6G | 384   | 85.0     | 145.923 qps  |\n  | [ConvFormerS36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s36_224_imagenet.h5)           | 40M    | 7.6G  | 224   | 84.1     | 161.609 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s36_384_imagenet.h5)                   | 40M    | 22.4G | 384   | 85.4     | 80.2101 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s36_224_imagenet21k-ft1k.h5)      | 40M    | 7.6G  | 224   | 85.4     | 161.609 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_s36_384_imagenet21k-ft1k.h5) | 40M    | 22.4G | 384   | 86.4     | 80.2101 qps  |\n  | [ConvFormerM36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_m36_224_imagenet.h5)           | 57M    | 12.8G | 224   | 84.5     | 130.161 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_m36_384_imagenet.h5)                   | 57M    | 37.7G | 384   | 85.6     | 63.9712 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_m36_224_imagenet21k-ft1k.h5)      | 57M    | 12.8G | 224   | 86.1     | 130.161 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_m36_384_imagenet21k-ft1k.h5) | 57M    | 37.7G | 384   | 86.9     | 63.9712 qps  |\n  | [ConvFormerB36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_b36_224_imagenet.h5)           | 100M   | 22.6G | 224   | 84.8     | 98.0751 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_b36_384_imagenet.h5)                   | 100M   | 66.5G | 384   | 85.7     | 48.5897 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_b36_224_imagenet21k-ft1k.h5)      | 100M   | 22.6G | 224   | 87.0     | 98.0751 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcaformer\u002Fconvformer_b36_384_imagenet21k-ft1k.h5) | 100M   | 66.5G | 384   | 87.6     | 48.5897 qps  |\n\n## CMT\n  - [Keras CMT](keras_cv_attention_models\u002Fcmt) 对应论文 [PDF 2107.06263 CMT: 卷积神经网络与视觉Transformer的结合](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2107.06263.pdf)。\n\n  | 模型                              | 参数量 | FLOPs | 输入尺寸 | Top1准确率 | T4推理速度 |\n  | ---------------------------------- | ------ | ----- | ------- | ---------- | ---------- |\n  | CMTTiny, (自训练105轮)             | 9.5M   | 0.65G | 160     | 77.4       | 315.566 qps |\n  | - [(305轮)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcmt\u002Fcmt_tiny_160_imagenet.h5)                     | 9.5M   | 0.65G | 160     | 78.94    | 315.566 qps |\n  | - [224, (微调69轮)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcmt\u002Fcmt_tiny_224_imagenet.h5)      | 9.5M   | 1.32G | 224     | 80.73    | 254.87 qps   |\n  | [CMTTiny_torch, (1000轮)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcmt\u002Fcmt_tiny_torch_160_imagenet.h5)       | 9.5M   | 0.65G | 160     | 79.2     | 338.207 qps  |\n  | [CMTXS_torch](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcmt\u002Fcmt_xs_torch_192_imagenet.h5)                        | 15.2M  | 1.58G | 192     | 81.8     | 241.288 qps  |\n  | [CMTSmall_torch](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcmt\u002Fcmt_small_torch_224_imagenet.h5)                     | 25.1M  | 4.09G | 224     | 83.5     | 171.109 qps  |\n  | [CMTBase_torch](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcmt\u002Fcmt_base_torch_256_imagenet.h5)                      | 45.7M  | 9.42G | 256     | 84.5     | 103.34 qps   |\n## CoaT\n  - [Keras CoaT](keras_cv_attention_models\u002Fcoat) 对应论文 [PDF 2104.06399 CoaT: 协尺度卷积-注意力图像Transformer](http:\u002F\u002Farxiv.org\u002Fabs\u002F2104.06399)。\n\n  | 模型         | 参数量 | FLOPs | 输入尺寸 | Top1准确率 | T4推理速度 |\n  | ------------- | ------ | ----- | ------- | ---------- | ---------- |\n  | [CoaTLiteTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoat\u002Fcoat_lite_tiny_imagenet.h5)  | 5.7M   | 1.60G | 224     | 77.5     | 450.27 qps   |\n  | [CoaTLiteMini](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoat\u002Fcoat_lite_mini_imagenet.h5)  | 11M    | 2.00G | 224     | 79.1     | 452.884 qps  |\n  | [CoaTLiteSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoat\u002Fcoat_lite_small_imagenet.h5) | 20M    | 3.97G | 224     | 81.9     | 248.846 qps  |\n  | [CoaTTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoat\u002Fcoat_tiny_imagenet.h5)      | 5.5M   | 4.33G | 224     | 78.3     | 152.495 qps  |\n  | [CoaTMini](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoat\u002Fcoat_mini_imagenet.h5)      | 10M    | 6.78G | 224     | 81.0     | 124.845 qps  |\n## CoAtNet\n  - [Keras CoAtNet](keras_cv_attention_models\u002Fcoatnet) 对应论文 [PDF 2106.04803 CoAtNet: 将卷积与注意力结合以适应所有数据规模](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.04803.pdf)。\n\n  | 模型                               | 参数量 | FLOPs  | 输入尺寸 | Top1准确率 | T4推理速度 |\n  | ----------------------------------- | ------ | ------ | ------- | ---------- | ---------- |\n  | [CoAtNet0, 160, (105轮)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoatnet\u002Fcoatnet0_160_imagenet.h5) | 23.3M  | 2.09G  | 160     | 80.48    | 584.059 qps  |\n  | [CoAtNet0, (305轮)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcoatnet\u002Fcoatnet0_224_imagenet.h5) | 23.8M  | 4.22G  | 224     | 82.79    | 400.333 qps  |\n  | CoAtNet0                            | 25M    | 4.6G   | 224     | 82.0     | 400.333 qps  |\n  | - use_dw_strides=False              | 25M    | 4.2G   | 224     | 81.6     | 461.197 qps  |\n  | CoAtNet1                            | 42M    | 8.8G   | 224     | 83.5     | 206.954 qps  |\n  | - use_dw_strides=False              | 42M    | 8.4G   | 224     | 83.3     | 228.938 qps  |\n  | CoAtNet2                            | 75M    | 16.6G  | 224     | 84.1     | 156.359 qps  |\n  | - use_dw_strides=False              | 75M    | 15.7G  | 224     | 84.1     | 165.846 qps  |\n  | CoAtNet2, 21k_ft1k                  | 75M    | 16.6G  | 224     | 87.1     | 156.359 qps  |\n  | CoAtNet3                            | 168M   | 34.7G  | 224     | 84.5     | 95.0703 qps  |\n  | CoAtNet3, 21k_ft1k                  | 168M   | 34.7G  | 224     | 87.6     | 95.0703 qps  |\n  | CoAtNet3, 21k_ft1k                  | 168M   | 203.1G | 512     | 87.9     | 95.0703 qps  |\n  | CoAtNet4, 21k_ft1k                  | 275M   | 360.9G | 512     | 88.1     | 74.6022 qps  |\n  | CoAtNet4, 21k_ft1k, PT-RA-E150      | 275M   | 360.9G | 512     | 88.56    | 74.6022 qps  |\n\n## ConvNeXt\n  - [Keras ConvNeXt](keras_cv_attention_models\u002Fconvnext) 对应论文 [PDF 2201.03545 A ConvNet for the 2020s](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.03545.pdf)。\n\n  | 模型                   | 参数量 | FLOPs   | 输入尺寸 | Top1准确率 | T4推理速度 |\n  | ----------------------- | ------ | ------- | -------- | ---------- | ------------ |\n  | [ConvNeXtTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_tiny_imagenet.h5)            | 28M    | 4.49G   | 224      | 82.1       | 361.58 qps   |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_tiny_224_imagenet21k-ft1k.h5)      | 28M    | 4.49G   | 224      | 82.9       | 361.58 qps   |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_tiny_384_imagenet21k-ft1k.h5) | 28M    | 13.19G  | 384      | 84.1       | 182.134 qps  |\n  | [ConvNeXtSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_small_imagenet.h5)           | 50M    | 8.73G   | 224      | 83.1       | 202.007 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_small_224_imagenet21k-ft1k.h5)      | 50M    | 8.73G   | 224      | 84.6       | 202.007 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_small_384_imagenet21k-ft1k.h5) | 50M    | 25.67G  | 384      | 85.8       | 108.125 qps  |\n  | [ConvNeXtBase](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_base_224_imagenet.h5)            | 89M    | 15.42G  | 224      | 83.8       | 160.036 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_base_384_imagenet.h5)                   | 89M    | 45.32G  | 384      | 85.1       | 83.3095 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_base_224_imagenet21k-ft1k.h5)      | 89M    | 15.42G  | 224      | 85.8       | 160.036 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_base_384_imagenet21k-ft1k.h5) | 89M    | 45.32G  | 384      | 86.8       | 83.3095 qps  |\n  | [ConvNeXtLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_large_224_imagenet.h5)           | 198M   | 34.46G  | 224      | 84.3       | 102.27 qps   |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_large_384_imagenet.h5)                   | 198M   | 101.28G | 384      | 85.5       | 47.2086 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_large_224_imagenet21k-ft1k.h5)      | 198M   | 34.46G  | 224      | 86.6       | 102.27 qps   |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_large_384_imagenet21k-ft1k.h5) | 198M   | 101.28G | 384      | 87.5       | 47.2086 qps  |\n  | [ConvNeXtXlarge, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_xlarge_224_imagenet21k-ft1k.h5)     | 350M   | 61.06G  | 224      | 87.0       | 40.5776 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_xlarge_384_imagenet21k-ft1k.h5)              | 350M   | 179.43G | 384      | 87.8       | 21.797 qps   |\n  | [ConvNeXtXXLarge, clip](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_xxlarge_clip-ft1k.h5)   | 846M   | 198.09G | 256      | 88.6       |              |\n\n## ConvNeXtV2\n  - [Keras ConvNeXt](keras_cv_attention_models\u002Fconvnext) 包含对 [PDF 2301.00808 ConvNeXt V2：与掩码自编码器协同设计和扩展卷积网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2301.00808.pdf) 的实现。**请注意，这些权重采用 CC-BY-NC 4.0 许可证，仅限非商业用途**。\n\n  | 模型                   | 参数量 | FLOPs  | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | ----------------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [ConvNeXtV2Atto](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_atto_imagenet.h5)          | 3.7M   | 0.55G  | 224   | 76.7     | 705.822 qps  |\n  | [ConvNeXtV2Femto](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_femto_imagenet.h5)         | 5.2M   | 0.78G  | 224   | 78.5     | 728.02 qps   |\n  | [ConvNeXtV2Pico](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_pico_imagenet.h5)          | 9.1M   | 1.37G  | 224   | 80.3     | 591.502 qps  |\n  | [ConvNeXtV2Nano](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_nano_imagenet.h5)          | 15.6M  | 2.45G  | 224   | 81.9     | 471.918 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_nano_224_imagenet21k-ft1k.h5)      | 15.6M  | 2.45G  | 224   | 82.1     | 471.918 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_nano_384_imagenet21k-ft1k.h5) | 15.6M  | 7.21G  | 384   | 83.4     | 213.802 qps  |\n  | [ConvNeXtV2Tiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_tiny_imagenet.h5)          | 28.6M  | 4.47G  | 224   | 83.0     | 301.982 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_tiny_224_imagenet21k-ft1k.h5)      | 28.6M  | 4.47G  | 224   | 83.9     | 301.982 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_tiny_384_imagenet21k-ft1k.h5) | 28.6M  | 13.1G  | 384   | 85.1     | 139.578 qps  |\n  | [ConvNeXtV2Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_base_imagenet.h5)          | 89M    | 15.4G  | 224   | 84.9     | 132.575 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_base_224_imagenet21k-ft1k.h5)      | 89M    | 15.4G  | 224   | 86.8     | 132.575 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_base_384_imagenet21k-ft1k.h5) | 89M    | 45.2G  | 384   | 87.7     | 66.5729 qps  |\n  | [ConvNeXtV2Large](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_large_imagenet.h5)         | 198M   | 34.4G  | 224   | 85.8     | 86.8846 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_large_224_imagenet21k-ft1k.h5)      | 198M   | 34.4G  | 224   | 87.3     | 86.8846 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_large_384_imagenet21k-ft1k.h5) | 198M   | 101.1G | 384   | 88.2     | 24.4542 qps  |\n  | [ConvNeXtV2Huge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_huge_imagenet.h5)          | 660M   | 115G   | 224   | 86.3     |              |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_huge_384_imagenet21k-ft1k.h5)      | 660M   | 337.9G | 384   | 88.7     |              |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fconvnext\u002Fconvnext_v2_huge_512_imagenet21k-ft1k.h5) | 660M   | 600.8G | 512   | 88.9     |              |\n## CoTNet\n  - [Keras CoTNet](keras_cv_attention_models\u002Fcotnet) 对应于 [PDF 2107.12292 用于视觉识别的上下文变换器网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2107.12292.pdf)。\n\n  | 模型        | 参数量 | FLOPs  | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | ------------ |:------:| ------ | ----- |:--------:| ------------ |\n  | [CotNet50](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcotnet\u002Fcotnet50_224_imagenet.h5)     | 22.2M  | 3.25G  | 224   |   81.3   | 324.913 qps  |\n  | [CotNetSE50D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcotnet\u002Fcotnet_se50d_224_imagenet.h5)  | 23.1M  | 4.05G  | 224   |   81.6   | 513.077 qps  |\n  | [CotNet101](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcotnet\u002Fcotnet101_224_imagenet.h5)    | 38.3M  | 6.07G  | 224   |   82.8   | 183.824 qps  |\n  | [CotNetSE101D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcotnet\u002Fcotnet_se101d_224_imagenet.h5) | 40.9M  | 8.44G  | 224   |   83.2   | 251.487 qps  |\n  | [CotNetSE152D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcotnet\u002Fcotnet_se152d_224_imagenet.h5) | 55.8M  | 12.22G | 224   |   84.0   | 175.469 qps  |\n  | [CotNetSE152D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcotnet\u002Fcotnet_se152d_320_imagenet.h5) | 55.8M  | 24.92G | 320   |   84.6   | 175.469 qps  |\n## CSPNeXt\n  - [Keras CSPNeXt](keras_cv_attention_models\u002Fcspnext) 是 [PDF 2212.07784 RTMDet：实时目标检测器设计的实证研究](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.07784) 中主干网络的实现。\n\n  | 模型         | 参数量 | FLOPs | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | ------------- | ------ | ----- | ----- | -------- | -------- |\n  | [CSPNeXtTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcspnext\u002Fcspnext_tiny_imagenet.h5)   | 2.73M  | 0.34G | 224   | 69.44    |  |\n  | [CSPNeXtSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcspnext\u002Fcspnext_small_imagenet.h5)  | 4.89M  | 0.66G | 224   | 74.41    |  |\n  | [CSPNeXtMedium](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcspnext\u002Fcspnext_medium_imagenet.h5) | 13.05M | 1.92G | 224   | 79.27    |  |\n  | [CSPNeXtLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcspnext\u002Fcspnext_large_imagenet.h5)  | 27.16M | 4.19G | 224   | 81.30    |  |\n  | [CSPNeXtXLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fcspnext\u002Fcspnext_xlarge_imagenet.h5) | 48.85M | 7.75G | 224   | 82.10    |  |\n\n## DaViT\n  - [Keras DaViT](keras_cv_attention_models\u002Fdavit) 对应论文 [PDF 2204.03645 DaViT: 双注意力视觉 Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.03645.pdf)。\n\n  | 模型              | 参数量 | FLOPs  | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | ------------------ | ------ | ------ | -------- | ----------- | ------------ |\n  | [DaViT_T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fdavit\u002Fdavit_t_imagenet.h5)            | 28.36M | 4.56G  | 224      | 82.8%     | 224.563 qps  |\n  | [DaViT_S](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fdavit\u002Fdavit_s_imagenet.h5)            | 49.75M | 8.83G  | 224      | 84.2%     | 145.838 qps  |\n  | [DaViT_B](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fdavit\u002Fdavit_b_imagenet.h5)            | 87.95M | 15.55G | 224      | 84.6%     | 114.527 qps  |\n  | DaViT_L, 21k_ft1k  | 196.8M | 103.2G | 384      | 87.5%     | 34.7015 qps  |\n  | DaViT_H, 1.5B      | 348.9M | 327.3G | 512      | 90.2%     | 12.363 qps   |\n  | DaViT_G, 1.5B      | 1.406B | 1.022T | 512      | 90.4%     |              |\n## DiNAT\n  - [Keras DiNAT](keras_cv_attention_models\u002Fnat) 对应论文 [PDF 2209.15001 扩张邻域注意力 Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2209.15001.pdf)。\n\n  | 模型                     | 参数量 | FLOPs  | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | ------------------------- | ------ | ------ | -------- | ----------- | ------------ |\n  | [DiNAT_Mini](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_mini_imagenet.h5)                | 20.0M  | 2.73G  | 224      | 81.8%     | 83.9943 qps  |\n  | [DiNAT_Tiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_tiny_imagenet.h5)                | 27.9M  | 4.34G  | 224      | 82.7%     | 61.1902 qps  |\n  | [DiNAT_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_small_imagenet.h5)               | 50.7M  | 7.84G  | 224      | 83.8%     | 41.0343 qps  |\n  | [DiNAT_Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_base_imagenet.h5)                | 89.8M  | 13.76G | 224      | 84.4%     | 30.1332 qps  |\n  | [DiNAT_Large, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_large_224_imagenet21k-ft1k.h5)     | 200.9M | 30.58G | 224      | 86.6%     | 18.4936 qps  |\n  | - [21k, (num_classes=21841)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_large_imagenet21k.h5)   | 200.9M | 30.58G | 224      |          |              |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_large_384_imagenet21k-ft1k.h5)           | 200.9M | 89.86G | 384      | 87.4%     |              |\n  | [DiNAT_Large_K11, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fdinat_large_k11_imagenet21k-ft1k.h5) | 201.1M | 92.57G | 384      | 87.5%     |              |\n## DINOv2\n  - [Keras DINOv2](keras_cv_attention_models\u002Fbeit) 包含来自论文 [PDF 2304.07193 DINOv2: 无监督学习鲁棒视觉特征](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2304.07193.pdf) 的模型。\n\n  | 模型              | 参数量  | FLOPs   | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | ------------------ | ------- | ------- | -------- | ----------- | ------------ |\n  | [DINOv2_ViT_Small14](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fdinov2_vit_small14_518_imagenet.h5) | 22.83M  | 47.23G  | 518      | 81.1%     | 165.271 qps  |\n  | [DINOv2_ViT_Base14](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fdinov2_vit_base14_518_imagenet.h5)  | 88.12M  | 152.6G  | 518      | 84.5%     | 54.9769 qps  |\n  | [DINOv2_ViT_Large14](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fdinov2_vit_large14_518_imagenet.h5) | 306.4M  | 509.6G  | 518      | 86.3%     | 17.4108 qps  |\n  | [DINOv2_ViT_Giant14](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fdinov2_vit_giant14_518_imagenet.h5) | 1139.6M | 1790.3G | 518      | 86.5%     |              |\n## EdgeNeXt\n  - [Keras EdgeNeXt](keras_cv_attention_models\u002Fedgenext) 对应论文 [PDF 2206.10589 EdgeNeXt: 面向移动视觉应用的高效融合 CNN-Transformer 架构](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.10589.pdf)。\n\n  | 模型             | 参数量 | FLOPs  | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | ----------------- | ------ | ------ | -------- | ----------- | ------------ |\n  | [EdgeNeXt_XX_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_xx_small_256_imagenet.h5) | 1.33M  | 266M   | 256      | 71.23%    | 902.957 qps  |\n  | [EdgeNeXt_X_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_x_small_256_imagenet.h5)  | 2.34M  | 547M   | 256      | 74.96%    | 638.346 qps  |\n  | [EdgeNeXt_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_small_256_imagenet.h5)    | 5.59M  | 1.27G  | 256      | 79.41%    | 536.762 qps  |\n  | - [usi](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_small_256_usi.h5)             | 5.59M  | 1.27G  | 256      | 81.07%    | 536.762 qps  |\n  | [EdgeNeXt_Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_base_256_imagenet.h5)     | 18.5M  | 3.86G  | 256      | 82.47%    | 383.461 qps  |\n  | - [usi](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_base_256_usi.h5)             | 18.5M  | 3.86G  | 256      | 83.31%    | 383.461 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fedgenext\u002Fedgenext_base_256_imagenet-ft1k.h5)        | 18.5M  | 3.86G  | 256      | 83.68%    | 383.461 qps  |\n## EfficientFormer\n  - [Keras EfficientFormer](keras_cv_attention_models\u002Fefficientformer) 对应论文 [PDF 2206.01191 EfficientFormer: MobileNet 速度的视觉 Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.01191.pdf)。\n\n  | 模型                      | 参数量 | FLOPs | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | -------------------------- | ------ | ----- | -------- | ----------- | ------------ |\n  | [EfficientFormerL1, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Fefficientformer_l1_224_imagenet.h5) | 12.3M  | 1.31G | 224      | 79.2%     | 1214.22 qps  |\n  | [EfficientFormerL3, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Fefficientformer_l3_224_imagenet.h5) | 31.4M  | 3.95G | 224      | 82.4%     | 596.705 qps  |\n  | [EfficientFormerL7, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Fefficientformer_l7_224_imagenet.h5) | 74.4M  | 9.79G | 224      | 83.3%     | 298.434 qps  |\n\n## EfficientFormerV2\n  - [Keras EfficientFormer](keras_cv_attention_models\u002Fefficientformer) 包含对 [PDF 2212.08059 重新思考适用于移动端的视觉Transformer：尺寸与速度](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.08059.pdf) 的实现。\n\n  | 模型                        | 参数量 | FLOPs  | 输入 | Top1准确率 | T4推理 |\n  | ---------------------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [EfficientFormerV2S0, 知识蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientformer\u002Fefficientformer_v2_s0_224_imagenet.h5) | 3.60M  | 405.2M | 224   | 76.2     | 1114.38 qps  |\n  | [EfficientFormerV2S1, 知识蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientformer\u002Fefficientformer_v2_s1_224_imagenet.h5) | 6.19M  | 665.6M | 224   | 79.7     | 841.186 qps  |\n  | [EfficientFormerV2S2, 知识蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientformer\u002Fefficientformer_v2_s2_224_imagenet.h5) | 12.7M  | 1.27G  | 224   | 82.0     | 573.9 qps    |\n  | [EfficientFormerV2L, 知识蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientformer\u002Fefficientformer_v2_l_224_imagenet.h5)  | 26.3M  | 2.59G  | 224   | 83.5     | 377.224 qps  |\n## EfficientNet\n  - [Keras EfficientNet](keras_cv_attention_models\u002Fefficientnet) 包含对 [PDF 1911.04252 噪声学生自训练提升ImageNet分类性能](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.04252.pdf) 的实现。\n\n  | 模型                          | 参数量 | FLOPs   | 输入 | Top1准确率 | T4推理 |\n  | ------------------------------ | ------ | ------- | ----- | -------- | ------------ |\n  | [EfficientNetV1B0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b0-imagenet.h5)               | 5.3M   | 0.39G   | 224   | 77.6     | 1129.93 qps  |\n  | - [噪声学生](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b0-noisy_student.h5)                 | 5.3M   | 0.39G   | 224   | 78.8     | 1129.93 qps  |\n  | [EfficientNetV1B1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b1-imagenet.h5)               | 7.8M   | 0.70G   | 240   | 79.6     | 758.639 qps  |\n  | - [噪声学生](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b1-noisy_student.h5)                 | 7.8M   | 0.70G   | 240   | 81.5     | 758.639 qps  |\n  | [EfficientNetV1B2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b2-imagenet.h5)               | 9.1M   | 1.01G   | 260   | 80.5     | 668.959 qps  |\n  | - [噪声学生](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b2-noisy_student.h5)                 | 9.1M   | 1.01G   | 260   | 82.4     | 668.959 qps  |\n  | [EfficientNetV1B3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b3-imagenet.h5)               | 12.2M  | 1.86G   | 300   | 81.9     | 473.607 qps  |\n  | - [噪声学生](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b3-noisy_student.h5)                 | 12.2M  | 1.86G   | 300   | 84.1     | 473.607 qps  |\n  | [EfficientNetV1B4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b4-imagenet.h5)               | 19.3M  | 4.46G   | 380   | 83.3     | 265.244 qps  |\n  | - [噪声学生](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b4-noisy_student.h5)                 | 19.3M  | 4.46G   | 380   | 85.3     | 265.244 qps  |\n  | [EfficientNetV1B5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b5-imagenet.h5)               | 30.4M  | 10.40G  | 456   | 84.3     | 146.758 qps  |\n  | - [噪声学生](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b5-noisy_student.h5)                 | 30.4M  | 10.40G  | 456   | 86.1     | 146.758 qps  |\n  | [EfficientNetV1B6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b6-imagenet.h5)               | 43.0M  | 19.29G  | 528   | 84.8     | 88.0369 qps  |\n  | - [噪声学生](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b6-noisy_student.h5)                 | 43.0M  | 19.29G  | 528   | 86.4     | 88.0369 qps  |\n  | [EfficientNetV1B7](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b7-imagenet.h5)               | 66.3M  | 38.13G  | 600   | 85.2     | 52.6616 qps  |\n  | - [噪声学生](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-b7-noisy_student.h5)                 | 66.3M  | 38.13G  | 600   | 86.9     | 52.6616 qps  |\n  | [EfficientNetV1L2, 知识蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetv1-l2-noisy_student.h5) | 480.3M | 477.98G | 800   | 88.4     |              |\n## EfficientNetEdgeTPU\n  - [Keras EfficientNetEdgeTPU](keras_cv_attention_models\u002Fefficientnet) 包含对 [PDF 1911.04252 噪声学生自训练提升ImageNet分类性能](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.04252.pdf) 的实现。\n\n  | 模型                          | 参数量 | FLOPs   | 输入 | Top1准确率 | T4推理 |\n  | ------------------------------ | ------ | ------- | ----- | -------- | ------------ |\n  | [EfficientNetEdgeTPUSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetedgetpu-small-imagenet.h5)       | 5.49M  | 1.79G   | 224   | 78.07    | 1459.38 qps  |\n  | [EfficientNetEdgeTPUMedium](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetedgetpu-medium-imagenet.h5)      | 6.90M  | 3.01G   | 240   | 79.25    | 1028.95 qps  |\n  | [EfficientNetEdgeTPULarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv1_pretrained\u002Fefficientnetedgetpu-large-imagenet.h5)       | 10.59M | 7.94G   | 300   | 81.32    | 527.034 qps  |\n\n## EfficientNetV2\n  - [Keras EfficientNet](keras_cv_attention_models\u002Fefficientnet) 包含了对 [PDF 2104.00298 EfficientNetV2：更小的模型和更快的训练](https:\u002F\u002Farxiv.org\u002Fabs\u002F2104.00298) 的实现。\n\n  | 模型                      | 参数量 | FLOPs  | 输入大小 | Top1 准确率 | T4 推理吞吐量 |\n  | -------------------------- | ------ | ------ | -------- | ----------- | -------------- |\n  | [EfficientNetV2B0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b0-imagenet.h5)           | 710万  | 0.72G  | 224      | 78.7%     | 1109.84 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b0-21k-ft1k.h5)         | 710万  | 0.72G  | 224      | 77.55%?   | 1109.84 qps  |\n  | [EfficientNetV2B1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b1-imagenet.h5)           | 810万  | 1.21G  | 240      | 79.8%     | 842.372 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b1-21k-ft1k.h5)         | 810万  | 1.21G  | 240      | 79.03%?   | 842.372 qps  |\n  | [EfficientNetV2B2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b2-imagenet.h5)           | 1010万 | 1.71G  | 260      | 80.5%     | 762.865 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b2-21k-ft1k.h5)         | 1010万 | 1.71G  | 260      | 79.48%?   | 762.865 qps  |\n  | [EfficientNetV2B3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b3-imagenet.h5)           | 1440万 | 3.03G  | 300      | 82.1%     | 548.501 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-b3-21k-ft1k.h5)         | 1440万 | 3.03G  | 300      | 82.46%?   | 548.501 qps  |\n  | [EfficientNetV2T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-t-imagenet.h5)            | 1360万 | 3.18G  | 288      | 82.34%    | 496.483 qps  |\n  | [EfficientNetV2T_GC](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-t-gc-imagenet.h5)         | 1370万 | 3.19G  | 288      | 82.46%    | 368.763 qps  |\n  | [EfficientNetV2S](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-s-imagenet.h5)            | 2150万 | 8.41G  | 384      | 83.9%     | 344.109 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-s-21k-ft1k.h5)         | 2150万 | 8.41G  | 384      | 84.9%     | 344.109 qps  |\n  | [EfficientNetV2M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-m-imagenet.h5)            | 5410万 | 24.69G | 480      | 85.2%     | 145.346 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-m-21k-ft1k.h5)         | 5410万 | 24.69G | 480      | 86.2%     | 145.346 qps  |\n  | [EfficientNetV2L](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-l-imagenet.h5)            | 1.195亿 | 56.27G | 480      | 85.7%     | 85.6514 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-l-21k-ft1k.h5)         | 1.195亿 | 56.27G | 480      | 86.9%     | 85.6514 qps  |\n  | [EfficientNetV2XL, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_efficientnet_v2\u002Freleases\u002Fdownload\u002Feffnetv2_pretrained\u002Fefficientnetv2-xl-21k-ft1k.h5) | 2.068亿 | 93.66G | 512      | 87.2%     | 55.141 qps   |\n\n## EfficientViT_B\n  - [Keras EfficientViT_B](keras_cv_attention_models\u002Fefficientvit) 对应论文 [PDF 2205.14756 EfficientViT: 轻量级多尺度注意力机制用于设备端语义分割](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.14756.pdf)。\n\n  | 模型           | 参数量 | FLOPs | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | --------------- | ------ | ----- | ------- | ---------- | ------------ |\n  | [EfficientViT_B0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b0_224_imagenet.h5) | 3.41M  | 0.12G | 224   | 71.6 ?   | 1581.76 qps  |\n  | [EfficientViT_B1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b1_224_imagenet.h5) | 9.10M  | 0.58G | 224   | 79.4     | 943.587 qps  |\n  | - [256](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b1_256_imagenet.h5)           | 9.10M  | 0.78G | 256   | 79.9     | 840.844 qps  |\n  | - [288](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b1_288_imagenet.h5)            | 9.10M  | 1.03G | 288   | 80.4     | 680.088 qps  |\n  | [EfficientViT_B2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b2_224_imagenet.h5) | 24.33M | 1.68G | 224   | 82.1     | 583.295 qps  |\n  | - [256](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b2_256_imagenet.h5)            | 24.33M | 2.25G | 256   | 82.7     | 507.187 qps  |\n  | - [288](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b2_288_imagenet.h5)            | 24.33M | 2.92G | 288   | 83.1     | 419.93 qps   |\n  | [EfficientViT_B3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b3_224_imagenet.h5) | 48.65M | 4.14G | 224   | 83.5     | 329.764 qps  |\n  | - [256](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b3_256_imagenet.h5)            | 48.65M | 5.51G | 256   | 83.8     | 288.605 qps  |\n  | - [288](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_b3_288_imagenet.h5)            | 48.65M | 7.14G | 288   | 84.2     | 229.992 qps  |\n  | [EfficientViT_L1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_l1_224_imagenet.h5) | 52.65M | 5.28G | 224   | 84.48    | 503.068 qps |\n  | [EfficientViT_L2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_l2_224_imagenet.h5) | 63.71M | 6.98G | 224   | 85.05    | 396.255 qps |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_l2_384_imagenet.h5)            | 63.71M | 20.7G | 384   | 85.98    | 207.322 qps |\n  | [EfficientViT_L3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_l3_224_imagenet.h5) | 246.0M | 27.6G | 224   | 85.814   | 174.926 qps |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_l3_384_imagenet.h5)            | 246.0M | 81.6G | 384   | 86.408   | 86.895 qps  |\n## EfficientViT_M\n  - [Keras EfficientViT_M](keras_cv_attention_models\u002Fefficientvit) 对应论文 [PDF 2305.07027 EfficientViT: 基于级联分组注意力的内存高效视觉Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.07027.pdf)。\n\n  | 模型           | 参数量 | FLOPs | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | --------------- | ------ | ----- | ------- | ---------- | ------------ |\n  | [EfficientViT_M0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_m0_224_imagenet.h5) | 2.35M  | 79.4M | 224   | 63.2     | 814.522 qps  |\n  | [EfficientViT_M1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_m1_224_imagenet.h5) | 2.98M  | 167M  | 224   | 68.4     | 948.041 qps  |\n  | [EfficientViT_M2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_m2_224_imagenet.h5) | 4.19M  | 201M  | 224   | 70.8     | 906.286 qps  |\n  | [EfficientViT_M3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_m3_224_imagenet.h5) | 6.90M  | 263M  | 224   | 73.4     | 758.086 qps  |\n  | [EfficientViT_M4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_m4_224_imagenet.h5) | 8.80M  | 299M  | 224   | 74.3     | 672.891 qps  |\n  | [EfficientViT_M5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientvit\u002Fefficientvit_m5_224_imagenet.h5) | 12.47M | 522M  | 224   | 77.1     | 577.254 qps  |\n## EVA\n  - [Keras EVA](keras_cv_attention_models\u002Fbeit) 包含来自论文 [PDF 2211.07636 EVA: 探索大规模掩码视觉表征学习的极限](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.07636.pdf) 的模型。\n\n  | 模型                 | 参数量  | FLOPs    | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | --------------------- | ------- | -------- | ------- | ---------- | ------------ |\n  | [EvaLargePatch14, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva_large_patch14_196_imagenet21k-ft1k.h5)  | 304.14M | 61.65G   | 196   | 88.59    | 115.532 qps  |\n  | - [21k_ft1k, 336](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva_large_patch14_336_imagenet21k-ft1k.h5)            | 304.53M | 191.55G  | 336   | 89.20    | 53.3467 qps  |\n  | [EvaGiantPatch14, clip](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva_giant_patch14_224_imagenet21k-ft1k.h5) | 1012.6M | 267.40G  | 224   | 89.10    |              |\n  | - [m30m](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva_giant_patch14_336_imagenet21k-ft1k.h5)                | 1013.0M | 621.45G  | 336   | 89.57    |              |\n  | - [m30m](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva_giant_patch14_560_imagenet21k-ft1k.h5)                | 1014.4M | 1911.61G | 560   | 89.80    |              |\n\n## EVA02\n  - [Keras EVA02](keras_cv_attention_models\u002Fbeit) 包含来自 [PDF 2303.11331 EVA: EVA-02: 新世纪福音战士的视觉表征](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.11331.pdf) 的模型。\n\n  | 模型                                  | 参数量  | FLOPs   | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | -------------------------------------- | ------- | ------- | ----- | -------- | ------------ |\n  | [EVA02TinyPatch14, mim_in22k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva02_tiny_patch14_336_mim_in22k_ft1k.h5)       | 5.76M   | 4.72G   | 336   | 80.658   | 320.123 qps  |\n  | [EVA02SmallPatch14, mim_in22k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva02_small_patch14_336_mim_in22k_ft1k.h5)      | 22.13M  | 15.57G  | 336   | 85.74    | 161.774 qps  |\n  | [EVA02BasePatch14, mim_in22k_ft22k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva02_base_patch14_448_mim_in22k_ft22k_ft1k.h5) | 87.12M  | 107.6G  | 448   | 88.692   | 34.3962 qps  |\n  | [EVA02LargePatch14, mim_m38m_ft22k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Feva02_large_patch14_448_mim_m38m_ft22k_ft1k.h5) | 305.08M | 363.68G | 448   | 90.054   |              |\n## FasterNet\n  - [Keras FasterNet](keras_cv_attention_models\u002Ffasternet) 包含对 [PDF 2303.03667 跑起来，别走着：追求更高 FLOPs 以实现更快的神经网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.03667.pdf) 的实现。\n\n  | 模型       | 参数量 | FLOPs  | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | ----------- | ------ | ------ | ----- | -------- | ------------ |\n  | [FasterNetT0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffasternet\u002Ffasternet_t0_imagenet.h5) | 3.9M   | 0.34G  | 224   | 71.9     | 1890.83 qps  |\n  | [FasterNetT1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffasternet\u002Ffasternet_t1_imagenet.h5) | 7.6M   | 0.85G  | 224   | 76.2     | 1788.16 qps  |\n  | [FasterNetT2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffasternet\u002Ffasternet_t2_imagenet.h5) | 15.0M  | 1.90G  | 224   | 78.9     | 1353.12 qps  |\n  | [FasterNetS](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffasternet\u002Ffasternet_s_imagenet.h5)  | 31.1M  | 4.55G  | 224   | 81.3     | 818.814 qps  |\n  | [FasterNetM](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffasternet\u002Ffasternet_m_imagenet.h5)  | 53.5M  | 8.72G  | 224   | 83.0     | 436.383 qps  |\n  | [FasterNetL](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffasternet\u002Ffasternet_l_imagenet.h5)  | 93.4M  | 15.49G | 224   | 83.5     | 319.809 qps  |\n## FasterViT\n  - [Keras FasterViT](keras_cv_attention_models\u002Ffastervit) 包含对 [PDF 2306.06189 FasterViT: 具有层次化注意力的快速视觉 Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.06189.pdf) 的实现。\n\n  | 模型      | 参数量   | FLOPs   | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | ---------- | -------- | ------- | ----- | -------- | ------------ |\n  | [FasterViT0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_0_224_imagenet.h5) | 31.40M   | 3.51G   | 224   | 82.1     | 716.809 qps  |\n  | [FasterViT1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_1_224_imagenet.h5) | 53.37M   | 5.52G   | 224   | 83.2     | 491.971 qps  |\n  | [FasterViT2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_2_224_imagenet.h5) | 75.92M   | 9.00G   | 224   | 84.2     | 377.006 qps  |\n  | [FasterViT3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_3_224_imagenet.h5) | 159.55M  | 18.75G  | 224   | 84.9     | 216.481 qps  |\n  | [FasterViT4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_4_224_imagenet.h5) | 351.12M  | 41.57G  | 224   | 85.4     | 71.6303 qps  |\n  | [FasterViT5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_5_224_imagenet.h5) | 957.52M  | 114.08G | 224   | 85.6     |              |\n  | [FasterViT6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_6_224_imagenet.1.h5), [+.2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastervit\u002Ffastervit_6_224_imagenet.2.h5) | 1360.33M | 144.13G | 224   | 85.8     |              |\n\n## FastViT\n  - [Keras FastViT](keras_cv_attention_models\u002Ffastvit) 包含对 [PDF 2303.14189 FastViT：一种基于结构重参数化的快速混合视觉 Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.14189.pdf) 的实现。\n\n  | 模型         | 参数量 | FLOPs  | 输入分辨率 | Top1 准确率 | T4 推理速度 |\n  | ------------- | ------ | ----- | ------- | ---------- | ----------- |\n  | [FastViT_T8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_t8_imagenet.h5)     | 4.03M  | 0.65G | 256   | 76.2     | 1020.29 qps  |\n  | - [蒸馏版](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_t8_distill.h5)       | 4.03M  | 0.65G | 256   | 77.2     | 1020.29 qps  |\n  | - deploy=True | 3.99M  | 0.64G | 256   | 76.2     | 1323.14 qps  |\n  | [FastViT_T12](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_t12_imagenet.h5)   | 7.55M  | 1.34G | 256   | 79.3     | 734.867 qps  |\n  | - [蒸馏版](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_t12_distill.h5)      | 7.55M  | 1.34G | 256   | 80.3     | 734.867 qps  |\n  | - deploy=True | 7.50M  | 1.33G | 256   | 79.3     | 956.332 qps  |\n  | [FastViT_S12](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_s12_imagenet.h5)   | 9.47M  | 1.74G | 256   | 79.9     | 666.669 qps  |\n  | - [蒸馏版](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_s12_distill.h5)      | 9.47M  | 1.74G | 256   | 81.1     | 666.669 qps  |\n  | - deploy=True | 9.42M  | 1.74G | 256   | 79.9     | 881.429 qps  |\n  | [FastViT_SA12](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_sa12_imagenet.h5) | 11.58M | 1.88G | 256   | 80.9     | 656.95 qps   |\n  | - [蒸馏版](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_sa12_distill.h5)     | 11.58M | 1.88G | 256   | 81.9     | 656.95 qps   |\n  | - deploy=True | 11.54M | 1.88G | 256   | 80.9     | 833.011 qps  |\n  | [FastViT_SA24](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_sa24_imagenet.h5) | 21.55M | 3.66G | 256   | 82.7     | 371.84 qps   |\n  | - [蒸馏版](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_sa24_distill.h5)     | 21.55M | 3.66G | 256   | 83.4     | 371.84 qps   |\n  | - deploy=True | 21.49M | 3.66G | 256   | 82.7     | 444.055 qps  |\n  | [FastViT_SA36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_sa36_imagenet.h5) | 31.53M | 5.44G | 256   | 83.6     | 267.986 qps  |\n  | - [蒸馏版](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_sa36_distill.h5)     | 31.53M | 5.44G | 256   | 84.2     | 267.986 qps  |\n  | - deploy=True | 31.44M | 5.43G | 256   | 83.6     | 325.967 qps  |\n  | [FastViT_MA36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_ma36_imagenet.h5) | 44.07M | 7.64G | 256   | 83.9     | 211.928 qps  |\n  | - [蒸馏版](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ffastvit\u002Ffastvit_ma36_distill.h5)     | 44.07M | 7.64G | 256   | 84.6     | 211.928 qps  |\n  | - deploy=True | 43.96M | 7.63G | 256   | 83.9     | 274.559 qps  |\n## FBNetV3\n  - [Keras FBNetV3](keras_cv_attention_models\u002Fmobilenetv3_family#fbnetv3) 包含对 [PDF 2006.02049 FBNetV3：利用预测器预训练进行架构与配方联合搜索](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2006.02049.pdf) 的实现。\n\n  | 模型    | 参数量 | FLOPs    | 输入分辨率 | Top1 准确率 | T4 推理速度 |\n  | -------- | ------ | -------- | ---------- | ---------- | ----------- |\n  | [FBNetV3B](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ffbnetv3_b_imagenet.h5) | 5.57M  | 539.82M  | 256   | 79.15    | 713.882 qps  |\n  | [FBNetV3D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ffbnetv3_d_imagenet.h5) | 10.31M | 665.02M  | 256   | 79.68    | 635.963 qps  |\n  | [FBNetV3G](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ffbnetv3_g_imagenet.h5) | 16.62M | 1379.30M | 256   | 82.05    | 478.835 qps  |\n## FlexiViT\n  - [Keras FlexiViT](keras_cv_attention_models\u002Fbeit) 包含来自 [PDF 2212.08013 FlexiViT：一种适用于所有补丁大小的模型](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.08013.pdf) 的模型。\n\n  | 模型         | 参数量  | FLOPs  | 输入分辨率 | Top1 准确率 | T4 推理速度 |\n  | ------------- | ------- | ------ | ---------- | ---------- | ----------- |\n  | [FlexiViTSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fflexivit_small_240_imagenet.h5) | 22.06M  | 5.36G  | 240   | 82.53    | 744.578 qps  |\n  | [FlexiViTBase](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fflexivit_base_240_imagenet.h5)  | 86.59M  | 20.33G | 240   | 84.66    | 301.948 qps  |\n  | [FlexiViTLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fflexivit_large_240_imagenet.h5) | 304.47M | 71.09G | 240   | 85.64    | 105.187 qps  |\n\n## GCViT\n  - [Keras GCViT](keras_cv_attention_models\u002Fgcvit) 包含对 [PDF 2206.09959 全局上下文视觉 Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.09959.pdf) 的实现。\n\n  | 模型           | 参数量 | FLOPs  | 输入尺寸 | Top1 准确率 | 下载链接 |\n  | --------------- | ------ | ------ | ------- | ----------- | -------- |\n  | [GCViT_XXTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_xx_tiny_224_imagenet.h5)    | 12.0M  | 2.15G  | 224   | 79.9     | 337.7 qps   |\n  | [GCViT_XTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_x_tiny_224_imagenet.h5)     | 20.0M  | 2.96G  | 224   | 82.0     | 255.625 qps   |\n  | [GCViT_Tiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_tiny_224_imagenet.h5)      | 28.2M  | 4.83G  | 224   | 83.5     | 174.553 qps   |\n  | [GCViT_Tiny2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_tiny2_224_imagenet.h5)     | 34.5M  | 6.28G  | 224   | 83.7     |  |\n  | [GCViT_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_small_224_imagenet.h5)     | 51.1M  | 8.63G  | 224   | 84.3     | 131.577 qps   |\n  | [GCViT_Small2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_small2_224_imagenet.h5)    | 68.6M  | 11.7G  | 224   | 84.8     |  |\n  | [GCViT_Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_base_224_imagenet.h5)      | 90.3M  | 14.9G  | 224   | 85.0     | 105.845 qps   |\n  | [GCViT_Large](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_large_224_imagenet.h5)     | 202.1M | 32.8G  | 224   | 85.7     |  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_large_224_imagenet21k-ft1k.h5)      | 202.1M | 32.8G  | 224   | 86.6     |  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_large_384_imagenet21k-ft1k.h5) | 202.9M | 105.1G | 384   | 87.4     |  |\n  | - [21k_ft1k, 512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgcvit\u002Fgcvit_large_512_imagenet21k-ft1k.h5) | 203.8M | 205.1G | 512   | 87.6     |  |\n## GhostNet\n  - [Keras GhostNet](keras_cv_attention_models\u002Fghostnet) 包含对 [PDF 1911.11907 GhostNet: 更多来自廉价操作的特征](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.11907.pdf) 的实现。\n\n  | 模型        | 参数量 | FLOPs  | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | ------------ | ------ | ------ | ------- | ----------- | ------------ |\n  | [GhostNet_050](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnet_050_imagenet.h5) | 2.59M  | 42.6M  | 224   | 66.88    | 1272.25 qps  |\n  | [GhostNet_100](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnet_100_imagenet.h5) | 5.18M  | 141.7M | 224   | 74.16    | 1167.4 qps   |\n  | [GhostNet_130](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnet_130_imagenet.h5) | 7.36M  | 227.7M | 224   | 75.79    | 1024.49 qps  |\n  | - [ssld](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnet_130_ssld.h5)       | 7.36M  | 227.7M | 224   | 79.38    | 1024.49 qps  |\n## GhostNetV2\n  - [Keras GhostNet](keras_cv_attention_models\u002Fghostnet) 包含对 [PDF GhostNetV2: 通过长距离注意力增强廉价操作](https:\u002F\u002Fopenreview.net\u002Fpdf\u002F6db544c65bbd0fa7d7349508454a433c112470e2.pdf) 的实现。\n\n  | 模型          | 参数量 | FLOPs  | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | -------------- | ------ | ------ | ------- | ----------- | ------------ |\n  | [GhostNetV2_100](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnetv2_100_imagenet.h5)    | 6.12M  | 168.5M | 224   | 75.3     | 797.088 qps  |\n  | [GhostNetV2_130](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnetv2_130_imagenet.h5)    | 8.96M  | 271.1M | 224   | 76.9     | 722.668 qps  |\n  | [GhostNetV2_160](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fghostnetv2\u002Fghostnetv2_160_imagenet.h5)    | 12.39M | 400.9M | 224   | 77.8     | 572.268 qps  |\n## GMLP\n  - [Keras GMLP](keras_cv_attention_models\u002Fmlp_family#gmlp) 包含对 [PDF 2105.08050 关注 MLP](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.08050.pdf) 的实现。\n\n  | 模型      | 参数量 | FLOPs  | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | ---------- | ------ | ------ | ------- | ----------- | ------------ |\n  | GMLPTiny16 | 6M     | 1.35G  | 224   | 72.3     | 234.187 qps  |\n  | [GMLPS16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fgmlp_s16_imagenet.h5)    | 20M    | 4.44G  | 224   | 79.6     | 138.363 qps  |\n  | GMLPB16    | 73M    | 15.82G | 224   | 81.6     | 77.816 qps   |\n## GPViT\n  - [Keras GPViT](keras_cv_attention_models\u002Fgpvit) 包含对 [PDF 2212.06795 GPVIT: 一种高分辨率非层次化视觉 Transformer，采用组传播机制](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2212.06795.pdf) 的实现。\n\n  | 模型    | 参数量 | FLOPs  | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | -------- | ------ | ------ | ------- | ----------- | ------------ |\n  | [GPViT_L1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpvit\u002Fgpvit_l1_224_imagenet.h5) | 9.59M  | 6.15G  | 224   | 80.5     | 210.166 qps  |\n  | [GPViT_L2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpvit\u002Fgpvit_l2_224_imagenet.h5) | 24.2M  | 15.74G | 224   | 83.4     | 139.656 qps  |\n  | [GPViT_L3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpvit\u002Fgpvit_l3_224_imagenet.h5) | 36.7M  | 23.54G | 224   | 84.1     | 131.284 qps  |\n  | [GPViT_L4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpvit\u002Fgpvit_l4_224_imagenet.h5) | 75.5M  | 48.29G | 224   | 84.3     | 94.1899 qps  |\n\n## HaloNet\n  - [Keras HaloNet](keras_cv_attention_models\u002Fhalonet) 对应于 [PDF 2103.12731 参数高效的视觉骨干网络中的局部自注意力扩展](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2103.12731.pdf)。\n\n  | 模型          | 参数量 | FLOPs   | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | -------------- | ------ | ------- | ----- | -------- | ------------ |\n  | [HaloNextECA26T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhalonet\u002Fhalonext_eca26t_256_imagenet.h5) | 10.7M  | 2.43G   | 256   | 79.50    | 1028.93 qps  |\n  | [HaloNet26T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhalonet\u002Fhalonet26t_256_imagenet.h5)     | 12.5M  | 3.18G   | 256   | 79.13    | 1096.79 qps  |\n  | [HaloNetSE33T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhalonet\u002Fhalonet_se33t_256_imagenet.h5)   | 13.7M  | 3.55G   | 256   | 80.99    | 582.008 qps  |\n  | [HaloRegNetZB](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhalonet\u002Fhaloregnetz_b_224_imagenet.h5)   | 11.68M | 1.97G   | 224   | 81.042   | 575.961 qps  |\n  | [HaloNet50T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhalonet\u002Fhalonet50t_256_imagenet.h5)     | 22.7M  | 5.29G   | 256   | 81.70    | 512.677 qps  |\n  | [HaloBotNet50T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhalonet\u002Fhalobotnet50t_256_imagenet.h5)  | 22.6M  | 5.02G   | 256   | 82.0     | 431.616 qps  |\n## Hiera\n  - [Keras Hiera](keras_cv_attention_models\u002Fhiera) 对应于 [PDF 2306.00989 Hiera：一种无花哨的层次化视觉 Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2306.00989.pdf)。\n\n  | 模型                        | 参数量  | FLOPs   | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | ---------------------------- | ------- | ------- | ----- | -------- | ------------ |\n  | [HieraTiny, mae_in1k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhiera\u002Fhiera_tiny_224_mae_in1k_ft1k.h5)     | 27.91M  | 4.93G   | 224   | 82.8     | 644.356 qps  |\n  | [HieraSmall, mae_in1k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhiera\u002Fhiera_small_224_mae_in1k_ft1k.h5)    | 35.01M  | 6.44G   | 224   | 83.8     | 491.669 qps  |\n  | [HieraBase, mae_in1k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhiera\u002Fhiera_base_224_mae_in1k_ft1k.h5)     | 51.52M  | 9.43G   | 224   | 84.5     | 351.542 qps  |\n  | [HieraBasePlus, mae_in1k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhiera\u002Fhiera_base_plus_224_mae_in1k_ft1k.h5) | 69.90M  | 12.71G  | 224   | 85.2     | 291.446 qps  |\n  | [HieraLarge, mae_in1k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhiera\u002Fhiera_large_224_mae_in1k_ft1k.h5)    | 213.74M | 40.43G  | 224   | 86.1     | 111.042 qps  |\n  | [HieraHuge, mae_in1k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhiera\u002Fhiera_huge_224_mae_in1k_ft1k.h5)     | 672.78M | 125.03G | 224   | 86.9     |              |\n## HorNet\n  - [Keras HorNet](keras_cv_attention_models\u002Fhornet) 对应于 [PDF 2207.14284 HorNet：基于递归门控卷积的高效高阶空间交互](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2207.14284.pdf)。\n\n  | 模型         | 参数量 | FLOPs  | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | ------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [HorNetTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_tiny_224_imagenet.h5)    | 22.4M  | 4.01G  | 224   | 82.8     | 222.665 qps  |\n  | [HorNetTinyGF](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_tiny_gf_224_imagenet.h5)  | 23.0M  | 3.94G  | 224   | 83.0     |              |\n  | [HorNetSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_small_224_imagenet.h5)   | 49.5M  | 8.87G  | 224   | 83.8     | 166.998 qps  |\n  | [HorNetSmallGF](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_small_gf_224_imagenet.h5) | 50.4M  | 8.77G  | 224   | 84.0     |              |\n  | [HorNetBase](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_base_224_imagenet.h5)    | 87.3M  | 15.65G | 224   | 84.2     | 133.842 qps  |\n  | [HorNetBaseGF](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_base_gf_224_imagenet.h5)  | 88.4M  | 15.51G | 224   | 84.3     |              |\n  | [HorNetLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_large_224_imagenet22k.h5)   | 194.5M | 34.91G | 224   | 86.8     | 89.8254 qps  |\n  | [HorNetLargeGF](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_large_gf_224_imagenet22k.h5) | 196.3M | 34.72G | 224   | 87.0     |              |\n  | [HorNetLargeGF](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fhornet\u002Fhornet_large_gf_384_imagenet22k.h5) | 201.8M | 102.0G | 384   | 87.7     |              |\n## IFormer\n  - [Keras IFormer](keras_cv_attention_models\u002Fiformer) 对应于 [PDF 2205.12956 Inception Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2205.12956.pdf)。\n\n  | 模型        | 参数量 | FLOPs  | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | ------------ | ------ | ------ | ----- | -------- | ------------ |\n  | [IFormerSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fiformer\u002Fiformer_small_224_imagenet.h5) | 19.9M  | 4.88G  | 224   | 83.4     | 254.392 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fiformer\u002Fiformer_small_384_imagenet.h5)        | 20.9M  | 16.29G | 384   | 84.6     | 128.98 qps   |\n  | [IFormerBase](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fiformer\u002Fiformer_base_224_imagenet.h5)  | 47.9M  | 9.44G  | 224   | 84.6     | 147.868 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fiformer\u002Fiformer_base_384_imagenet.h5)        | 48.9M  | 30.86G | 384   | 85.7     | 77.8391 qps  |\n  | [IFormerLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fiformer\u002Fiformer_largel_224_imagenet.h5) | 86.6M  | 14.12G | 224   | 84.6     | 113.434 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fiformer\u002Fiformer_largel_384_imagenet.h5)        | 87.7M  | 45.74G | 384   | 85.8     | 60.0292 qps  |\n\n## InceptionNeXt\n  - [Keras InceptionNeXt](keras_cv_attention_models\u002Finceptionnext) 对应论文 [PDF 2303.16900 InceptionNeXt: When Inception Meets ConvNeXt](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2303.16900.pdf)。\n\n  | 模型              | 参数量 | FLOPs | 输入大小 | Top1准确率 | T4推理速度 |\n  | ------------------ | ------ | ------ | ----- | -------- | ------------ |\n  | [InceptionNeXtTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Finceptionnext\u002Finceptionnext_tiny_imagenet.h5)  | 28.05M | 4.21G  | 224   | 82.3     | 606.527 qps  |\n  | [InceptionNeXtSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Finceptionnext\u002Finceptionnext_small_imagenet.h5) | 49.37M | 8.39G  | 224   | 83.5     | 329.01 qps   |\n  | [InceptionNeXtBase](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Finceptionnext\u002Finceptionnext_base_224_imagenet.h5)  | 86.67M | 14.88G | 224   | 84.0     | 260.639 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Finceptionnext\u002Finceptionnext_base_384_imagenet.h5)              | 86.67M | 43.73G | 384   | 85.2     | 142.888 qps  |\n## LCNet\n  - [Keras LCNet](keras_cv_attention_models\u002Fmobilenetv3_family#lcnet) 包含对论文 [PDF 2109.15099 PP-LCNet: A Lightweight CPU Convolutional Neural Network](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2109.15099.pdf) 的实现。\n\n  | 模型    | 参数量 | FLOPs   | 输入大小 | Top1准确率 | T4推理速度 |\n  | -------- | ------ | ------- | ----- | -------- | ------------ |\n  | [LCNet050](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_050_imagenet.h5) | 1.88M  | 46.02M  | 224   | 63.10    | 3107.89 qps  |\n  | - [ssld](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_050_ssld.h5)   | 1.88M  | 46.02M  | 224   | 66.10    | 3107.89 qps  |\n  | [LCNet075](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_075_imagenet.h5) | 2.36M  | 96.82M  | 224   | 68.82    | 3083.55 qps  |\n  | [LCNet100](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_100_imagenet.h5) | 2.95M  | 158.28M | 224   | 72.10    | 2752.6 qps   |\n  | - [ssld](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_100_ssld.h5)   | 2.95M  | 158.28M | 224   | 74.39    | 2752.6 qps   |\n  | [LCNet150](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_150_imagenet.h5) | 4.52M  | 338.05M | 224   | 73.71    | 2250.69 qps  |\n  | [LCNet200](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_200_imagenet.h5) | 6.54M  | 585.35M | 224   | 75.18    | 2028.31 qps  |\n  | [LCNet250](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_250_imagenet.h5) | 9.04M  | 900.16M | 224   | 76.60    | 1686.7 qps   |\n  | - [ssld](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Flcnet_250_ssld.h5)   | 9.04M  | 900.16M | 224   | 80.82    | 1686.7 qps   |\n## LeViT\n  - [Keras LeViT](keras_cv_attention_models\u002Flevit) 对应论文 [PDF 2104.01136 LeViT: a Vision Transformer in ConvNet’s Clothing for Faster Inference](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2104.01136.pdf)。\n\n  | 模型              | 参数量 | FLOPs | 输入大小 | Top1准确率 | T4推理速度 |\n  | ------------------ | ------ | ----- | ----- | -------- | ------------ |\n  | [LeViT128S, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Flevit128s_imagenet.h5) | 7.8M   | 0.31G | 224   | 76.6     | 800.53 qps   |\n  | [LeViT128, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Flevit128_imagenet.h5)  | 9.2M   | 0.41G | 224   | 78.6     | 628.714 qps  |\n  | [LeViT192, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Flevit192_imagenet.h5)  | 11M    | 0.66G | 224   | 80.0     | 597.299 qps  |\n  | [LeViT256, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Flevit256_imagenet.h5)  | 19M    | 1.13G | 224   | 81.6     | 538.885 qps  |\n  | [LeViT384, distill](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Flevit384_imagenet.h5)  | 39M    | 2.36G | 224   | 82.6     | 460.139 qps  |\n\n## MaxViT\n  - [Keras MaxViT](keras_cv_attention_models\u002Fmaxvit) 对应论文 [PDF 2204.01697 MaxViT: 多轴视觉Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.01697.pdf)。\n\n  | 模型                      | 参数量 | FLOPs  | 输入分辨率 | Top1 精度 | T4 推理速度 |\n  | -------------------------- | ------ | ------ | ---------- | ---------- | ------------ |\n  | [MaxViT_Tiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_tiny_224_imagenet.h5)                | 31M    | 5.6G   | 224      | 83.62%   | 195.283 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_tiny_384_imagenet.h5)                      | 31M    | 17.7G  | 384      | 85.24%   | 92.5725 qps  |\n  | - [512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_tiny_512_imagenet.h5)                      | 31M    | 33.7G  | 512      | 85.72%   | 52.6485 qps  |\n  | [MaxViT_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_small_224_imagenet.h5)               | 69M    | 11.7G  | 224      | 84.45%   | 149.286 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_small_384_imagenet.h5)                    | 69M    | 36.1G  | 384      | 85.74%   | 61.5757 qps  |\n  | - [512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_small_512_imagenet.h5)                    | 69M    | 67.6G  | 512      | 86.19%   | 34.7002 qps  |\n  | [MaxViT_Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_base_224_imagenet.h5)                | 119M   | 24.2G  | 224      | 84.95%   | 74.7351 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_base_384_imagenet.h5)                      | 119M   | 74.2G  | 384      | 86.34%   | 31.9028 qps  |\n  | - [512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_base_512_imagenet.h5)                      | 119M   | 138.5G | 512      | 86.66%   | 17.8139 qps  |\n  | - [imagenet21k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_base_224_imagenet21k.h5)              | 135M   | 24.2G  | 224      |          | 74.7351 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_base_384_imagenet21k-ft1k.h5)     | 119M   | 74.2G  | 384      | 88.24%   | 31.9028 qps  |\n  | - [21k_ft1k, 512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_base_512_imagenet21k-ft1k.h5)     | 119M   | 138.5G | 512      | 88.38%   | 17.8139 qps  |\n  | [MaxViT_Large](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_large_224_imagenet.h5)               | 212M   | 43.9G  | 224      | 85.17%   | 58.0967 qps  |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_large_384_imagenet.h5)                    | 212M   | 133.1G | 384      | 86.40%   | 24.1388 qps  |\n  | - [512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_large_512_imagenet.h5)                    | 212M   | 245.4G | 512      | 86.70%   | 13.063 qps   |\n  | - [imagenet21k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_large_224_imagenet21k.h5)              | 233M   | 43.9G  | 224      |          | 58.0967 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_large_384_imagenet21k-ft1k.h5)     | 212M   | 133.1G | 384      | 88.32%   | 24.1388 qps  |\n  | - [21k_ft1k, 512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_large_512_imagenet21k-ft1k.h5)     | 212M   | 245.4G | 512      | 88.46%   | 13.063 qps   |\n  | [MaxViT_XLarge, imagenet21k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_xlarge_224_imagenet21k.h5) | 507M   | 97.7G  | 224      |          |              |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_xlarge_384_imagenet21k-ft1k.h5)    | 475M   | 293.7G | 384      | 88.51%   |              |\n  | - [21k_ft1k, 512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmaxvit\u002Fmaxvit_xlarge_512_imagenet21k-ft1k.h5)    | 475M   | 535.2G | 512      | 88.70%   |              |\n## MetaTransFormer\n  - [Keras MetaTransFormer](keras_cv_attention_models\u002Fbeit) 包含来自论文 [PDF 2307.10802 Meta-Transformer: 多模态学习的统一框架](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.10802) 的模型。\n\n  | 模型                                 | 参数量  | FLOPs  | 输入分辨率 | Top1 精度 | T4 推理速度 |\n  | ------------------------------------- | ------- | ------ | ---------- | ---------- | ------------ |\n  | [MetaTransformerBasePatch16, laion_2b](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fmeta_transformer_base_patch16_384_laion_2b.h5)  | 86.86M  | 55.73G | 384      | 85.4%    | 150.731 qps  |\n  | [MetaTransformerLargePatch14, laion_2b](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fmeta_transformer_large_patch14_336_laion_2b.h5) | 304.53M | 191.6G | 336      | 88.1%    | 50.1536 qps |\n\n## MLP混合器\n  - [Keras MLP混合器](keras_cv_attention_models\u002Fmlp_family#mlp-mixer) 包含对 [PDF 2105.01601 MLP-Mixer：一种全MLP的视觉架构](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.01601.pdf) 的实现。\n\n  | 模型            | 参数量 | FLOPs   | 输入 | Top1准确率 | T4推理 |\n  | ---------------- | ------ | ------- | ----- | -------- | ------------ |\n  | MLPMixerS32, JFT | 19.1M  | 1.01G   | 224   | 68.70    | 488.839 qps  |\n  | MLPMixerS16, JFT | 18.5M  | 3.79G   | 224   | 73.83    | 451.962 qps  |\n  | MLPMixerB32, JFT | 60.3M  | 3.25G   | 224   | 75.53    | 247.629 qps  |\n  | - [sam](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fmlp_mixer_b32_imagenet_sam.h5)   | 60.3M  | 3.25G   | 224   | 72.47    | 247.629 qps  |\n  | [MLPMixerB16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fmlp_mixer_b16_imagenet.h5)      | 59.9M  | 12.64G  | 224   | 76.44    | 207.423 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fmlp_mixer_b16_imagenet21k.h5)    | 59.9M  | 12.64G  | 224   | 80.64    | 207.423 qps  |\n  | - [sam](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fmlp_mixer_b16_imagenet_sam.h5)   | 59.9M  | 12.64G  | 224   | 77.36    | 207.423 qps  |\n  | - JFT            | 59.9M  | 12.64G  | 224   | 80.00    | 207.423 qps  |\n  | MLPMixerL32, JFT | 206.9M | 11.30G  | 224   | 80.67    | 95.1865 qps  |\n  | [MLPMixerL16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fmlp_mixer_l16_imagenet.h5)      | 208.2M | 44.66G  | 224   | 71.76    | 77.9928 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fmlp_mixer_l16_imagenet21k.h5)    | 208.2M | 44.66G  | 224   | 82.89    | 77.9928 qps  |\n  | - JFT            | 208.2M | 44.66G  | 224   | 84.82    | 77.9928 qps  |\n  | - 448            | 208.2M | 178.54G | 448   | 83.91    |              |\n  | - 448, JFT       | 208.2M | 178.54G | 448   | 86.78    |              |\n  | MLPMixerH14, JFT | 432.3M | 121.22G | 224   | 86.32    |              |\n  | - 448, JFT       | 432.3M | 484.73G | 448   | 87.94    |              |\n## MobileNetV3\n  - [Keras MobileNetV3](keras_cv_attention_models\u002Fmobilenetv3_family#mobilenetv3) 包含对 [PDF 1905.02244 寻找MobileNetV3](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1905.02244.pdf) 的实现。\n\n  | 模型               | 参数量 | FLOPs   | 输入 | Top1准确率 | T4推理 |\n  | ------------------- | ------ | ------- | ----- | -------- | ------------ |\n  | [MobileNetV3Small050](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Fmobilenetv3_small_050_imagenet.h5) | 1.29M  | 24.92M  | 224   | 57.89    | 2458.28 qps  |\n  | [MobileNetV3Small075](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Fmobilenetv3_small_075_imagenet.h5) | 2.04M  | 44.35M  | 224   | 65.24    | 2286.44 qps  |\n  | [MobileNetV3Small100](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Fmobilenetv3_small_100_imagenet.h5) | 2.54M  | 57.62M  | 224   | 67.66    | 2058.06 qps  |\n  | [MobileNetV3Large075](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Fmobilenetv3_large_075_imagenet.h5) | 3.99M  | 156.30M | 224   | 73.44    | 1643.78 qps  |\n  | [MobileNetV3Large100](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Fmobilenetv3_large_100_imagenet.h5) | 5.48M  | 218.73M | 224   | 75.77    | 1629.44 qps  |\n  | - [miil](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Fmobilenetv3_large_100_mill.h5)              | 5.48M  | 218.73M | 224   | 77.92    | 1629.44 qps  |\n## MobileViT\n  - [Keras MobileViT](keras_cv_attention_models\u002Fmobilevit) 对应于 [PDF 2110.02178 MOBILEVIT：轻量级、通用且适合移动端的视觉Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2110.02178.pdf)。\n\n  | 模型         | 参数量 | FLOPs | 输入 | Top1准确率 | T4推理 |\n  | ------------- | ------ | ----- | ----- | -------- | ------------ |\n  | [MobileViT_XXS](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_xxs_imagenet.h5) | 1.3M   | 0.42G | 256   | 69.0     | 1319.43 qps  |\n  | [MobileViT_XS](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_xs_imagenet.h5)  | 2.3M   | 1.05G | 256   | 74.7     | 1019.57 qps  |\n  | [MobileViT_S](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_s_imagenet.h5)   | 5.6M   | 2.03G | 256   | 78.3     | 790.943 qps  |\n\n## MobileViT_V2\n  - [Keras MobileViT_V2](keras_cv_attention_models\u002Fmobilevit) 对应论文 [PDF 2206.02680 移动视觉Transformer中的可分离自注意力](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2206.02680.pdf)。\n\n  | 模型              | 参数量 | FLOPs | 输入分辨率 | Top1准确率 | T4推理速度 |\n  | ------------------ | ------ | ----- | ---------- | ---------- | ---------- |\n  | [MobileViT_V2_050](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_050_256_imagenet.h5)   | 1.37M  | 0.47G | 256      | 70.18    | 718.337 qps  |\n  | [MobileViT_V2_075](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_075_256_imagenet.h5)   | 2.87M  | 1.04G | 256      | 75.56    | 642.323 qps  |\n  | [MobileViT_V2_100](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_100_256_imagenet.h5)   | 4.90M  | 1.83G | 256      | 78.09    | 591.217 qps  |\n  | [MobileViT_V2_125](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_125_256_imagenet.h5)   | 7.48M  | 2.84G | 256      | 79.65    | 510.25 qps   |\n  | [MobileViT_V2_150](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_150_256_imagenet.h5)   | 10.6M  | 4.07G | 256      | 80.38    | 466.482 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_150_256_imagenet22k.h5)      | 10.6M  | 4.07G | 256      | 81.46    | 466.482 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_150_384_imagenet22k.h5) | 10.6M  | 9.15G | 384      | 82.60    | 278.834 qps  |\n  | [MobileViT_V2_175](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_175_256_imagenet.h5)   | 14.3M  | 5.52G | 256      | 80.84    | 412.759 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_175_256_imagenet22k.h5)      | 14.3M  | 5.52G | 256      | 81.94    | 412.759 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_175_384_imagenet22k.h5) | 14.3M  | 12.4G | 384      | 82.93    | 247.108 qps  |\n  | [MobileViT_V2_200](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_200_256_imagenet.h5)   | 18.4M  | 7.12G | 256      | 81.17    | 394.325 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_200_256_imagenet22k.h5)      | 18.4M  | 7.12G | 256      | 82.36    | 394.325 qps  |\n  | - [21k_ft1k, 384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilevit\u002Fmobilevit_v2_200_384_imagenet22k.h5) | 18.4M  | 16.2G | 384      | 83.41    | 229.399 qps  |\n## MogaNet\n  - [Keras MogaNet](keras_cv_attention_models\u002Fmoganet) 对应论文 [PDF 2211.03295 高效多阶门控聚合网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2211.03295.pdf)。\n\n  | 模型        | 参数量 | FLOPs  | 输入分辨率 | Top1准确率 | T4推理速度 |\n  | ------------ | ------ | ------ | ---------- | ---------- | ---------- |\n  | [MogaNetXtiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmoganet\u002Fmoganet_xtiny_imagenet.h5) | 2.96M  | 806M   | 224      | 76.5     | 398.488 qps  |\n  | [MogaNetTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmoganet\u002Fmoganet_tiny_224_imagenet.h5)  | 5.20M  | 1.11G  | 224      | 79.0     | 362.409 qps  |\n  | - [256](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmoganet\u002Fmoganet_tiny_256_imagenet.h5)        | 5.20M  | 1.45G  | 256      | 79.6     | 335.372 qps  |\n  | [MogaNetSmall](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmoganet\u002Fmoganet_small_imagenet.h5) | 25.3M  | 4.98G  | 224      | 83.4     | 249.807 qps  |\n  | [MogaNetBase](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmoganet\u002Fmoganet_base_imagenet.h5)  | 43.7M  | 9.96G  | 224      | 84.2     | 133.071 qps  |\n  | [MogaNetLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmoganet\u002Fmoganet_large_imagenet.h5) | 82.5M  | 15.96G | 224      | 84.6     | 84.2045 qps  |\n## NAT\n  - [Keras NAT](keras_cv_attention_models\u002Fnat) 对应论文 [PDF 2204.07143 邻域注意力Transformer](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2204.07143.pdf)。\n\n  | 模型     | 参数量 | FLOPs  | 输入分辨率 | Top1准确率 | T4推理速度 |\n  | --------- | ------ | ------ | ---------- | ---------- | ---------- |\n  | [NAT_Mini](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fnat_mini_imagenet.h5)  | 20.0M  | 2.73G  | 224      | 81.8     | 85.2324 qps  |\n  | [NAT_Tiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fnat_tiny_imagenet.h5)  | 27.9M  | 4.34G  | 224      | 83.2     | 62.6147 qps  |\n  | [NAT_Small](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fnat_small_imagenet.h5) | 50.7M  | 7.84G  | 224      | 83.7     | 41.1545 qps  |\n  | [NAT_Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnat\u002Fnat_base_imagenet.h5)  | 89.8M  | 13.76G | 224      | 84.3     | 30.8989 qps  |\n\n## NFNets\n  - [Keras NFNets](keras_cv_attention_models\u002Fnfnets) 对应论文为 [PDF 2102.06171 高性能大规模图像识别无需归一化](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2102.06171.pdf)。\n\n  | 模型        | 参数量 | FLOPs   | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | ------------ | ------ | ------- | ----- | -------- | ------------ |\n  | [NFNetL0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetl0_imagenet.h5)      | 35.07M | 7.13G   | 288   | 82.75    |              |\n  | [NFNetF0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf0_imagenet.h5)      | 71.5M  | 12.58G  | 256   | 83.6     |              |\n  | [NFNetF1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf1_imagenet.h5)      | 132.6M | 35.95G  | 320   | 84.7     |              |\n  | [NFNetF2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf2_imagenet.h5)      | 193.8M | 63.24G  | 352   | 85.1     |              |\n  | [NFNetF3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf3_imagenet.h5)      | 254.9M | 115.75G | 416   | 85.7     |              |\n  | [NFNetF4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf4_imagenet.h5)      | 316.1M | 216.78G | 512   | 85.9     |              |\n  | [NFNetF5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf5_imagenet.h5)      | 377.2M | 291.73G | 544   | 86.0     |              |\n  | [NFNetF6, sam](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Fnfnetf6_imagenet.h5) | 438.4M | 379.75G | 576   | 86.5     |              |\n  | NFNetF7      | 499.5M | 481.80G | 608   |          |              |\n  | [ECA_NFNetL0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Feca_nfnetl0_imagenet.h5)  | 24.14M | 7.12G   | 288   | 82.58    |              |\n  | [ECA_NFNetL1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Feca_nfnetl1_imagenet.h5)  | 41.41M | 14.93G  | 320   | 84.01    |              |\n  | [ECA_NFNetL2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fnfnets\u002Feca_nfnetl2_imagenet.h5)  | 56.72M | 30.12G  | 384   | 84.70    |              |\n  | ECA_NFNetL3  | 72.04M | 52.73G  | 448   |          |              |\n## PVT_V2\n  - [Keras PVT_V2](keras_cv_attention_models\u002Fpvt) 对应论文为 [PDF 2106.13797 PVTv2：基于金字塔视觉Transformer的改进基线](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.13797.pdf)。\n\n  | 模型           | 参数量 | FLOPs  | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | --------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [PVT_V2B0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b0_imagenet.h5)        | 3.7M   | 580.3M | 224   | 70.5     | 561.593 qps  |\n  | [PVT_V2B1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b1_imagenet.h5)        | 14.0M  | 2.14G  | 224   | 78.7     | 392.408 qps  |\n  | [PVT_V2B2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b2_imagenet.h5)        | 25.4M  | 4.07G  | 224   | 82.0     | 210.476 qps  |\n  | [PVT_V2B2_linear](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b2_linear_imagenet.h5) | 22.6M  | 3.94G  | 224   | 82.1     | 226.791 qps  |\n  | [PVT_V2B3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b3_imagenet.h5)        | 45.2M  | 6.96G  | 224   | 83.1     | 135.51 qps   |\n  | [PVT_V2B4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b4_imagenet.h5)        | 62.6M  | 10.19G | 224   | 83.6     | 97.666 qps   |\n  | [PVT_V2B5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fpvt\u002Fpvt_v2_b5_imagenet.h5)        | 82.0M  | 11.81G | 224   | 83.8     | 81.4798 qps  |\n## RegNetY\n  - [Keras RegNetY](keras_cv_attention_models\u002Fresnet_family#regnety) 对应论文为 [PDF 2003.13678 设计网络设计空间](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2003.13678.pdf)。\n\n  | 模型      | 参数量  | FLOPs  | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | ---------- | ------- | ------ | ----- | -------- | ------------ |\n  | [RegNetY040](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnety_040_imagenet.h5) | 20.65M  | 3.98G  | 224   | 82.3     | 749.277 qps  |\n  | [RegNetY064](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnety_064_imagenet.h5) | 30.58M  | 6.36G  | 224   | 83.0     | 436.946 qps  |\n  | [RegNetY080](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnety_080_imagenet.h5) | 39.18M  | 7.97G  | 224   | 83.17    | 513.43 qps   |\n  | [RegNetY160](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnety_160_imagenet.h5) | 83.59M  | 15.92G | 224   | 82.0     | 338.046 qps  |\n  | [RegNetY320](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnety_320_imagenet.h5) | 145.05M | 32.29G | 224   | 82.5     | 188.508 qps  |\n\n## RegNetZ\n  - [Keras RegNetZ](keras_cv_attention_models\u002Fresnet_family#regnetz) 包含了对 [Github timm\u002Fmodels\u002Fbyobnet.py](https:\u002F\u002Fgithub.com\u002Frwightman\u002Fpytorch-image-models\u002Fblob\u002Fmaster\u002Ftimm\u002Fmodels\u002Fbyobnet.py) 的实现。\n  - 相关论文 [PDF 2004.02967 演化的归一化-激活层](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.02967.pdf)\n\n  | 模型          | 参数量 | FLOPs | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | -------------- | ------ | ----- | ------- | ---------- | ------------ |\n  | [RegNetZB16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_b16_imagenet.h5)     | 9.72M  | 1.44G | 224   | 79.868   | 751.035 qps  |\n  | [RegNetZC16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_c16_imagenet.h5)     | 13.46M | 2.50G | 256   | 82.164   | 636.549 qps  |\n  | [RegNetZC16_EVO](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_c16_evo_imagenet.h5) | 13.49M | 2.55G | 256   | 81.9     |              |\n  | [RegNetZD32](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_d32_imagenet.h5)     | 27.58M | 5.96G | 256   | 83.422   | 459.204 qps  |\n  | [RegNetZD8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_d8_imagenet.h5)      | 23.37M | 3.95G | 256   | 83.5     | 460.021 qps  |\n  | [RegNetZD8_EVO](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_d8_evo_imagenet.h5)  | 23.46M | 4.61G | 256   | 83.42    |              |\n  | [RegNetZE8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fregnetz_e8_imagenet.h5)      | 57.70M | 9.88G | 256   | 84.5     | 274.97 qps   |\n## RepViT\n  - [Keras RepViT](keras_cv_attention_models\u002Frepvit) 对应于 [PDF 2307.09283 RepViT：从 ViT 视角重审移动端 CNN](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.09283.pdf)。\n\n  | 模型                    | 参数量 | FLOPs | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | ------------------------ | ------ | ----- | ------- | ---------- | -------- |\n  | [RepViT_M09, 蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Frepvit_m_09_imagenet.h5) | 5.10M  | 0.82G | 224   | 79.1     |  |\n  | - deploy=True            | 5.07M  | 0.82G | 224   | 79.1     | 966.72 qps  |\n  | [RepViT_M10, 蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Frepvit_m_10_imagenet.h5) | 6.85M  | 1.12G | 224   | 80.3     | 1157.8 qps  |\n  | - deploy=True            | 6.81M  | 1.12G | 224   | 80.3     |          |\n  | [RepViT_M11, 蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Frepvit_m_11_imagenet.h5) | 8.29M  | 1.35G | 224   | 81.2     | 846.682 qps  |\n  | - deploy=True            | 8.24M  | 1.35G | 224   | 81.2     | 1027.5 qps  |\n  | [RepViT_M15, 蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Frepvit_m_15_imagenet.h5) | 14.13M | 2.30G | 224   | 82.5     |   |\n  | - deploy=True            | 14.05M | 2.30G | 224   | 82.5     |   |\n  | [RepViT_M23, 蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Flevit\u002Frepvit_m_23_imagenet.h5) | 23.01M | 4.55G | 224   | 83.7     |  |\n  | - deploy=True            | 22.93M | 4.55G | 224   | 83.7     |          |\n## ResMLP\n  - [Keras ResMLP](keras_cv_attention_models\u002Fmlp_family#resmlp) 包含了对 [PDF 2105.03404 ResMLP：用于图像分类的前馈网络，支持数据高效训练](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.03404.pdf) 的实现。\n\n  | 模型         | 参数量 | FLOPs   | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | ------------- | ------ | ------- | ------- | ---------- | ------------ |\n  | [ResMLP12](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fresmlp12_imagenet.h5)      | 15M    | 3.02G   | 224   | 77.8     | 928.402 qps  |\n  | [ResMLP24](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fresmlp24_imagenet.h5)      | 30M    | 5.98G   | 224   | 80.8     | 420.709 qps  |\n  | [ResMLP36](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fresmlp36_imagenet.h5)      | 116M   | 8.94G   | 224   | 81.1     | 309.513 qps  |\n  | [ResMLP_B24](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fresmlp_b24_imagenet.h5)    | 129M   | 100.39G | 224   | 83.6     | 78.3015 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fresmlp_b24_imagenet22k.h5) | 129M   | 100.39G | 224   | 84.4     | 78.3015 qps  |\n## ResNeSt\n  - [Keras ResNeSt](keras_cv_attention_models\u002Fresnest) 对应于 [PDF 2004.08955 ResNeSt：分割注意力网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2004.08955.pdf)。\n\n  | 模型          | 参数量 | FLOPs  | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | -------------- | ------ | ------ | ------- | ---------- | ------------ |\n  | [ResNest50](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnest\u002Fresnest50_imagenet.h5)      | 28M    | 5.38G  | 224   | 81.03    | 534.627 qps  |\n  | [ResNest101](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnest\u002Fresnest101_imagenet.h5)     | 49M    | 13.33G | 256   | 82.83    | 257.074 qps  |\n  | [ResNest200](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnest\u002Fresnest200_imagenet.h5)     | 71M    | 35.55G | 320   | 83.84    | 118.183 qps  |\n  | [ResNest269](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnest\u002Fresnest269_imagenet.h5)     | 111M   | 77.42G | 416   | 84.54    | 61.167 qps   |\n## ResNetD\n  - [Keras ResNetD](keras_cv_attention_models\u002Fresnet_family#resnetd) 包含了对 [PDF 1812.01187 用于卷积神经网络图像分类的技巧大全](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.01187.pdf) 的实现。\n\n  | 模型      | 参数量 | FLOPs  | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | ---------- | ------ | ------ | ------- | ---------- | ------------ |\n  | [ResNet50D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnet50d_imagenet.h5)  | 25.58M | 4.33G  | 224   | 80.530   | 930.214 qps  |\n  | [ResNet101D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnet101d_imagenet.h5) | 44.57M | 8.04G  | 224   | 83.022   | 502.268 qps  |\n  | [ResNet152D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnet152d_imagenet.h5) | 60.21M | 11.75G | 224   | 83.680   | 353.279 qps  |\n  | [ResNet200D](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnet200d_imagenet.h5) | 64.69M | 15.25G | 224   | 83.962   | 287.73 qps   |\n\n## ResNetQ\n  - [Keras ResNetQ](keras_cv_attention_models\u002Fresnet_family#resnetq) 包含了对 [Github timm\u002Fmodels\u002Fresnet.py](https:\u002F\u002Fgithub.com\u002Frwightman\u002Fpytorch-image-models\u002Fblob\u002Fmaster\u002Ftimm\u002Fmodels\u002Fresnet.py) 的实现。\n\n  | 模型     | 参数量 | FLOPs | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | --------- | ------ | ----- | ------- | ---------- | ------------ |\n  | [ResNet51Q](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnet51q_imagenet.h5) | 35.7M  | 4.87G | 224   | 82.36    | 838.754 qps  |\n  | ResNet61Q | 36.8M  | 5.96G | 224   |          | 730.245 qps  |\n## ResNeXt\n  - [Keras ResNeXt](keras_cv_attention_models\u002Fresnet_family#resnext) 包含了对 [PDF 1611.05431 深度神经网络的聚合残差变换](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1611.05431.pdf) 的实现。\n  - `SWSL` 是指来自 [Github facebookresearch\u002Fsemi-supervised-ImageNet1K-models](https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fsemi-supervised-ImageNet1K-models) 的 `半弱监督 ResNe*t`。**请注意这些权重采用 CC-BY-NC 4.0 许可证，仅限非商业用途**。\n\n  | 模型                      | 参数量 | FLOPs  | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | -------------------------- | ------ | ------ | ------- | ---------- | ------------ |\n  | [ResNeXt50, (32×4d)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext50_imagenet.h5)         | 25M    | 4.23G  | 224   | 79.768   | 1041.46 qps  |\n  | - [SWSL](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext50_swsl.h5)                     | 25M    | 4.23G  | 224   | 82.182   | 1041.46 qps  |\n  | [ResNeXt50D, (32×4d + 深层结构)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext50d_imagenet.h5) | 25M    | 4.47G  | 224   | 79.676   | 1010.94 qps  |\n  | [ResNeXt101, (32×4d)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext101_imagenet.h5)        | 42M    | 7.97G  | 224   | 80.334   | 571.652 qps  |\n  | - [SWSL](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext101_swsl.h5)                     | 42M    | 7.97G  | 224   | 83.230   | 571.652 qps  |\n  | [ResNeXt101W, (32×8d)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext101_imagenet.h5)       | 89M    | 16.41G | 224   | 79.308   | 367.431 qps  |\n  | - [SWSL](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext101w_swsl.h5)                     | 89M    | 16.41G | 224   | 84.284   | 367.431 qps  |\n  | [ResNeXt101W_64, (64×4d)](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fresnet_family\u002Fresnext101w_64_imagenet.h5)    | 83.46M | 15.46G | 224   | 82.46    | 377.83 qps   |\n## SwinTransformerV2\n  - [Keras SwinTransformerV2](keras_cv_attention_models\u002Fswin_transformer_v2) 包含了对 [PDF 2111.09883 Swin Transformer V2：扩展容量和分辨率](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2111.09883.pdf) 的实现。\n\n  | 模型                                | 参数量 | FLOPs  | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | ------------------------------------ | ------ | ------ | ------- | ---------- | ------------ |\n  | [SwinTransformerV2Tiny_ns](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_tiny_ns_224_imagenet.h5)             | 28.3M  | 4.69G  | 224   | 81.8     | 289.205 qps  |\n  | [SwinTransformerV2Small_ns](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_small_ns_224_imagenet.h5)            | 49.7M  | 9.12G  | 224   | 83.5     | 169.645 qps  |\n  | [SwinTransformerV2Tiny_window8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_tiny_window8_256_imagenet.h5)        | 28.3M  | 5.99G  | 256   | 81.8     | 275.547 qps  |\n  | [SwinTransformerV2Tiny_window16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_tiny_window16_256_imagenet.h5)       | 28.3M  | 6.75G  | 256   | 82.8     | 217.207 qps  |\n  | [SwinTransformerV2Small_window8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_small_window8_256_imagenet.h5)       | 49.7M  | 11.63G | 256   | 83.7     | 157.559 qps  |\n  | [SwinTransformerV2Small_window16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_small_window16_256_imagenet.h5)      | 49.7M  | 12.93G | 256   | 84.1     | 129.953 qps  |\n  | [SwinTransformerV2Base_window8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_base_window8_256_imagenet.h5)        | 87.9M  | 20.44G | 256   | 84.2     | 126.294 qps  |\n  | [SwinTransformerV2Base_window16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_base_window16_256_imagenet.h5)       | 87.9M  | 22.17G | 256   | 84.6     | 99.634 qps   |\n  | [SwinTransformerV2Base_window16, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_base_window16_256_imagenet22k.h5)  | 87.9M  | 22.17G | 256   | 86.2     | 99.634 qps   |\n  | [SwinTransformerV2Base_window24, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_base_window24_384_imagenet22k.h5)  | 87.9M  | 55.89G | 384   | 87.1     | 35.0508 qps  |\n  | [SwinTransformerV2Large_window16, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_large_window16_256_imagenet22k.h5) | 196.7M | 48.03G | 256   | 86.9     |              |\n  | [SwinTransformerV2Large_window24, 21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fswin_transformer_v2\u002Fswin_transformer_v2_large_window24_384_imagenet22k.h5) | 196.7M | 117.1G | 384   | 87.6     |              |\n\n## TinyNet\n  - [Keras TinyNet](keras_cv_attention_models\u002Fmobilenetv3_family#tinynet) 包含对 [PDF 2010.14819 文章《模型魔方：通过调整深度和宽度优化 TinyNet》](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.14819.pdf) 的实现。\n\n  | 模型    | 参数量 | FLOPs   | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | -------- | ------ | ------- | ----- | -------- | ------------ |\n  | [TinyNetE](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ftinynet_e_imagenet.h5) | 2.04M  | 25.22M  | 106   | 59.86    | 2152.36 qps  |\n  | [TinyNetD](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ftinynet_d_imagenet.h5) | 2.34M  | 53.35M  | 152   | 66.96    | 1905.56 qps  |\n  | [TinyNetC](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ftinynet_c_imagenet.h5) | 2.46M  | 103.22M | 184   | 71.23    | 1353.44 qps  |\n  | [TinyNetB](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ftinynet_b_imagenet.h5) | 3.73M  | 206.28M | 188   | 74.98    | 1196.06 qps  |\n  | [TinyNetA](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmobilenetv3_family\u002Ftinynet_a_imagenet.h5) | 6.19M  | 343.74M | 192   | 77.65    | 981.976 qps  |\n## TinyViT\n  - [Keras TinyViT](keras_cv_attention_models\u002Ftinyvit) 包含对 [PDF 2207.10666 文章《TinyViT：小型视觉 Transformer 的快速预训练蒸馏》](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2207.10666.pdf) 的实现。\n\n  | 模型                | 参数量 | FLOPs | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | -------------------- | ------ | ----- | ----- | -------- | ------------ |\n  | [TinyViT_5M，蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_5m_224_imagenet.h5)  | 5.4M   | 1.3G  | 224   | 79.1     | 631.414 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_5m_224_imagenet21k-ft1k.h5)   | 5.4M   | 1.3G  | 224   | 80.7     | 631.414 qps  |\n  | [TinyViT_11M，蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_11m_224_imagenet.h5) | 11M    | 2.0G  | 224   | 81.5     | 509.818 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_11m_224_imagenet21k-ft1k.h5)   | 11M    | 2.0G  | 224   | 83.2     | 509.818 qps  |\n  | [TinyViT_21M，蒸馏](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_21m_224_imagenet.h5) | 21M    | 4.3G  | 224   | 83.1     | 410.676 qps  |\n  | - [21k_ft1k](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_21m_224_imagenet21k-ft1k.h5)   | 21M    | 4.3G  | 224   | 84.8     | 410.676 qps  |\n  | - [21k_ft1k，384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_21m_384_imagenet21k-ft1k.h5)           | 21M    | 13.8G | 384   | 86.2     | 199.458 qps  |\n  | - [21k_ft1k，512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Ftinyvit\u002Ftiny_vit_21m_512_imagenet21k-ft1k.h5)           | 21M    | 27.0G | 512   | 86.5     | 122.846 qps  |\n## UniFormer\n  - [Keras UniFormer](keras_cv_attention_models\u002Funiformer) 包含对 [PDF 2201.09450 文章《UniFormer：统一卷积与自注意力用于视觉识别》](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2201.09450.pdf) 的实现。\n\n  | 模型                | 参数量 | FLOPs  | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | -------------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [UniformerSmall32，token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_32_224_token_label.h5) | 22M    | 3.66G  | 224   | 83.4     | 577.334 qps  |\n  | [UniformerSmall64](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_64_224_imagenet.h5)     | 22M    | 3.66G  | 224   | 82.9     | 562.794 qps  |\n  | - [token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_64_224_token_label.h5)     | 22M    | 3.66G  | 224   | 83.4     | 562.794 qps  |\n  | [UniformerSmallPlus32](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_plus_32_224_imagenet.h5) | 24M    | 4.24G  | 224   | 83.4     | 546.82 qps   |\n  | - [token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_plus_32_224_token_label.h5)     | 24M    | 4.24G  | 224   | 83.9     | 546.82 qps   |\n  | [UniformerSmallPlus64](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_plus_64_224_imagenet.h5) | 24M    | 4.23G  | 224   | 83.4     | 538.193 qps  |\n  | - [token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_small_plus_64_224_token_label.h5)     | 24M    | 4.23G  | 224   | 83.6     | 538.193 qps  |\n  | [UniformerBase32，token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_base_32_224_token_label.h5)  | 50M    | 8.32G  | 224   | 85.1     | 272.485 qps  |\n  | [UniformerBase64](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_base_64_224_imagenet.h5)      | 50M    | 8.31G  | 224   | 83.8     | 286.963 qps  |\n  | - [token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_base_64_224_token_label.h5)     | 50M    | 8.31G  | 224   | 84.8     | 286.963 qps  |\n  | [UniformerLarge64，token_label](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_large_64_224_token_label.h5) | 100M   | 19.79G | 224   | 85.6     | 154.761 qps  |\n  | - [token_label，384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Funiformer\u002Funiformer_large_64_384_token_label.h5)            | 100M   | 63.11G | 384   | 86.3     | 75.3487 qps  |\n\n## VanillaNet\n  - [Keras VanillaNet](keras_cv_attention_models\u002Fvanillanet) 对应论文为 [PDF 2305.12972 VanillaNet: 深度学习中极简主义的力量](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2305.12972.pdf)。\n\n  | 模型         | 参数量 | FLOPs  | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | ------------- | ------ | ------ | ----- | -------- | ------------ |\n  | [VanillaNet5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_5_imagenet.h5)   | 22.33M | 8.46G  | 224   | 72.49    | 598.964 qps  |\n  | - [deploy=True](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_5_deploy_imagenet.h5) | 15.52M | 5.17G  | 224   | 72.49    | 798.199 qps  |\n  | [VanillaNet6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_6_imagenet.h5)   | 56.12M | 10.11G | 224   | 76.36    | 465.031 qps  |\n  | - [deploy=True](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_6_deploy_imagenet.h5) | 32.51M | 6.00G  | 224   | 76.36    | 655.944 qps  |\n  | [VanillaNet7](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_7_imagenet.h5)   | 56.67M | 11.84G | 224   | 77.98    | 375.479 qps  |\n  | - [deploy=True](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_7_deploy_imagenet.h5) | 32.80M | 6.90G  | 224   | 77.98    | 527.723 qps  |\n  | [VanillaNet8](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_8_imagenet.h5)   | 65.18M | 13.50G | 224   | 79.13    | 341.157 qps  |\n  | - [deploy=True](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_8_deploy_imagenet.h5) | 37.10M | 7.75G  | 224   | 79.13    | 479.328 qps  |\n  | [VanillaNet9](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_9_imagenet.h5)   | 73.68M | 15.17G | 224   | 79.87    | 312.815 qps  |\n  | - [deploy=True](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_9_deploy_imagenet.h5) | 41.40M | 8.59G  | 224   | 79.87    | 443.464 qps  |\n  | [VanillaNet10](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_10_imagenet.h5)  | 82.19M | 16.83G | 224   | 80.57    | 277.871 qps  |\n  | - [deploy=True](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvanillanet\u002Fvanillanet_10_deploy_imagenet.h5) | 45.69M | 9.43G  | 224   | 80.57    | 408.082 qps  |\n  | VanillaNet11  | 90.69M | 18.49G | 224   | 81.08    | 267.026 qps  |\n  | - deploy=True | 50.00M | 10.27G | 224   | 81.08    | 377.239 qps  |\n  | VanillaNet12  | 99.20M | 20.16G | 224   | 81.55    | 229.987 qps  |\n  | - deploy=True | 54.29M | 11.11G | 224   | 81.55    | 358.076 qps  |\n  | VanillaNet13  | 107.7M | 21.82G | 224   | 82.05    | 218.256 qps  |\n  | - deploy=True | 58.59M | 11.96G | 224   | 82.05    | 334.244 qps  |\n## ViT-5\n  - [Keras ViT-5](keras_cv_attention_models\u002Fbeit) 包含来自论文 [PDF 2602.08071 ViT-5: 面向2020年代中期的视觉Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.08071) 的模型。\n\n  | 模型              | 参数量 | FLOPs  | 输入尺寸 | Top1 准确率 |\n  | ------------------ | ------ | ------ | ----- | -------- |\n  | [ViT5_Small_Patch16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fvit5_small_patch16_224_imagenet.h5) | 22.04M | 4.73G  | 224   | 82.2     |\n  | [ViT5_Base_Patch16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fvit5_base_patch16_224_imagenet.h5) | 86.54M | 18.00G | 224   | 84.2     |\n  | [ViT5_Base_Patch16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fvit5_base_patch16_384_imagenet.h5) | 86.83M | 56.19G | 384   | 85.4     |\n  | [ViT5_Large_Patch16](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fvit5_large_patch16_224_imagenet.h5) | 304.3M | 63.01G | 224   | 84.9     |\n  | ViT5_Large_Patch16 | 304.6M | 193.2G | 384   | 86.0     |\n## VOLO\n  - [Keras VOLO](keras_cv_attention_models\u002Fvolo) 对应论文为 [PDF 2106.13112 VOLO: 用于视觉识别的视觉观察者](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2106.13112.pdf)。\n\n  | 模型   | 参数量 | FLOPs   | 输入尺寸 | Top1 准确率 | T4 推理速度 |\n  | ------- | ------ | ------- | ----- | -------- | ------------ |\n  | [VOLO_d1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d1_224_imagenet.h5) | 27M    | 4.82G   | 224   | 84.2     |              |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d1_384_imagenet.h5)   | 27M    | 14.22G  | 384   | 85.2     |              |\n  | [VOLO_d2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d2_224_imagenet.h5) | 59M    | 9.78G   | 224   | 85.2     |              |\n  | - [384](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d2_384_imagenet.h5)   | 59M    | 28.84G  | 384   | 86.0     |              |\n  | [VOLO_d3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d3_224_imagenet.h5) | 86M    | 13.80G  | 224   | 85.4     |              |\n  | - [448](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d3_448_imagenet.h5)   | 86M    | 55.50G  | 448   | 86.3     |              |\n  | [VOLO_d4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d4_224_imagenet.h5) | 193M   | 29.39G  | 224   | 85.7     |              |\n  | - [448](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d4_448_imagenet.h5)   | 193M   | 117.81G | 448   | 86.8     |              |\n  | [VOLO_d5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d5_224_imagenet.h5) | 296M   | 53.34G  | 224   | 86.1     |              |\n  | - [448](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d5_448_imagenet.h5)   | 296M   | 213.72G | 448   | 87.0     |              |\n  | - [512](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fvolo\u002Fvolo_d5_512_imagenet.h5)   | 296M   | 279.36G | 512   | 87.1     |              |\n\n## WaveMLP\n  - [Keras WaveMLP](keras_cv_attention_models\u002Fmlp_family#wavemlp) 包含对 [PDF 2111.12294《一个图像块就是一种波：受量子启发的视觉 MLP》](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2111.12294.pdf) 的实现。\n\n  | 模型     | 参数量 | FLOPs  | 输入大小 | Top1 准确率 | T4 推理速度 |\n  | --------- | ------ | ------ | ----- | -------- | ------------ |\n  | [WaveMLP_T](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fwavemlp_t_imagenet.h5) | 17M    | 2.47G  | 224   | 80.9     | 523.4 qps    |\n  | [WaveMLP_S](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fwavemlp_s_imagenet.h5) | 30M    | 4.55G  | 224   | 82.9     | 203.445 qps  |\n  | [WaveMLP_M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fmlp_family\u002Fwavemlp_m_imagenet.h5) | 44M    | 7.92G  | 224   | 83.3     | 147.155 qps  |\n  | WaveMLP_B | 63M    | 10.26G | 224   | 83.6     |              |\n***\n\n# 检测模型\n## EfficientDet\n  - [Keras EfficientDet](keras_cv_attention_models\u002Fefficientdet) 包含对 [论文 1911.09070《EfficientDet：可扩展且高效的物体检测》](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1911.09070.pdf) 的实现。\n  - `Det-AdvProp + AutoAugment` [论文 2103.13886《通过对抗学习实现鲁棒且准确的物体检测》](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2103.13886.pdf)。\n\n  | 模型              | 参数量 | FLOPs   | 输入大小 | COCO 验证集 AP | 测试集 AP | T4 推理速度 |\n  | ------------------ | ------ | ------- | ----- | ----------- | ------- | ------------ |\n  | [EfficientDetD0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d0_512_coco.h5)     | 3.9M   | 2.55G   | 512   | 34.3        | 34.6    | 248.009 qps  |\n  | - Det-AdvProp      | 3.9M   | 2.55G   | 512   | 35.1        | 35.3    | 248.009 qps  |\n  | [EfficientDetD1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d1_640_coco.h5)     | 6.6M   | 6.13G   | 640   | 40.2        | 40.5    | 133.139 qps  |\n  | - Det-AdvProp      | 6.6M   | 6.13G   | 640   | 40.8        | 40.9    | 133.139 qps  |\n  | [EfficientDetD2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d2_768_coco.h5)     | 8.1M   | 11.03G  | 768   | 43.5        | 43.9    | 89.0523 qps  |\n  | - Det-AdvProp      | 8.1M   | 11.03G  | 768   | 44.3        | 44.3    | 89.0523 qps  |\n  | [EfficientDetD3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d3_896_coco.h5)     | 12.0M  | 24.95G  | 896   | 46.8        | 47.2    | 50.0498 qps  |\n  | - Det-AdvProp      | 12.0M  | 24.95G  | 896   | 47.7        | 48.0    | 50.0498 qps  |\n  | [EfficientDetD4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d4_1024_coco.h5)     | 20.7M  | 55.29G  | 1024  | 49.3        | 49.7    | 28.0086 qps  |\n  | - Det-AdvProp      | 20.7M  | 55.29G  | 1024  | 50.4        | 50.4    | 28.0086 qps  |\n  | [EfficientDetD5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d5_1280_coco.h5)     | 33.7M  | 135.62G | 1280  | 51.2        | 51.5    |              |\n  | - Det-AdvProp      | 33.7M  | 135.62G | 1280  | 52.2        | 52.5    |              |\n  | [EfficientDetD6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d6_1280_coco.h5)     | 51.9M  | 225.93G | 1280  | 52.1        | 52.6    |              |\n  | [EfficientDetD7](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d7_1536_coco.h5)     | 51.9M  | 325.34G | 1536  | 53.4        | 53.7    |              |\n  | [EfficientDetD7X](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_d7x_1536_coco.h5)    | 77.0M  | 410.87G | 1536  | 54.4        | 55.1    |              |\n  | [EfficientDetLite0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_lite0_320_coco.h5)  | 3.2M   | 0.98G   | 320   | 27.5        | 26.41   | 599.616 qps  |\n  | [EfficientDetLite1](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_lite1_384_coco.h5)  | 4.2M   | 1.97G   | 384   | 32.6        | 31.50   | 369.273 qps  |\n  | [EfficientDetLite2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_lite2_448_coco.h5)  | 5.3M   | 3.38G   | 448   | 36.2        | 35.06   | 278.263 qps  |\n  | [EfficientDetLite3](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_lite3_512_coco.h5)  | 8.4M   | 7.50G   | 512   | 39.9        | 38.77   | 180.871 qps  |\n  | [EfficientDetLite3X](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_lite3x_640_coco.h5) | 9.3M   | 14.01G  | 640   | 44.0        | 42.64   | 115.271 qps  |\n  | [EfficientDetLite4](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fefficientdet\u002Fefficientdet_lite4_640_coco.h5)  | 15.1M  | 20.20G  | 640   | 44.4        | 43.18   | 95.4122 qps  |\n## YOLO_NAS\n  - [Keras YOLO_NAS](keras_cv_attention_models\u002Fyolov8) 包含对 [Github Deci-AI\u002Fsuper-gradients](https:\u002F\u002Fgithub.com\u002FDeci-AI\u002Fsuper-gradients) 提供的 YOLO-NAS 模型的实现。\n\n  | 模型                   | 参数量 | FLOPs  | 输入大小 | COCO 验证集 AP | 测试集 AP | T4 推理速度 |\n  | ----------------------- | ------ | ------ | ----- | ----------- | ------- | ------------ |\n  | [YOLO_NAS_S](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolo_nas_s_before_reparam_coco.h5) | 12.88M | 16.96G | 640   | 47.5        |         | 240.087 qps  |\n  | - [use_reparam_conv=False](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolo_nas_s_coco.h5)  | 12.18M | 15.92G | 640   | 47.5        |         | 345.595 qps  |\n  | [YOLO_NAS_M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolo_nas_m_before_reparam_coco.h5) | 33.86M | 47.12G | 640   | 51.55       |         | 128.96 qps   |\n  | - [use_reparam_conv=False](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolo_nas_m_coco.h5)  | 31.92M | 43.91G | 640   | 51.55       |         | 167.935 qps  |\n  | [YOLO_NAS_L](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolo_nas_l_before_reparam_coco.h5) | 44.53M | 64.53G | 640   | 52.22       |         | 98.6069 qps  |\n  | - [use_reparam_conv=False](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolo_nas_l_coco.h5)  | 42.02M | 59.95G | 640   | 52.22       |         | 131.11 qps   |\n\n## YOLOR\n  - [Keras YOLOR](keras_cv_attention_models\u002Fyolor) 包含对论文 [2105.04206 你只需学习一种表示：用于多任务的统一网络](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2105.04206.pdf) 的实现。\n\n  | 模型      | 参数量 | FLOPs   | 输入大小 | COCO 验证集 AP | 测试集 AP | T4 推理速度 |\n  | ---------- | ------ | ------- | ----- | ----------- | ------- | ------------ |\n  | [YOLOR_CSP](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolor\u002Fyolor_csp_coco.h5)  | 52.9M  | 60.25G  | 640   | 50.0        | 52.8    | 118.746 qps  |\n  | [YOLOR_CSPX](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolor\u002Fyolor_csp_x_coco.h5) | 99.8M  | 111.11G | 640   | 51.5        | 54.8    | 67.9444 qps  |\n  | [YOLOR_P6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolor\u002Fyolor_p6_coco.h5)   | 37.3M  | 162.87G | 1280  | 52.5        | 55.7    | 49.3128 qps  |\n  | [YOLOR_W6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolor\u002Fyolor_w6_coco.h5)   | 79.9M  | 226.67G | 1280  | 53.6 ?      | 56.9    | 40.2355 qps  |\n  | [YOLOR_E6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolor\u002Fyolor_e6_coco.h5)   | 115.9M | 341.62G | 1280  | 50.3 ?      | 57.6    | 21.5719 qps  |\n  | [YOLOR_D6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolor\u002Fyolor_d6_coco.h5)   | 151.8M | 467.88G | 1280  | 50.8 ?      | 58.2    | 16.6061 qps  |\n## YOLOV7\n  - [Keras YOLOV7](keras_cv_attention_models\u002Fyolov7) 包含对论文 [2207.02696 YOLOv7：可训练的免费工具包为实时目标检测器树立了新的行业标杆](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2207.02696.pdf) 的实现。\n\n  | 模型       | 参数量 | FLOPs  | 输入大小 | COCO 验证集 AP | 测试集 AP | T4 推理速度 |\n  | ----------- | ------ | ------ | ----- | ----------- | ------- | ------------ |\n  | [YOLOV7_Tiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_tiny_coco.h5) | 6.23M  | 2.90G  | 416   | 33.3        |         | 845.903 qps  |\n  | [YOLOV7_CSP](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_csp_coco.h5)  | 37.67M | 53.0G  | 640   | 51.4        |         | 137.441 qps  |\n  | [YOLOV7_X](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_x_coco.h5)    | 71.41M | 95.0G  | 640   | 53.1        |         | 82.0534 qps  |\n  | [YOLOV7_W6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_w6_coco.h5)   | 70.49M | 180.1G | 1280  | 54.9        |         | 49.9841 qps  |\n  | [YOLOV7_E6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_e6_coco.h5)   | 97.33M | 257.6G | 1280  | 56.0        |         | 31.3852 qps  |\n  | [YOLOV7_D6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_d6_coco.h5)   | 133.9M | 351.4G | 1280  | 56.6        |         | 26.1346 qps  |\n  | [YOLOV7_E6E](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov7\u002Fyolov7_e6e_coco.h5)  | 151.9M | 421.7G | 1280  | 56.8        |         | 20.1331 qps  |\n## YOLOV8\n  - [Keras YOLOV8](keras_cv_attention_models\u002Fyolov8) 包含对 [Github ultralytics\u002Fultralytics](https:\u002F\u002Fgithub.com\u002Fultralytics\u002Fultralytics) 检测和分类模型的实现。\n\n  | 模型     | 参数量 | FLOPs  | 输入大小 | COCO 验证集 AP | 测试集 AP | T4 推理速度 |\n  | --------- | ------ | ------ | ----- | ----------- | ------- | ------------ |\n  | [YOLOV8_N](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_n_coco.h5)   | 3.16M  | 4.39G  | 640   | 37.3        |         | 614.042 qps  |\n  | [YOLOV8_S](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_s_coco.h5)   | 11.17M | 14.33G | 640   | 44.9        |         | 349.528 qps  |\n  | [YOLOV8_M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_m_coco.h5)   | 25.90M | 39.52G | 640   | 50.2        |         | 160.212 qps  |\n  | [YOLOV8_L](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_l_coco.h5)   | 43.69M | 82.65G | 640   | 52.9        |         | 104.452 qps  |\n  | [YOLOV8_X](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_x_coco.h5)   | 68.23M | 129.0G | 640   | 53.9        |         | 66.0428 qps  |\n  | [YOLOV8_X6](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_x6_coco.h5) | 97.42M | 522.6G | 1280  | 56.7 ?      |         | 17.4368 qps  |\n## YOLOX\n  - [Keras YOLOX](keras_cv_attention_models\u002Fyolox) 包含对论文 [2107.08430 YOLOX：在2021年超越YOLO系列](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2107.08430.pdf) 的实现。\n\n  | 模型     | 参数量 | FLOPs   | 输入大小 | COCO 验证集 AP | 测试集 AP | T4 推理速度 |\n  | --------- | ------ | ------- | ----- | ----------- | ------- | ------------ |\n  | [YOLOXNano](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolox\u002Fyolox_nano_coco.h5) | 0.91M  | 0.53G   | 416   | 25.8        |         | 930.57 qps   |\n  | [YOLOXTiny](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolox\u002Fyolox_tiny_coco.h5) | 5.06M  | 3.22G   | 416   | 32.8        |         | 745.2 qps    |\n  | [YOLOXS](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolox\u002Fyolox_s_coco.h5)    | 9.0M   | 13.39G  | 640   | 40.5        | 40.5    | 380.38 qps   |\n  | [YOLOXM](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolox\u002Fyolox_m_coco.h5)    | 25.3M  | 36.84G  | 640   | 46.9        | 47.2    | 181.084 qps  |\n  | [YOLOXL](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolox\u002Fyolox_l_coco.h5)    | 54.2M  | 77.76G  | 640   | 49.7        | 50.1    | 111.517 qps  |\n  | [YOLOXX](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolox\u002Fyolox_x_coco.h5)    | 99.1M  | 140.87G | 640   | 51.5        | 51.5    | 62.3189 qps  |\n\n***\n\n# 语言模型\n\n## GPT2\n  - [Keras GPT2](keras_cv_attention_models\u002Fgpt2) 包含对 [Language Models are Unsupervised Multitask Learners](https:\u002F\u002Fd4mucfpksywv.cloudfront.net\u002Fbetter-language-models\u002Flanguage-models.pdf) 的实现。`T4 推理` 使用 `input_shape=[1, 1024]` 进行测试。\n\n  | 模型            | 参数量  | FLOPs   | 词汇表大小 | LAMBADA PPL | T4 推理 |\n  | ---------------- | ------- | ------- | ---------- | ----------- | ------------ |\n  | [GPT2_Base](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpt2\u002Fgpt2_base_webtext.h5)        | 163.04M | 146.42G | 50257      | 35.13       | 51.4483 qps  |\n  | [GPT2_Medium](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpt2\u002Fgpt2_medium_webtext.h5)      | 406.29M | 415.07G | 50257      | 15.60       | 21.756 qps   |\n  | [GPT2_Large](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpt2\u002Fgpt2_large_webtext.h5)       | 838.36M | 890.28G | 50257      | 10.87       |              |\n  | [GPT2_XLarge](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpt2\u002Fgpt2_xlarge_webtext.1.h5), [+.2](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fgpt2\u002Fgpt2_xlarge_webtext.2.h5) | 1.638B  | 1758.3G | 50257      | 8.63        |              |\n## LLaMA2\n  - [Keras LLaMA2](keras_cv_attention_models\u002Fllama2) 包含对 [PDF 2307.09288 Llama 2: Open Foundation and Fine-Tuned Chat Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2307.09288.pdf) 的实现。\n  - `tiny_stories` 权重移植自 [Github karpathy\u002Fllama2.c](https:\u002F\u002Fgithub.com\u002Fkarpathy\u002Fllama2.c)，而 `LLaMA2_1B` 模型权重则移植自 [Github jzhang38\u002FTinyLlama](https:\u002F\u002Fgithubfast.com\u002Fjzhang38\u002FTinyLlama) 的 `TinyLlama-1.1B-Chat-V0.4` 版本。\n\n  | 模型       | 参数量 | FLOPs  | 词汇表大小 | 验证损失 | T4 推理 |\n  | ----------- | ------ | ------ | ---------- | -------- | ------------ |\n  | [LLaMA2_15M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fllama2\u002Fllama2_15m_tiny_stories.h5)  | 24.41M | 4.06G  | 32000      | 1.072    |  |\n  | [LLaMA2_42M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fllama2\u002Fllama2_42m_tiny_stories.h5)  | 58.17M | 50.7G  | 32000      | 0.847    |  |\n  | [LLaMA2_110M](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fllama2\u002Fllama2_110m_tiny_stories.h5) | 134.1M | 130.2G | 32000      | 0.760    |  |\n  | [LLaMA2_1B](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fllama2\u002Fllama2_1b_tiny_llama_1.1B_chat_v0.4.h5) | 1.10B  | 2.50T  | 32003      |          |\n  | LLaMA2_7B   | 6.74B  | 14.54T | 32000      |          |  |\n***\n\n# Stable Diffusion\n  - [Keras Stable Diffusion](keras_cv_attention_models\u002Fstable_diffusion) 包含对 [PDF 2112.10752 High-Resolution Image Synthesis with Latent Diffusion Models](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2112.10752.pdf) 的实现。权重移植自 [Github runwayml\u002Fstable-diffusion](https:\u002F\u002Fgithub.com\u002Frunwayml\u002Fstable-diffusion) 的 `sd-v1-5.ckpt`。\n\n  | 模型               | 参数量 | FLOPs   | 输入               | 下载            |\n  | ------------------- | ------ | ------- | ------------------- | ------------------- |\n  | ViTTextLargePatch14 | 123.1M | 6.67G   | [None, 77]          | [vit_text_large_patch14_clip.h5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fbeit\u002Fvit_text_large_patch14_clip.h5) |\n  | 编码器             | 34.16M | 559.6G  | [None, 512, 512, 3] | [encoder_v1_5.h5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fstable_diffusion\u002Fencoder_v1_5.h5) |\n  | UNet                | 859.5M | 404.4G  | [None, 64, 64, 4]   | [unet_v1_5.h5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fstable_diffusion\u002Funet_v1_5.h5) |\n  | 解码器             | 49.49M | 1259.5G | [None, 64, 64, 4]   | [decoder_v1_5.h5](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fstable_diffusion\u002Fdecoder_v1_5.h5) |\n***\n\n# 分割模型\n## YOLOV8 分割\n  - [Keras YOLOV8](keras_cv_attention_models\u002Fyolov8) 包含对 [Github ultralytics\u002Fultralytics](https:\u002F\u002Fgithub.com\u002Fultralytics\u002Fultralytics) 分割模型的实现。\n\n  | 模型        | 参数量 | FLOPs   | 输入 | COCO val mask AP | T4 推理 |\n  | ------------ | ------ | ------- | ----- | ---------------- | ------------ |\n  | [YOLOV8_N_SEG](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_n_seg_coco.h5) | 3.41M  | 6.02G   | 640   | 30.5             |  |\n  | [YOLOV8_S_SEG](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_s_seg_coco.h5) | 11.82M | 20.08G  | 640   | 36.8             |  |\n  | [YOLOV8_M_SEG](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_m_seg_coco.h5) | 27.29M | 52.33G  | 640   | 40.8             |  |\n  | [YOLOV8_L_SEG](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_l_seg_coco.h5) | 46.00M | 105.29G | 640   | 42.6             |  |\n  | [YOLOV8_X_SEG](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fyolov8\u002Fyolov8_x_seg_coco.h5) | 71.83M | 164.30G | 640   | 43.4             |  |\n## Segment Anything\n  - [Keras Segment Anything](keras_cv_attention_models\u002Fsegment_anything) 包含对 [PDF 2304.02643 Segment Anything](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.02643) 的实现。\n\n  | 模型               | 参数量 | FLOPs | 输入 | COCO val mask AP | T4 推理 |\n  | ------------------- | ------ | ----- | ----- | ---------------- | ------------ |\n  | [MobileSAM](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fsegment_anything\u002Fmobile_sam_5m_image_encoder_1024_sam.h5)           | 5.74M  | 39.4G | 1024  | 41.0             |   |\n  | [TinySAM](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fsegment_anything\u002Ftinysam_5m_image_encoder_1024_sam.h5)           | 5.74M  | 39.4G | 1024  | 41.9             |   |\n  | [EfficientViT_SAM_L0](https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Freleases\u002Fdownload\u002Fsegment_anything\u002Fefficientvit_sam_l0_image_encoder_1024_sam.h5) | 30.73M | 35.4G | 512   | 45.7             |   |\n***\n\n# 许可证\n  - 本部分内容根据 [Github rwightman\u002Fpytorch-image-models](https:\u002F\u002Fgithub.com\u002Frwightman\u002Fpytorch-image-models) 复制并修改而来。\n  - **代码**。此处的代码采用 MIT 许可证。您有责任确保遵守此处的许可证以及任何依赖许可证的条款。在适用的情况下，我在文档字符串中链接了各个组件的来源或参考文献。如果您认为我遗漏了某些内容，请提交一个问题。迄今为止，此处提供的所有预训练权重均基于 ImageNet 和 COCO 数据集进行预训练，其中仅有少数几个模型还进行了额外的预训练。\n  - **ImageNet 预训练权重**。ImageNet 数据集仅用于非商业性的研究目的（https:\u002F\u002Fimage-net.org\u002Fdownload）。这对其所生成的预训练权重的使用有何具体影响尚不明确。我使用 ImageNet 数据集训练的所有模型均出于研究目的，因此应假定原始数据集的许可证同样适用于这些权重。如果您打算将这些预训练权重用于商业产品，最好咨询法律意见。\n  - **COCO 预训练权重**。应遵循 [cocodataset 使用条款](https:\u002F\u002Fcocodataset.org\u002F#termsofuse)。COCO 数据集中的标注属于 COCO 联盟，并根据 [知识共享署名 4.0 许可证](https:\u002F\u002Fcreativecommons.org\u002Flicenses\u002Fby\u002F4.0\u002Flegalcode)授权。COCO 联盟并不拥有图像的版权。图像的使用必须遵守 [Flickr 使用条款](https:\u002F\u002Fwww.flickr.com\u002Fcreativecommons\u002F)。图像的使用者应对数据集的使用承担全部责任，包括但不限于使用其从该数据集中创建的任何受版权保护的图像副本。\n  - **基于除 ImageNet 和 COCO 之外的数据集进行预训练**。此处包含或引用的若干权重是使用我无法访问的专有数据集进行预训练的，其中包括 Facebook 的 WSL、SSL、SWSL ResNe(Xt) 以及 Google 的 Noisy Student EfficientNet 模型。Facebook 的相关模型具有明确的非商业性许可（CC-BY-NC 4.0，https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fsemi-supervised-ImageNet1K-models、https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002FWSL-Images）。而 Google 的模型似乎除了 Apache 2.0 许可证（以及对 ImageNet 的顾虑）之外并无其他限制。无论哪种情况，如有任何疑问，您都应联系 Facebook 或 Google。\n***\n\n# 引用\n  - **BibTeX**\n    ```bibtex\n    @misc{leondgarse,\n      author = {Leondgarse},\n      title = {Keras CV 注意力模型},\n      year = {2022},\n      publisher = {GitHub},\n      journal = {GitHub 仓库},\n      doi = {10.5281\u002Fzenodo.6506947},\n      howpublished = {\\url{https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models}}\n    }\n    ```\n  - **最新 DOI**：[![DOI](https:\u002F\u002Fzenodo.org\u002Fbadge\u002F391777965.svg)](https:\u002F\u002Fzenodo.org\u002Fbadge\u002Flatestdoi\u002F391777965)\n***","# Keras CV Attention Models 快速上手指南\n\n`keras_cv_attention_models` (简称 `kecam`) 是一个集成了大量最新计算机视觉模型（如 ConvNeXt, Swin Transformer, YOLOv8, MobileViT 等）的 Keras 工具库，支持图像分类、目标检测、分割等任务。\n\n## 1. 环境准备\n\n### 系统要求\n- **Python**: 3.7+\n- **后端框架**:\n  - **TensorFlow**: 推荐 `2.15` 及以下版本。\n  - **注意**: 目前**不兼容** `keras 3.x`。如果你使用 `tensorflow>=2.16.0`，必须手动安装 legacy keras：\n    ```bash\n    pip install tf-keras~=$(pip show tensorflow | awk -F ': ' '\u002FVersion\u002F{print $2}')\n    ```\n    并在导入包之前设置环境变量或优先导入本库：\n    ```bash\n    export TF_USE_LEGACY_KERAS=1\n    ```\n  - **PyTorch**: 可选后端，需额外安装 `torch` 和 `thop` (用于计算 FLOPs)。\n\n### 前置依赖\n确保已安装以下基础科学计算库（若未安装）：\n```bash\npip install numpy pandas matplotlib\n```\n\n## 2. 安装步骤\n\n推荐使用国内镜像源加速安装。\n\n### 方式一：通过 PyPI 安装（推荐）\n```bash\n# 使用阿里云镜像源\npip install -U kecam -i https:\u002F\u002Fmirrors.aliyun.com\u002Fpypi\u002Fsimple\u002F\n\n# 或者使用完整包名\npip install -U keras-cv-attention-models -i https:\u002F\u002Fmirrors.aliyun.com\u002Fpypi\u002Fsimple\u002F\n```\n\n### 方式二：从 GitHub 源码安装（获取最新功能）\n```bash\npip install -U git+https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models -i https:\u002F\u002Fmirrors.aliyun.com\u002Fpypi\u002Fsimple\u002F\n```\n\n> **提示**: 安装包 `kecam` 不强制绑定后端，请确保在安装前已自行安装好 TensorFlow 或 PyTorch。\n\n## 3. 基本使用\n\n### 3.1 图像分类（最简示例）\n\n加载预训练模型并进行预测。库内置了 `preprocess_input` 和 `decode_predictions` 方便使用。\n\n```python\nimport tensorflow as tf\nfrom keras_cv_attention_models import coatnet\nfrom keras_cv_attention_models.test_images import cat\n\n# 1. 加载预训练模型 (自动下载权重)\n# 支持众多模型，如: convnext, swin_transformer_v2, mobilevit 等\nmm = coatnet.CoAtNet0(pretrained=\"imagenet\")\n\n# 2. 准备图片并预处理\nimg = cat() # 加载测试图片\ninputs = mm.preprocess_input(img)\n\n# 3. 推理预测\npreds = mm(tf.expand_dims(inputs, 0))\n\n# 4. 解码结果\nresults = mm.decode_predictions(preds)\nprint(results[0]) \n# 输出示例: [('n02124075', 'Egyptian_cat', 0.9999875), ...]\n```\n\n### 3.2 自定义输出类别数\n\n如果需要迁移学习，可以指定 `num_classes`。若类别数与预训练权重（通常为 1000）不一致，库会自动跳过顶层全连接层的权重加载。\n\n```python\nfrom keras_cv_attention_models import swin_transformer_v2\n\n# 设置为 64 类，顶层权重将不会加载，需重新训练该层\nmm = swin_transformer_v2.SwinTransformerV2Tiny_window8(num_classes=64)\nprint(mm.summary())\n```\n\n### 3.3 提取特征（去除顶层）\n\n设置 `num_classes=0` 可移除全局平均池化和全连接层，直接输出特征图。\n\n```python\nfrom keras_cv_attention_models import resnest\n\n# 输出形状将为 (None, H, W, Channels)，无分类头\nmm = resnest.ResNest50(num_classes=0)\nprint(mm.output_shape) \n# 示例输出: (None, 7, 7, 2048)\n```\n\n### 3.4 使用别名简化导入\n\n可以使用 `kecam` 作为简短别名：\n\n```python\nimport kecam\n\n# 加载目标检测模型示例\nmodel = kecam.yolor.YOLOR_CSP(pretrained=\"coco\")\nimgs = kecam.test_images.dog_cat()\npreds = model(model.preprocess_input(imgs))\n\n# 解码检测结果 (bbox, labels, confidences)\nbboxes, labels, scores = model.decode_predictions(preds)[0]\n```\n\n### 3.5 切换 PyTorch 后端 (可选)\n\n如果希望使用 PyTorch 作为后端运行模型：\n\n```python\nimport os\nos.environ['KECAM_BACKEND'] = 'torch'\n\nfrom keras_cv_attention_models import caformer\n\n# 此时将使用 PyTorch 加载模型\nmm = caformer.CAFormerS18(pretrained=\"imagenet\")\nprint(f\"Backend is using: {type(mm)}\")\n```","某工业质检团队需要在边缘设备（如 NVIDIA Jetson）上部署高精度缺陷检测模型，以实时识别生产线上的微小瑕疵。\n\n### 没有 keras_cv_attention_models 时\n- **模型选择受限**：开发者只能依赖官方 TensorFlow 库中有限的预训练模型（如 ResNet 或基础 YOLO），难以尝试最新的 EfficientViT、FastViT 等专为边缘计算优化的架构。\n- **复现成本高昂**：若想使用论文中提出的先进模型（如 CoAtNet 或 Hiera），需手动从零编写复杂的注意力机制代码，耗时数周且极易引入 Bug。\n- **部署流程繁琐**：将自定义模型转换为 TFLite 格式常在算子支持上报错，缺乏统一的转换脚本，导致模型无法在移动端流畅运行。\n- **权重加载困难**：不同来源的预训练权重格式混乱，缺乏标准化的加载接口，经常因维度不匹配导致训练失败。\n\n### 使用 keras_cv_attention_models 后\n- **架构即插即用**：通过 `kecam.models.FastViT()` 等一行代码即可调用数十种前沿模型，直接获得针对边缘设备优化的网络结构。\n- **零样本快速验证**：内置了完整的模型构建与权重自动下载功能，无需手动处理权重映射，几分钟内即可完成新架构的性能基准测试。\n- **一键端侧部署**：工具原生支持 TFLite 转换流程，自动处理算子兼容性问题，让复杂注意力模型能顺利部署到产线工控机上。\n- **统一开发体验**：提供标准化的训练与评估脚本（如 COCO\u002FImagenet），屏蔽了不同模型间的实现差异，让团队能专注于数据优化而非代码调试。\n\nkeras_cv_attention_models 将原本需要数周的研究与工程落地工作压缩至小时级，让中小团队也能轻松驾驭最前沿的视觉算法。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fleondgarse_keras_cv_attention_models_ffe50735.png","leondgarse",null,"https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fleondgarse_1948d3be.jpg","leondgarse@gmail.com","https:\u002F\u002Fgithub.com\u002Fleondgarse",[79],{"name":80,"color":81,"percentage":82},"Python","#3572A5",100,625,97,"2026-04-03T05:57:35","MIT","未说明","可选。推理测试基于 NVIDIA Tesla T4 (CUDA 12.0.1)；支持 CPU 运行；若使用 PyTorch 后端需相应 GPU 环境",{"notes":90,"python":87,"dependencies":91},"重要警告：当前不兼容 Keras 3.x。若使用 TensorFlow>=2.16.0，必须手动安装 tf-keras 并设置环境变量 export TF_USE_LEGACY_KERAS=1，或在导入 Tensorflow 前先导入本包。不建议直接加载 .h5 模型文件，应通过代码构建模型后加载权重。支持切换 PyTorch 或 Keras Core 作为后端。",[92,93,94,95,96,97,98,99],"tensorflow>=2.16.0 (需注意 Keras 兼容性)","tf-keras (当 TF>=2.16.0 时必需)","torch (可选后端)","thop (PyTorch 后端计算 FLOPs 时必需)","tf2onnx, onnx, onnxsim, onnxruntime (导出 ONNX 时必需)","matplotlib","pandas","numpy",[35,15,52,101,14],"音频",[103,104,105,106,107,108,109,110,111,112,113,114,115,116,117],"tensorflow","visualizing","keras","attention","model","imagenet","coco","recognition","detection","tf","tf2","clip","stable-diffusion","segment-anything","ddpm","2026-03-27T02:49:30.150509","2026-04-07T22:59:44.533427",[121,126,131,136,141,146],{"id":122,"question_zh":123,"answer_zh":124,"source_url":125},22870,"MaxViT、HorNet、SwinTransformerV2 和 BeiT 等模型运行时出现问题或结果不佳，如何解决？","这通常是因为预训练权重对输入值的范围非常敏感。建议尝试不同的预处理模式：\n1. 使用 \"tf\" 模式（范围约 -1 到 1）：\n   ```python\n   from keras.preprocessing.image import ImageDataGenerator\n   aa = ImageDataGenerator(preprocessing_function=lambda img: (img - 128) \u002F 128)\n   ```\n2. 使用 \"torch\" 模式（标准化）：\n   ```python\n   import tensorflow as tf\n   mean, std = tf.constant([0.485, 0.456, 0.406]) * 255.0, tf.constant([0.229, 0.224, 0.225]) * 255.0\n   aa = ImageDataGenerator(preprocessing_function=lambda img: (img - mean) \u002F std)\n   ```\n请根据模型要求选择合适的预处理方式。","https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fissues\u002F110",{"id":127,"question_zh":128,"answer_zh":129,"source_url":130},22871,"在 TPU 上运行 VOLO 模型时遇到 'ExtractImagePatches' 编译错误怎么办？","这是因为 TPU 的 XLA 编译器不支持 `ExtractImagePatches` 操作。解决方法是启用软放置（soft placement），让不支持的操作在 CPU 上运行：\n```python\nimport tensorflow as tf\ntf.config.set_soft_device_placement(True)\n```\n注意：这可能会带来一定的性能损失。维护者已将此问题列入路线图，未来可能会优化对 TPU 的支持。","https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fissues\u002F8",{"id":132,"question_zh":133,"answer_zh":134,"source_url":135},22872,"如何将 EfficientFormer 或 MobileViT 模型成功转换为 TFLite 格式？","直接转换可能会因为某些算子（如 Group Convolution, GELU, Extract Patches）不被支持而失败。在转换前需要使用 `model_surgery` 模块对模型进行手术式修改：\n```python\nfrom keras_cv_attention_models import efficientformer, model_surgery\n\n# 加载模型\nbb = efficientformer.EfficientFormerL1()\n\n# 1. 将使用 groups 的 Conv2D 转换为 SplitConv2D\nmm = model_surgery.convert_groups_conv2d_2_split_conv2d(bb)\n\n# 2. 将 GELU 激活函数转换为近似版本\nmm = model_surgery.convert_gelu_to_approximate(mm)\n\n# 3. 将 extract_patches 操作转换为卷积操作\nmm = model_surgery.convert_extract_patches_to_conv(mm)\n\n# 验证修改前后输出是否一致\ntest_inputs = np.random.uniform(size=[1, *mm.input_shape[1:]])\nprint(np.allclose(mm(test_inputs), bb(test_inputs))) # 应输出 True\n\n# 执行转换\nconverter = tf.lite.TFLiteConverter.from_keras_model(mm)\ntflite_model = converter.convert()\n```","https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fissues\u002F137",{"id":137,"question_zh":138,"answer_zh":139,"source_url":140},22873,"如何不使用自带的训练脚本，直接通过 `model.fit()` 训练自定义模型（如结合 Hornet 骨干网络）？","可以直接使用 Keras 的标准流程。如果是加载已有模型并进行微调或修改结构，需要注意自定义层的注册和锚点（Anchor）初始化问题：\n1. 确保导入库以注册自定义层：`import keras_cv_attention_models`。\n2. 加载模型时设置 `compile=False`：\n   ```python\n   from tensorflow import keras\n   model = keras.models.load_model(\"xxxx.h5\", compile=False)\n   # 如果需要构建子模型，可以重新定义输入输出\n   model = keras.models.Model(model.outputs[0], model.outputs[-1])\n   ```\n3. 对于 Anchor-Free 模式，锚点初始化通常只需要模型的 `input_shape`。只要提供正确的输入和输出形状，直接使用 `model.fit()` 即可。如果加载后跳过了一些层，请检查日志中不匹配的原因。","https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fissues\u002F134",{"id":142,"question_zh":143,"answer_zh":144,"source_url":145},22874,"导入特定模型（如 MaxViT）时出现 'ModuleNotFoundError' 错误怎么办？","这通常是由于安装方式或依赖项问题导致的。建议：\n1. 确保安装了最新的包：`pip install kecam`（注意包名可能是 kecam 而不是完整的仓库名）。\n2. 新版本（如 1.3.11+）已支持 PyTorch 后端并减少了对 TensorFlow 的强制依赖。如果你在使用 macOS 或其他特殊环境，可能需要调整 `setup.py` 中的依赖配置，或者暂时注释掉特定的 TensorFlow 版本限制（如 `tensorflow-macos`）。\n3. 确认导入路径是否正确，部分模型可能位于子模块中，需确保包结构完整。","https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fissues\u002F112",{"id":147,"question_zh":148,"answer_zh":149,"source_url":150},22875,"如何使用 Swin Transformer 作为 YOLO-R 的骨干网络进行推理？","可以将 Swin Transformer 实例化后作为 `backbone` 参数传递给 YOLO-R 模型。示例代码如下：\n```python\nfrom keras_cv_attention_models import yolor\nfrom keras_cv_attention_models import swin_transformer_v2\n\n# 初始化 Swin Transformer V2 Small 作为骨干网络\nbb = swin_transformer_v2.SwinTransformerV2Small_window16(input_shape=(256, 256, 3), num_classes=1000)\n\n# 构建 YOLO-R 模型\nmodel = yolor.YOLOR(backbone=bb)\n\n# 进行测试推理\nfrom keras_cv_attention_models import test_images\nimm = test_images.dog_cat()\npreds = model.predict(imm)\n```\n注意确保输入图像尺寸与骨干网络定义的 `input_shape` 一致。","https:\u002F\u002Fgithub.com\u002Fleondgarse\u002Fkeras_cv_attention_models\u002Fissues\u002F71",[152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228],{"id":153,"version":154,"summary_zh":74,"released_at":155},136621,"cspnext","2024-01-13T06:55:40",{"id":157,"version":158,"summary_zh":74,"released_at":159},136622,"segment_anything","2023-11-28T10:25:12",{"id":161,"version":162,"summary_zh":74,"released_at":163},136623,"stable_diffusion","2023-09-26T14:45:27",{"id":165,"version":166,"summary_zh":74,"released_at":167},136624,"fastvit","2023-08-19T11:31:12",{"id":169,"version":170,"summary_zh":74,"released_at":171},136625,"llama2","2023-08-05T05:51:14",{"id":173,"version":174,"summary_zh":74,"released_at":175},136626,"repvit","2023-07-31T02:40:58",{"id":177,"version":178,"summary_zh":74,"released_at":179},136627,"fastervit","2023-06-19T13:37:56",{"id":181,"version":182,"summary_zh":74,"released_at":183},136628,"hiera","2023-06-11T09:16:36",{"id":185,"version":186,"summary_zh":74,"released_at":187},136629,"assets","2023-06-02T06:55:26",{"id":189,"version":190,"summary_zh":74,"released_at":191},136630,"gpt2","2023-06-02T08:49:54",{"id":193,"version":194,"summary_zh":74,"released_at":195},136631,"vanillanet","2023-05-25T06:36:49",{"id":197,"version":198,"summary_zh":74,"released_at":199},136632,"efficientvit","2023-05-23T12:11:31",{"id":201,"version":202,"summary_zh":74,"released_at":203},136633,"yolov8","2023-04-05T12:51:17",{"id":205,"version":206,"summary_zh":74,"released_at":207},136634,"inceptionnext","2023-04-05T07:58:05",{"id":209,"version":210,"summary_zh":74,"released_at":211},136635,"fasternet","2023-03-23T13:02:29",{"id":213,"version":214,"summary_zh":74,"released_at":215},136636,"moganet","2023-01-18T13:09:31",{"id":217,"version":218,"summary_zh":74,"released_at":219},136637,"tinyvit","2023-01-17T13:07:16",{"id":221,"version":222,"summary_zh":74,"released_at":223},136638,"caformer","2023-01-17T01:16:48",{"id":225,"version":226,"summary_zh":74,"released_at":227},136639,"gpvit","2023-01-15T11:52:43",{"id":229,"version":230,"summary_zh":74,"released_at":231},136640,"iformer","2023-01-14T12:51:02"]