[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-mlfoundations--open_clip":3,"tool-mlfoundations--open_clip":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",140436,2,"2026-04-05T23:32:43",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":79,"owner_location":79,"owner_email":79,"owner_twitter":79,"owner_website":80,"owner_url":81,"languages":82,"stars":95,"forks":96,"last_commit_at":97,"license":98,"difficulty_score":10,"env_os":99,"env_gpu":100,"env_ram":99,"env_deps":101,"category_tags":109,"github_topics":110,"view_count":119,"oss_zip_url":79,"oss_zip_packed_at":79,"status":16,"created_at":120,"updated_at":121,"faqs":122,"releases":151},489,"mlfoundations\u002Fopen_clip","open_clip","An open source implementation of CLIP.","open_clip 是一个开源实现的 CLIP（对比语言-图像预训练）模型库，旨在帮助用户通过统一的文本-图像对比学习框架，实现跨模态的内容理解和生成。它基于 OpenAI 提出的 CLIP 理论，提供了多种预训练模型和训练代码，支持从数百万到数十亿级规模的数据集进行模型训练与优化。\n\n这一工具解决了传统图像识别和文本匹配任务中依赖大量标注数据的痛点，通过零样本迁移能力，使模型能直接理解未见过的类别。例如，用户无需手动标注\"粉色独角兽\"图片，模型即可通过文本描述完成识别。其核心价值在于降低了多模态 AI 的研发门槛，为研究人员和开发者提供可复现、可扩展的实验基础。\n\n主要面向 AI 研究者、算法工程师和计算机视觉开发者，尤其适合需要处理图像检索、跨模态生成、视觉语义分析等场景的技术团队。技术亮点包括：提供覆盖不同精度和性能需求的 10+ 预训练模型（最高 ImageNet 零样本准确率达 85.4%），支持 LAION、DataComp 等主流数据集的灵活训练配置，并通过 PyTorch 实现了模块化代码架构。用户可通过 pip 安装，结合 Colab 示例快速上手，所有模型参数和训练","open_clip 是一个开源实现的 CLIP（对比语言-图像预训练）模型库，旨在帮助用户通过统一的文本-图像对比学习框架，实现跨模态的内容理解和生成。它基于 OpenAI 提出的 CLIP 理论，提供了多种预训练模型和训练代码，支持从数百万到数十亿级规模的数据集进行模型训练与优化。\n\n这一工具解决了传统图像识别和文本匹配任务中依赖大量标注数据的痛点，通过零样本迁移能力，使模型能直接理解未见过的类别。例如，用户无需手动标注\"粉色独角兽\"图片，模型即可通过文本描述完成识别。其核心价值在于降低了多模态 AI 的研发门槛，为研究人员和开发者提供可复现、可扩展的实验基础。\n\n主要面向 AI 研究者、算法工程师和计算机视觉开发者，尤其适合需要处理图像检索、跨模态生成、视觉语义分析等场景的技术团队。技术亮点包括：提供覆盖不同精度和性能需求的 10+ 预训练模型（最高 ImageNet 零样本准确率达 85.4%），支持 LAION、DataComp 等主流数据集的灵活训练配置，并通过 PyTorch 实现了模块化代码架构。用户可通过 pip 安装，结合 Colab 示例快速上手，所有模型参数和训练细节均在 Hugging Face 平台公开。","# OpenCLIP\n\n[[Paper]](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.07143) [[Citations]](#citing) [[Clip Colab]](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fmlfoundations\u002Fopen_clip\u002Fblob\u002Fmaster\u002Fdocs\u002FInteracting_with_open_clip.ipynb) [[Coca Colab]](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fmlfoundations\u002Fopen_clip\u002Fblob\u002Fmaster\u002Fdocs\u002FInteracting_with_open_coca.ipynb)\n[![pypi](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fv\u002Fopen_clip_torch.svg)](https:\u002F\u002Fpypi.python.org\u002Fpypi\u002Fopen_clip_torch)\n\nWelcome to an open source implementation of OpenAI's [CLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020) (Contrastive Language-Image Pre-training).\n\nUsing this codebase, we have trained several models on a variety of data sources and compute budgets, ranging from [small-scale experiments](docs\u002FLOW_ACC.md) to larger runs including models trained on datasets such as [LAION-400M](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.02114), [LAION-2B](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.08402) and [DataComp-1B](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.14108).\nMany of our models and their scaling properties are studied in detail in the paper [reproducible scaling laws for contrastive language-image learning](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.07143).\nSome of the best models we've trained and their zero-shot ImageNet-1k accuracy are shown below, along with the ViT-L model trained by OpenAI and other state-of-the-art open source alternatives (all can be loaded via OpenCLIP).\nWe provide more details about our full collection of pretrained models [here](docs\u002FPRETRAINED.md), and zero-shot results for 38 datasets [here](docs\u002Fopenclip_results.csv).\n\n\n\n| Model    | Training data | Resolution | # of samples seen | ImageNet zero-shot acc. | \n| -------- | ------- |  ------- |  ------- |  ------- |  \n| ConvNext-Base | LAION-2B  | 256px | 13B | 71.5% |\n| ConvNext-Large | LAION-2B  | 320px | 29B | 76.9% |\n| ConvNext-XXLarge | LAION-2B | 256px | 34B | 79.5% |\n| ViT-B-32-256  | DataComp-1B  | 256px | 34B | 72.8% |\n| ViT-B-16  | DataComp-1B  | 224px | 13B | 73.5% |\n| ViT-L-14  | LAION-2B  | 224px | 32B | 75.3% |\n| ViT-H-14  | LAION-2B  | 224px | 32B | 78.0% |\n| ViT-L-14  | DataComp-1B  | 224px | 13B | 79.2% |\n| ViT-bigG-14  | LAION-2B  | 224px | 34B | 80.1% |\n|  |  |   |   |  |\n| ViT-L-14-quickgelu [(Original CLIP)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020) | WIT | 224px | 13B | 75.5% | \n| ViT-SO400M-14-SigLIP [(SigLIP)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.15343) | WebLI | 224px | 45B | 82.0% | \n| ViT-L-14 [(DFN)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17425) | DFN-2B | 224px | 39B | 82.2% | \n| ViT-L-16-256 [(SigLIP2)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14786) |  WebLI (multi-lang) | 256px | 40B | 82.5% |\n| ViT-SO400M-14-SigLIP-384 [(SigLIP)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.15343) |  WebLI | 384px | 45B | 83.1% |\n| ViT-H-14-quickgelu [(DFN)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17425) | DFN-5B | 224px | 39B | 83.4% | \n| PE-Core-L-14-336 [(PE)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.13181) | MetaCLIP-5.4B | 336px | 58B | 83.5% |\n| ViT-SO400M-16-SigLIP2-384 [(SigLIP2)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14786) |  WebLI (multi-lang) | 384px | 40B | 84.1% |\n| ViT-H-14-378-quickgelu [(DFN)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17425) | DFN-5B | 378px | 44B | 84.4% |\n| ViT-gopt-16-SigLIP2-384 [(SigLIP2)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14786) | WebLI (multi-lang) | 384px | 40B | 85.0% |\n| PE-Core-bigG-14-448 [(PE)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.13181) | MetaCLIP-5.4B | 448px | 86B | 85.4% |\n\nModel cards with additional model specific details can be found on the Hugging Face Hub under the OpenCLIP library tag: https:\u002F\u002Fhuggingface.co\u002Fmodels?library=open_clip. \n\nIf you found this repository useful, please consider [citing](#citing).\nWe welcome anyone to submit an issue or send an email if you have any other requests or suggestions.\n\nNote that portions of `src\u002Fopen_clip\u002F` modelling and tokenizer code are adaptations of OpenAI's official [repository](https:\u002F\u002Fgithub.com\u002Fopenai\u002FCLIP).\n\n## Approach\n\n| ![CLIP](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmlfoundations_open_clip_readme_3958bc6e52d9.png) |\n|:--:|\n| Image Credit: https:\u002F\u002Fgithub.com\u002Fopenai\u002FCLIP |\n\n## Usage\n\n```\npip install open_clip_torch\n```\n\n```python\nimport torch\nfrom PIL import Image\nimport open_clip\n\nmodel, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')\nmodel.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active\ntokenizer = open_clip.get_tokenizer('ViT-B-32')\n\nimage = preprocess(Image.open(\"docs\u002FCLIP.png\")).unsqueeze(0)\ntext = tokenizer([\"a diagram\", \"a dog\", \"a cat\"])\n\nwith torch.no_grad(), torch.autocast(\"cuda\"):\n    image_features = model.encode_image(image)\n    text_features = model.encode_text(text)\n    image_features \u002F= image_features.norm(dim=-1, keepdim=True)\n    text_features \u002F= text_features.norm(dim=-1, keepdim=True)\n\n    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)\n\nprint(\"Label probs:\", text_probs)  # prints: [[1., 0., 0.]]\n```\n\nIf model uses `timm` image encoders (convnext, siglip, eva, etc) ensure the latest timm is installed. Upgrade `timm` if you see 'Unknown model' errors for the image encoder.\n\nIf model uses transformers tokenizers, ensure `transformers` is installed.\n\nSee also this [[Clip Colab]](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fmlfoundations\u002Fopen_clip\u002Fblob\u002Fmaster\u002Fdocs\u002FInteracting_with_open_clip.ipynb).\n\nTo compute billions of embeddings efficiently, you can use [clip-retrieval](https:\u002F\u002Fgithub.com\u002From1504\u002Fclip-retrieval) which has openclip support.\n\n### Pretrained models\n\nWe offer a simple model interface to instantiate both pre-trained and untrained models.\nTo see which pretrained models are available, use the following code snippet.\nMore details about our pretrained models are available [here](docs\u002FPRETRAINED.md).\n\n```python\n>>> import open_clip\n>>> open_clip.list_pretrained()\n```\n\nYou can find more about the models we support (e.g. number of parameters, FLOPs) in [this table](docs\u002Fmodel_profile.csv).\n\nNOTE: Many existing checkpoints use the QuickGELU activation from the original OpenAI models. This activation is actually less efficient than native torch.nn.GELU in recent versions of PyTorch. The model defaults are now nn.GELU, so one should use model definitions with `-quickgelu` postfix for the OpenCLIP pretrained weights. All OpenAI pretrained weights will always default to QuickGELU. One can also use the non `-quickgelu` model definitions with pretrained weights using QuickGELU but there will be an accuracy drop, for fine-tune that will likely vanish for longer runs.\nFuture trained models will use nn.GELU.\n\n### Loading models\n\nModels can be loaded with `open_clip.create_model_and_transforms`, as shown in the example below. The model name and corresponding `pretrained` keys are compatible with the outputs of `open_clip.list_pretrained()`. \n\nThe `pretrained` argument also accepts local paths, for example `\u002Fpath\u002Fto\u002Fmy\u002Fb32.pt`.\nYou can also load checkpoints from huggingface this way. To do so, download the `open_clip_pytorch_model.bin` file (for example, [https:\u002F\u002Fhuggingface.co\u002Flaion\u002FCLIP-ViT-L-14-DataComp.XL-s13B-b90K\u002Ftree\u002Fmain](https:\u002F\u002Fhuggingface.co\u002Flaion\u002FCLIP-ViT-L-14-DataComp.XL-s13B-b90K\u002Fblob\u002Fmain\u002Fopen_clip_pytorch_model.bin)), and use `pretrained=\u002Fpath\u002Fto\u002Fopen_clip_pytorch_model.bin`.\n\n```python\n# pretrained also accepts local paths\nmodel, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k') \n```\n\n## Fine-tuning on classification tasks\n\nThis repository is focused on training CLIP models. To fine-tune a *trained* zero-shot model on a downstream classification task such as ImageNet, please see [our other repository: WiSE-FT](https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fwise-ft). The [WiSE-FT repository](https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fwise-ft) contains code for our paper on [Robust Fine-tuning of Zero-shot Models](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01903), in which we introduce a technique for fine-tuning zero-shot models while preserving robustness under distribution shift.\n\n## Data\n\nTo download datasets as webdataset, we recommend [img2dataset](https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset).\n\n### Conceptual Captions\n\nSee [cc3m img2dataset example](https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset\u002Fblob\u002Fmain\u002Fdataset_examples\u002Fcc3m.md).\n\n### YFCC and other datasets\n\nIn addition to specifying the training data via CSV files as mentioned above, our codebase also supports [webdataset](https:\u002F\u002Fgithub.com\u002Fwebdataset\u002Fwebdataset), which is recommended for larger scale datasets. The expected format is a series of `.tar` files. Each of these `.tar` files should contain two files for each training example, one for the image and one for the corresponding text. Both files should have the same name but different extensions. For instance, `shard_001.tar` could contain files such as `abc.jpg` and `abc.txt`. You can learn more about `webdataset` at [https:\u002F\u002Fgithub.com\u002Fwebdataset\u002Fwebdataset](https:\u002F\u002Fgithub.com\u002Fwebdataset\u002Fwebdataset). We use `.tar` files with 1,000 data points each, which we create using [tarp](https:\u002F\u002Fgithub.com\u002Fwebdataset\u002Ftarp).\n\nYou can download the YFCC dataset from [Multimedia Commons](http:\u002F\u002Fmmcommons.org\u002F).\nSimilar to OpenAI, we used a subset of YFCC to reach the aforementioned accuracy numbers.\nThe indices of images in this subset are in [OpenAI's CLIP repository](https:\u002F\u002Fgithub.com\u002Fopenai\u002FCLIP\u002Fblob\u002Fmain\u002Fdata\u002Fyfcc100m.md).\n\n\n## Training CLIP\n\n### Install\n\nWe advise you first create a virtual environment with:\n\n```\npython3 -m venv .env\nsource .env\u002Fbin\u002Factivate\npip install -U pip\n```\n\nYou can then install openclip for training with `pip install 'open_clip_torch[training]'`.\n\n#### Development\n\nIf you want to make changes to contribute code, you can clone openclip then run `make install` in openclip folder (after creating a virtualenv)\n\nInstall pip PyTorch as per https:\u002F\u002Fpytorch.org\u002Fget-started\u002Flocally\u002F\n\nYou may run `make install-training` to install training deps\n\n#### Testing\n\nTest can be run with `make install-test` then `make test`\n\n`python -m pytest -x -s -v tests -k \"training\"` to run a specific test\n\nRunning regression tests against a specific git revision or tag:\n1. Generate testing data\n    ```sh\n    python tests\u002Futil_test.py --model RN50 RN101 --save_model_list models.txt --git_revision 9d31b2ec4df6d8228f370ff20c8267ec6ba39383\n    ```\n    **_WARNING_: This will invoke git and modify your working tree, but will reset it to the current state after data has been generated! \\\n    Don't modify your working tree while test data is being generated this way.**\n\n2. Run regression tests\n    ```sh\n    OPEN_CLIP_TEST_REG_MODELS=models.txt python -m pytest -x -s -v -m regression_test\n    ```\n\n### Sample single-process running code:\n\n```bash\npython -m open_clip_train.main \\\n    --save-frequency 1 \\\n    --zeroshot-frequency 1 \\\n    --report-to tensorboard \\\n    --train-data=\"\u002Fpath\u002Fto\u002Ftrain_data.csv\"  \\\n    --val-data=\"\u002Fpath\u002Fto\u002Fvalidation_data.csv\"  \\\n    --csv-img-key filepath \\\n    --csv-caption-key title \\\n    --imagenet-val=\u002Fpath\u002Fto\u002Fimagenet\u002Froot\u002Fval\u002F \\\n    --warmup 10000 \\\n    --batch-size=128 \\\n    --lr=1e-3 \\\n    --wd=0.1 \\\n    --epochs=30 \\\n    --workers=8 \\\n    --model RN50\n```\n\nNote: `imagenet-val` is the path to the *validation* set of ImageNet for zero-shot evaluation, not the training set!\nYou can remove this argument if you do not want to perform zero-shot evaluation on ImageNet throughout training. Note that the `val` folder should contain subfolders. If it does not, please use [this script](https:\u002F\u002Fraw.githubusercontent.com\u002Fsoumith\u002Fimagenetloader.torch\u002Fmaster\u002Fvalprep.sh).\n\n### Multi-GPU and Beyond\n\nThis code has been battle tested up to 1024 A100s and offers a variety of solutions\nfor distributed training. We include native support for SLURM clusters.\n\nAs the number of devices used to train increases, so does the space complexity of\nthe the logit matrix. Using a naïve all-gather scheme, space complexity will be\n`O(n^2)`. Instead, complexity may become effectively linear if the flags\n`--gather-with-grad` and `--local-loss` are used. This alteration results in one-to-one\nnumerical results as the naïve method.\n\n#### Epochs\n\nFor larger datasets (eg Laion2B), we recommend setting `--train-num-samples` to a lower value than the full epoch, for example `--train-num-samples 135646078` to 1\u002F16 of an epoch in conjunction with `--dataset-resampled` to do sampling with replacement. This allows having frequent checkpoints to evaluate more often.\n\n#### Patch Dropout\n\n\u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.00794\">Recent research\u003C\u002Fa> has shown that one can dropout half to three-quarters of the visual tokens, leading to up to 2-3x training speeds without loss of accuracy.\n\nYou can set this on your visual transformer config with the key `patch_dropout`.\n\nIn the paper, they also finetuned without the patch dropout at the end. You can do this with the command-line argument `--force-patch-dropout 0.`\n\n#### Multiple data sources\n\nOpenCLIP supports using multiple data sources, by separating different data paths with `::`.\nFor instance, to train on CC12M and on LAION, one might use `--train-data \"\u002Fdata\u002Fcc12m\u002Fcc12m-train-{0000..2175}.tar::\u002Fdata\u002FLAION-400M\u002F{00000..41455}.tar\"`.\nUsing `--dataset-resampled` is recommended for these cases.\n\nBy default, on expectation the amount of times the model will see a sample from each source is proportional to the size of the source.\nFor instance, when training on one data source with size 400M and one with size 10M, samples from the first source are 40x more likely to be seen in expectation.\n\nWe also support different weighting of the data sources, by using the `--train-data-upsampling-factors` flag.\nFor instance, using `--train-data-upsampling-factors=1::1` in the above scenario is equivalent to not using the flag, and `--train-data-upsampling-factors=1::2` is equivalent to upsampling the second data source twice.\nIf you want to sample from data sources with the same frequency, the upsampling factors should be inversely proportional to the sizes of the data sources.\nFor instance, if dataset `A` has 1000 samples and dataset `B` has 100 samples, you can use `--train-data-upsampling-factors=0.001::0.01` (or analogously, `--train-data-upsampling-factors=1::10`).\n\n#### Single-Node\n\nWe make use of `torchrun` to launch distributed jobs. The following launches a\na job on a node of 4 GPUs:\n\n```bash\ncd open_clip\u002Fsrc\ntorchrun --nproc_per_node 4 -m open_clip_train.main \\\n    --train-data '\u002Fdata\u002Fcc12m\u002Fcc12m-train-{0000..2175}.tar' \\\n    --train-num-samples 10968539 \\\n    --dataset-type webdataset \\\n    --batch-size 320 \\\n    --precision amp \\\n    --workers 4 \\\n    --imagenet-val \u002Fdata\u002Fimagenet\u002Fvalidation\u002F\n```\n\n#### Multi-Node\n\nThe same script above works, so long as users include information about the number\nof nodes and host node.\n\n```bash\ncd open_clip\u002Fsrc\ntorchrun --nproc_per_node=4 \\\n    --rdzv_endpoint=$HOSTE_NODE_ADDR \\\n    -m open_clip_train.main \\\n    --train-data '\u002Fdata\u002Fcc12m\u002Fcc12m-train-{0000..2175}.tar' \\\n    --train-num-samples 10968539 \\\n    --dataset-type webdataset \\\n    --batch-size 320 \\\n    --precision amp \\\n    --workers 4 \\\n    --imagenet-val \u002Fdata\u002Fimagenet\u002Fvalidation\u002F\n```\n\n#### SLURM\n\nThis is likely the easiest solution to utilize. The following script was used to\ntrain our largest models:\n\n```bash\n#!\u002Fbin\u002Fbash -x\n#SBATCH --nodes=32\n#SBATCH --gres=gpu:4\n#SBATCH --ntasks-per-node=4\n#SBATCH --cpus-per-task=6\n#SBATCH --wait-all-nodes=1\n#SBATCH --job-name=open_clip\n#SBATCH --account=ACCOUNT_NAME\n#SBATCH --partition PARTITION_NAME\n\neval \"$(\u002Fpath\u002Fto\u002Fconda\u002Fbin\u002Fconda shell.bash hook)\" # init conda\nconda activate open_clip\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\nexport MASTER_PORT=12802\n\nmaster_addr=$(scontrol show hostnames \"$SLURM_JOB_NODELIST\" | head -n 1)\nexport MASTER_ADDR=$master_addr\n\ncd \u002Fshared\u002Fopen_clip\nexport PYTHONPATH=\"$PYTHONPATH:$PWD\u002Fsrc\"\nsrun --cpu_bind=v --accel-bind=gn python -u src\u002Fopen_clip_train\u002Fmain.py \\\n    --save-frequency 1 \\\n    --report-to tensorboard \\\n    --train-data=\"\u002Fdata\u002FLAION-400M\u002F{00000..41455}.tar\" \\\n    --warmup 2000 \\\n    --batch-size=256 \\\n    --epochs=32 \\\n    --workers=8 \\\n    --model ViT-B-32 \\\n    --name \"ViT-B-32-Vanilla\" \\\n    --seed 0 \\\n    --local-loss \\\n    --gather-with-grad\n```\n\n### Resuming from a checkpoint:\n\n```bash\npython -m open_clip_train.main \\\n    --train-data=\"\u002Fpath\u002Fto\u002Ftrain_data.csv\" \\\n    --val-data=\"\u002Fpath\u002Fto\u002Fvalidation_data.csv\"  \\\n    --resume \u002Fpath\u002Fto\u002Fcheckpoints\u002Fepoch_K.pt\n```\n\n### Training CoCa:\nTraining [CoCa](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.01917) models is enabled through specifying a CoCa config using the ```--model``` parameter of the training script. Currently available configs are \"coca_base\", \"coca_ViT-B-32\", and \"coca_roberta-ViT-B-32\" (which uses RoBERTa as the text encoder). CoCa configs are different from CLIP configs because they have an additional \"multimodal_cfg\" component which specifies parameters for the multimodal text decoder. Here's an example from the coca_ViT-B-32 config:\n```json\n\"multimodal_cfg\": {\n\t\"context_length\": 76,\n\t\"vocab_size\": 49408,\n\t\"width\": 512,\n\t\"heads\": 8,\n\t\"layers\": 12,\n\t\"latent_dim\": 512,\n\t\"attn_pooler_heads\": 8\n}\n```\nCredit to [lucidrains](https:\u002F\u002Fgithub.com\u002Flucidrains) for [initial code](https:\u002F\u002Fgithub.com\u002Flucidrains\u002FCoCa-pytorch), [gpucce](https:\u002F\u002Fgithub.com\u002Fgpucce) for adapting the code to open_clip, and [iejMac](https:\u002F\u002Fgithub.com\u002FiejMac) for training the models.\n\n### Generating text with CoCa\n\n```python\nimport open_clip\nimport torch\nfrom PIL import Image\n\nmodel, _, transform = open_clip.create_model_and_transforms(\n  model_name=\"coca_ViT-L-14\",\n  pretrained=\"mscoco_finetuned_laion2B-s13B-b90k\"\n)\n\nim = Image.open(\"cat.jpg\").convert(\"RGB\")\nim = transform(im).unsqueeze(0)\n\nwith torch.no_grad(), torch.cuda.amp.autocast():\n  generated = model.generate(im)\n\nprint(open_clip.decode(generated[0]).split(\"\u003Cend_of_text>\")[0].replace(\"\u003Cstart_of_text>\", \"\"))\n```\n\nSee also this [[Coca Colab]](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fmlfoundations\u002Fopen_clip\u002Fblob\u002Fmaster\u002Fdocs\u002FInteracting_with_open_coca.ipynb)\n\n### Fine Tuning CoCa\n\nTo fine-tune coca on mscoco, first create the dataset, one way is using a csvdataset and perhaps the simplest way to do it is using [CLIP_benchmark](https:\u002F\u002Fgithub.com\u002FLAION-AI\u002FCLIP_benchmark) which in turn uses [pycocotools](https:\u002F\u002Fgithub.com\u002Fcocodataset\u002Fcocoapi) (that can be used also by itself).\n\n```python\nfrom clip_benchmark.datasets.builder import build_dataset\nimport pandas as pd\nimport os\n\nroot_path = \"path\u002Fto\u002Fdata\u002Fdir\" # set this to smth meaningful\nds = build_dataset(\"mscoco_captions\", root=root_path, split=\"train\", task=\"captioning\") # this downloads the dataset if it is not there already\ncoco = ds.coco\nimgs = coco.loadImgs(coco.getImgIds())\nfuture_df = {\"filepath\":[], \"title\":[]}\nfor img in imgs:\n    caps = coco.imgToAnns[img[\"id\"]]\n    for cap in caps:\n        future_df[\"filepath\"].append(img[\"file_name\"])\n        future_df[\"title\"].append(cap[\"caption\"])\npd.DataFrame.from_dict(future_df).to_csv(\n  os.path.join(root_path, \"train2014.csv\"), index=False, sep=\"\\t\"\n)\n```\nThis should create a csv dataset that one can use to fine-tune coca with open_clip\n```bash\npython -m open_clip_train.main \\\n    --dataset-type \"csv\" \\\n    --train-data \"path\u002Fto\u002Fdata\u002Fdir\u002Ftrain2014.csv\" \\\n    --warmup 1000 \\\n    --batch-size 128 \\\n    --lr 1e-5 \\\n    --wd 0.1 \\\n    --epochs 1 \\\n    --workers 3 \\\n    --model \"coca_ViT-L-14\" \\\n    --report-to \"wandb\" \\\n    --coca-contrastive-loss-weight 0 \\\n    --coca-caption-loss-weight 1 \\\n    --log-every-n-steps 100\n```\n\nThis is a general setting, open_clip has very parameters that can be set, ```python -m open_clip_train.main --help``` should show them. The only relevant change compared to pre-training are the two arguments\n\n```bash\n--coca-contrastive-loss-weight 0\n--coca-caption-loss-weight 1\n```\nwhich make the model only train the generative side.\n\n### Training with pre-trained language models as text encoder:\n\nIf you wish to use different language models as the text encoder for CLIP you can do so by using one of the Hugging Face model configs in ```src\u002Fopen_clip\u002Fmodel_configs``` and passing in it's tokenizer as the ```--model``` and ```--hf-tokenizer-name``` parameters respectively. Currently we only support RoBERTa (\"test-roberta\" config), however adding new models should be trivial. You can also determine how many layers, from the end, to leave unfrozen with the ```--lock-text-unlocked-layers``` parameter. Here's an example command to train CLIP with the RoBERTa LM that has it's last 10 layers unfrozen:\n```bash\npython -m open_clip_train.main \\\n         --train-data=\"pipe:aws s3 cp s3:\u002F\u002Fs-mas\u002Fcc3m\u002F{00000..00329}.tar -\" \\\n         --train-num-samples 3000000 \\\n         --val-data=\"pipe:aws s3 cp s3:\u002F\u002Fs-mas\u002Fcc3m\u002F{00330..00331}.tar -\" \\\n         --val-num-samples 10000 \\\n         --dataset-type webdataset \\\n         --batch-size 256 \\\n         --warmup 2000 \\\n         --epochs 10 \\\n         --lr 5e-4 \\\n         --precision amp \\\n         --workers 6 \\\n         --model \"roberta-ViT-B-32\" \\\n         --lock-text \\\n         --lock-text-unlocked-layers 10 \\\n         --name \"10_unfrozen\" \\\n         --report-to \"tensorboard\" \\\n```\n\n### Loss Curves\n\nWhen run on a machine with 8 GPUs the command should produce the following training curve for Conceptual Captions:\n\n![CLIP zero shot training curve](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmlfoundations_open_clip_readme_8f120cd69fc9.png)\n\nMore detailed curves for Conceptual Captions are given at [\u002Fdocs\u002Fclip_conceptual_captions.md](\u002Fdocs\u002Fclip_conceptual_captions.md).\n\nWhen training a RN50 on YFCC the same hyperparameters as above are used, with the exception of `lr=5e-4` and `epochs=32`.\n\nNote that to use another model, like `ViT-B\u002F32` or `RN50x4` or `RN50x16` or `ViT-B\u002F16`, specify with `--model RN50x4`.\n\n### Logging\n\nFor tensorboard logging, run:\n```bash\ntensorboard --logdir=logs\u002Ftensorboard\u002F --port=7777\n```\n\nFor wandb logging, we recommend looking at the `step` variable instead of `Step`, since the later was not properly set in earlier versions of this codebase.\nFor older runs with models trained before https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F613, the `Step` variable should be ignored.\nFor newer runs, after that PR, the two variables are the same.\n\n## Evaluation \u002F Zero-Shot\n\nWe recommend https:\u002F\u002Fgithub.com\u002FLAION-AI\u002FCLIP_benchmark#how-to-use for systematic evaluation on 40 datasets.\n\n### Evaluating local checkpoint:\n\n```bash\npython -m open_clip_train.main \\\n    --val-data=\"\u002Fpath\u002Fto\u002Fvalidation_data.csv\"  \\\n    --model RN101 \\\n    --pretrained \u002Fpath\u002Fto\u002Fcheckpoints\u002Fepoch_K.pt\n```\n\n### Evaluating hosted pretrained checkpoint on ImageNet zero-shot prediction:\n\n```bash\npython -m open_clip_train.main \\\n    --imagenet-val \u002Fpath\u002Fto\u002Fimagenet\u002Fvalidation \\\n    --model ViT-B-32-quickgelu \\\n    --pretrained laion400m_e32\n```\n\n### Model distillation\n\nYou can distill from a pre-trained by using `--distill-model` and `--distill-pretrained` to specify the model you'd like to distill from.\nFor instance, to distill from OpenAI ViT-L\u002F14 use `--distill-model ViT-L-14 --distill-pretrained openai`.\n\n### Gradient accumulation\n\nTo simulate larger batches use `--accum-freq k`. If per gpu batch size, `--batch-size`, is `m`, then the effective batch size will be `k * m * num_gpus`.\n\nWhen increasing `--accum-freq` from its default of 1, samples\u002Fs will remain approximately constant (batch size will double, as will time-per-batch). It is recommended to use other features to reduce batch size such as `--grad-checkpointing --local-loss --gather-with-grad` before increasing `--accum-freq`. `--accum-freq` can be used in addition to these features.\n\nInstead of 1 forward pass per example, there are now 2 forward passes per-example. However, the first is done with `torch.no_grad`.\n\nThere is some additional GPU memory required --- the features and data from all `m` batches are stored in memory.\n\nThere are also `m` loss computations instead of the usual 1.\n\nFor more information see Cui et al. (https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.09331) or Pham et al. (https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.10050).\n\n### Int8 Support\n\nWe have beta support for int8 training and inference.\nYou can enable int8 training with `--use-bnb-linear SwitchBackLinearGlobal` or `--use-bnb-linear SwitchBackLinearGlobalMemEfficient`.\nPlease see the bitsandbytes library for definitions for these layers.\nFor CLIP VIT-Huge this should currently correspond to a 10% training speedup with no accuracy loss.\nMore speedups comin when the attention layer is refactored so that linear layers man be replaced there, too.\n\nSee the tutorial https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fblob\u002Fmain\u002Ftutorials\u002Fint8_tutorial.ipynb or [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.13013).\n\n### Support for remote loading\u002Ftraining\n\nIt is always possible to resume directly from a remote file, e.g., a file in an s3 bucket. Just set `--resume s3:\u002F\u002F\u003Cpath-to-checkpoint> `.\nThis will work with any filesystem supported by `fsspec`.\n\nIt is also possible to train `open_clip` models while continuously backing up to s3. This can help to avoid slow local file systems.\n\nSay that your node has a local ssd `\u002Fscratch`, an s3 bucket `s3:\u002F\u002F\u003Cpath-to-bucket>`.\n\nIn that case, set `--logs \u002Fscratch` and `--remote-sync s3:\u002F\u002F\u003Cpath-to-bucket>`. Then, a background process will sync `\u002Fscratch\u002F\u003Crun-name>` to `s3:\u002F\u002F\u003Cpath-to-bucket>\u002F\u003Crun-name>`. After syncing, the background process will sleep for `--remote-sync-frequency` seconds, which defaults to 5 minutes.\n\nThere is also experimental support for syncing to other remote file systems, not just s3. To do so, specify `--remote-sync-protocol fsspec`. However, this is currently very slow and not recommended.\n\nAlso, to optionally avoid saving too many checkpoints locally when using these features, you can use `--delete-previous-checkpoint` which deletes the previous checkpoint after saving a new one.\n\nNote: if you are using this feature with `--resume latest`, there are a few warnings. First, use with `--save-most-recent` is not supported. Second, only `s3` is supported. Finally, since the sync happens in the background, it is possible that the most recent checkpoint may not be finished syncing to the remote.\n\n### Pushing Models to Hugging Face Hub\n\nThe module `open_clip.push_to_hf_hub` includes helpers for pushing models \u002Fw weights and config to the HF Hub.\n\nThe tool can be run from command line, ex:\n`python -m open_clip.push_to_hf_hub --model convnext_large_d_320 --pretrained \u002Ftrain\u002Fcheckpoints\u002Fepoch_12.pt --repo-id laion\u002FCLIP-convnext_large_d_320.laion2B-s29B-b131K-ft`\n\n\n\n## Acknowledgments\n\nWe gratefully acknowledge the Gauss Centre for Supercomputing e.V. (www.gauss-centre.eu) for funding this part of work by providing computing time through the John von Neumann Institute for Computing (NIC) on the GCS Supercomputer JUWELS Booster at Jülich Supercomputing Centre (JSC).\n\n## The Team\n\nCurrent development of this repository is led by [Ross Wightman](https:\u002F\u002Frwightman.com\u002F), [Romain Beaumont](https:\u002F\u002Fgithub.com\u002From1504), [Cade Gordon](http:\u002F\u002Fcadegordon.io\u002F), and [Vaishaal Shankar](http:\u002F\u002Fvaishaal.com\u002F).\n\nThe original version of this repository is from a group of researchers at UW, Google, Stanford, Amazon, Columbia, and Berkeley.\n\n[Gabriel Ilharco*](http:\u002F\u002Fgabrielilharco.com\u002F), [Mitchell Wortsman*](https:\u002F\u002Fmitchellnw.github.io\u002F), [Nicholas Carlini](https:\u002F\u002Fnicholas.carlini.com\u002F), [Rohan Taori](https:\u002F\u002Fwww.rohantaori.com\u002F), [Achal Dave](http:\u002F\u002Fwww.achaldave.com\u002F), [Vaishaal Shankar](http:\u002F\u002Fvaishaal.com\u002F), [John Miller](https:\u002F\u002Fpeople.eecs.berkeley.edu\u002F~miller_john\u002F), [Hongseok Namkoong](https:\u002F\u002Fhsnamkoong.github.io\u002F), [Hannaneh Hajishirzi](https:\u002F\u002Fhomes.cs.washington.edu\u002F~hannaneh\u002F), [Ali Farhadi](https:\u002F\u002Fhomes.cs.washington.edu\u002F~ali\u002F), [Ludwig Schmidt](https:\u002F\u002Fpeople.csail.mit.edu\u002Fludwigs\u002F)\n\nSpecial thanks to [Jong Wook Kim](https:\u002F\u002Fjongwook.kim\u002F) and [Alec Radford](https:\u002F\u002Fgithub.com\u002FNewmu) for help with reproducing CLIP!\n\n## Citing\n\nIf you found this repository useful, please consider citing:\n```bibtex\n@software{ilharco_gabriel_2021_5143773,\n  author       = {Ilharco, Gabriel and\n                  Wortsman, Mitchell and\n                  Wightman, Ross and\n                  Gordon, Cade and\n                  Carlini, Nicholas and\n                  Taori, Rohan and\n                  Dave, Achal and\n                  Shankar, Vaishaal and\n                  Namkoong, Hongseok and\n                  Miller, John and\n                  Hajishirzi, Hannaneh and\n                  Farhadi, Ali and\n                  Schmidt, Ludwig},\n  title        = {OpenCLIP},\n  month        = jul,\n  year         = 2021,\n  note         = {If you use this software, please cite it as below.},\n  publisher    = {Zenodo},\n  version      = {0.1},\n  doi          = {10.5281\u002Fzenodo.5143773},\n  url          = {https:\u002F\u002Fdoi.org\u002F10.5281\u002Fzenodo.5143773}\n}\n```\n\n```bibtex\n@inproceedings{cherti2023reproducible,\n  title={Reproducible scaling laws for contrastive language-image learning},\n  author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},\n  booktitle={Proceedings of the IEEE\u002FCVF Conference on Computer Vision and Pattern Recognition},\n  pages={2818--2829},\n  year={2023}\n}\n```\n\n```bibtex\n@inproceedings{Radford2021LearningTV,\n  title={Learning Transferable Visual Models From Natural Language Supervision},\n  author={Alec Radford and Jong Wook Kim and Chris Hallacy and A. Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},\n  booktitle={ICML},\n  year={2021}\n}\n```\n\n```bibtex\n@inproceedings{schuhmann2022laionb,\n  title={{LAION}-5B: An open large-scale dataset for training next generation image-text models},\n  author={Christoph Schuhmann and\n          Romain Beaumont and\n          Richard Vencu and\n          Cade W Gordon and\n          Ross Wightman and\n          Mehdi Cherti and\n          Theo Coombes and\n          Aarush Katta and\n          Clayton Mullis and\n          Mitchell Wortsman and\n          Patrick Schramowski and\n          Srivatsa R Kundurthy and\n          Katherine Crowson and\n          Ludwig Schmidt and\n          Robert Kaczmarczyk and\n          Jenia Jitsev},\n  booktitle={Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track},\n  year={2022},\n  url={https:\u002F\u002Fopenreview.net\u002Fforum?id=M3Y74vmsMcY}\n}\n```\n\n[![DOI](https:\u002F\u002Fzenodo.org\u002Fbadge\u002F390536799.svg)](https:\u002F\u002Fzenodo.org\u002Fbadge\u002Flatestdoi\u002F390536799)\n","# OpenCLIP\n\n[[Paper]](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.07143) [[Citations]](#citing) [[Clip Colab]](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fmlfoundations\u002Fopen_clip\u002Fblob\u002Fmaster\u002Fdocs\u002FInteracting_with_open_clip.ipynb) [[Coca Colab]](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fmlfoundations\u002Fopen_clip\u002Fblob\u002Fmaster\u002Fdocs\u002FInteracting_with_open_coca.ipynb)\n[![pypi](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fv\u002Fopen_clip_torch.svg)](https:\u002F\u002Fpypi.python.org\u002Fpypi\u002Fopen_clip_torch)\n\n欢迎使用 OpenAI 的 [CLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020)（Contrastive Language-Image Pre-training，对比语言 - 图像预训练）的开源实现。\n\n使用此代码库，我们在各种数据源和计算资源预算下训练了多个模型，范围从 [小规模实验](docs\u002FLOW_ACC.md) 到更大的运行，包括在 [LAION-400M](https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.02114)、[LAION-2B](https:\u002F\u002Farxiv.org\u002Fabs\u002F2210.08402) 和 [DataComp-1B](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.14108) 等数据集上训练的模型。我们在论文 [可复现的对比语言 - 图像学习缩放定律](https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.07143) 中详细研究了许多模型及其缩放特性。我们训练的一些最佳模型及其在 ImageNet-1k 上的 zero-shot（零样本）准确率如下所示，还包括 OpenAI 训练的 ViT-L 模型以及其他最先进的开源替代方案（所有这些都可以通过 OpenCLIP 加载）。关于我们完整的预训练模型集合的更多详细信息请见 [此处](docs\u002FPRETRAINED.md)，38 个数据集的 zero-shot 结果请见 [此处](docs\u002Fopenclip_results.csv)。\n\n| 模型    | 训练数据 | 分辨率 | 查看到的样本数量 | ImageNet 零样本准确率 | \n| -------- | ------- |  ------- |  ------- |  ------- |  \n| ConvNext-Base | LAION-2B  | 256px | 13B | 71.5% |\n| ConvNext-Large | LAION-2B  | 320px | 29B | 76.9% |\n| ConvNext-XXLarge | LAION-2B | 256px | 34B | 79.5% |\n| ViT-B-32-256  | DataComp-1B  | 256px | 34B | 72.8% |\n| ViT-B-16  | DataComp-1B  | 224px | 13B | 73.5% |\n| ViT-L-14  | LAION-2B  | 224px | 32B | 75.3% |\n| ViT-H-14  | LAION-2B  | 224px | 32B | 78.0% |\n| ViT-L-14  | DataComp-1B  | 224px | 13B | 79.2% |\n| ViT-bigG-14  | LAION-2B  | 224px | 34B | 80.1% |\n|  |  |   |   |  |\n| ViT-L-14-quickgelu [(原始 CLIP)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2103.00020) | WIT | 224px | 13B | 75.5% | \n| ViT-SO400M-14-SigLIP [(SigLIP)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.15343) | WebLI | 224px | 45B | 82.0% | \n| ViT-L-14 [(DFN)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17425) | DFN-2B | 224px | 39B | 82.2% | \n| ViT-L-16-256 [(SigLIP2)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14786) |  WebLI (多语言) | 256px | 40B | 82.5% |\n| ViT-SO400M-14-SigLIP-384 [(SigLIP)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.15343) |  WebLI | 384px | 45B | 83.1% |\n| ViT-H-14-quickgelu [(DFN)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17425) | DFN-5B | 224px | 39B | 83.4% | \n| PE-Core-L-14-336 [(PE)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.13181) | MetaCLIP-5.4B | 336px | 58B | 83.5% |\n| ViT-SO400M-16-SigLIP2-384 [(SigLIP2)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14786) |  WebLI (多语言) | 384px | 40B | 84.1% |\n| ViT-H-14-378-quickgelu [(DFN)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.17425) | DFN-5B | 378px | 44B | 84.4% |\n| ViT-gopt-16-SigLIP2-384 [(SigLIP2)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14786) | WebLI (多语言) | 384px | 40B | 85.0% |\n| PE-Core-bigG-14-448 [(PE)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.13181) | MetaCLIP-5.4B | 448px | 86B | 85.4% |\n\n包含其他模型特定详情的模型卡片可以在 Hugging Face Hub 上的 OpenCLIP 库标签下找到：https:\u002F\u002Fhuggingface.co\u002Fmodels?library=open_clip. \n\n如果您发现此仓库有用，请考虑 [引用](#citing)。\n如果您有其他请求或建议，欢迎提交问题或发送电子邮件。\n\n请注意，`src\u002Fopen_clip\u002F` 中的部分建模和 tokenizer（分词器）代码是 OpenAI 官方 [repository](https:\u002F\u002Fgithub.com\u002Fopenai\u002FCLIP) 的改编版本。\n\n## 方法\n\n| ![CLIP](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmlfoundations_open_clip_readme_3958bc6e52d9.png) |\n|:--:|\n| 图片来源：https:\u002F\u002Fgithub.com\u002Fopenai\u002FCLIP |\n\n## 用法\n\n```\npip install open_clip_torch\n```\n\n```python\nimport torch\nfrom PIL import Image\nimport open_clip\n\nmodel, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')\nmodel.eval()  # 默认情况下模型处于训练模式，会影响一些启用了 BatchNorm（批归一化）或 stochastic depth（随机深度）的模型\ntokenizer = open_clip.get_tokenizer('ViT-B-32')\n\nimage = preprocess(Image.open(\"docs\u002FCLIP.png\")).unsqueeze(0)\ntext = tokenizer([\"a diagram\", \"a dog\", \"a cat\"])\n\nwith torch.no_grad(), torch.autocast(\"cuda\"):\n    image_features = model.encode_image(image)\n    text_features = model.encode_text(text)\n    image_features \u002F= image_features.norm(dim=-1, keepdim=True)\n    text_features \u002F= text_features.norm(dim=-1, keepdim=True)\n\n    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)\n\nprint(\"Label probs:\", text_probs)  # 输出：[[1., 0., 0.]]\n```\n\n如果模型使用 `timm` 图像编码器（convnext, siglip, eva 等），请确保安装了最新版本的 timm。如果看到图像编码器的 'Unknown model' 错误，请升级 `timm`。\n\n如果模型使用 transformers 分词器，请确保已安装 `transformers`。\n\n另请参阅此 [[Clip Colab]](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fmlfoundations\u002Fopen_clip\u002Fblob\u002Fmaster\u002Fdocs\u002FInteracting_with_open_clip.ipynb)。\n\n为了高效地计算数十亿个 embeddings（嵌入），您可以使用支持 openclip 的 [clip-retrieval](https:\u002F\u002Fgithub.com\u002From1504\u002Fclip-retrieval)。\n\n### 预训练模型\n\n我们提供了一个简单的模型接口来实例化预训练和未训练的模型。\n要查看哪些预训练模型可用，请使用以下代码片段。\n关于我们预训练模型的更多详细信息可在 [此处](docs\u002FPRETRAINED.md) 找到。\n\n```python\n>>> import open_clip\n>>> open_clip.list_pretrained()\n```\n\n您可以在 [此表](docs\u002Fmodel_profile.csv) 中找到关于我们支持的模型的更多信息（例如参数量、FLOPs）。\n\n注意：许多现有的 checkpoints（检查点）使用来自原始 OpenAI 模型的 QuickGELU 激活函数。这种激活函数在最近版本的 PyTorch 中实际上不如原生的 torch.nn.GELU 高效。模型默认值现在是 nn.GELU，因此对于 OpenCLIP 预训练权重，应使用带有 `-quickgelu` 后缀的模型定义。所有 OpenAI 预训练权重将始终默认为 QuickGELU。也可以使用非 `-quickgelu` 模型定义配合使用 QuickGELU 的预训练权重，但这会导致准确率下降，对于微调来说，这种差异在长时间运行后可能会消失。\n未来训练的模型将使用 nn.GELU。\n\n\u003C\u002Fthink>\n\n### 加载模型\n\n模型可以通过 `open_clip.create_model_and_transforms` 进行加载，如下例所示。模型名称和对应的 `pretrained` 参数（预训练）与 `open_clip.list_pretrained()` 的输出兼容。\n\n`pretrained` 参数也接受本地路径，例如 `\u002Fpath\u002Fto\u002Fmy\u002Fb32.pt`。你也可以通过这种方式从 Hugging Face 加载检查点 (checkpoints)。为此，下载 `open_clip_pytorch_model.bin` 文件（例如，[https:\u002F\u002Fhuggingface.co\u002Flaion\u002FCLIP-ViT-L-14-DataComp.XL-s13B-b90K\u002Ftree\u002Fmain](https:\u002F\u002Fhuggingface.co\u002Flaion\u002FCLIP-ViT-L-14-DataComp.XL-s13B-b90K\u002Fblob\u002Fmain\u002Fopen_clip_pytorch_model.bin)），并使用 `pretrained=\u002Fpath\u002Fto\u002Fopen_clip_pytorch_model.bin`。\n\n```python\n# pretrained also accepts local paths\nmodel, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k') \n```\n\n## 在分类任务上进行微调\n\n本仓库专注于训练 CLIP 模型。若要针对下游分类任务（如 ImageNet）对 *已训练好的* 零样本 (zero-shot) 模型进行微调，请参见 [我们的另一个仓库：WiSE-FT](https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fwise-ft)。[WiSE-FT 仓库](https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fwise-ft) 包含我们关于 [零样本模型的鲁棒微调](https:\u002F\u002Farxiv.org\u002Fabs\u002F2109.01903) 论文的代码，其中我们介绍了一种在分布偏移 (distribution shift) 下保持鲁棒性的同时微调零样本模型的技术。\n\n## 数据\n\n若要以下载 WebDataset 格式的数据集，我们推荐 [img2dataset](https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset)。\n\n### 概念性描述 (Conceptual Captions)\n\n参见 [cc3m img2dataset 示例](https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset\u002Fblob\u002Fmain\u002Fdataset_examples\u002Fcc3m.md)。\n\n### YFCC 及其他数据集\n\n除了上述通过 CSV 文件指定训练数据外，我们的代码库还支持 [WebDataset](https:\u002F\u002Fgithub.com\u002Fwebdataset\u002Fwebdataset)，这推荐用于更大规模的数据集。预期格式是一系列 `.tar` 文件。每个 `.tar` 文件应为每个训练示例包含两个文件，一个用于图像，一个用于相应的文本。这两个文件应具有相同的名称但不同的扩展名。例如，`shard_001.tar` 可以包含 `abc.jpg` 和 `abc.txt` 等文件。您可以在 [https:\u002F\u002Fgithub.com\u002Fwebdataset\u002Fwebdataset](https:\u002F\u002Fgithub.com\u002Fwebdataset\u002Fwebdataset) 了解更多关于 `webdataset` 的信息。我们使用每个包含 1,000 个数据点的 `.tar` 文件，这些是使用 [tarp](https:\u002F\u002Fgithub.com\u002Fwebdataset\u002Ftarp) 创建的。\n\n您可以从 [Multimedia Commons](http:\u002F\u002Fmmcommons.org\u002F) 下载 YFCC 数据集。与 OpenAI 类似，我们使用了 YFCC 的一个子集来达到上述准确率数值。该子集中图像的索引位于 [OpenAI 的 CLIP 仓库](https:\u002F\u002Fgithub.com\u002Fopenai\u002FCLIP\u002Fblob\u002Fmain\u002Fdata\u002Fyfcc100m.md) 中。\n\n## 训练 CLIP\n\n### 安装\n\n我们建议您首先创建一个虚拟环境 (virtual environment)：\n\n```\npython3 -m venv .env\nsource .env\u002Fbin\u002Factivate\npip install -U pip\n```\n\n然后您可以使用 `pip install 'open_clip_torch[training]'` 安装用于训练的 openclip。\n\n#### 开发\n\n如果您想进行修改以贡献代码，可以克隆 openclip，然后在 openclip 文件夹中运行 `make install`（创建虚拟环境后）。\n\n按照 https:\u002F\u002Fpytorch.org\u002Fget-started\u002Flocally\u002F 安装 pip 版本的 PyTorch。\n\n您可以运行 `make install-training` 来安装训练依赖项。\n\n#### 测试\n\n可以使用 `make install-test` 然后 `make test` 运行测试。\n\n运行 `python -m pytest -x -s -v tests -k \"training\"` 以运行特定测试。\n\n针对特定的 Git 修订版或标签运行回归测试：\n1. 生成测试数据\n    ```sh\n    python tests\u002Futil_test.py --model RN50 RN101 --save_model_list models.txt --git_revision 9d31b2ec4df6d8228f370ff20c8267ec6ba39383\n    ```\n    **_警告_：这将调用 git 并修改您的工作树，但在生成数据后会将其重置为当前状态！\\\n    在此方式生成测试数据时，请勿修改您的工作树。**\n\n2. 运行回归测试\n    ```sh\n    OPEN_CLIP_TEST_REG_MODELS=models.txt python -m pytest -x -s -v -m regression_test\n    ```\n\n### 单进程运行代码示例：\n\n```bash\npython -m open_clip_train.main \\\n    --save-frequency 1 \\\n    --zeroshot-frequency 1 \\\n    --report-to tensorboard \\\n    --train-data=\"\u002Fpath\u002Fto\u002Ftrain_data.csv\"  \\\n    --val-data=\"\u002Fpath\u002Fto\u002Fvalidation_data.csv\"  \\\n    --csv-img-key filepath \\\n    --csv-caption-key title \\\n    --imagenet-val=\u002Fpath\u002Fto\u002Fimagenet\u002Froot\u002Fval\u002F \\\n    --warmup 10000 \\\n    --batch-size=128 \\\n    --lr=1e-3 \\\n    --wd=0.1 \\\n    --epochs=30 \\\n    --workers=8 \\\n    --model RN50\n```\n\n注意：`imagenet-val` 是用于零样本评估的 ImageNet *验证* 集的路径，而不是训练集！\n如果您不想在整个训练过程中对 ImageNet 执行零样本评估，可以删除此参数。请注意，`val` 文件夹应包含子文件夹。如果没有，请使用 [此脚本](https:\u002F\u002Fraw.githubusercontent.com\u002Fsoumith\u002Fimagenetloader.torch\u002Fmaster\u002Fvalprep.sh)。\n\n### 多 GPU 及进阶\n\n此代码已历经实战测试，支持高达 1024 张 A100 显卡，并提供多种分布式训练 (distributed training) 解决方案。我们包含对 SLURM 集群的原生支持。\n\n随着用于训练的设备的增加，logit 矩阵的空间复杂度也会增加。使用朴素的 all-gather 方案，空间复杂度将为 `O(n^2)`。相反，如果使用 `--gather-with-grad` 和 `--local-loss` 标志，复杂度可能会变得有效线性。这种修改产生的数值结果与朴素方法一一对应。\n\n#### 轮次\n\n对于更大的数据集（例如 Laion2B），我们建议将 `--train-num-samples` 设置为低于完整轮次的值，例如 `--train-num-samples 135646078` 对应 1\u002F16 个轮次，并结合 `--dataset-resampled` 进行有放回采样。这允许拥有更频繁的 checkpoint 以便更频繁地进行评估。\n\n#### Patch Dropout（补丁丢弃）\n\n\u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2212.00794\">最近的研究\u003C\u002Fa> 表明可以丢弃一半到四分之三的视觉 token，从而在不损失准确性的情况下实现最高 2-3 倍的训练速度提升。\n\n你可以在视觉 Transformer 配置中使用键 `patch_dropout` 来设置此项。\n\n在论文中，他们最后也在没有 patch dropout 的情况下进行了微调。你可以通过命令行参数 `--force-patch-dropout 0` 来实现这一点。\n\n#### 多个数据源\n\nOpenCLIP 支持使用多个数据源，通过 `::` 分隔不同的数据路径。例如，要在 CC12M 和 LAION 上训练，可以使用 `--train-data \"\u002Fdata\u002Fcc12m\u002Fcc12m-train-{0000..2175}.tar::\u002Fdata\u002FLAION-400M\u002F{00000..41455}.tar\"`。针对这些情况，建议使用 `--dataset-resampled`。\n\n默认情况下，期望上模型看到每个源样本的次数与该源的大小成正比。例如，当在一个大小为 400M 的数据源和一个大小为 10M 的数据源上训练时，来自第一个源的样本在期望上被看到的概率是 40 倍。\n\n我们还支持通过使用 `--train-data-upsampling-factors` 标志对数据源进行不同加权。例如，在上述场景中使用 `--train-data-upsampling-factors=1::1` 等同于不使用该标志，而 `--train-data-upsampling-factors=1::2` 等同于将第二个数据源上采样两次。如果你希望以相同的频率从数据源中采样，上采样因子应与数据源的大小成反比。例如，如果数据集 `A` 有 1000 个样本，数据集 `B` 有 100 个样本，你可以使用 `--train-data-upsampling-factors=0.001::0.01`（或类似地，`--train-data-upsampling-factors=1::10`）。\n\n#### 单节点\n\n我们使用 `torchrun` 启动分布式作业。以下命令将在一个拥有 4 张 GPU 的节点上启动作业：\n\n```bash\ncd open_clip\u002Fsrc\ntorchrun --nproc_per_node 4 -m open_clip_train.main \\\n    --train-data '\u002Fdata\u002Fcc12m\u002Fcc12m-train-{0000..2175}.tar' \\\n    --train-num-samples 10968539 \\\n    --dataset-type webdataset \\\n    --batch-size 320 \\\n    --precision amp \\\n    --workers 4 \\\n    --imagenet-val \u002Fdata\u002Fimagenet\u002Fvalidation\u002F\n```\n\n#### 多节点\n\n只要用户包含有关节点数量和主节点的信息，上述脚本同样适用。\n\n```bash\ncd open_clip\u002Fsrc\ntorchrun --nproc_per_node=4 \\\n    --rdzv_endpoint=$HOSTE_NODE_ADDR \\\n    -m open_clip_train.main \\\n    --train-data '\u002Fdata\u002Fcc12m\u002Fcc12m-train-{0000..2175}.tar' \\\n    --train-num-samples 10968539 \\\n    --dataset-type webdataset \\\n    --batch-size 320 \\\n    --precision amp \\\n    --workers 4 \\\n    --imagenet-val \u002Fdata\u002Fimagenet\u002Fvalidation\u002F\n```\n\n#### SLURM\n\n这可能是最容易利用的解决方案。以下脚本用于训练我们最大的模型：\n\n```bash\n#!\u002Fbin\u002Fbash -x\n#SBATCH --nodes=32\n#SBATCH --gres=gpu:4\n#SBATCH --ntasks-per-node=4\n#SBATCH --cpus-per-task=6\n#SBATCH --wait-all-nodes=1\n#SBATCH --job-name=open_clip\n#SBATCH --account=ACCOUNT_NAME\n#SBATCH --partition PARTITION_NAME\n\neval \"$(\u002Fpath\u002Fto\u002Fconda\u002Fbin\u002Fconda shell.bash hook)\" # init conda\nconda activate open_clip\nexport CUDA_VISIBLE_DEVICES=0,1,2,3\nexport MASTER_PORT=12802\n\nmaster_addr=$(scontrol show hostnames \"$SLURM_JOB_NODELIST\" | head -n 1)\nexport MASTER_ADDR=$master_addr\n\ncd \u002Fshared\u002Fopen_clip\nexport PYTHONPATH=\"$PYTHONPATH:$PWD\u002Fsrc\"\nsrun --cpu_bind=v --accel-bind=gn python -u src\u002Fopen_clip_train\u002Fmain.py \\\n    --save-frequency 1 \\\n    --report-to tensorboard \\\n    --train-data=\"\u002Fdata\u002FLAION-400M\u002F{00000..41455}.tar\" \\\n    --warmup 2000 \\\n    --batch-size=256 \\\n    --epochs=32 \\\n    --workers=8 \\\n    --model ViT-B-32 \\\n    --name \"ViT-B-32-Vanilla\" \\\n    --seed 0 \\\n    --local-loss \\\n    --gather-with-grad\n```\n\n### 从检查点 (checkpoint) 恢复：\n\n```bash\npython -m open_clip_train.main \\\n    --train-data=\"\u002Fpath\u002Fto\u002Ftrain_data.csv\" \\\n    --val-data=\"\u002Fpath\u002Fto\u002Fvalidation_data.csv\"  \\\n    --resume \u002Fpath\u002Fto\u002Fcheckpoints\u002Fepoch_K.pt\n```\n\n### 训练 CoCa：\n\n通过使用训练脚本的 ```--model``` 参数指定 CoCa 配置，即可启用 [CoCa](https:\u002F\u002Farxiv.org\u002Fabs\u002F2205.01917) 模型的训练。当前可用的配置包括 \"coca_base\", \"coca_ViT-B-32\" 和 \"coca_roberta-ViT-B-32\"（后者使用 RoBERTa 作为文本编码器）。CoCa 配置与 CLIP 配置不同，因为它们有一个额外的 \"multimodal_cfg\" 组件，用于指定多模态文本解码器的参数。以下是来自 coca_ViT-B-32 配置的一个示例：\n```json\n\"multimodal_cfg\": {\n\t\"context_length\": 76,\n\t\"vocab_size\": 49408,\n\t\"width\": 512,\n\t\"heads\": 8,\n\t\"layers\": 12,\n\t\"latent_dim\": 512,\n\t\"attn_pooler_heads\": 8\n}\n```\n感谢 [lucidrains](https:\u002F\u002Fgithub.com\u002Flucidrains) 提供 [初始代码](https:\u002F\u002Fgithub.com\u002Flucidrains\u002FCoCa-pytorch)，[gpucce](https:\u002F\u002Fgithub.com\u002Fgpucce) 将代码适配到 open_clip，以及 [iejMac](https:\u002F\u002Fgithub.com\u002FiejMac) 训练模型。\n\n### 使用 CoCa 生成文本\n\n```python\nimport open_clip\nimport torch\nfrom PIL import Image\n\nmodel, _, transform = open_clip.create_model_and_transforms(\n  model_name=\"coca_ViT-L-14\",\n  pretrained=\"mscoco_finetuned_laion2B-s13B-b90k\"\n)\n\nim = Image.open(\"cat.jpg\").convert(\"RGB\")\nim = transform(im).unsqueeze(0)\n\nwith torch.no_grad(), torch.cuda.amp.autocast():\n  generated = model.generate(im)\n\nprint(open_clip.decode(generated[0]).split(\"\u003Cend_of_text>\")[0].replace(\"\u003Cstart_of_text>\", \"\"))\n```\n\n另请参阅此 [[Coca Colab]](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fmlfoundations\u002Fopen_clip\u002Fblob\u002Fmaster\u002Fdocs\u002FInteracting_with_open_coca.ipynb)\n\n### CoCa (对比式图文生成模型) 微调\n\n要在 MSCOCO (微软通用对象上下文数据集) 上微调 CoCa，首先需要创建数据集。一种方法是使用 CSV 数据集，也许最简单的方法是使用 [CLIP_benchmark (CLIP 基准测试)](https:\u002F\u002Fgithub.com\u002FLAION-AI\u002FCLIP_benchmark)，它反过来使用了 [pycocotools (COCO 工具包)](https:\u002F\u002Fgithub.com\u002Fcocodataset\u002Fcocoapi)（也可以单独使用）。\n\n```python\nfrom clip_benchmark.datasets.builder import build_dataset\nimport pandas as pd\nimport os\n\nroot_path = \"path\u002Fto\u002Fdata\u002Fdir\" # set this to smth meaningful\nds = build_dataset(\"mscoco_captions\", root=root_path, split=\"train\", task=\"captioning\") # this downloads the dataset if it is not there already\ncoco = ds.coco\nimgs = coco.loadImgs(coco.getImgIds())\nfuture_df = {\"filepath\":[], \"title\":[]}\nfor img in imgs:\n    caps = coco.imgToAnns[img[\"id\"]]\n    for cap in caps:\n        future_df[\"filepath\"].append(img[\"file_name\"])\n        future_df[\"title\"].append(cap[\"caption\"])\npd.DataFrame.from_dict(future_df).to_csv(\n  os.path.join(root_path, \"train2014.csv\"), index=False, sep=\"\\t\"\n)\n```\n这将创建一个 CSV 数据集，可用于配合 open_clip (开源 CLIP) 微调 CoCa。\n```bash\npython -m open_clip_train.main \\\n    --dataset-type \"csv\" \\\n    --train-data \"path\u002Fto\u002Fdata\u002Fdir\u002Ftrain2014.csv\" \\\n    --warmup 1000 \\\n    --batch-size 128 \\\n    --lr 1e-5 \\\n    --wd 0.1 \\\n    --epochs 1 \\\n    --workers 3 \\\n    --model \"coca_ViT-L-14\" \\\n    --report-to \"wandb\" \\\n    --coca-contrastive-loss-weight 0 \\\n    --coca-caption-loss-weight 1 \\\n    --log-every-n-steps 100\n```\n\n这是一个通用设置，open_clip 有许多可设置的参数，运行 ```python -m open_clip_train.main --help``` 应该可以显示它们。与预训练相比，唯一相关的更改是两个参数：\n\n```bash\n--coca-contrastive-loss-weight 0\n--coca-caption-loss-weight 1\n```\n这使得模型仅训练生成端。\n\n### 使用预训练语言模型作为文本编码器进行训练：\n\n如果您希望为 CLIP 使用不同的语言模型作为文本编码器，您可以使用 ```src\u002Fopen_clip\u002Fmodel_configs``` 中的一个 Hugging Face (模型库) 模型配置，并将其分词器分别作为 ```--model``` 和 ```--hf-tokenizer-name``` 参数传入。目前我们仅支持 RoBERTa (优化的 BERT 模型)（\"test-roberta\" 配置），但添加新模型应该是微不足道的。您还可以使用 ```--lock-text-unlocked-layers``` 参数决定从末尾保留多少层未冻结。以下是一个使用 RoBERTa 语言模型 (LM) 训练 CLIP 的示例命令，该模型最后 10 层未冻结：\n```bash\npython -m open_clip_train.main \\\n         --train-data=\"pipe:aws s3 cp s3:\u002F\u002Fs-mas\u002Fcc3m\u002F{00000..00329}.tar -\" \\\n         --train-num-samples 3000000 \\\n         --val-data=\"pipe:aws s3 cp s3:\u002F\u002Fs-mas\u002Fcc3m\u002F{00330..00331}.tar -\" \\\n         --val-num-samples 10000 \\\n         --dataset-type webdataset \\\n         --batch-size 256 \\\n         --warmup 2000 \\\n         --epochs 10 \\\n         --lr 5e-4 \\\n         --precision amp \\\n         --workers 6 \\\n         --model \"roberta-ViT-B-32\" \\\n         --lock-text \\\n         --lock-text-unlocked-layers 10 \\\n         --name \"10_unfrozen\" \\\n         --report-to \"tensorboard\" \\\n```\n\n### 损失曲线\n\n在拥有 8 个 GPU (图形处理器) 的机器上运行时，该命令应为 Conceptual Captions (概念性描述) 生成以下训练曲线：\n\n![CLIP zero shot training curve](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmlfoundations_open_clip_readme_8f120cd69fc9.png)\n\nConceptual Captions 的更详细曲线位于 [\u002Fdocs\u002Fclip_conceptual_captions.md](\u002Fdocs\u002Fclip_conceptual_captions.md)。\n\n在 YFCC 上训练 RN50 时，使用与上述相同的超参数，除了 `lr=5e-4` 和 `epochs=32`。\n\n注意，要使用其他模型，如 `ViT-B\u002F32` 或 `RN50x4` 或 `RN50x16` 或 `ViT-B\u002F16`，请使用 `--model RN50x4` 指定。\n\n### 日志记录\n\n对于 tensorboard (可视化日志工具) 日志记录，请运行：\n```bash\ntensorboard --logdir=logs\u002Ftensorboard\u002F --port=7777\n```\n\n对于 wandb (Weights & Biases) 日志记录，我们建议查看 `step` 变量而不是 `Step`，因为后者在该代码库的早期版本中未正确设置。对于在 https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F613 之前训练的模型的旧运行，应忽略 `Step` 变量。对于该 PR 之后的较新运行，这两个变量是相同的。\n\n## 评估 \u002F 零样本\n\n我们推荐 https:\u002F\u002Fgithub.com\u002FLAION-AI\u002FCLIP_benchmark#how-to-use 用于在 40 个数据集上的系统评估。\n\n### 评估本地检查点：\n\n```bash\npython -m open_clip_train.main \\\n    --val-data=\"\u002Fpath\u002Fto\u002Fvalidation_data.csv\"  \\\n    --model RN101 \\\n    --pretrained \u002Fpath\u002Fto\u002Fcheckpoints\u002Fepoch_K.pt\n```\n\n### 评估托管的预训练检查点在 ImageNet (大规模视觉识别挑战) 零样本预测上的表现：\n\n```bash\npython -m open_clip_train.main \\\n    --imagenet-val \u002Fpath\u002Fto\u002Fimagenet\u002Fvalidation \\\n    --model ViT-B-32-quickgelu \\\n    --pretrained laion400m_e32\n```\n\n### 模型蒸馏\n\n您可以使用 `--distill-model` 和 `--distill-pretrained` 来指定您想从中蒸馏的模型，从而进行蒸馏。\n例如，要从 OpenAI ViT-L\u002F14 蒸馏，请使用 `--distill-model ViT-L-14 --distill-pretrained openai`。\n\n### 梯度累积\n\n为了模拟更大的批次，请使用 `--accum-freq k`。如果每个 GPU 的批次大小 `--batch-size` 为 `m`，则有效批次大小将为 `k * m * num_gpus`。\n\n当将 `--accum-freq` 从其默认值 1 增加时，样本\u002F秒将保持大致恒定（批次大小将翻倍，每批次时间也将翻倍）。建议在增加 `--accum-freq` 之前使用其他功能来减小批次大小，例如 `--grad-checkpointing --local-loss --gather-with-grad`。`--accum-freq` 可以与这些功能一起使用。\n\n不再是每个示例 1 次前向传播，现在每个示例有 2 次前向传播。但是，第一次是在 `torch.no_grad` 下完成的。\n\n需要一些额外的 GPU 内存 --- 所有 `m` 个批次的特征和数据都存储在内存中。\n\n还有 `m` 次损失计算，而不是通常的 1 次。\n\n更多信息请参阅 Cui et al. (https:\u002F\u002Farxiv.org\u002Fabs\u002F2112.09331) 或 Pham et al. (https:\u002F\u002Farxiv.org\u002Fabs\u002F2111.10050)。\n\n### Int8 (8 位整数) 支持\n\n我们提供 int8 训练和推理的测试版支持。\n您可以使用 `--use-bnb-linear SwitchBackLinearGlobal` 或 `--use-bnb-linear SwitchBackLinearGlobalMemEfficient` 启用 int8 训练。\n有关这些层的定义，请参阅 bitsandbytes 库。\n对于 CLIP VIT-Huge，这目前应对应 10% 的训练加速且无精度损失。\n当注意力层被重构以便线性层也可以在那里替换时，将会有更多的加速。\n\n请参阅教程 https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fblob\u002Fmain\u002Ftutorials\u002Fint8_tutorial.ipynb 或 [论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.13013)。\n\n### 支持远程加载\u002F训练\n\n始终可以直接从远程文件恢复，例如 S3（对象存储服务）存储桶中的文件。只需设置 `--resume s3:\u002F\u002F\u003Cpath-to-checkpoint>`。\n这将适用于 `fsspec`（Python 文件系统抽象层）支持的任何文件系统。\n\n也可以在持续备份到 S3 的同时训练 `open_clip`（OpenCLIP 模型库）模型。这有助于避免缓慢的本地文件系统。\n\n假设您的节点拥有一个本地 SSD（固态硬盘）`\u002Fscratch`，以及一个 S3 存储桶 `s3:\u002F\u002F\u003Cpath-to-bucket>`。\n\n在这种情况下，设置 `--logs \u002Fscratch` 和 `--remote-sync s3:\u002F\u002F\u003Cpath-to-bucket>`。然后，后台进程会将 `\u002Fscratch\u002F\u003Crun-name>` 同步到 `s3:\u002F\u002F\u003Cpath-to-bucket>\u002F\u003Crun-name>`。同步后，后台进程将休眠 `--remote-sync-frequency` 秒，默认为 5 分钟。\n\n此外，还有实验性支持以同步到其他远程文件系统，而不仅仅是 S3。为此，请指定 `--remote-sync-protocol fsspec`。然而，目前这非常慢，不推荐使用。\n\n另外，为了在使用这些功能时选择性地避免在本地保存太多检查点，您可以使用 `--delete-previous-checkpoint`，它会在保存新检查点后删除前一个检查点。\n\n注意：如果您使用 `--resume latest` 配合此功能，有几个警告。首先，不支持与 `--save-most-recent` 一起使用。其次，仅支持 `s3`。最后，由于同步是在后台进行的，最新的检查点可能尚未完成同步到远程端。\n\n### 推送模型到 Hugging Face Hub\n\n模块 `open_clip.push_to_hf_hub` 包含用于将模型及其权重和配置推送到 Hugging Face Hub（模型托管平台）的辅助工具。\n\n该工具可以从命令行运行，例如：\n`python -m open_clip.push_to_hf_hub --model convnext_large_d_320 --pretrained \u002Ftrain\u002Fcheckpoints\u002Fepoch_12.pt --repo-id laion\u002FCLIP-convnext_large_d_320.laion2B-s29B-b131K-ft`\n\n\n\n## 致谢\n\n我们要衷心感谢 Gauss 超级计算中心协会 (www.gauss-centre.eu)，通过于利希超级计算中心 (JSC) 的 GCS 超级计算机 JUWELS Booster 上的冯·诺依曼计算研究所 (NIC) 提供计算时间，资助了本部分工作。\n\n## 团队\n\n当前仓库的开发由 [Ross Wightman](https:\u002F\u002Frwightman.com\u002F)、[Romain Beaumont](https:\u002F\u002Fgithub.com\u002From1504)、[Cade Gordon](http:\u002F\u002Fcadegordon.io\u002F) 和 [Vaishaal Shankar](http:\u002F\u002Fvaishaal.com\u002F) 领导。\n\n本仓库的原始版本来自华盛顿大学、Google、斯坦福大学、亚马逊、哥伦比亚大学和伯克利大学的一组研究人员。\n\n[Gabriel Ilharco*](http:\u002F\u002Fgabrielilharco.com\u002F)、[Mitchell Wortsman*](https:\u002F\u002Fmitchellnw.github.io\u002F)、[Nicholas Carlini](https:\u002F\u002Fnicholas.carlini.com\u002F)、[Rohan Taori](https:\u002F\u002Fwww.rohantaori.com\u002F)、[Achal Dave](http:\u002F\u002Fwww.achaldave.com\u002F)、[Vaishaal Shankar](http:\u002F\u002Fvaishaal.com\u002F)、[John Miller](https:\u002F\u002Fpeople.eecs.berkeley.edu\u002F~miller_john\u002F)、[Hongseok Namkoong](https:\u002F\u002Fhsnamkoong.github.io\u002F)、[Hannaneh Hajishirzi](https:\u002F\u002Fhomes.cs.washington.edu\u002F~hannaneh\u002F)、[Ali Farhadi](https:\u002F\u002Fhomes.cs.washington.edu\u002F~ali\u002F)、[Ludwig Schmidt](https:\u002F\u002Fpeople.csail.mit.edu\u002Fludwigs\u002F)\n\n特别感谢 [Jong Wook Kim](https:\u002F\u002Fjongwook.kim\u002F) 和 [Alec Radford](https:\u002F\u002Fgithub.com\u002FNewmu) 在复现 CLIP 方面提供的帮助！\n\n## 引用\n\n如果您发现此仓库有用，请考虑引用：\n```bibtex\n@software{ilharco_gabriel_2021_5143773,\n  author       = {Ilharco, Gabriel and\n                  Wortsman, Mitchell and\n                  Wightman, Ross and\n                  Gordon, Cade and\n                  Carlini, Nicholas and\n                  Taori, Rohan and\n                  Dave, Achal and\n                  Shankar, Vaishaal and\n                  Namkoong, Hongseok and\n                  Miller, John and\n                  Hajishirzi, Hannaneh and\n                  Farhadi, Ali and\n                  Schmidt, Ludwig},\n  title        = {OpenCLIP},\n  month        = jul,\n  year         = 2021,\n  note         = {If you use this software, please cite it as below.},\n  publisher    = {Zenodo},\n  version      = {0.1},\n  doi          = {10.5281\u002Fzenodo.5143773},\n  url          = {https:\u002F\u002Fdoi.org\u002F10.5281\u002Fzenodo.5143773}\n}\n```\n\n```bibtex\n@inproceedings{cherti2023reproducible,\n  title={Reproducible scaling laws for contrastive language-image learning},\n  author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},\n  booktitle={Proceedings of the IEEE\u002FCVF Conference on Computer Vision and Pattern Recognition},\n  pages={2818--2829},\n  year={2023}\n}\n```\n\n```bibtex\n@inproceedings{Radford2021LearningTV,\n  title={Learning Transferable Visual Models From Natural Language Supervision},\n  author={Alec Radford and Jong Wook Kim and Chris Hallacy and A. Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},\n  booktitle={ICML},\n  year={2021}\n}\n```\n\n```bibtex\n@inproceedings{schuhmann2022laionb,\n  title={{LAION}-5B: An open large-scale dataset for training next generation image-text models},\n  author={Christoph Schuhmann and\n          Romain Beaumont and\n          Richard Vencu and\n          Cade W Gordon and\n          Ross Wightman and\n          Mehdi Cherti and\n          Theo Coombes and\n          Aarush Katta and\n          Clayton Mullis and\n          Mitchell Wortsman and\n          Patrick Schramowski and\n          Srivatsa R Kundurthy and\n          Katherine Crowson and\n          Ludwig Schmidt and\n          Robert Kaczmarczyk and\n          Jenia Jitsev},\n  booktitle={Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track},\n  year={2022},\n  url={https:\u002F\u002Fopenreview.net\u002Fforum?id=M3Y74vmsMcY}\n}\n```\n\n[![DOI](https:\u002F\u002Fzenodo.org\u002Fbadge\u002F390536799.svg)](https:\u002F\u002Fzenodo.org\u002Fbadge\u002Flatestdoi\u002F390536799)","# OpenCLIP 快速上手指南\n\nOpenCLIP 是 OpenAI CLIP 模型的开源实现，支持多种预训练模型（如 ViT、ConvNeXt、SigLIP 等），适用于图像 - 文本对比学习及相关下游任务。\n\n## 环境准备\n\n- **Python**: 建议版本 3.8 及以上。\n- **PyTorch**: 需安装对应版本的 PyTorch（包含 CUDA 支持更佳）。\n- **其他依赖**:\n  - `Pillow`: 用于图像处理。\n  - `timm`: 若使用 ConvNeXt、SigLIP 等基于 timm 的图像编码器，请确保安装最新版。\n  - `transformers`: 若使用特定 Transformer 分词器，需安装此库。\n\n> **提示**：建议使用虚拟环境管理依赖，例如：\n> ```bash\n> python3 -m venv .env\n> source .env\u002Fbin\u002Factivate\n> ```\n\n## 安装步骤\n\n通过 PyPI 安装核心库：\n\n```bash\npip install open_clip_torch\n```\n\n如需进行模型训练，可安装额外依赖：\n\n```bash\npip install 'open_clip_torch[training]'\n```\n\n> **注意**：国内用户可通过配置 pip 镜像源加速下载（如清华源）：\n> ```bash\n> pip install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple open_clip_torch\n> ```\n\n## 基本使用\n\n以下示例演示如何加载预训练模型并进行零样本推理（Zero-shot Inference）。\n\n### 1. 加载模型与预处理\n\n```python\nimport torch\nfrom PIL import Image\nimport open_clip\n\n# 创建模型及变换函数\nmodel, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')\nmodel.eval()  # 设置为评估模式\ntokenizer = open_clip.get_tokenizer('ViT-B-32')\n```\n\n### 2. 编码与相似度计算\n\n```python\n# 读取并预处理图像\nimage = preprocess(Image.open(\"docs\u002FCLIP.png\")).unsqueeze(0)\n# 处理文本列表\ntext = tokenizer([\"a diagram\", \"a dog\", \"a cat\"])\n\n# 推理\nwith torch.no_grad(), torch.autocast(\"cuda\"):\n    image_features = model.encode_image(image)\n    text_features = model.encode_text(text)\n    \n    # 归一化特征\n    image_features \u002F= image_features.norm(dim=-1, keepdim=True)\n    text_features \u002F= text_features.norm(dim=-1, keepdim=True)\n\n    # 计算相似度概率\n    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)\n\nprint(\"Label probs:\", text_probs)  # 输出：[[1., 0., 0.]]\n```\n\n## 查看可用模型\n\n您可以通过以下命令列出所有支持的预训练模型名称：\n\n```python\n>>> import open_clip\n>>> open_clip.list_pretrained()\n```\n\n更多模型详情及性能指标可参考官方文档中的 `PRETRAINED.md` 文件。","某电商技术团队正在紧急开发移动端“以图搜货”功能，希望用户上传图片即可精准匹配相似商品，但面临数据标注不足的困境。\n\n### 没有 open_clip 时\n- 必须投入大量人力为百万级商品库进行图文标注，项目启动周期被拉长数月。\n- 从零训练对比学习模型收敛困难，且受限于自有数据规模，泛化能力严重不足。\n- 若采用商业闭源 API，不仅单次调用成本高，还存在核心用户数据外泄风险。\n- 现有方案固定了模型结构，无法根据服务器 GPU 性能动态调整推理速度与精度。\n\n### 使用 open_clip 后\n- 直接调用基于 LAION-2B 等大规模数据训练的预训练权重，实现零样本图像语义检索。\n- 提供 ViT-L、ConvNeXt 等多种架构选择，可依据部署环境灵活平衡计算资源与准确率。\n- 代码完全开源且支持本地部署，确保所有图片分析过程在内部服务器完成，杜绝隐私泄露。\n- 通过 Hugging Face Hub 一键加载最新 SOTA 模型，显著提升了新品上架首周的搜索匹配质量。\n\nopen_clip 凭借强大的零样本能力和灵活的模型生态，帮助团队在零标注成本下快速实现了高精度的视觉搜索服务。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmlfoundations_open_clip_6efba330.png","mlfoundations","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fmlfoundations_a182ca1f.png","",null,"https:\u002F\u002Fpeople.csail.mit.edu\u002Fludwigs\u002F","https:\u002F\u002Fgithub.com\u002Fmlfoundations",[83,87,91],{"name":84,"color":85,"percentage":86},"Python","#3572A5",99.4,{"name":88,"color":89,"percentage":90},"Shell","#89e051",0.5,{"name":92,"color":93,"percentage":94},"Makefile","#427819",0.1,13647,1270,"2026-04-05T17:59:01","NOASSERTION","未说明","未说明具体型号\u002F显存，需 CUDA 支持（示例代码使用 autocast cuda）",{"notes":102,"python":99,"dependencies":103},"训练需安装 [training] 扩展依赖；部分模型依赖最新 timm 和 transformers；注意 QuickGELU 与 nn.GELU 的激活函数差异；数据推荐使用 webdataset 格式 (.tar)；支持多卡分布式训练及 SLURM。",[104,105,106,107,108],"torch","Pillow","timm","transformers","webdataset",[26,14,13],[111,112,113,114,115,116,117,118],"deep-learning","pytorch","computer-vision","language-model","multi-modal-learning","contrastive-loss","zero-shot-classification","pretrained-models",15,"2026-03-27T02:49:30.150509","2026-04-06T09:46:12.541709",[123,128,132,137,141,146],{"id":124,"question_zh":125,"answer_zh":126,"source_url":127},1934,"如何获取单类零样本分类的 logits 和概率？","在多类别设置下，可以使用 `(100.0 * image_features @ text_features.T).softmax(dim=-1)` 获得校准的概率。但在单类别场景下，直接使用 sigmoid 可能无法提供良好校准。建议参考 CLIP 论文中的 logits 计算方式，确认 `model.logit_scale` 的值（约为 4.6，对应 exp(t)=100）。若需二分类，可构建 `[包含某类，不包含某类]` 的标签并使用 softmax。","https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fissues\u002F193",{"id":129,"question_zh":130,"answer_zh":131,"source_url":127},1935,"CLIP 模型的 logit_scale (t) 值在不同架构中是否一致？","是的，logit_scale 值通常是一致的。可以通过 `model.logit_scale` 进行确认。OpenAI 原始实现初始化 t=1\u002F.07（缩放因子约 14.29），经过训练后固定。在 OpenCLIP 中，该值用于控制温度系数，确保跨架构的一致性。",{"id":133,"question_zh":134,"answer_zh":135,"source_url":136},1936,"如何复现 OpenCLIP B\u002F32 在 ImageNet 上的报告准确率？","复现时需注意训练参数配置。例如使用 `--train-num-samples` 设置正确的样本数，并配合 `--dataset-type webdataset`。如果发现准确率偏低（如 40%-50% 而非报告的 62.9%），请检查是否使用了正确的数据集（如 laion-400m）以及优化器参数。","https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fissues\u002F385",{"id":138,"question_zh":139,"answer_zh":140,"source_url":136},1937,"OpenCLIP 训练中局部损失 (local loss) 与全局损失 (global loss) 如何配置？","默认情况下两者可能存在差异。为了复现全局损失的效果，建议添加 `--gather-with-grad` 标志。经验证，使用 `local_loss + gather_with_grad` 后，损失曲线与全局损失一致，且收敛效果更好。",{"id":142,"question_zh":143,"answer_zh":144,"source_url":145},1938,"COCa 微调后生成的图像描述为何总是重复？","这可能与使用的代码版本有关。请确保使用的是最新版本的 open_clip 仓库代码。如果问题依旧，可能需要重新运行微调过程。旧版本可能存在 commit 差异导致生成结果异常。","https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fissues\u002F797",{"id":147,"question_zh":148,"answer_zh":149,"source_url":150},1939,"如何为每张图像生成多个描述（caption）？","可以通过对 batch 调用 `.repeat` 或 `.repeat_interleave` 来增加处理量，但这会增加耗时。另一种方法是调整采样策略，例如将默认的 beam search 改为 top p sampling，以便为单张图像生成多个不同的描述。","https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fissues\u002F462",[152,156,161,166,171,176,181,186,191,195,199,203,207,211,215,219,223,227,231,235],{"id":153,"version":154,"summary_zh":79,"released_at":155},111114,"v3.3.0","2026-02-27T00:32:41",{"id":157,"version":158,"summary_zh":159,"released_at":160},111115,"v3.2.0","## What's Changed\r\n* Remove non-existent MetaCLIP 2 L\u002F14 checkpoint by @voidism in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1111\r\n* Add MobileCLIP2 model configs & pretrained weights.  by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1112\r\n\r\n## New Contributors\r\n* @voidism made their first contribution in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1111\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fcompare\u002Fv3.1.0...v3.2.0","2025-09-21T17:32:02",{"id":162,"version":163,"summary_zh":164,"released_at":165},111116,"v3.1.0","## What's Changed\r\n* Add support for MetaCLIP2 WorldWide models by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1100\r\n* Fix mask for CoCa generate by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1103\r\n* Add a text locking impl that works across CustomCLIP and CLIP. by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1104\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fcompare\u002Fv3.0.0...v3.1.0","2025-08-06T14:48:35",{"id":167,"version":168,"summary_zh":169,"released_at":170},111117,"v3.0.0","## What's Changed\r\n* Initial work on adding local-dir: schema for model & tokenizer loading from local folder by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1069\r\n* Add --force-context-length argument by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1080\r\n* fix CustomTextCLIP.no_weight_decay by @thelaao in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1082\r\n* Fix an issue where CustomText models not gettin proper pos embed interpolation by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1085\r\n* Removing redundent checks by @shreyaskamathkm in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1096\r\n* PE Core by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1097\r\n* Alternative tokenizer support for CLIPS by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1081\r\n* Wire up custom attention block via config by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1086\r\n* An alternative text masking helper fn by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1084\r\n* Update min reqs, test on python 3.10 by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1098\r\n* Resize timm image encoders by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1099\r\n\r\n## New Contributors\r\n* @thelaao made their first contribution in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1082\r\n* @shreyaskamathkm made their first contribution in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1096\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fcompare\u002Fv2.32.0...v3.0.0","2025-07-23T19:24:56",{"id":172,"version":173,"summary_zh":174,"released_at":175},111118,"v2.32.0","## What's Changed\r\n* API for getting intermediate image and text features, forward_intermediates() by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1035\r\n* Updated PyTorch code in README  by @Ja-Tink in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1050\r\n\r\n## New Contributors\r\n* @Ja-Tink made their first contribution in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1050\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fcompare\u002Fv2.31.0...v2.32.0","2025-04-05T21:55:19",{"id":177,"version":178,"summary_zh":179,"released_at":180},111119,"v2.31.0","## What's Changed\r\n* Add SigLIP2 models by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F1033\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fcompare\u002Fv2.30.0...v2.31.0","2025-02-23T16:25:17",{"id":182,"version":183,"summary_zh":184,"released_at":185},111120,"v2.30.0","## What's Changed\r\n* Support using timm optimizers for alternative to adamw default by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F979\r\n* add missing ViTamin configs by @xywei00 in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F978\r\n* Experimenting with alternative siglip loss impl for better dist scaling by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F971\r\n\r\n## New Contributors\r\n* @xywei00 made their first contribution in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F978\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fcompare\u002Fv2.29.0...v2.30.0","2025-01-04T16:10:47",{"id":187,"version":188,"summary_zh":189,"released_at":190},111121,"v2.29.0","## What's Changed\r\n* All default pretrained weights pushed to HF hub by @rwightman in https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fpull\u002F970\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fmlfoundations\u002Fopen_clip\u002Fcompare\u002Fv2.28.0...v2.29.0","2024-10-28T18:32:46",{"id":192,"version":193,"summary_zh":79,"released_at":194},111122,"v2.28.0","2024-10-21T22:07:54",{"id":196,"version":197,"summary_zh":79,"released_at":198},111123,"v2.27.1","2024-10-17T02:39:21",{"id":200,"version":201,"summary_zh":79,"released_at":202},111124,"v2.27.0","2024-10-15T20:10:54",{"id":204,"version":205,"summary_zh":79,"released_at":206},111125,"v2.26.1","2024-07-04T19:03:11",{"id":208,"version":209,"summary_zh":79,"released_at":210},111126,"v2.26.0","2024-07-04T19:00:12",{"id":212,"version":213,"summary_zh":79,"released_at":214},111127,"v2.24.0","2024-01-08T10:35:50",{"id":216,"version":217,"summary_zh":79,"released_at":218},111128,"v2.23.0","2023-10-24T16:03:10",{"id":220,"version":221,"summary_zh":79,"released_at":222},111129,"v2.22.0","2023-10-06T17:50:46",{"id":224,"version":225,"summary_zh":79,"released_at":226},111130,"v2.20.0","2023-05-15T15:41:34",{"id":228,"version":229,"summary_zh":79,"released_at":230},111131,"v2.19.0","2023-05-04T05:51:01",{"id":232,"version":233,"summary_zh":79,"released_at":234},111132,"v2.18.0","2023-04-26T20:53:09",{"id":236,"version":237,"summary_zh":79,"released_at":238},111133,"v2.17.2","2023-04-26T00:17:31"]