[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-rom1504--img2dataset":3,"tool-rom1504--img2dataset":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",160784,2,"2026-04-19T11:32:54",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",109154,"2026-04-18T11:18:24",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":78,"owner_email":79,"owner_twitter":80,"owner_website":81,"owner_url":82,"languages":83,"stars":104,"forks":105,"last_commit_at":106,"license":107,"difficulty_score":32,"env_os":108,"env_gpu":109,"env_ram":110,"env_deps":111,"category_tags":121,"github_topics":123,"view_count":32,"oss_zip_url":80,"oss_zip_packed_at":80,"status":17,"created_at":131,"updated_at":132,"faqs":133,"releases":164},9611,"rom1504\u002Fimg2dataset","img2dataset","Easily turn large sets of image urls to an image dataset. Can download, resize and package 100M urls in 20h on one machine.","img2dataset 是一款专为机器学习领域设计的高效数据准备工具，它能将海量的图片网址列表快速转化为标准化的图像数据集。在训练 AI 模型时，研究人员往往面临从互联网收集、下载并整理数百万甚至数十亿张图片的巨大挑战，手动操作不仅耗时费力，还难以保证数据格式的统一。img2dataset 正是为了解这一痛点而生，它支持批量下载、自动调整图片尺寸以及打包存储，甚至在单台机器上也能在约 20 小时内处理高达 1 亿个网址，极大提升了数据工程效率。\n\n除了纯图像数据，该工具还能同步保存对应的文本描述，非常适合构建用于图文多模态训练的数据集（如 LAION、COCO 等知名数据集均可通过它轻松获取）。其技术亮点在于卓越的性能优化，包括对高并发下载的支持以及对网站“禁止 AI 抓取”协议的尊重与灵活配置，确保数据采集过程既快速又合规。\n\nimg2dataset 主要面向 AI 开发者、数据科学家及机器学习研究人员，尤其是那些需要大规模预训练数据的团队。虽然它通过命令行运行，对普通用户有一定门槛，但其提供的 Colab 笔记本和详细文档让上手变得相对简单。如果你正致力于计算机视觉或生成式 AI ","img2dataset 是一款专为机器学习领域设计的高效数据准备工具，它能将海量的图片网址列表快速转化为标准化的图像数据集。在训练 AI 模型时，研究人员往往面临从互联网收集、下载并整理数百万甚至数十亿张图片的巨大挑战，手动操作不仅耗时费力，还难以保证数据格式的统一。img2dataset 正是为了解这一痛点而生，它支持批量下载、自动调整图片尺寸以及打包存储，甚至在单台机器上也能在约 20 小时内处理高达 1 亿个网址，极大提升了数据工程效率。\n\n除了纯图像数据，该工具还能同步保存对应的文本描述，非常适合构建用于图文多模态训练的数据集（如 LAION、COCO 等知名数据集均可通过它轻松获取）。其技术亮点在于卓越的性能优化，包括对高并发下载的支持以及对网站“禁止 AI 抓取”协议的尊重与灵活配置，确保数据采集过程既快速又合规。\n\nimg2dataset 主要面向 AI 开发者、数据科学家及机器学习研究人员，尤其是那些需要大规模预训练数据的团队。虽然它通过命令行运行，对普通用户有一定门槛，但其提供的 Colab 笔记本和详细文档让上手变得相对简单。如果你正致力于计算机视觉或生成式 AI 的研究，想要摆脱繁琐的数据清洗工作，专注于模型创新，那么 img2dataset 将是你不可或缺的效率助手。","# img2dataset\n[![pypi](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fv\u002Fimg2dataset.svg)](https:\u002F\u002Fpypi.python.org\u002Fpypi\u002Fimg2dataset)\n[![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002From1504\u002Fimg2dataset\u002Fblob\u002Fmaster\u002Fnotebook\u002Fimg2dataset_getting_started.ipynb)\n[![Try it on gitpod](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Ftry-on%20gitpod-brightgreen.svg)](https:\u002F\u002Fgitpod.io\u002F#https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset)\n[![Chat on discord](https:\u002F\u002Fimg.shields.io\u002Fdiscord\u002F823813159592001537?color=5865F2&logo=discord&logoColor=white)](https:\u002F\u002Fdiscord.gg\u002Feq3cAMZtCC)\n\nEasily turn large sets of image urls to an image dataset.\nCan download, resize and package 100M urls in 20h on one machine.\n\nAlso supports saving captions for url+caption datasets.\n\nIf you believe in making reusable tools to make data easy to use for ML and you would like to contribute, please join the [DataToML](https:\u002F\u002Fdiscord.gg\u002Fep8yUUtCnp) chat.\n\n## Install\n\n```bash\npip install img2dataset\n```\n\nFor better performance, it's highly recommended to set up a fast dns resolver, see [this section](https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset#setting-up-a-high-performance-dns-resolver)\n\n## Opt-out directives\n\nWebsites can pass the http headers `X-Robots-Tag: noai`, `X-Robots-Tag: noindex` , `X-Robots-Tag: noimageai` and `X-Robots-Tag: noimageindex`\nBy default img2dataset will ignore images with such headers.\n\nTo disable this behavior and download all images, you may pass --disallowed_header_directives '[]'\n\nSee [AI use impact](#ai-use-impact) to understand better why you may decide to enable or disable this feature.\n\n## Examples\n\nExample of datasets to download with example commands are available in the [dataset_examples](dataset_examples) folder. In particular:\n* [mscoco](dataset_examples\u002Fmscoco.md) 600k image\u002Ftext pairs that can be downloaded in 10min\n* [sbucaptions](dataset_examples\u002FSBUcaptions.md) 860K image\u002Ftext pairs can be downloaded in 20 mins.\n* [cc3m](dataset_examples\u002Fcc3m.md) 3M image\u002Ftext pairs that can be downloaded in one hour\n* [cc12m](dataset_examples\u002Fcc12m.md) 12M image\u002Ftext pairs that can be downloaded in five hour\n* [laion400m](dataset_examples\u002Flaion400m.md) 400M image\u002Ftext pairs that can be downloaded in 3.5 days\n* [laion5B](dataset_examples\u002Flaion5B.md) 5B image\u002Ftext pairs that can be downloaded in 7 days using 10 nodes\n* [laion-aesthetic](dataset_examples\u002Flaion-aesthetic.md) Laion aesthetic is a 120M laion5B subset with aesthetic > 7 pwatermark \u003C 0.8 punsafe \u003C 0.5\n* [laion-art](dataset_examples\u002Flaion-art.md) Laion aesthetic is a 8M laion5B subset with aesthetic > 8 pwatermark \u003C 0.8 punsafe \u003C 0.5\n* [laion-coco](dataset_examples\u002Flaion-coco.md) Laion-COCO is a 600M subset of LAION2B-EN, captioned with an ensemble of BLIP L\u002F14 and 2 CLIP versions (L\u002F14 and RN50x64).\n* [laion-high-resolution](dataset_examples\u002Flaion-high-resolution.md) Laion high resolution is a 170M resolution >= 1024x1024 subset of laion5B\n* [laion-face](dataset_examples\u002Flaion-face.md) Laion face is the human face subset of LAION-400M for large-scale face pretraining. It has 50M image-text pairs.\n* [coyo-700m](dataset_examples\u002Fcoyo-700m.md) COYO is a large-scale dataset that contains 747M image-text pairs as well as many other meta-attributes to increase the usability to train various models.\n* [commonpool](dataset_examples\u002Fcommon_pool.md) CommonPool is a large-scale dataset collected from CommonCrawl containing 12.8B image-text pairs.\n* [datacomp-1b](dataset_examples\u002Fdatacomp.md) DataComp-1B is a large-scale dataset with 1.4B image-text pairs filtered from CommonPool.\n\nFor all these examples, you may want to tweak the resizing to your preferences. The default is 256x256 with white borders.\nSee options below.\n\n## Usage\n\nFirst get some image url list. For example:\n\n```bash\necho 'https:\u002F\u002Fpicsum.photos\u002F200\u002F305' >> myimglist.txt\necho 'https:\u002F\u002Fpicsum.photos\u002F200\u002F304' >> myimglist.txt\necho 'https:\u002F\u002Fpicsum.photos\u002F200\u002F303' >> myimglist.txt\n```\n\nThen, run the tool:\n\n```bash\nimg2dataset --url_list=myimglist.txt --output_folder=output_folder --thread_count=64 --image_size=256\n```\n\nThe tool will then automatically download the urls, resize them, and store them with that format:\n* output_folder\n    * 00000\n        * 000000000.jpg\n        * 000000001.jpg\n        * 000000002.jpg\n\nor as this format if choosing webdataset:\n* output_folder\n    * 00000.tar containing:\n        * 000000000.jpg\n        * 000000001.jpg\n        * 000000002.jpg\n\nwith each number being the position in the list. The subfolders avoids having too many files in a single folder.\n\nIf **captions** are provided, they will be saved as 0.txt, 1.txt, ...\n\nThis can then easily be fed into machine learning training or any other use case.\n\nAlso .json files named 0.json, 1.json,... are saved with these keys:\n* url\n* caption\n* key of the form 000010005 : the first 5 digits are the shard id, the last 4 are the index in the shard\n* status : whether the download succeeded\n* error_message\n* width\n* height\n* original_width\n* original_height\n* exif\n\nAlso a .parquet file will be saved with the same name as the subfolder\u002Ftar files containing these same metadata.\nIt can be used to analyze the results efficiently.\n\n.json files will also be saved with the same name suffixed by _stats, they contain stats collected during downloading (download time, number of success, ...)\n\n## Python examples\n\nCheckout these examples to call this as a lib:\n* [simple_example.py](examples\u002Fsimple_example.py)\n* [pyspark_example.py](examples\u002Fpyspark_example.py)\n* [distributed img2dataset tutorial](examples\u002Fdistributed_img2dataset_tutorial.md)\n\n## API\n\nThis module exposes a single function `download` which takes the same arguments as the command line tool:\n\n* **url_list** A file with the list of url of images to download. It can be a folder of such files. (*required*)\n* **image_size** The size to resize image to (default *256*)\n* **output_folder** The path to the output folder. (default *\"images\"*)\n* **processes_count** The number of processes used for downloading the pictures. This is important to be high for performance. (default *1*)\n* **thread_count** The number of threads used for downloading the pictures. This is important to be high for performance. (default *256*)\n* **resize_mode** The way to resize pictures, can be no, border or keep_ratio (default *border*)\n  * **no** doesn't resize at all\n  * **border** will make the image image_size x image_size and add a border\n  * **keep_ratio** will keep the ratio and make the smallest side of the picture image_size\n  * **keep_ratio_largest** will keep the ratio and make the largest side of the picture image_size\n  * **center_crop** will keep the ratio and center crop the largest side so the picture is squared\n* **resize_only_if_bigger** resize pictures only if bigger that the image_size (default *False*)\n* **upscale_interpolation** kind of upscale interpolation used for resizing (default *\"lanczos\"*)\n* **downscale_interpolation** kind of downscale interpolation used for resizing (default *\"area\"*)\n* **encode_quality** encode quality from 0 to 100, when using png it is the compression factor from 0 to 9 (default *95*)\n* **encode_format** encode format (default *jpg*)\n  * **jpg** jpeg format\n  * **png** png format\n  * **webp** webp format\n* **skip_reencode** whether to skip reencoding if no resizing is done (default *False*)\n* **output_format** decides how to save pictures (default *files*)\n  * **files** saves as a set of subfolder containing pictures\n  * **webdataset** saves as tars containing pictures\n  * **parquet** saves as parquet containing pictures as bytes\n  * **tfrecord** saves as tfrecord containing pictures as bytes\n  * **dummy** does not save. Useful for benchmarks\n* **input_format** decides how to load the urls (default *txt*)\n  * **txt** loads the urls as a text file of url, one per line\n  * **txt.gz** loads the urls as a compressed (gzip) txt.gz with a list of url, one per line\n  * **csv** loads the urls and optional caption as a csv\n  * **csv.gz** loads the urls and optional caption, as a compressed (gzip) csv.gz\n  * **tsv** loads the urls and optional caption as a tsv\n  * **tsv.gz** loads the urls and optional caption, as a compressed (gzip) tsv.gz\n  * **json** loads the urls and optional caption as a json\n  * **json.gz** loads the urls and optional caption, as a compressed (gzip) json.gz\n  * **jsonl** loads the urls and optional caption as a jsonl. see [jsonlines](https:\u002F\u002Fjsonlines.org\u002F) for more\n  * **jsonl.gz** loads the urls and optional caption, as a compressed (gzip) jsonl.gz. see [jsonlines](https:\u002F\u002Fjsonlines.org\u002F) for more\n  * **parquet** loads the urls and optional caption as a parquet\n* **url_col** the name of the url column for parquet and csv (default *url*)\n* **caption_col** the name of the caption column for parquet and csv (default *None*)\n* **bbox_col** the name of the bounding box column. Bounding boxes are assumed to have format ```[x_min, y_min, x_max, y_max]```, with all elements being floats in *[0,1]* (relative to the size of the image). If *None*, then no bounding box blurring is performed (default *None*)\n* **number_sample_per_shard** the number of sample that will be downloaded in one shard (default *10000*)\n* **extract_exif** if true, extract the exif information of the images and save it to the metadata (default *True*)\n* **save_additional_columns** list of additional columns to take from the csv\u002Fparquet files and save in metadata files (default *None*)\n* **timeout** maximum time (in seconds) to wait when trying to download an image (default *10*)\n* **enable_wandb** whether to enable wandb logging (default *False*)\n* **wandb_project** name of W&B project used (default *img2dataset*)\n* **oom_shard_count** the order of magnitude of the number of shards, used only to decide what zero padding to use to name the shard files (default *5*)\n* **compute_hash** the hash of raw images to compute and store in the metadata, one of *None*, *md5*, *sha256*, *sha512* (default *sha256*)\n* **verify_hash** if not *None*, then this is a list of two elements that will be used to verify hashes based on the provided input. The first element of this list is the label of the column containing the hashes in the input file, while the second one is the type of the hash that is being checked (default *None*)\n* **distributor** choose how to distribute the downloading (default *multiprocessing*)\n  * **multiprocessing** use a multiprocessing pool to spawn processes\n  * **pyspark** use a pyspark session to create workers on a spark cluster (see details below)\n  * **ray** use a ray cluster. See ray example.\n* **subjob_size** the number of shards to download in each subjob supporting it, a subjob can be a pyspark job for example (default *1000*)\n* **retries** number of time a download should be retried (default *0*)\n* **disable_all_reencoding** if set to True, this will keep the image files in their original state with no resizing and no conversion, will not even check if the image is valid. Useful for benchmarks. To use only if you plan to post process the images by another program and you have plenty of storage available. (default *False*)\n* **min_image_size** minimum size of the image to download (default *0*)\n* **max_image_area** maximum area of the image to download (default *inf*)\n* **max_aspect_ratio** maximum aspect ratio of the image to download (default *inf*)\n* **incremental_mode** Can be \"incremental\", \"overwrite\" or \"extend\". For \"incremental\", img2dataset will download all the shards that were not downloaded, for \"overwrite\" img2dataset will delete recursively the output folder then start from zero, for \"extend\" img2dataset will download shards from the next available shard number (default *incremental*)\n* **max_shard_retry** Number of time to retry failed shards at the end (default *1*)\n* **user_agent_token** Additional identifying token that will be added to the User-Agent header sent with HTTP requests to download images; for example: \"img2downloader\". (default *None*)\n* **disallowed_header_directives** List of X-Robots-Tags header directives that, if present in HTTP response when downloading an image, will cause the image to be excluded from the output dataset. To ignore x-robots-tags, pass '[]'. (default '[\"noai\", \"noimageai\", \"noindex\", \"noimageindex\"]')\n* **ignore_ssl_certificate** If set to True, SSL certificate verification will be disabled when downloading images. This allows downloading from servers with invalid or self-signed certificates, but reduces security. Use with caution. (default *False*)\n\n## Incremental mode\n\nIf a first download got interrupted for any reason, you can run again with --incremental \"incremental\" (this is the default) and using the same output folder , the same number_sample_per_shard and the same input urls, and img2dataset will complete the download.\n\n## Output format choice\n\nImg2dataset support several formats. There are trade off for which to choose:\n* files: this is the simplest one, images are simply saved as files. It's good for up to 1M samples on a local file system. Beyond that performance issues appear very fast. Handling more than a million files in standard filesystem does not work well.\n* webdataset: webdataset format saves samples in tar files, thanks to [webdataset](https:\u002F\u002Fwebdataset.github.io\u002Fwebdataset\u002F) library, this makes it possible to load the resulting dataset fast in both pytorch, tensorflow and jax. Choose this for most use cases. It works well for any filesystem\n* parquet: parquet is a columnar format that allows fast filtering. It's particularly easy to read it using pyarrow and pyspark. Choose this if the rest of your data ecosystem is based on pyspark. [petastorm](https:\u002F\u002Fgithub.com\u002Fuber\u002Fpetastorm) can be used to read the data but it's not as easy to use as webdataset\n* tfrecord: tfrecord is a protobuf based format. It's particularly easy to use from tensorflow and using [tf data](https:\u002F\u002Fwww.tensorflow.org\u002Fguide\u002Fdata). Use this if you plan to use the dataset only in the tensorflow ecosystem. The tensorflow writer does not use fsspec and as a consequence supports only a limited amount of filesystem, including local, hdfs, s3 and gcs. It is also less efficient than the webdataset writer when writing to other filesystems than local, losing some 30% performance.\n\n## Encode format choice\n\nImages can be encoded in jpeg, png or webp, with different quality settings.\n\nHere are a few comparisons of space used for 1M images at 256 x 256:\n\n| format | quality | compression | size (GB)  |\n| ------ | ------- | ----------- | ---------- |\n| jpg    | 100     | N\u002FA         | 54.2       |\n| jpg    | 95      | N\u002FA         | 29.9       |\n| png    | N\u002FA     | 0           | 187.9      |\n| png    | N\u002FA     | 9           | 97.7       |\n| webp   | 100     | N\u002FA         | 31.0       |\n| webp   | 95      | N\u002FA         | 23.8       |\n\nNotes:\n\n* jpeg at quality 100 is NOT lossless\n* png format is lossless\n* webp at quality >100 is lossless ([see OpenCV Docs](https:\u002F\u002Fdocs.opencv.org\u002F3.4\u002Fd8\u002Fd6a\u002Fgroup__imgcodecs__flags.html))\n* same quality scale between formats does not mean same image quality\n\n## Filtering the dataset\n\nWhenever feasible, you should pre-filter your dataset prior to downloading.\n\nIf needed, you can use:\n* --min_image_size SIZE : to filter out images with one side smaller than SIZE\n* --max_image_area SIZE : to filter out images with area larger than SIZE\n* --max_aspect_ratio RATIO : to filter out images with an aspect ratio greater than RATIO\n\nWhen filtering data, it is recommended to pre-shuffle your dataset to limit the impact on shard size distribution.\n\n## Hashes and security\n\nSome dataset (for example laion5B) expose hashes of original images.\n\nIf you want to be extra safe, you may automatically drop out the images that do not match theses hashes.\nIn that case you can use `--compute_hash \"md5\" --verify_hash '[\"md5\",\"md5\"]'` \nSome of those images are actually still good but have been slightly changed by the websites.\n\n## SSL Certificate Handling\n\nBy default, img2dataset verifies SSL certificates when downloading images. However, some servers may have invalid or self-signed certificates.\n\nTo download images from such servers, you can disable SSL certificate verification:\n\n```bash\nimg2dataset --url_list=myimglist.txt --output_folder=output_folder --ignore_ssl_certificate=True\n```\n\n**Warning**: Disabling SSL certificate verification reduces security by making downloads vulnerable to man-in-the-middle attacks. Only use this option when necessary and when you trust the source.\n\n## How to tweak the options\n\nThe default values should be good enough for small sized dataset. For larger ones, these tips may help you get the best performance:\n\n* set the processes_count as the number of cores your machine has\n* increase thread_count as long as your bandwidth and cpu are below the limits\n* I advise to set output_format to webdataset if your dataset has more than 1M elements, it will be easier to manipulate few tars rather than million of files\n* keeping metadata to True can be useful to check what items were already saved and avoid redownloading them\n\nTo benchmark your system, and img2dataset interactions with it, it may be interesting to enable these options (only for testing, not for real downloads)\n* --output_format dummy : will not save anything. Good to remove the storage bottleneck\n* --disable_all_reencoding True : will not reencode anything. Good to remove the cpu bottleneck\nWhen both these options are enabled, the only bottlenecks left are network related: eg dns setup, your bandwidth or the url servers bandwidth.\n\n## File system support\n\nThanks to [fsspec](https:\u002F\u002Ffilesystem-spec.readthedocs.io\u002Fen\u002Flatest\u002F), img2dataset supports reading and writing files in [many file systems](https:\u002F\u002Fgithub.com\u002Ffsspec\u002Ffilesystem_spec\u002Fblob\u002F6233f315548b512ec379323f762b70764efeb92c\u002Ffsspec\u002Fregistry.py#L87).\nTo use it, simply use the prefix of your filesystem before the path. For example `hdfs:\u002F\u002F`, `s3:\u002F\u002F`, `http:\u002F\u002F`, `gcs:\u002F\u002F`, `ssh:\u002F\u002F` or `hf:\u002F\u002F` (includes a [Dataset Viewer](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fhub\u002Fdatasets-viewer)).\nSome of these file systems require installing an additional package (for example s3fs for s3, gcsfs for gcs, [fsspec\u002Fsshfs](https:\u002F\u002Fgithub.com\u002Ffsspec\u002Fsshfs) for ssh, [huggingface_hub](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fhuggingface_hub\u002Fguides\u002Fhf_file_system) for hf).\nSee fsspec doc for all the details.\n\nIf you need specific configuration for your filesystem, you may handle this problem by using the [fsspec configuration system](https:\u002F\u002Ffilesystem-spec.readthedocs.io\u002Fen\u002Flatest\u002Ffeatures.html#configuration) that makes it possible to create a file such as `.config\u002Ffsspec\u002Fs3.json` and have information in it such as:\n\n```json\n{\n  \"s3\": {\n    \"client_kwargs\": {\n            \"endpoint_url\": \"https:\u002F\u002Fsome_endpoint\",\n            \"aws_access_key_id\": \"your_user\",\n           \"aws_secret_access_key\": \"your_password\"\n    }\n  }\n}\n```\n\nWhich may be necessary if using s3 compatible file systems such as [minio](https:\u002F\u002Fmin.io\u002F). That kind of configuration also work for all other fsspec-supported file systems.\n\n## Distribution modes\n\nImg2dataset supports several distributors.\n* multiprocessing which spawns a process pool and use these local processes for downloading\n* pyspark which spawns workers in a spark pool to do the downloading\n\nmultiprocessing is a good option for downloading on one machine, and as such it is the default.\nPyspark lets img2dataset use many nodes, which makes it as fast as the number of machines.\nIt can be particularly useful if downloading datasets with more than a billion image.\n\n### pyspark configuration\n\nIn order to use img2dataset with pyspark, you will need to do this:\n1. `pip install pyspark`\n2. use the `--distributor pyspark` option\n3. tweak the `--subjob_size 1000` option: this is the number of images to download in each subjob. Increasing it will mean a longer time of preparation to put the feather files in the temporary dir, a shorter time will mean sending less shards at a time to the pyspark job.\n\nBy default a local spark session will be created.\nYou may want to create a custom spark session depending on your specific spark cluster.\nTo do that check [pyspark_example.py](examples\u002Fpyspark_example.py), there you can plug your custom code to create a spark session, then\nrun img2dataset which will use it for downloading.\n\nTo create a spark cluster check the [distributed img2dataset tutorial](examples\u002Fdistributed_img2dataset_tutorial.md)\n\n## Integration with Weights & Biases\n\nTo enable wandb, use the `--enable_wandb=True` option.\n\nPerformance metrics are monitored through [Weights & Biases](https:\u002F\u002Fwandb.com\u002F).\n\n![W&B metrics](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002From1504_img2dataset_readme_66c7a17d177f.png)\n\nIn addition, most frequent errors are logged for easier debugging.\n\n![W&B table](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002From1504_img2dataset_readme_484aee86722e.png)\n\nOther features are available:\n\n* logging of environment configuration (OS, python version, CPU count, Hostname, etc)\n* monitoring of hardware resources (GPU\u002FCPU, RAM, Disk, Networking, etc)\n* custom graphs and reports\n* comparison of runs (convenient when optimizing parameters such as number of threads\u002Fcpus)\n\nWhen running the script for the first time, you can decide to either associate your metrics to your account or log them anonymously.\n\nYou can also log in (or create an account) before by running `wandb login`.\n\n## Road map\n\nThis tool works very well in the current state for up to 100M elements. Future goals include:\n\n* a benchmark for 1B pictures which may require\n  * further optimization on the resizing part\n  * better multi node support\n  * integrated support for incremental support (only download new elements)\n\n## Architecture notes\n\nThis tool is designed to download pictures as fast as possible.\nThis put a stress on various kind of resources. Some numbers assuming 1350 image\u002Fs:\n* Bandwidth: downloading a thousand average image per second requires about 130MB\u002Fs\n* CPU: resizing one image may take several milliseconds, several thousand per second can use up to 16 cores\n* DNS querying: million of urls mean million of domains, default OS setting usually are not enough. Setting up a local bind9 resolver may be required\n* Disk: if using resizing, up to 30MB\u002Fs write speed is necessary. If not using resizing, up to 130MB\u002Fs. Writing in few tar files make it possible to use rotational drives instead of a SSD.\n\nWith these information in mind, the design choice was done in this way:\n* the list of urls is split in K shards. K is chosen such that a shard has a reasonable size on disk (for example 256MB), by default K = 10000\n* N processes are started (using multiprocessing process pool)\n  * each process starts M threads. M should be maximized in order to use as much network as possible while keeping cpu usage below 100%.\n  * each of this thread download 1 image and returns it\n  * the parent thread handle resizing (which means there is at most N resizing running at once, using up the cores but not more)\n  * the parent thread saves to a tar file that is different from other process\n\nThis design make it possible to use the CPU resource efficiently by doing only 1 resize per core, reduce disk overhead by opening 1 file per core, while using the bandwidth resource as much as possible by using M thread per process.\n\nAlso see [architecture.md](img2dataset\u002Farchitecture.md) for the precise split in python modules.\n\n## Setting up a high performance dns resolver\n\nTo get the best performances with img2dataset, using an efficient dns resolver is needed.\n* knot resolver can run in parallel\n* bind resolver is the historic resolver and is mono core but very optimized\n\n\n### Setting up a knot resolver\n\nFollow [the official quick start](https:\u002F\u002Fknot-resolver.readthedocs.io\u002Fen\u002Fstable\u002Fquickstart-install.html) or run this on ubuntu:\n\ninstall knot with\n\n```bash\nwget https:\u002F\u002Fsecure.nic.cz\u002Ffiles\u002Fknot-resolver\u002Fknot-resolver-release.deb\nsudo dpkg -i knot-resolver-release.deb\nsudo apt update\nsudo apt install -y knot-resolver\nsudo sh -c 'echo `hostname -I` `hostname` >> \u002Fetc\u002Fhosts'\nsudo sh -c 'echo nameserver 127.0.0.1 > \u002Fetc\u002Fresolv.conf'\nsudo systemctl stop systemd-resolved\n```\n\nthen start 4 instances with\n\n```bash\nsudo systemctl start kresd@1.service\nsudo systemctl start kresd@2.service\nsudo systemctl start kresd@3.service\nsudo systemctl start kresd@4.service\n```\n\nCheck it works with\n\n```bash\ndig @localhost google.com\n```\n\n### Setting up a bind9 resolver\n\nIn order to keep the success rate high, it is necessary to use an efficient DNS resolver.\nI tried several options: systemd-resolved, dnsmaskq and bind9 and reached the conclusion that bind9 reaches the best performance for this use case.\nHere is how to set this up on Ubuntu. Run:\n\n```bash\nsudo apt install bind9\nsudo vim \u002Fetc\u002Fbind\u002Fnamed.conf.options\n```\n\nAnd add this in `options`:\n\n```\n\trecursive-clients 10000;\n\tresolver-query-timeout 30000;\n\tmax-clients-per-query 10000;\n\tmax-cache-size 2000m;\n```\n\nThen, run:\n\n```bash\nsudo systemctl restart bind9\necho nameserver 127.0.0.1 | sudo tee -a \u002Fetc\u002Fresolv.conf\n```\n\nThis will make it possible to keep an high success rate while doing thousands of dns queries.\nYou may also want to [setup bind9 logging](https:\u002F\u002Fnsrc.org\u002Factivities\u002Fagendas\u002Fen\u002Fdnssec-3-days\u002Fdns\u002Fmaterials\u002Flabs\u002Fen\u002Fdns-bind-logging.html) in order to check that few dns errors happen.\n\n\n## AI use impact\n\nimg2dataset is used to retrieve images from the web and make them easily available for ML use cases. Use cases involve:\n* doing inference and indexing to better understand what is in the web (https:\u002F\u002From1504.github.io\u002Fclip-retrieval\u002F is an example of this)\n* training models\n\nModels that can be trained using image\u002Ftext datasets include:\n* CLIP: an image understanding model that allow for example to know whether an image is safe, aesthetic, what animal it contains, ...\n* text to image models: generating images based on text\n\nThere is a lot of discussions regarding the consequences of text to image models. Some opinions include:\n* AI art is democratizing art and letting hundred of millions of people express themselves through art. Making art much more prevalent and unique\n* AI models should not be trained on images that creator do not want to share\n\nThe opt out directive try to let creators that do not want to share their art not be used for indexing and for training.\n\n\n## For development\n\nEither locally, or in [gitpod](https:\u002F\u002Fgitpod.io\u002F#https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset) (do `export PIP_USER=false` there)\n\nSetup a virtualenv:\n\n```bash\npython3 -m venv .env\nsource .env\u002Fbin\u002Factivate\npip install -e .\n```\n\nto run tests:\n\n```bash\npip install -r requirements-test.txt\n```\nthen\n\n```bash\nmake lint\nmake test\n```\n\nYou can use `make black` to reformat the code\n\n`python -m pytest -x -s -v tests -k \"dummy\"` to run a specific test\n\n## Benchmarks\n\n### 10000 image benchmark\n\n```bash\ncd tests\u002Ftest_files\nbash benchmark.sh\n```\n\n\n### 18M image benchmark\n\nDownload crawling at home first part, then:\n\n```bash\ncd tests\nbash large_bench.sh\n```\n\nIt takes 3.7h to download 18M pictures\n\n1350 images\u002Fs is the currently observed performance. 4.8M images per hour, 116M images per 24h.\n\n\n### 36M image benchmark\n\ndownloading 2 parquet files of 18M items (result 936GB) took 7h24\naverage of 1345 image\u002Fs\n\n### 190M benchmark\n\ndownloading 190M images from the [crawling at home dataset](https:\u002F\u002Fgithub.com\u002From1504\u002Fcah-prepro) took 41h (result 5TB)\naverage of 1280 image\u002Fs\n\n### 5B benchmark\n\ndownloading 5.8B images from the [laion5B dataset](https:\u002F\u002Flaion.ai\u002Flaion-5b-a-new-era-of-open-large-scale-multi-modal-datasets\u002F) took 7 days (result 240TB), average of 9500 sample\u002Fs on 10 machines, [technical details](https:\u002F\u002From1504.medium.com\u002Fsemantic-search-at-billions-scale-95f21695689a)\n\n\n\n## Citation\n\n```bibtex\n@misc{beaumont-2021-img2dataset,\n  author = {Romain Beaumont},\n  title = {img2dataset: Easily turn large sets of image urls to an image dataset},\n  year = {2021},\n  publisher = {GitHub},\n  journal = {GitHub repository},\n  howpublished = {\\url{https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset}}\n}\n```\n","# img2dataset\n[![pypi](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fv\u002Fimg2dataset.svg)](https:\u002F\u002Fpypi.python.org\u002Fpypi\u002Fimg2dataset)\n[![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002From1504\u002Fimg2dataset\u002Fblob\u002Fmaster\u002Fnotebook\u002Fimg2dataset_getting_started.ipynb)\n[![Try it on gitpod](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Ftry-on%20gitpod-brightgreen.svg)](https:\u002F\u002Fgitpod.io\u002F#https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset)\n[![Chat on discord](https:\u002F\u002Fimg.shields.io\u002Fdiscord\u002F823813159592001537?color=5865F2&logo=discord&logoColor=white)](https:\u002F\u002Fdiscord.gg\u002Feq3cAMZtCC)\n\n轻松将大量图片URL转换为图像数据集。在单台机器上，可在20小时内下载、调整大小并打包1亿个URL。\n\n还支持保存URL+标题的数据集的标题信息。\n\n如果您相信构建可重用工具能够让机器学习中的数据更易于使用，并希望参与贡献，请加入 [DataToML](https:\u002F\u002Fdiscord.gg\u002Fep8yUUtCnp) 聊天群。\n\n## 安装\n\n```bash\npip install img2dataset\n```\n\n为了获得更好的性能，强烈建议设置一个高速DNS解析器，详情请参阅[此部分](https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset#setting-up-a-high-performance-dns-resolver)。\n\n## 退出指令\n\n网站可以通过HTTP头部传递 `X-Robots-Tag: noai`、`X-Robots-Tag: noindex`、`X-Robots-Tag: noimageai` 和 `X-Robots-Tag: noimageindex`。默认情况下，img2dataset会忽略带有这些头部的图片。\n\n若要禁用此行为并下载所有图片，可以传递 --disallowed_header_directives '[]'。\n\n请参阅[AI使用影响](#ai-use-impact)，以更好地理解为何您可能决定启用或禁用此功能。\n\n## 示例\n\n可在[dataset_examples](dataset_examples)文件夹中找到可下载的数据集示例及相应命令。具体包括：\n* [mscoco](dataset_examples\u002Fmscoco.md) 60万张图片\u002F文本对，可在10分钟内下载完成。\n* [sbucaptions](dataset_examples\u002FSBUcaptions.md) 86万张图片\u002F文本对，可在20分钟内下载完成。\n* [cc3m](dataset_examples\u002Fcc3m.md) 300万张图片\u002F文本对，可在1小时内下载完成。\n* [cc12m](dataset_examples\u002Fcc12m.md) 1200万张图片\u002F文本对，可在5小时内下载完成。\n* [laion400m](dataset_examples\u002Flaion400m.md) 4亿张图片\u002F文本对，可在3.5天内下载完成。\n* [laion5B](dataset_examples\u002Flaion5B.md) 50亿张图片\u002F文本对，使用10个节点可在7天内下载完成。\n* [laion-aesthetic](dataset_examples\u002Flaion-aesthetic.md) Laion aesthetic是包含1.2亿条记录的laion5B子集，其中审美评分>7，水印分数\u003C0.8，安全分数\u003C0.5。\n* [laion-art](dataset_examples\u002Flaion-art.md) Laion aesthetic是包含800万条记录的laion5B子集，其中审美评分>8，水印分数\u003C0.8，安全分数\u003C0.5。\n* [laion-coco](dataset_examples\u002Flaion-coco.md) Laion-COCO是LAION2B-EN中的6亿条记录子集，由BLIP L\u002F14和2种CLIP版本（L\u002F14和RN50x64）联合生成的标题。\n* [laion-high-resolution](dataset_examples\u002Flaion-high-resolution.md) Laion high resolution是从laion5B中筛选出的1.7亿条分辨率≥1024×1024的子集。\n* [laion-face](dataset_examples\u002Flaion-face.md) Laion face是从LAION-400M中筛选出的人脸子集，用于大规模人脸预训练，包含5000万张图片\u002F文本对。\n* [coyo-700m](dataset_examples\u002Fcoyo-700m.md) COYO是一个大规模数据集，包含7.47亿张图片\u002F文本对，以及许多其他元属性，以提高其在训练各种模型时的可用性。\n* [commonpool](dataset_examples\u002Fcommon_pool.md) CommonPool是从CommonCrawl收集的大规模数据集，包含128亿张图片\u002F文本对。\n* [datacomp-1b](dataset_examples\u002Fdatacomp.md) DataComp-1B是从CommonPool中筛选出的14亿张图片\u002F文本对的大规模数据集。\n\n对于以上所有示例，您可以根据自己的喜好调整图片的尺寸。默认设置为256×256像素，并添加白色边框。\n更多选项请见下文。\n\n## 使用方法\n\n首先获取一些图片URL列表。例如：\n\n```bash\necho 'https:\u002F\u002Fpicsum.photos\u002F200\u002F305' >> myimglist.txt\necho 'https:\u002F\u002Fpicsum.photos\u002F200\u002F304' >> myimglist.txt\necho 'https:\u002F\u002Fpicsum.photos\u002F200\u002F303' >> myimglist.txt\n```\n\n然后运行该工具：\n\n```bash\nimg2dataset --url_list=myimglist.txt --output_folder=output_folder --thread_count=64 --image_size=256\n```\n\n工具将自动下载这些URL，调整图片大小，并按以下格式存储：\n* output_folder\n    * 00000\n        * 000000000.jpg\n        * 000000001.jpg\n        * 000000002.jpg\n\n或者如果选择webdataset格式，则会以如下形式存储：\n* output_folder\n    * 00000.tar 文件包含：\n        * 000000000.jpg\n        * 000000001.jpg\n        * 000000002.jpg\n\n每个编号对应于列表中的位置。子文件夹的设计避免了单个文件夹中文件过多的问题。\n\n如果提供了**标题**，它们将被保存为0.txt、1.txt等文件。\n\n之后，这些数据即可轻松输入到机器学习训练或其他应用场景中。\n\n此外，还会保存名为0.json、1.json等的JSON文件，包含以下键值：\n* url\n* 标题\n* 键值，格式为000010005：前5位数字表示分片ID，后4位表示分片内的索引\n* 状态：下载是否成功\n* 错误信息\n* 宽度\n* 高度\n* 原始宽度\n* 原始高度\n* EXIF信息\n\n同时，还会以与子文件夹或tar文件同名的方式保存一个.parquet文件，其中包含相同的元数据。这可用于高效地分析结果。\n\n另外，还会保存以_stats为后缀的JSON文件，其中包含下载过程中收集的统计信息（下载时间、成功次数等）。\n\n## Python示例\n\n请查看以下示例，了解如何将其作为库调用：\n* [simple_example.py](examples\u002Fsimple_example.py)\n* [pyspark_example.py](examples\u002Fpyspark_example.py)\n* [分布式img2dataset教程](examples\u002Fdistributed_img2dataset_tutorial.md)\n\n## API\n\n本模块暴露了一个名为 `download` 的函数，其参数与命令行工具的参数相同：\n\n* **url_list** 一个包含待下载图片 URL 列表的文件。也可以是一个包含此类文件的文件夹。（必填）\n* **image_size** 图片的目标尺寸（默认 *256*）\n* **output_folder** 输出文件夹的路径。（默认 *\"images\"*）\n* **processes_count** 用于下载图片的进程数。为了性能，此值应设置得较高。（默认 *1*）\n* **thread_count** 用于下载图片的线程数。为了性能，此值应设置得较高。（默认 *256*）\n* **resize_mode** 图片的缩放方式，可选值为 no、border 或 keep_ratio（默认 *border*）：\n  * **no** 不进行任何缩放\n  * **border** 将图片调整为 `image_size x image_size` 大小，并添加边框\n  * **keep_ratio** 保持宽高比，使图片的短边长度为 `image_size`\n  * **keep_ratio_largest** 保持宽高比，使图片的长边长度为 `image_size`\n  * **center_crop** 保持宽高比，对长边进行居中裁剪，使图片变为正方形\n* **resize_only_if_bigger** 仅当图片大于目标尺寸时才进行缩放（默认 *False*）\n* **upscale_interpolation** 缩放时使用的插值方法（默认 *\"lanczos\"*）\n* **downscale_interpolation** 缩放时使用的插值方法（默认 *\"area\"*）\n* **encode_quality** 编码质量，范围为 0 到 100；使用 PNG 格式时，该参数表示压缩级别，范围为 0 到 9（默认 *95*）\n* **encode_format** 编码格式（默认 *jpg*）：\n  * **jpg** JPEG 格式\n  * **png** PNG 格式\n  * **webp** WebP 格式\n* **skip_reencode** 如果未进行缩放，是否跳过重新编码（默认 *False*）\n* **output_format** 决定如何保存图片（默认 *files*）：\n  * **files** 以包含图片的子文件夹集合形式保存\n  * **webdataset** 以包含图片的 tar 文件形式保存\n  * **parquet** 以包含图片字节数据的 Parquet 文件形式保存\n  * **tfrecord** 以包含图片字节数据的 TFRecord 文件形式保存\n  * **dummy** 不保存图片，适用于基准测试\n* **input_format** 决定如何加载 URL 列表（默认 *txt*）：\n  * **txt** 以每行一个 URL 的文本文件形式加载\n  * **txt.gz** 以 gzip 压缩的 txt.gz 文件形式加载，文件中每行包含一个 URL\n  * **csv** 以 CSV 文件形式加载 URL 及可选标题\n  * **csv.gz** 以 gzip 压缩的 csv.gz 文件形式加载 URL 及可选标题\n  * **tsv** 以 TSV 文件形式加载 URL 及可选标题\n  * **tsv.gz** 以 gzip 压缩的 tsv.gz 文件形式加载 URL 及可选标题\n  * **json** 以 JSON 文件形式加载 URL 及可选标题\n  * **json.gz** 以 gzip 压缩的 json.gz 文件形式加载 URL 及可选标题\n  * **jsonl** 以 JSONL 文件形式加载 URL 及可选标题。更多信息请参见 [jsonlines](https:\u002F\u002Fjsonlines.org\u002F)。\n  * **jsonl.gz** 以 gzip 压缩的 jsonl.gz 文件形式加载 URL 及可选标题。更多信息请参见 [jsonlines](https:\u002F\u002Fjsonlines.org\u002F)。\n  * **parquet** 以 Parquet 文件形式加载 URL 及可选标题\n* **url_col** Parquet 和 CSV 文件中 URL 列的名称（默认 *url*）\n* **caption_col** Parquet 和 CSV 文件中标题列的名称（默认 *None*）\n* **bbox_col** 边界框列的名称。假设边界框的格式为 ```[x_min, y_min, x_max, y_max]```，其中所有元素均为介于 *[0,1]* 之间的浮点数（相对于图片尺寸）。如果设置为 *None*，则不会对边界框进行模糊处理（默认 *None*）\n* **number_sample_per_shard** 每个分片将下载的样本数量（默认 *10000*）\n* **extract_exif** 如果为真，则提取图片的 EXIF 信息并将其保存到元数据中（默认 *True*）\n* **save_additional_columns** 从 CSV\u002FParquet 文件中提取并保存到元数据文件中的附加列列表（默认 *None*）\n* **timeout** 尝试下载一张图片时的最大等待时间（以秒为单位）（默认 *10*）\n* **enable_wandb** 是否启用 WandB 日志记录（默认 *False*）\n* **wandb_project** 使用的 W&B 项目名称（默认 *img2dataset*）\n* **oom_shard_count** 分片数量的数量级，仅用于决定为分片文件命名时应使用多少位零填充（默认 *5*）\n* **compute_hash** 需要计算并存储在元数据中的原始图片哈希值，可选值为 *None*、*md5*、*sha256*、*sha512*（默认 *sha256*）\n* **verify_hash** 如果不为 *None*，则该参数为一个包含两个元素的列表，用于根据提供的输入验证哈希值。列表的第一个元素是输入文件中包含哈希值的列的标签，第二个元素是所检查哈希值的类型（默认 *None*）\n* **distributor** 选择下载任务的分配方式（默认 *multiprocessing*）：\n  * **multiprocessing** 使用多进程池来启动进程\n  * **pyspark** 使用 PySpark 会话在 Spark 集群上创建工作节点（详情见下文）\n  * **ray** 使用 Ray 集群。请参阅 Ray 示例。\n* **subjob_size** 支持该功能的每个子任务中要下载的分片数量，例如，一个子任务可以是一个 PySpark 作业。（默认 *1000*）\n* **retries** 下载失败后重试的次数（默认 *0*）\n* **disable_all_reencoding** 如果设置为 True，图片文件将保持其原始状态，不进行任何缩放或格式转换，甚至不会检查图片是否有效。适用于基准测试。仅当您计划使用其他程序对图片进行后处理且拥有充足存储空间时才使用。（默认 *False*）\n* **min_image_size** 待下载图片的最小尺寸（默认 *0*）\n* **max_image_area** 待下载图片的最大面积（默认 *inf*）\n* **max_aspect_ratio** 待下载图片的最大宽高比（默认 *inf*）\n* **incremental_mode** 可选值为 \"incremental\"、\"overwrite\" 或 \"extend\"。对于 \"incremental\"，img2dataset 将下载所有尚未下载的分片；对于 \"overwrite\"，img2dataset 将递归删除输出文件夹并从头开始；对于 \"extend\"，img2dataset 将从下一个可用的分片编号开始下载。（默认 *incremental*）\n* **max_shard_retry** 在最后重试失败分片的次数（默认 *1*）\n* **user_agent_token** 额外的身份标识令牌，将被添加到用于下载图片的 HTTP 请求的 User-Agent 头中；例如：\"img2downloader\"。（默认 *None*）\n* **disallowed_header_directives** X-Robots-Tags 头部指令列表，如果在下载图片时的 HTTP 响应中出现这些指令，则该图片将被排除在输出数据集之外。若要忽略 X-Robots-Tags，请传递 '[]'。（默认 '[\"noai\", \"noimageai\", \"noindex\", \"noimageindex\"]'）\n* **ignore_ssl_certificate** 如果设置为 True，下载图片时将禁用 SSL 证书验证。这允许从使用无效或自签名证书的服务器下载图片，但会降低安全性。请谨慎使用。（默认 *False*)\n\n## 增量模式\n\n如果首次下载因任何原因中断，您可以再次运行命令，并添加 `--incremental \"incremental\"`（这是默认设置），同时使用相同的输出文件夹、相同的 `number_sample_per_shard` 参数以及相同的输入 URL 列表，img2dataset 将会继续完成剩余部分的下载。\n\n## 输出格式选择\n\nImg2dataset 支持多种输出格式。不同格式各有优劣，需根据具体需求进行选择：\n\n* **files**：这是最简单的格式，图像直接以文件形式保存。适用于本地文件系统中最多 100 万张样本的情况。超过这个数量后，性能问题会迅速显现。在标准文件系统中处理超过 100 万个文件通常效果不佳。\n* **webdataset**：该格式将样本存储为 tar 文件，借助 [webdataset](https:\u002F\u002Fwebdataset.github.io\u002Fwebdataset\u002F) 库，可以高效地在 PyTorch、TensorFlow 和 JAX 中加载生成的数据集。对于大多数应用场景，推荐使用此格式。它适用于任何类型的文件系统。\n* **parquet**：Parquet 是一种列式存储格式，支持快速过滤。使用 PyArrow 和 PySpark 可以非常方便地读取这种格式的数据。如果您的数据生态系统主要基于 PySpark，则可以选择此格式。[Petastorm](https:\u002F\u002Fgithub.com\u002Fuber\u002Fpetastorm) 也可以用来读取数据，但其使用便捷性不如 webdataset。\n* **tfrecord**：tfrecord 是一种基于 Protocol Buffers 的格式，特别适合在 TensorFlow 生态系统中使用，并可通过 [tf.data](https:\u002F\u002Fwww.tensorflow.org\u002Fguide\u002Fdata) API 轻松操作。如果您计划仅在 TensorFlow 环境中使用该数据集，可选择此格式。不过，TensorFlow 的写入器不支持 fsspec，因此仅支持有限的文件系统，包括本地、HDFS、S3 和 GCS。此外，在非本地文件系统上写入时，其效率也低于 webdataset 写入器，性能可能降低约 30%。\n\n## 编码格式选择\n\n图像可以采用 JPEG、PNG 或 WebP 格式进行编码，并且每种格式都支持不同的质量设置。\n\n以下是 100 万张 256×256 分辨率图像在不同格式下的存储空间对比：\n\n| 格式   | 质量 | 压缩方式 | 大小 (GB) |\n| ------ | ---- | -------- | --------- |\n| JPG    | 100  | 无       | 54.2      |\n| JPG    | 95   | 无       | 29.9      |\n| PNG    | 无   | 0        | 187.9     |\n| PNG    | 无   | 9        | 97.7      |\n| WebP   | 100  | 无       | 31.0      |\n| WebP   | 95   | 无       | 23.8      |\n\n注意事项：\n\n* JPEG 在质量设置为 100 时并非无损压缩。\n* PNG 格式为无损压缩。\n* WebP 在质量设置高于 100 时为无损压缩（详见 [OpenCV 文档](https:\u002F\u002Fdocs.opencv.org\u002F3.4\u002Fd8\u002Fd6a\u002Fgroup__imgcodecs__flags.html)）。\n* 不同格式之间的相同质量等级并不意味着图像质量完全一致。\n\n## 数据集筛选\n\n在条件允许的情况下，建议在下载数据集之前先对其进行预筛选。\n\n如有需要，可以使用以下参数：\n* `--min_image_size SIZE`：用于过滤掉任意一边小于 SIZE 的图像。\n* `--max_image_area SIZE`：用于过滤掉面积大于 SIZE 的图像。\n* `--max_aspect_ratio RATIO`：用于过滤掉宽高比大于 RATIO 的图像。\n\n在进行数据筛选时，建议先对数据集进行打乱排序，以减少对分片大小分布的影响。\n\n## 哈希值与安全性\n\n某些数据集（例如 laion5B）会公开原始图像的哈希值。\n\n如果您希望更加安全，可以自动排除与这些哈希值不匹配的图像。此时可以使用 `--compute_hash \"md5\" --verify_hash '[\"md5\",\"md5\"]'` 参数。\n\n需要注意的是，部分图像虽然经过轻微修改，但仍属有效。\n\n## SSL 证书处理\n\n默认情况下，img2dataset 在下载图像时会验证 SSL 证书。然而，某些服务器可能使用无效或自签名的证书。\n\n若要从这类服务器下载图像，可以禁用 SSL 证书验证：\n\n```bash\nimg2dataset --url_list=myimglist.txt --output_folder=output_folder --ignore_ssl_certificate=True\n```\n\n**警告**：禁用 SSL 证书验证会降低安全性，使下载过程容易受到中间人攻击。请仅在必要且信任来源的情况下使用此选项。\n\n## 参数调优建议\n\n对于小型数据集，默认设置通常已足够。但对于大型数据集，以下建议可能有助于提升性能：\n\n* 将 `processes_count` 设置为机器的核心数。\n* 在带宽和 CPU 使用未达到上限的前提下，适当增加 `thread_count`。\n* 如果数据集规模超过 100 万条，建议将 `output_format` 设置为 webdataset，这样只需管理少量的 tar 文件，而非数百万个单独的图像文件。\n* 保持 `metadata` 参数为 True，以便查看已保存的项目，避免重复下载。\n\n为了测试系统性能及 img2dataset 与其交互情况，可以启用以下选项（仅用于测试，不应用于实际下载）：\n* `--output_format dummy`：不会保存任何内容，可用于消除存储瓶颈。\n* `--disable_all_reencoding True`：不会对图像进行重新编码，可用于消除 CPU 瓶颈。\n\n当同时启用这两个选项时，剩下的唯一瓶颈就是网络相关因素，例如 DNS 配置、您的带宽或目标服务器的带宽。\n\n## 文件系统支持\n\n得益于 [fsspec](https:\u002F\u002Ffilesystem-spec.readthedocs.io\u002Fen\u002Flatest\u002F)，img2dataset 支持在多种文件系统中读取和写入文件（详见 [fsspec 文件系统列表](https:\u002F\u002Fgithub.com\u002Ffsspec\u002Ffilesystem_spec\u002Fblob\u002F6233f315548b512ec379323f762b70764efeb92c\u002Ffsspec\u002Fregistry.py#L87)）。使用时，只需在路径前加上相应文件系统的前缀即可，例如 `hdfs:\u002F\u002F`、`s3:\u002F\u002F`、`http:\u002F\u002F`、`gcs:\u002F\u002F`、`ssh:\u002F\u002F` 或 `hf:\u002F\u002F`（包含 [Dataset Viewer](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fhub\u002Fdatasets-viewer) 功能）。\n\n部分文件系统需要额外安装相应的库，例如 S3 需要 s3fs，GCS 需要 gcsfs，SSH 需要 [fsspec\u002Fsshfs](https:\u002F\u002Fgithub.com\u002Ffsspec\u002Fsshfs)，而 HF 则需要 [huggingface_hub](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fhuggingface_hub\u002Fguides\u002Fhf_file_system)。\n\n有关详细信息，请参阅 fsspec 官方文档。\n\n如果您的文件系统需要特定配置，可以通过 [fsspec 配置系统](https:\u002F\u002Ffilesystem-spec.readthedocs.io\u002Fen\u002Flatest\u002Ffeatures.html#configuration) 来解决。例如，您可以在 `.config\u002Ffsspec\u002Fs3.json` 文件中添加如下内容：\n\n```json\n{\n  \"s3\": {\n    \"client_kwargs\": {\n            \"endpoint_url\": \"https:\u002F\u002Fsome_endpoint\",\n            \"aws_access_key_id\": \"your_user\",\n           \"aws_secret_access_key\": \"your_password\"\n    }\n  }\n}\n```\n\n此类配置在使用兼容 S3 协议的文件系统（如 MinIO）时尤为必要。类似的配置同样适用于其他受 fsspec 支持的文件系统。\n\n## 分发模式\n\nImg2dataset 支持多种分发方式：\n* `multiprocessing`：启动一个进程池，利用本地进程进行下载。\n* `pyspark`：在 Spark 集群中启动工作节点来执行下载任务。\n\n`multiprocessing` 适合在单机上进行下载，因此它是默认选项。而 `pyspark` 则可以利用多台机器并行下载，下载速度与机器数量成正比。当处理包含超过十亿张图片的数据集时，`pyspark` 尤其有用。\n\n### pyspark 配置\n\n要使用 `pyspark` 运行 img2dataset，您需要完成以下步骤：\n1. 安装 `pyspark`：`pip install pyspark`\n2. 使用 `--distributor pyspark` 选项\n3. 调整 `--subjob_size 1000` 参数：该参数指定每个子任务下载的图片数量。增大此值会延长将 Feather 文件放入临时目录的准备时间；减小则意味着每次发送给 PySpark 作业的分片更少。\n\n默认情况下，会创建一个本地 Spark 会话。根据您的具体 Spark 集群配置，您可能希望创建自定义的 Spark 会话。有关详细信息，请参阅 [pyspark_example.py](examples\u002Fpyspark_example.py)，您可以在其中插入自定义代码以创建 Spark 会话，然后运行 img2dataset，它将使用该会话进行下载。\n\n如需搭建 Spark 集群，请参考 [分布式 img2dataset 教程](examples\u002Fdistributed_img2dataset_tutorial.md)。\n\n## 与 Weights & Biases 的集成\n\n要启用 Weights & Biases 日志记录，使用 `--enable_wandb=True` 选项。\n\n性能指标将通过 [Weights & Biases](https:\u002F\u002Fwandb.com\u002F) 进行监控。\n\n![W&B 指标](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002From1504_img2dataset_readme_66c7a17d177f.png)\n\n此外，最常见的错误也会被记录下来，以便于调试。\n\n![W&B 表格](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002From1504_img2dataset_readme_484aee86722e.png)\n\n其他功能包括：\n* 记录环境配置（操作系统、Python 版本、CPU 核数、主机名等）\n* 监控硬件资源（GPU\u002FCPU、内存、磁盘、网络等）\n* 自定义图表和报告\n* 比较不同运行结果（在优化线程数或 CPU 数等参数时非常方便）\n\n首次运行脚本时，您可以选择将指标关联到您的账户，或者以匿名方式记录。\n\n您也可以提前通过运行 `wandb login` 登录（或创建账户）。\n\n## 路线图\n\n目前，该工具在最多 1 亿个元素的情况下表现良好。未来目标包括：\n* 针对 10 亿张图片的基准测试，这可能需要\n  * 对图像缩放部分进一步优化\n  * 更好的多节点支持\n  * 集成增量下载功能（仅下载新增元素）\n\n## 架构说明\n\n该工具旨在尽可能快速地下载图片，这会对各种资源造成压力。以下是一些假设每秒下载 1350 张图片时的估算：\n* 带宽：每秒下载一千张平均大小的图片大约需要 130 MB\u002Fs 的带宽。\n* CPU：缩放一张图片可能需要几毫秒，每秒数千张图片可能会占用多达 16 个核心。\n* DNS 查询：百万个 URL 意味着百万个域名，默认的操作系统设置通常不足以应对。可能需要配置本地 bind9 解析器。\n* 磁盘：如果使用缩放功能，写入速度需要达到 30 MB\u002Fs；如果不使用缩放，则需要 130 MB\u002Fs。通过将数据写入少量 tar 文件，可以使用机械硬盘代替 SSD。\n\n基于以上信息，设计选择了如下方案：\n* 将 URL 列表拆分为 K 个分片。K 的取值应使每个分片在磁盘上具有合理大小（例如 256 MB），默认 K = 10000。\n* 启动 N 个进程（使用 multiprocessing 进程池）：\n  * 每个进程启动 M 个线程。M 应尽可能大，以便充分利用网络带宽，同时确保 CPU 使用率不超过 100%。\n  * 每个线程下载一张图片并返回。\n  * 主线程负责图像缩放（这意味着同一时刻最多只有 N 个缩放任务在运行，充分利用所有核心但不会超负荷）。\n  * 主线程将数据保存到与其他进程不同的 tar 文件中。\n\n这种设计能够高效利用 CPU 资源，每个核心只执行一次缩放操作；通过为每个核心打开一个文件来减少磁盘开销；同时利用每个进程中的 M 个线程最大限度地发挥带宽优势。\n\n有关 Python 模块的具体划分，请参阅 [architecture.md](img2dataset\u002Farchitecture.md)。\n\n## 搭建高性能 DNS 解析器\n\n为了获得 img2dataset 的最佳性能，使用高效的 DNS 解析器至关重要。\n* Knot Resolver 可以并行运行。\n* Bind Resolver 是历史悠久的解析器，虽然是单核的，但经过高度优化。\n\n### 搭建 Knot Resolver\n\n您可以按照 [官方快速入门指南](https:\u002F\u002Fknot-resolver.readthedocs.io\u002Fen\u002Fstable\u002Fquickstart-install.html) 操作，或者在 Ubuntu 上执行以下命令：\n\n安装 Knot Resolver：\n\n```bash\nwget https:\u002F\u002Fsecure.nic.cz\u002Ffiles\u002Fknot-resolver\u002Fknot-resolver-release.deb\nsudo dpkg -i knot-resolver-release.deb\nsudo apt update\nsudo apt install -y knot-resolver\nsudo sh -c 'echo `hostname -I` `hostname` >> \u002Fetc\u002Fhosts'\nsudo sh -c 'echo nameserver 127.0.0.1 > \u002Fetc\u002Fresolv.conf'\nsudo systemctl stop systemd-resolved\n```\n\n然后启动 4 个实例：\n\n```bash\nsudo systemctl start kresd@1.service\nsudo systemctl start kresd@2.service\nsudo systemctl start kresd@3.service\nsudo systemctl start kresd@4.service\n```\n\n通过以下命令验证是否正常工作：\n\n```bash\ndig @localhost google.com\n```\n\n### 搭建 Bind9 解析器\n\n为了保持较高的成功率，使用高效的 DNS 解析器是必要的。我尝试过 systemd-resolved、dnsmaskq 和 bind9 等多种解决方案，最终得出结论：bind9 在此场景下表现最佳。以下是如何在 Ubuntu 上进行配置的方法。运行以下命令：\n\n```bash\nsudo apt install bind9\nsudo vim \u002Fetc\u002Fbind\u002Fnamed.conf.options\n```\n\n并在 `options` 中添加以下内容：\n\n```\n\trecursive-clients 10000;\n\tresolver-query-timeout 30000;\n\tmax-clients-per-query 10000;\n\tmax-cache-size 2000m;\n```\n\n然后运行：\n\n```bash\nsudo systemctl restart bind9\necho nameserver 127.0.0.1 | sudo tee -a \u002Fetc\u002Fresolv.conf\n```\n\n这样可以在进行数千次 DNS 查询的同时，保持较高的成功率。您还可以参考 [Bind9 日志记录指南](https:\u002F\u002Fnsrc.org\u002Factivities\u002Fagendas\u002Fen\u002Fdnssec-3-days\u002Fdns\u002Fmaterials\u002Flabs\u002Fen\u002Fdns-bind-logging.html)，以检查是否有少量 DNS 错误发生。\n\n## AI 使用的影响\n\nimg2dataset 用于从网络上抓取图像，并使其易于供机器学习用例使用。其应用场景包括：\n* 进行推理和索引，以更好地理解网络上的内容（例如 https:\u002F\u002From1504.github.io\u002Fclip-retrieval\u002F）；\n* 训练模型。\n\n可以使用图像\u002F文本数据集训练的模型包括：\n* CLIP：一种图像理解模型，可用于判断图像是否安全、是否具有美学价值、包含何种动物等；\n* 文本到图像模型：根据文本生成图像。\n\n关于文本到图像模型的后果，目前存在大量讨论。一些观点包括：\n* AI 艺术正在使艺术更加民主化，让数亿人能够通过艺术表达自我，从而使艺术作品变得更加普遍和独特；\n* 不应使用创作者不希望分享的图像来训练 AI 模型。\n\n“退出机制”旨在允许那些不希望自己的作品被用于索引和训练的创作者，确保其作品不会被使用。\n\n## 开发指南\n\n您可以在本地或在 [GitPod](https:\u002F\u002Fgitpod.io\u002F#https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset) 中进行开发（请注意，在 GitPod 中需要执行 `export PIP_USER=false` 命令）。\n\n设置虚拟环境：\n\n```bash\npython3 -m venv .env\nsource .env\u002Fbin\u002Factivate\npip install -e .\n```\n\n运行测试：\n\n```bash\npip install -r requirements-test.txt\n```\n然后执行：\n\n```bash\nmake lint\nmake test\n```\n\n您可以使用 `make black` 来格式化代码。\n\n运行特定测试的命令为：\n\n```bash\npython -m pytest -x -s -v tests -k \"dummy\"\n```\n\n## 基准测试\n\n### 10,000 张图像基准测试\n\n```bash\ncd tests\u002Ftest_files\nbash benchmark.sh\n```\n\n### 1800 万张图像基准测试\n\n首先下载“在家爬取”数据集的第一部分，然后执行以下命令：\n\n```bash\ncd tests\nbash large_bench.sh\n```\n\n下载 1800 万张图片耗时约 3.7 小时。\n\n目前观察到的性能为每秒 1350 张图像，即每小时 480 万张图像，每天可处理 1.16 亿张图像。\n\n### 3600 万张图像基准测试\n\n下载两个各包含 1800 万条记录的 Parquet 文件（总大小 936GB），耗时 7 小时 24 分钟，平均速度为每秒 1345 张图像。\n\n### 1.9 亿张图像基准测试\n\n从“在家爬取”数据集（[cah-prepro](https:\u002F\u002Fgithub.com\u002From1504\u002Fcah-prepro)）下载 1.9 亿张图像，耗时 41 小时（总大小 5TB），平均速度为每秒 1280 张图像。\n\n### 50 亿张图像基准测试\n\n从 LAION-5B 数据集（[laion5B dataset](https:\u002F\u002Flaion.ai\u002Flaion-5b-a-new-era-of-open-large-scale-multi-modal-datasets\u002F)）下载 58 亿张图像，耗时 7 天（总大小 240TB），在 10 台机器上平均速度为每秒 9500 个样本。详细技术说明请参见：[semantic search at billions scale](https:\u002F\u002From1504.medium.com\u002Fsemantic-search-at-billions-scale-95f21695689a)。\n\n## 引用\n\n```bibtex\n@misc{beaumont-2021-img2dataset,\n  author = {Romain Beaumont},\n  title = {img2dataset: Easily turn large sets of image urls to an image dataset},\n  year = {2021},\n  publisher = {GitHub},\n  journal = {GitHub repository},\n  howpublished = {\\url{https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset}}\n}\n```","# img2dataset 快速上手指南\n\n`img2dataset` 是一个高效的开源工具，能够将大规模图片 URL 列表快速下载、调整大小并打包为机器学习数据集。它支持单机在 20 小时内处理 1 亿个 URL，并可选保存对应的文本描述（Caption）。\n\n## 环境准备\n\n*   **操作系统**：Linux, macOS, Windows (推荐 Linux 以获得最佳性能)\n*   **Python 版本**：Python 3.8+\n*   **前置依赖**：\n    *   `pip` 包管理工具\n    *   (可选但强烈推荐) 高性能 DNS 解析器：在处理大规模数据时，配置快速的 DNS 解析器能显著提升下载速度。\n\n## 安装步骤\n\n使用 pip 直接安装：\n\n```bash\npip install img2dataset\n```\n\n> **国内加速建议**：如果下载速度较慢，建议使用国内镜像源安装：\n> ```bash\n> pip install img2dataset -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n> ```\n\n## 基本使用\n\n### 1. 准备 URL 列表\n首先创建一个文本文件（例如 `myimglist.txt`），每行包含一个图片 URL：\n\n```bash\necho 'https:\u002F\u002Fpicsum.photos\u002F200\u002F305' >> myimglist.txt\necho 'https:\u002F\u002Fpicsum.photos\u002F200\u002F304' >> myimglist.txt\necho 'https:\u002F\u002Fpicsum.photos\u002F200\u002F303' >> myimglist.txt\n```\n\n### 2. 执行下载命令\n运行以下命令即可自动下载图片、调整尺寸并保存到指定文件夹：\n\n```bash\nimg2dataset --url_list=myimglist.txt --output_folder=output_folder --thread_count=64 --image_size=256\n```\n\n**参数说明：**\n*   `--url_list`: 输入的图片 URL 列表文件路径。\n*   `--output_folder`: 输出数据的存储目录。\n*   `--thread_count`: 下载线程数（设为 64 或更高可提升性能）。\n*   `--image_size`: 将图片调整为目标尺寸（默认 256x256）。\n\n### 3. 输出结果\n工具运行完成后，`output_folder` 中将生成如下结构的数据：\n\n```text\noutput_folder\n├── 00000\n│   ├── 000000000.jpg\n│   ├── 000000001.jpg\n│   └── ...\n├── 00000.json      # 包含元数据 (URL, 状态，尺寸等)\n└── 00000_stats.json # 包含下载统计信息\n```\n\n如果提供了包含标题（caption）的输入文件，还会生成对应的 `.txt` 文本文件。该格式可直接用于机器学习模型的训练输入。","某计算机视觉团队急需构建一个包含百万级“街头招牌”图像的数据集，以训练新一代 OCR 模型。\n\n### 没有 img2dataset 时\n- **采集效率极低**：工程师需编写复杂的多线程爬虫脚本，手动处理下载失败、重试机制及 DNS 解析瓶颈，耗时数周仅能获取少量数据。\n- **数据格式混乱**：下载的图片尺寸不一、编码格式混杂，后续需额外开发脚本进行批量 resizing 和格式统一，预处理流程繁琐易错。\n- **元数据丢失风险**：难以将图片 URL 与其对应的文本描述（Caption）自动对齐并打包，导致构建图文对数据集时人工清洗成本巨大。\n- **资源利用率低**：单机无法有效并发处理海量链接，若要加速必须搭建昂贵的分布式集群，增加了基础设施维护负担。\n\n### 使用 img2dataset 后\n- **极速规模化采集**：只需提供一个包含百万个 URL 的文本文件，img2dataset 即可在单台机器上利用高效 DNS 解析，20 小时内完成下载、调整大小及打包全流程。\n- **自动化数据清洗**：工具内置图片解码与 Resize 功能，自动过滤损坏链接并按指定分辨率（如 256x256）输出标准化的 WebDataset 格式，开箱即用。\n- **图文对完美同步**：支持直接输入\"URL+ 标题”列表，自动将文本描述保存为配套元数据，轻松构建高质量的图文训练集。\n- **合规性与灵活性**：默认尊重网站的 `noai` 等robots 协议头，同时允许通过参数灵活控制下载策略，确保数据采集合法合规且适应不同需求。\n\nimg2dataset 将原本需要数周协作的工程难题转化为单条命令的自动化流程，让开发者能专注于模型算法而非数据搬运。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002From1504_img2dataset_ee211ddd.png","rom1504","Romain Beaumont","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002From1504_1196ce5a.png","Interested in machine learning (computer vision, natural language processing, deep learning), node.js (network, bots, web), and programming in general","@google","Palo Alto","romain.rom1@gmail.com",null,"http:\u002F\u002From1504.fr\u002F","https:\u002F\u002Fgithub.com\u002From1504",[84,88,92,96,100],{"name":85,"color":86,"percentage":87},"Python","#3572A5",80,{"name":89,"color":90,"percentage":91},"Jupyter Notebook","#DA5B0B",18,{"name":93,"color":94,"percentage":95},"Shell","#89e051",1.2,{"name":97,"color":98,"percentage":99},"Makefile","#427819",0.7,{"name":101,"color":102,"percentage":103},"Dockerfile","#384d54",0.1,4408,375,"2026-04-16T14:03:43","MIT","Linux, macOS, Windows","未说明 (该工具主要用于下载和调整图像大小，不依赖 GPU 进行深度学习推理或训练)","未说明 (取决于并发线程数和图像缓存量，处理大规模数据集时建议较大内存)",{"notes":112,"python":113,"dependencies":114},"该工具主要通过多线程\u002F多进程进行网络下载和 CPU 图像处理。为了获得最佳性能，强烈建议配置高性能 DNS 解析器。支持多种分布式运行环境（如 PySpark, Ray）。默认会尊重网站的 'noai' 等 robots 标签，可通过参数禁用。输出格式支持文件目录、webdataset (tar)、parquet 和 tfrecord。","未说明 (通常支持 Python 3.6+)",[115,116,117,118,119,120],"requests","Pillow","pyarrow","webdataset","pandas","tqdm",[122,16,14,15],"其他",[124,125,126,127,128,129,130],"deep-learning","dataset","big-data","image","multimodal","image-dataset","download-images","2026-03-27T02:49:30.150509","2026-04-20T04:04:27.091717",[134,139,144,149,154,159],{"id":135,"question_zh":136,"answer_zh":137,"source_url":138},43133,"如何支持断点续传或增量下载，以便在任务中断后继续？","该功能已实现。您可以使用“增量模式”重新运行任务以获取缺失的分片（shards）。只需在相同的输出文件夹中重新运行命令，工具会自动跳过已完成的部分并继续下载剩余内容。此外，最近还实现了分片重试功能，进一步提高了下载的鲁棒性。详细信息请参阅 README 中的增量模式部分。","https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset\u002Fissues\u002F6",{"id":140,"question_zh":141,"answer_zh":142,"source_url":143},43134,"为什么下载完成后统计的成功数量远少于输入数据集中的行数？","这通常是由于部分分片下载失败导致的。现在可以通过“增量模式”重新运行任务来自动获取缺失的分片。此外，项目已实现了分片重试机制（shard retrying），可以在未来版本中自动处理此类问题。建议检查 `*_stats.json` 文件确认成功率，并使用增量模式补全数据。","https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset\u002Fissues\u002F159",{"id":145,"question_zh":146,"answer_zh":147,"source_url":148},43135,"下载任务在即将结束时卡住或停滞不前怎么办？","此问题通常由网络波动或个别分片下载失败引起，现已通过新增的“重试功能”解决。请更新到最新版本并重试。如果仍然遇到读取错误（如 \"Unexpected end of file\"），建议在训练时使用带有错误处理机制的数据加载器（例如参考 laion-prepro 项目中的 datalo_pytorch.py 实现），以跳过损坏的文件。","https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset\u002Fissues\u002F164",{"id":150,"question_zh":151,"answer_zh":152,"source_url":153},43136,"程序运行时内存占用过高甚至耗尽，且没有输出文件，如何解决？","这可能是因为处理大数据集时默认配置导致的内存泄漏或积压。尝试减少 `--processes_count` 和 `--thread_count` 的值（例如设置为 1 和 12），以降低并发压力。如果问题依旧，可以关注社区关于禁用某些非核心功能（如跳过无效 SSL 验证以减少异常抛出）的讨论，或等待后续版本优化内存管理。","https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset\u002Fissues\u002F49",{"id":155,"question_zh":156,"answer_zh":157,"source_url":158},43137,"如何配置分布式下载（如使用 Spark、Dask 或多进程）以优化内存使用和速度？","目前工具支持多进程模式，也可以通过配置使用 Spark 或 Dask 进行分布式处理。对于 AWS 用户，虽然 EMR on EKS 配置较复杂，但推荐使用原始 EC2 实例配合文档化的 Spark 设置，或者选择不依赖 Spark 的原生多进程方案（需确保实现足够的鲁棒性）。写入 S3 或 HDFS 的功能已正常工作，未来可能还会增加基于 SSH 的分布式策略以简化部署。","https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset\u002Fissues\u002F20",{"id":160,"question_zh":161,"answer_zh":162,"source_url":163},43138,"该工具是否尊重网站的 robots.txt 或 opt-out 请求？如何阻止我的网站被抓取？","维护者认为默认应为“允许抓取”，因为限制少数人会影响大多数人的利益和 AI 发展。如果您希望阻止自己的内容被抓取，可以在网站响应头中添加 `X-Robots-Tag` 相关字段，或在 `robots.txt` 文件中声明（部分数据源如 Common Crawl 已支持）。请注意，这些设置仅对此开源工具有效，无法约束其他闭源数据采集行为。社区正在讨论更完善的 robots.txt 支持（见 Issue #48）。","https:\u002F\u002Fgithub.com\u002From1504\u002Fimg2dataset\u002Fissues\u002F293",[165,169,173,177,181,185,189,193,197,201,205,209,213,217,221,225,229,233,237,241],{"id":166,"version":167,"summary_zh":80,"released_at":168},342820,"1.47.0","2025-08-09T22:07:06",{"id":170,"version":171,"summary_zh":80,"released_at":172},342821,"1.46.0","2025-08-09T20:21:32",{"id":174,"version":175,"summary_zh":80,"released_at":176},342822,"1.45.0","2024-01-22T22:08:26",{"id":178,"version":179,"summary_zh":80,"released_at":180},342823,"1.44.1","2024-01-11T23:59:30",{"id":182,"version":183,"summary_zh":80,"released_at":184},342824,"1.44.0","2024-01-11T23:02:26",{"id":186,"version":187,"summary_zh":80,"released_at":188},342825,"1.43.0","2024-01-06T21:27:51",{"id":190,"version":191,"summary_zh":80,"released_at":192},342826,"1.42.0","2023-08-20T21:30:00",{"id":194,"version":195,"summary_zh":80,"released_at":196},342827,"1.41.0","2023-01-07T01:50:31",{"id":198,"version":199,"summary_zh":80,"released_at":200},342828,"1.40.0","2022-12-22T03:58:54",{"id":202,"version":203,"summary_zh":80,"released_at":204},342829,"1.39.0","2022-12-19T14:13:27",{"id":206,"version":207,"summary_zh":80,"released_at":208},342830,"1.38.0","2022-12-17T21:47:42",{"id":210,"version":211,"summary_zh":80,"released_at":212},342831,"1.37.0","2022-12-17T01:49:43",{"id":214,"version":215,"summary_zh":80,"released_at":216},342832,"1.36.0","2022-12-10T15:22:50",{"id":218,"version":219,"summary_zh":80,"released_at":220},342833,"1.35.0","2022-11-26T01:40:31",{"id":222,"version":223,"summary_zh":80,"released_at":224},342834,"1.34.0","2022-11-25T22:41:39",{"id":226,"version":227,"summary_zh":80,"released_at":228},342835,"1.33.0","2022-08-23T09:59:04",{"id":230,"version":231,"summary_zh":80,"released_at":232},342836,"1.32.0","2022-07-24T08:07:17",{"id":234,"version":235,"summary_zh":80,"released_at":236},342837,"1.31.0","2022-06-27T19:44:17",{"id":238,"version":239,"summary_zh":80,"released_at":240},342838,"1.30.2","2022-06-24T10:09:47",{"id":242,"version":243,"summary_zh":80,"released_at":244},342839,"1.30.1","2022-05-26T23:47:22"]