[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-MoonshotAI--Kimi-VL":3,"tool-MoonshotAI--Kimi-VL":62},[4,18,26,36,46,54],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",160411,2,"2026-04-18T23:33:24",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":42,"last_commit_at":43,"category_tags":44,"status":17},8272,"opencode","anomalyco\u002Fopencode","OpenCode 是一款开源的 AI 编程助手（Coding Agent），旨在像一位智能搭档一样融入您的开发流程。它不仅仅是一个代码补全插件，而是一个能够理解项目上下文、自主规划任务并执行复杂编码操作的智能体。无论是生成全新功能、重构现有代码，还是排查难以定位的 Bug，OpenCode 都能通过自然语言交互高效完成，显著减少开发者在重复性劳动和上下文切换上的时间消耗。\n\n这款工具专为软件开发者、工程师及技术研究人员设计，特别适合希望利用大模型能力来提升编码效率、加速原型开发或处理遗留代码维护的专业人群。其核心亮点在于完全开源的架构，这意味着用户可以审查代码逻辑、自定义行为策略，甚至私有化部署以保障数据安全，彻底打破了传统闭源 AI 助手的“黑盒”限制。\n\n在技术体验上，OpenCode 提供了灵活的终端界面（Terminal UI）和正在测试中的桌面应用程序，支持 macOS、Windows 及 Linux 全平台。它兼容多种包管理工具，安装便捷，并能无缝集成到现有的开发环境中。无论您是追求极致控制权的资深极客，还是渴望提升产出的独立开发者，OpenCode 都提供了一个透明、可信",144296,1,"2026-04-16T14:50:03",[13,45],"插件",{"id":47,"name":48,"github_repo":49,"description_zh":50,"stars":51,"difficulty_score":32,"last_commit_at":52,"category_tags":53,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",109154,"2026-04-18T11:18:24",[14,15,13],{"id":55,"name":56,"github_repo":57,"description_zh":58,"stars":59,"difficulty_score":32,"last_commit_at":60,"category_tags":61,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[45,13,15,14],{"id":63,"github_repo":64,"name":65,"description_en":66,"description_zh":67,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":77,"owner_email":77,"owner_twitter":77,"owner_website":78,"owner_url":79,"languages":77,"stars":80,"forks":81,"last_commit_at":82,"license":83,"difficulty_score":10,"env_os":84,"env_gpu":85,"env_ram":84,"env_deps":86,"category_tags":95,"github_topics":77,"view_count":32,"oss_zip_url":77,"oss_zip_packed_at":77,"status":17,"created_at":98,"updated_at":99,"faqs":100,"releases":141},9465,"MoonshotAI\u002FKimi-VL","Kimi-VL","Kimi-VL: Mixture-of-Experts Vision-Language Model for Multimodal Reasoning, Long-Context Understanding, and Strong Agent Capabilities","Kimi-VL 是一款高效开源的视觉语言模型，专为多模态推理、长上下文理解及智能体任务打造。它巧妙解决了传统大模型在保持高性能的同时难以兼顾低算力成本的难题，仅需激活 28 亿参数即可在多项基准测试中媲美甚至超越 GPT-4o 等旗舰模型。\n\n这款模型特别适合开发者、研究人员以及需要构建复杂多模态应用的企业用户。无论是处理大学级别的图文视频理解、高精度 OCR 识别，还是执行需要多轮交互的智能体任务（如操作系统自动化），Kimi-VL 都能游刃有余。其独特的技术亮点在于采用了混合专家（MoE）架构与原生分辨率视觉编码器 MoonViT，不仅支持 128K 超长上下文窗口，能精准分析长篇文档与视频，还能直接处理超高分辨率图像而无需压缩失真。\n\n此外，最新推出的 Kimi-VL-Thinking 变体通过强化学习具备了强大的“深度思考”能力，能在减少 token 消耗的同时，显著提升数学推理与复杂逻辑问题的解决准确率，并扩展了对高清视频场景的支持。对于希望在有限资源下部署强大多模态能力的团队而言，Kimi-VL 提供了一个兼具效率与智慧的优质选择。","\u003Cdiv align=\"center\">\n  \u003Ca href=\"Kimi-VL.pdf\">KIMI-VL TECHNICAL REPORT\u003C\u002Fa>\n\u003C\u002Fdiv>\n\n\u003Cdiv align=\"center\">\n  \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.07491\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FMoonshotAI_Kimi-VL_readme_3371a2bbd8de.png\" height=\"16\" width=\"16\" style=\"vertical-align:middle\">\u003Cb> Tech Report\u003C\u002Fb>\u003C\u002Fa>  |  \n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Fmoonshotai\u002Fkimi-vl-a3b-67f67b6ac91d3b03d382dd85\">\u003Cimg src=\"https:\u002F\u002Fhuggingface.co\u002Ffront\u002Fassets\u002Fhuggingface_logo-noborder.svg\" height=\"16\" width=\"16\" style=\"vertical-align:middle\">\u003Cb> HuggingFace\u003C\u002Fb>\n  \u003C\u002Fa> |\n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking\u002F\">💬\u003Cb>Chat with Latest Kimi-VL (2506)\u003C\u002Fb>\u003C\u002Fa>\n\u003C\u002Fdiv>\n\n\n## 1. Introduction\n\nWe present **Kimi-VL**, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers **advanced multimodal reasoning, long-context understanding, and strong agent capabilities**—all while activating only **2.8B** parameters in its language decoder (Kimi-VL-A3B).\n\nKimi-VL demonstrates strong performance across challenging domains:\nas a general-purpose VLM, Kimi-VL excels in multi-turn agent interaction tasks (e.g.,OSWorld), achieving state-of-the-art results comparable to flagship models.\nFurthermore, it exhibits remarkable capabilities across diverse challenging vision language tasks, including college-level image and video comprehension, optical character recognition (OCR), mathematical reasoning, multi-image understanding, and etc.\n\nIn comparative evaluations, it effectively competes with cutting-edge efficient VLMs such as GPT-4o-mini, Qwen2.5-VL-7B, and Gemma-3-12B-IT, while surpassing GPT-4o in several specialized domains.\n\nKimi-VL also advances the pareto frontiers of multimodal models in processing long contexts and perceiving clearly: Equipped with a 128K extended context window, Kimi-VL can processes long and diverse inputs, achieving impressive scores of 64.5 on LongVideoBench, and 35.1 on MMLongBench-Doc; Its native-resolution vision encoder, MoonViT, further allows it to see and understand ultra-high-resolution visual inputs, achieving 83.2 on InfoVQA and 34.5 on ScreenSpot-Pro, while maintaining lower computational cost with common visual inputs and general tasks.\n\nBuilding on this foundation, we introduce an advanced long-thinking variant: **Kimi-VL-Thinking**. Developed through long chain-of-thought (CoT) supervised fine-tuning (SFT) and reinforcement learning (RL), this model exhibits strong long-horizon reasoning capabilities. It achieves scores of 61.7 on MMMU, 36.8 on MathVision, and 71.3 on MathVista while maintaining the compact 2.8B activated LLM parameter footprint, setting a new standard for efficient yet capable multimodal **thinking** models.\n\n\n\u003Ci>Besides original model variants, we also provide a new [Kimi-VL-A3B-Thinking-2506](https:\u002F\u002Fhuggingface.co\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking-2506) variant with several new or improved abilities:\n- It Thinks Smarter while Consuming Less Tokens: The 2506 version reaches better accuracy on multimodal reasoning benchmarks: 56.9 on MathVision (+20.1), 80.1 on MathVista (+8.4), 46.3 on MMMU-Pro (+3.2), 64.0 on MMMU (+2.1), while in average reducing 20% thinking length.\n- It Sees Clearer with Thinking: Unlike the previous version that specializes on thinking tasks, the 2506 version can also achieve the same or even better ability on general visual perception and understanding, e.g. MMBench-EN-v1.1 (84.4), MMStar (70.4), RealWorldQA (70.0), MMVet (78.4) compared to the original non-thinking version (Kimi-VL-A3B-Instruct).\n- It Extends to Video Scenarios: The new 2506 version also improves on video reasoning and understanding benchmarks. It sets new state-of-the-art for open-source models on VideoMMMU (65.2), while also retaining good ability on general video understanding (71.9 on Video-MME).\n- It Extends to Higher Resolution: The new 2506 version supports 3.2 million total pixels in a single image (1792x1792), 4X compared to the original release. This leads to non-trivial improvements on high-resolution perception and OS-agent grounding benchmarks: 83.2 on V* Benchmark (without extra tools), 52.8 on ScreenSpot-Pro, 52.5 on OSWorld-G (full set with refusal).\n\u003C\u002Fi>\n\n\n\n## 2. Architecture\n\nThe model adopts an MoE language model, a native-resolution visual encoder (MoonViT), and an MLP projector, as illustrated in the following image.\n\n\u003Cdiv align=\"center\">\n  \u003Cimg width=\"90%\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FMoonshotAI_Kimi-VL_readme_c7b1ef2dd980.png\">\n\u003C\u002Fdiv>\n\n## 3. News\n\n- 2025.06.21: Release of Kimi-VL-A3B-Thinking-2506: [Tech Blog \\& Cookbook](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fmoonshotai\u002Fkimi-vl-a3b-thinking-2506), [🤗 Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking-2506)\n- 2025.04.15: [vLLM](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm) has supported Kimi-VL deployment. See [#16387](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fpull\u002F16387) for details.\n- 2025.04.14: [LLaMA-Factory](https:\u002F\u002Fgithub.com\u002Fhiyouga\u002FLLaMA-Factory) has supported Kimi-VL finetuning. See [#7719](https:\u002F\u002Fgithub.com\u002Fhiyouga\u002FLLaMA-Factory\u002Fpull\u002F7719) for details.\n\n## 4. Model Variants\n\n🤗 For common general multimodal perception and understanding, OCR, long video and long document, video perception, and OS-agent uses, we recommend `Kimi-VL-A3B-Instruct` for efficient inference; meanwhile, our new thinking version, `Kimi-VL-A3B-Thinking-2506` also has excellent multimodal perception, long video and long document and OS-agent grounding abilities while achieving better multimodal reasoning skills. See [this blog](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fmoonshotai\u002Fkimi-vl-a3b-thinking-2506) for more information.\n\n\u003Cdiv align=\"center\">\n\n| **Model** | **#Total Params** | **#Activated Params** | **Context Length** | **Download Link** |\n| :------------: | :------------: | :------------: | :------------: | :------------: |\n| 🔥Kimi-VL-A3B-Thinking-2506  | 16B | 3B |  128K   | [🤗 Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking-2506)   |\n| Kimi-VL-A3B-Instruct | 16B | 3B | 128K   | [🤗 Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fmoonshotai\u002FKimi-VL-A3B-Instruct)   |\n| Kimi-VL-A3B-Thinking (deprecated)  | 16B | 3B |  128K   | [🤗 Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking)   |\n\n\u003C\u002Fdiv>\n\n> [!Note]\n> Recommended parameter settings:\n> - For **Thinking models**, it is recommended to use `Temperature = 0.8`. \n> - For **Instruct models**, it is recommended to use `Temperature = 0.2`. \n\n\n### Hugging Face Demo\n\n> 🤗 We serve our model demo in Hugging Face spaces:\n> - Chat with **Kimi-VL-A3B-Thinking-2506**👀🤔🗺️🎬📖🖥️ (*unifying thinking, general understanding, puzzle solving, agent, video, PDF*) model on \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking\u002F\">Chat Web\u003C\u002Fa>.\n\n## 5. Performance\n\n> [!Note]\n> See the performance of Kimi-VL-A3B-Thinking-2506 at [Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking-2506#2-performance).\n\nAs an efficient model, Kimi-VL can robustly handle diverse tasks (fine-grained perception, math, college-level problems, OCR, agent, etc) across a broad spectrum of input forms (single-image, multi-image, video, long-document, etc).\n\nA brief comparison with existing 10B-level dense VLMs and DeepSeek-VL2 (A4.5B):\n\n\u003Cdiv align=\"center\">\n  \u003Cimg width=\"100%\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FMoonshotAI_Kimi-VL_readme_016c25c3f9ab.png\">\n\u003C\u002Fdiv>\n\nWith effective long-thinking abilities, Kimi-VL-A3B-Thinking (2504 version) can match the performance of 30B\u002F70B frontier open-source VLMs on MathVision benchmark:\n\n\u003Cdiv align=\"center\">\n  \u003Cimg width=\"100%\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FMoonshotAI_Kimi-VL_readme_9e4378aac536.png\">\n\u003C\u002Fdiv>\n\n\n## 6. Example usage\n\n### Setup\n\n```bash\nconda create -n kimi-vl python=3.10 -y\nconda activate kimi-vl\npip install -r requirements.txt\n```\n\n> [!Note]\n> If you encounter Out-of-Memory or want to speed up inference, please install **flash-attn** with `pip install flash-attn --no-build-isolation`.\n\n### Inference with Hugging Face Transformers \n\nWe introduce how to use our model at inference stage using transformers library. It is recommended to use python=3.10, torch=2.5.1, and transformers=4.51.3 as the development environment. \n\n#### Kimi-VL-A3B-Instruct:\n\n```python\nimport torch\nfrom PIL import Image\nfrom transformers import AutoModelForCausalLM, AutoProcessor\n\nmodel_path = \"moonshotai\u002FKimi-VL-A3B-Instruct\"\nmodel = AutoModelForCausalLM.from_pretrained(\n    model_path,\n    torch_dtype=\"auto\",\n    device_map=\"auto\",\n    trust_remote_code=True,\n)\n# If flash-attn has been installed, it is recommended to set torch_dtype=torch.bfloat16 and attn_implementation=\"flash_attention_2\"\n# to save memory and speed up inference\n# model = AutoModelForCausalLM.from_pretrained(\n#     model_path,\n#     torch_dtype=torch.bfloat16,\n#     device_map=\"auto\",\n#     trust_remote_code=True,\n#     attn_implementation=\"flash_attention_2\"\n# )\n\nprocessor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)\n\nimage_path = \".\u002Ffigures\u002Fdemo.png\"\nimage = Image.open(image_path)\nmessages = [\n    {\"role\": \"user\", \"content\": [{\"type\": \"image\", \"image\": image_path}, {\"type\": \"text\", \"text\": \"What is the dome building in the picture? Think step by step.\"}]}\n]\ntext = processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors=\"pt\")\ninputs = processor(images=image, text=text, return_tensors=\"pt\", padding=True, truncation=True).to(model.device)\ngenerated_ids = model.generate(**inputs, max_new_tokens=512)\ngenerated_ids_trimmed = [\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n]\nresponse = processor.batch_decode(\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n)[0]\nprint(response)\n```\n\n#### Kimi-VL-A3B-Thinking-2506:\n\n```python\nimport torch\nfrom PIL import Image\nfrom transformers import AutoModelForCausalLM, AutoProcessor\n\nmodel_path = \"moonshotai\u002FKimi-VL-A3B-Thinking-2506\"\nmodel = AutoModelForCausalLM.from_pretrained(\n    model_path,\n    torch_dtype=\"auto\",\n    device_map=\"auto\",\n    trust_remote_code=True,\n)\n# If flash-attn has been installed, it is recommended to set torch_dtype=torch.bfloat16 and attn_implementation=\"flash_attention_2\"\n# to save memory and speed up inference\n# model = AutoModelForCausalLM.from_pretrained(\n#     model_path,\n#     torch_dtype=torch.bfloat16,\n#     device_map=\"auto\",\n#     trust_remote_code=True,\n#     attn_implementation=\"flash_attention_2\"\n# )\nprocessor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)\n\nimage_paths = [\".\u002Ffigures\u002Fdemo1.png\", \".\u002Ffigures\u002Fdemo2.png\"]\nimages = [Image.open(path) for path in image_paths]\nmessages = [\n    {\n        \"role\": \"user\",\n        \"content\": [\n            {\"type\": \"image\", \"image\": image_path} for image_path in image_paths\n        ] + [{\"type\": \"text\", \"text\": \"Please infer step by step who this manuscript belongs to and what it records\"}],\n    },\n]\ntext = processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors=\"pt\")\ninputs = processor(images=images, text=text, return_tensors=\"pt\", padding=True, truncation=True).to(model.device)\ngenerated_ids = model.generate(**inputs, max_new_tokens=32768, temperature=0.8)\ngenerated_ids_trimmed = [\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n]\nresponse = processor.batch_decode(\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n)[0]\nprint(response)\n```\n\n## 7. Finetuning\n\nCollaborating closely with the open-source community, Kimi-VL now offers seamless support for efficient fine-tuning through the latest version of [LLaMA-Factory](https:\u002F\u002Fgithub.com\u002Fhiyouga\u002FLLaMA-Factory). \n\nThe framework enables Single-GPU LoRA fine-tuning with 50GB of VRAM, as well as Multi-GPU full\u002Flora fine-tuning using DeepSpeed ZeRO-2. For more detailed configuration instructions, check out [this PR](https:\u002F\u002Fgithub.com\u002Fhiyouga\u002FLLaMA-Factory\u002Fpull\u002F7719#issue-2992644288).\n\n## 8. Deployment\n\n### Using vLLM\n\nThe [vLLM main branch](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm) has supported Kimi-VL deployment. You are welcome to deploy Kimi-VL using vLLM.\n\n#### Offline Inference\n\n> [!Note]\n> More usages about `Offline Inference` can be found at [vLLM Offline Inference](https:\u002F\u002Fdocs.vllm.ai\u002Fen\u002Flatest\u002Fserving\u002Foffline_inference.html).\n\n```python\nfrom PIL import Image\nfrom transformers import AutoProcessor\nfrom vllm import LLM, SamplingParams\n\nmodel_path = \"moonshotai\u002FKimi-VL-A3B-Instruct\"  # or \"moonshotai\u002FKimi-VL-A3B-Thinking-2506\"\nllm = LLM(\n    model_path,\n    trust_remote_code=True,\n)\n\nprocessor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)\n\nimage_path = \".\u002Ffigures\u002Fdemo.png\"\nimage = Image.open(image_path)\nmessages = [\n    {\"role\": \"user\", \"content\": [{\"type\": \"image\", \"image\": image_path}, {\"type\": \"text\", \"text\": \"What is the dome building in the picture? Think step by step.\"}]}\n]\ntext = processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors=\"pt\")\noutputs = llm.generate([{\"prompt\": text, \"multi_modal_data\": {\"image\": image}}], sampling_params = SamplingParams(max_tokens=512))\n\nprint(\"-\" * 50)\nfor o in outputs:\n    generated_text = o.outputs[0].text\n    print(generated_text)\n    print(\"-\" * 50)\n```\n\n#### OpenAI-Compatible Server\n\n> [!Note]\n> More usages about `OpenAI-Compatible Server` can be found at [vLLM OpenAI-Compatible Server](https:\u002F\u002Fdocs.vllm.ai\u002Fen\u002Flatest\u002Fserving\u002Fopenai_compatible_server.html#).\n\nServe Kimi-VL with `vllm serve` command:\n\n```bash\n# If you need a longer context window, you can set --max-model-len and --max-num-batched-tokens to 131072\n# If you need more input images, you can set --limit-mm-per-prompt image=256 or 512\n\n# kimi-vl-thinking-2506\nvllm serve moonshotai\u002FKimi-VL-A3B-Thinking-2506 --served-model-name kimi-vl-thinking-2506 --trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 32768 --limit-mm-per-prompt image=64\n\n# kimi-vl-instruct\nvllm serve moonshotai\u002FKimi-VL-A3B-Instruct --served-model-name kimi-vl --trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 32768 --limit-mm-per-prompt image=64\n```\n\nCall the API\n\n```python\nimport base64\nfrom PIL import Image\nfrom io import BytesIO\nfrom openai import OpenAI\n\nclient = OpenAI(\n    base_url=\"http:\u002F\u002Flocalhost:8000\u002Fv1\",\n    api_key=\"token-abc123\",\n)\n\nimage_path = \".\u002Ffigures\u002Fdemo.png\"\nimage = Image.open(image_path).convert(\"RGB\")\n\nbuffered = BytesIO()\nimage.save(buffered, format=\"JPEG\")\nimg_b64_str = base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\nbase64_image_url = f\"data:image\u002Fjpeg;base64,{img_b64_str}\"\n\nmessages = [\n    {\"role\": \"user\", \"content\": [{\"type\": \"image_url\", \"image_url\": {\"url\": base64_image_url}}, {\"type\": \"text\", \"text\": \"What is the dome building in the picture? Think step by step.\"}]}\n]\n\ncompletion = client.chat.completions.create(\n  model=\"kimi-vl-thinking-2506\", # or kimi-vl\n  messages=messages\n)\n\nprint(completion.choices[0].message)\n```\n\n## 9. Citation\n\n```\n@misc{kimiteam2025kimivltechnicalreport,\n      title={{Kimi-VL} Technical Report}, \n      author={Kimi Team and Angang Du and Bohong Yin and Bowei Xing and Bowen Qu and Bowen Wang and Cheng Chen and Chenlin Zhang and Chenzhuang Du and Chu Wei and Congcong Wang and Dehao Zhang and Dikang Du and Dongliang Wang and Enming Yuan and Enzhe Lu and Fang Li and Flood Sung and Guangda Wei and Guokun Lai and Han Zhu and Hao Ding and Hao Hu and Hao Yang and Hao Zhang and Haoning Wu and Haotian Yao and Haoyu Lu and Heng Wang and Hongcheng Gao and Huabin Zheng and Jiaming Li and Jianlin Su and Jianzhou Wang and Jiaqi Deng and Jiezhong Qiu and Jin Xie and Jinhong Wang and Jingyuan Liu and Junjie Yan and Kun Ouyang and Liang Chen and Lin Sui and Longhui Yu and Mengfan Dong and Mengnan Dong and Nuo Xu and Pengyu Cheng and Qizheng Gu and Runjie Zhou and Shaowei Liu and Sihan Cao and Tao Yu and Tianhui Song and Tongtong Bai and Wei Song and Weiran He and Weixiao Huang and Weixin Xu and Xiaokun Yuan and Xingcheng Yao and Xingzhe Wu and Xinxing Zu and Xinyu Zhou and Xinyuan Wang and Y. Charles and Yan Zhong and Yang Li and Yangyang Hu and Yanru Chen and Yejie Wang and Yibo Liu and Yibo Miao and Yidao Qin and Yimin Chen and Yiping Bao and Yiqin Wang and Yongsheng Kang and Yuanxin Liu and Yulun Du and Yuxin Wu and Yuzhi Wang and Yuzi Yan and Zaida Zhou and Zhaowei Li and Zhejun Jiang and Zheng Zhang and Zhilin Yang and Zhiqi Huang and Zihao Huang and Zijia Zhao and Ziwei Chen},\n      year={2025},\n      eprint={2504.07491},\n      archivePrefix={arXiv},\n      primaryClass={cs.CV},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.07491}, \n}\n```\n\n","\u003Cdiv align=\"center\">\n  \u003Ca href=\"Kimi-VL.pdf\">KIMI-VL 技术报告\u003C\u002Fa>\n\u003C\u002Fdiv>\n\n\u003Cdiv align=\"center\">\n  \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.07491\">\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FMoonshotAI_Kimi-VL_readme_3371a2bbd8de.png\" height=\"16\" width=\"16\" style=\"vertical-align:middle\">\u003Cb> 技术报告\u003C\u002Fb>\u003C\u002Fa>  |  \n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Fmoonshotai\u002Fkimi-vl-a3b-67f67b6ac91d3b03d382dd85\">\u003Cimg src=\"https:\u002F\u002Fhuggingface.co\u002Ffront\u002Fassets\u002Fhuggingface_logo-noborder.svg\" height=\"16\" width=\"16\" style=\"vertical-align:middle\">\u003Cb> HuggingFace\u003C\u002Fb>\n  \u003C\u002Fa> |\n  \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking\u002F\">💬\u003Cb>与最新版 Kimi-VL（2506）对话\u003C\u002Fb>\u003C\u002Fa>\n\u003C\u002Fdiv>\n\n\n## 1. 引言\n\n我们推出了 **Kimi-VL**，这是一款高效的开源专家混合（MoE）视觉语言模型（VLM），具备 **先进的多模态推理能力、长上下文理解能力以及强大的智能体功能**——同时其语言解码器仅激活了 **28亿** 参数（Kimi-VL-A3B）。\n\nKimi-VL 在多个具有挑战性的领域中表现出色：\n作为一款通用的视觉语言模型，Kimi-VL 在多轮智能体交互任务（如 OSWorld）中表现卓越，取得了与旗舰模型相媲美的最先进成果。\n此外，它在各类复杂的视觉语言任务中也展现了非凡的能力，包括大学水平的图像和视频理解、光学字符识别（OCR）、数学推理、多图像理解等。\n\n在对比评估中，Kimi-VL 能够有效与 GPT-4o-mini、Qwen2.5-VL-7B 和 Gemma-3-12B-IT 等前沿高效视觉语言模型竞争，并在多个专业领域超越 GPT-4o。\n\nKimi-VL 还在处理长上下文和清晰感知方面推动了多模态模型的帕累托前沿：配备 128K 的扩展上下文窗口，Kimi-VL 可以处理长篇且多样化的输入，在 LongVideoBench 上获得 64.5 分，在 MMLongBench-Doc 上获得 35.1 分；其原生分辨率视觉编码器 MoonViT 进一步使其能够看清并理解超高分辨率的视觉输入，在 InfoVQA 上获得 83.2 分，在 ScreenSpot-Pro 上获得 34.5 分，同时在常规视觉输入和一般任务中保持较低的计算成本。\n\n在此基础上，我们推出了一个先进的长思维变体：**Kimi-VL-Thinking**。该模型通过长链式思维（CoT）监督微调（SFT）和强化学习（RL）训练而成，展现出强大的长时序推理能力。它在 MMMU 上获得 61.7 分，MathVision 上获得 36.8 分，MathVista 上获得 71.3 分，同时保持紧凑的 28亿激活 LLM 参数规模，为高效而强大的多模态 **思考** 模型树立了新标杆。\n\n\n\u003Ci>除了原始模型变体外，我们还提供了一个新的 [Kimi-VL-A3B-Thinking-2506](https:\u002F\u002Fhuggingface.co\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking-2506) 变体，具备多项全新或改进的能力：\n- 思维更智能，消耗更少的令牌：2506 版本在多模态推理基准测试中达到了更高的准确率：MathVision 上为 56.9 分（提升 20.1 分），MathVista 上为 80.1 分（提升 8.4 分），MMMU-Pro 上为 46.3 分（提升 3.2 分），MMMU 上为 64.0 分（提升 2.1 分），同时平均减少了 20% 的思考长度。\n- 思维时也能看得更清楚：与之前专注于思考任务的版本不同，2506 版本在一般的视觉感知和理解方面也能达到相同甚至更好的效果，例如在 MMBench-EN-v1.1 上得分为 84.4 分，MMStar 上得分为 70.4 分，RealWorldQA 上得分为 70.0 分，MMVet 上得分为 78.4 分，均优于非思考版本（Kimi-VL-A3B-Instruct）。\n- 应用于视频场景：新的 2506 版本在视频推理和理解方面的表现也有所提升。它在 VideoMMMU 基准上创下了开源模型的新纪录（65.2 分），同时在一般视频理解方面仍保持良好表现（Video-MME 得分为 71.9 分）。\n- 支持更高分辨率：新的 2506 版本支持单张图像 320 万像素（1792x1792），是原始版本的 4 倍。这带来了高分辨率感知和 OS 智能体定位基准测试中的显著提升：V* 基准测试得分为 83.2 分（无需额外工具），ScreenSpot-Pro 得分为 52.8 分，OSWorld-G 得分为 52.5 分（完整拒绝设置）。\n\u003C\u002Fi>\n\n\n\n## 2. 架构\n\n该模型采用 MoE 语言模型、原生分辨率视觉编码器（MoonViT）以及 MLP 投影器，如下图所示。\n\n\u003Cdiv align=\"center\">\n  \u003Cimg width=\"90%\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FMoonshotAI_Kimi-VL_readme_c7b1ef2dd980.png\">\n\u003C\u002Fdiv>\n\n## 3. 新闻\n\n- 2025年6月21日：发布 Kimi-VL-A3B-Thinking-2506：[技术博客 & 食谱](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fmoonshotai\u002Fkimi-vl-a3b-thinking-2506)，[🤗 Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking-2506)\n- 2025年4月15日：[vLLM](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm) 已支持 Kimi-VL 的部署。详情请参阅 [#16387](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm\u002Fpull\u002F16387)。\n- 2025年4月14日：[LLaMA-Factory](https:\u002F\u002Fgithub.com\u002Fhiyouga\u002FLLaMA-Factory) 已支持 Kimi-VL 的微调。详情请参阅 [#7719](https:\u002F\u002Fgithub.com\u002Fhiyouga\u002FLLaMA-Factory\u002Fpull\u002F7719)。\n\n## 4. 模型变体\n\n🤗 对于常见的多模态感知与理解、OCR、长视频和长文档、视频感知以及 OS 智能体应用，我们推荐使用 `Kimi-VL-A3B-Instruct` 进行高效推理；与此同时，我们的新型思考版本 `Kimi-VL-A3B-Thinking-2506` 也在多模态感知、长视频和长文档以及 OS 智能体定位方面表现出色，同时具备更强的多模态推理能力。更多信息请参阅 [这篇博客](https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fmoonshotai\u002Fkimi-vl-a3b-thinking-2506)。\n\n\u003Cdiv align=\"center\">\n\n| **模型** | **总参数量** | **激活参数量** | **上下文长度** | **下载链接** |\n| :------------: | :------------: | :------------: | :------------: | :------------: |\n| 🔥Kimi-VL-A3B-Thinking-2506  | 160亿 | 30亿 |  128K   | [🤗 Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking-2506)   |\n| Kimi-VL-A3B-Instruct | 160亿 | 30亿 | 128K   | [🤗 Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fmoonshotai\u002FKimi-VL-A3B-Instruct)   |\n| Kimi-VL-A3B-Thinking（已弃用）  | 160亿 | 30亿 |  128K   | [🤗 Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking)   |\n\n\u003C\u002Fdiv>\n\n> [!注意]\n> 推荐参数设置：\n> - 对于 **思考模型**，建议将温度设置为 0.8。\n> - 对于 **指令模型**，建议将温度设置为 0.2。\n\n\n### Hugging Face 演示\n\n> 🤗 我们在 Hugging Face 空间中提供了模型演示：\n> - 与 **Kimi-VL-A3B-Thinking-2506** 👀🤔🗺️🎬📖🖥️ 模型（集思考、通用理解、解谜、智能体、视频、PDF 于一体）进行对话，请访问 \u003Ca href=\"https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking\u002F\">聊天网页\u003C\u002Fa>。\n\n## 5. 性能\n\n> [!注]\n> 请参阅 Kimi-VL-A3B-Thinking-2506 在 [Hugging Face](https:\u002F\u002Fhuggingface.co\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking-2506#2-performance) 上的性能表现。\n\n作为一款高效的模型，Kimi-VL 能够稳健地处理多样化的任务（细粒度感知、数学、大学水平问题、OCR、智能体等），并支持广泛的输入形式（单张图像、多张图像、视频、长文档等）。\n\n与现有的 10B 级密集型 VLM 以及 DeepSeek-VL2（A4.5B）的简要对比：\n\n\u003Cdiv align=\"center\">\n  \u003Cimg width=\"100%\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FMoonshotAI_Kimi-VL_readme_016c25c3f9ab.png\">\n\u003C\u002Fdiv>\n\n凭借强大的长序列思考能力，Kimi-VL-A3B-Thinking（2504 版本）在 MathVision 基准测试上可达到 30B\u002F70B 级前沿开源 VLM 的性能水平：\n\n\u003Cdiv align=\"center\">\n  \u003Cimg width=\"100%\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FMoonshotAI_Kimi-VL_readme_9e4378aac536.png\">\n\u003C\u002Fdiv>\n\n\n## 6. 示例用法\n\n### 环境搭建\n\n```bash\nconda create -n kimi-vl python=3.10 -y\nconda activate kimi-vl\npip install -r requirements.txt\n```\n\n> [!注]\n> 如果遇到显存不足或希望加快推理速度，请使用 `pip install flash-attn --no-build-isolation` 安装 **flash-attn**。\n\n### 使用 Hugging Face Transformers 进行推理\n\n我们介绍如何使用 transformers 库在推理阶段调用我们的模型。建议使用 python=3.10、torch=2.5.1 和 transformers=4.51.3 作为开发环境。\n\n#### Kimi-VL-A3B-Instruct：\n\n```python\nimport torch\nfrom PIL import Image\nfrom transformers import AutoModelForCausalLM, AutoProcessor\n\nmodel_path = \"moonshotai\u002FKimi-VL-A3B-Instruct\"\nmodel = AutoModelForCausalLM.from_pretrained(\n    model_path,\n    torch_dtype=\"auto\",\n    device_map=\"auto\",\n    trust_remote_code=True,\n)\n# 如果已安装 flash-attn，建议设置 torch_dtype=torch.bfloat16 和 attn_implementation=\"flash_attention_2\"\n# 以节省显存并加速推理\n# model = AutoModelForCausalLM.from_pretrained(\n#     model_path,\n#     torch_dtype=torch.bfloat16,\n#     device_map=\"auto\",\n#     trust_remote_code=True,\n#     attn_implementation=\"flash_attention_2\"\n# )\n\nprocessor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)\n\nimage_path = \".\u002Ffigures\u002Fdemo.png\"\nimage = Image.open(image_path)\nmessages = [\n    {\"role\": \"user\", \"content\": [{\"type\": \"image\", \"image\": image_path}, {\"type\": \"text\", \"text\": \"图片中的穹顶建筑是什么？请逐步思考。\"}]}\n]\ntext = processor.apply_chat_template(messages, add_generation_prompt=True，return_tensors=\"pt\")\ninputs = processor(images=image，text=text，return_tensors=\"pt\"，padding=True，truncation=True).to(model.device)\ngenerated_ids = model.generate(**inputs，max_new_tokens=512)\ngenerated_ids_trimmed = [\n    out_ids[len(in_ids) :] for in_ids，out_ids in zip(inputs.input_ids，generated_ids)\n]\nresponse = processor.batch_decode(\n    generated_ids_trimmed，skip_special_tokens=True，clean_up_tokenization_spaces=False\n)[0]\nprint(response)\n```\n\n#### Kimi-VL-A3B-Thinking-2506：\n\n```python\nimport torch\nfrom PIL import Image\nfrom transformers import AutoModelForCausalLM，AutoProcessor\n\nmodel_path = \"moonshotai\u002FKimi-VL-A3B-Thinking-2506\"\nmodel = AutoModelForCausalLM.from_pretrained(\n    model_path，\n    torch_dtype=\"auto，\n    device_map=\"auto，\n    trust_remote_code=True，\n)\n# 如果已安装 flash-attn，建议设置 torch_dtype=torch.bfloat16 和 attn_implementation=\"flash_attention_2\"\n# 以节省显存和加速推理\n# model = AutoModelForCausalLM.from_pretrained(\n#     model_path，\n#     torch_dtype=torch.bfloat16，\n#     device_map=\"auto，\n#     trust_remote_code=True，\n#     attn_implementation=\"flash_attention_2\"\n# )\nprocessor = AutoProcessor.from_pretrained(model_path，trust_remote_code=True）\n\nimage_paths = [\".\u002Ffigures\u002Fdemo1.png\"，\".\u002Ffigures\u002Fdemo2.png\"]\nimages = [Image.open(path) for path in image_paths]\nmessages = [\n    {\n        \"role\": \"user\"，\n        \"content\": [\n            {\"type\": \"image\"，\"image\": image_path} for image_path in image_paths\n        ] + [{\"type\": \"text\"，\"text\": \"请逐步推断这份手稿属于谁，记录了什么内容\"}],\n    },\n]\ntext = processor.apply_chat_template(messages，add_generation_prompt=True，return_tensors=\"pt\")\ninputs = processor(images=images，text=text，return_tensors=\"pt\"，padding=True，truncation=True).to(model.device)\ngenerated_ids = model.generate(**inputs，max_new_tokens=32768，temperature=0.8)\ngenerated_ids_trimmed = [\n    out_ids[len(in_ids) :] for in_ids，out_ids in zip(inputs.input_ids，generated_ids)\n]\nresponse = processor.batch_decode(\n    generated_ids_trimmed，skip_special_tokens=True，clean_up_tokenization_spaces=False\n)[0]\nprint(response)\n```\n\n## 7. 微调\n\nKimi-VL 与开源社区紧密合作，现已通过最新版本的 [LLaMA-Factory](https:\u002F\u002Fgithub.com\u002Fhiyouga\u002FLLaMA-Factory) 提供对高效微调的无缝支持。\n\n该框架支持单 GPU LoRA 微调（需 50GB 显存），以及使用 DeepSpeed ZeRO-2 的多 GPU 全量\u002FLoRA 微调。更多详细配置说明，请参阅 [此 PR](https:\u002F\u002Fgithub.com\u002Fhiyouga\u002FLLaMA-Factory\u002Fpull\u002F7719#issue-2992644288)。\n\n## 8. 部署\n\n### 使用 vLLM\n\n[vLLM 主分支](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm)已支持 Kimi-VL 的部署。欢迎使用 vLLM 部署 Kimi-VL。\n\n#### 离线推理\n\n> [!注]\n> 关于 `离线推理` 的更多用法，请参阅 [vLLM 离线推理](https:\u002F\u002Fdocs.vllm.ai\u002Fen\u002Flatest\u002Fserving\u002Foffline_inference.html)。\n\n```python\nfrom PIL import Image\nfrom transformers import AutoProcessor\nfrom vllm import LLM，SamplingParams\n\nmodel_path = \"moonshotai\u002FKimi-VL-A3B-Instruct\"  # 或 \"moonshotai\u002FKimi-VL-A3B-Thinking-2506\"\nllm = LLM(\n    model_path，\n    trust_remote_code=True，\n)\n\nprocessor = AutoProcessor.from_pretrained(model_path，trust_remote_code=True）\n\nimage_path = \".\u002Ffigures\u002Fdemo.png\"\nimage = Image.open(image_path)\nmessages = [\n    {\"role\": \"user\"，\"content\": [{\"type\": \"image\"，\"image\": image_path}，{\"type\": \"text\"，\"text\": \"图片中的穹顶建筑是什么？请逐步思考。\"}]}\n]\ntext = processor.apply_chat_template(messages，add_generation_prompt=True，return_tensors=\"pt\")\noutputs = llm.generate([{\"prompt\": text，\"multi_modal_data\": {\"image\": image}}]，sampling_params = SamplingParams(max_tokens=512))\n\nprint(\"-\" * 50)\nfor o in outputs:\n    generated_text = o.outputs[0].text\n    print(generated_text)\n    print(\"-\" * 50)\n```\n\n#### OpenAI 兼容服务器\n\n> [!注]\n> 关于 `OpenAI 兼容服务器` 的更多用法，请参阅 [vLLM OpenAI 兼容服务器](https:\u002F\u002Fdocs.vllm.ai\u002Fen\u002Flatest\u002Fserving\u002Fopenai_compatible_server.html#)。\n\n使用 `vllm serve` 命令部署 Kimi-VL：\n\n```bash\n# 如果需要更长的上下文窗口，可以将 --max-model-len 和 --max-num-batched-tokens 设置为 131072\n# 如果需要处理更多的输入图像，可以将 --limit-mm-per-prompt 设置为 image=256 或 512\n\n# kimi-vl-thinking-2506\nvllm serve moonshotai\u002FKimi-VL-A3B-Thinking-2506 --served-model-name kimi-vl-thinking-2506 --trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 32768 --limit-mm-per-prompt image=64\n\n# kimi-vl-instruct\nvllm serve moonshotai\u002FKimi-VL-A3B-Instruct --served-model-name kimi-vl --trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 32768 --limit-mm-per-prompt image=64\n```\n\n调用 API\n\n```python\nimport base64\nfrom PIL import Image\nfrom io import BytesIO\nfrom openai import OpenAI\n\nclient = OpenAI(\n    base_url=\"http:\u002F\u002Flocalhost:8000\u002Fv1\",\n    api_key=\"token-abc123\",\n)\n\nimage_path = \".\u002Ffigures\u002Fdemo.png\"\nimage = Image.open(image_path).convert(\"RGB\")\n\nbuffered = BytesIO()\nimage.save(buffered, format=\"JPEG\")\nimg_b64_str = base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\nbase64_image_url = f\"data:image\u002Fjpeg;base64,{img_b64_str}\"\n\nmessages = [\n    {\"role\": \"user\", \"content\": [{\"type\": \"image_url\", \"image_url\": {\"url\": base64_image_url}}, {\"type\": \"text\", \"text\": \"图片中的穹顶建筑是什么？请逐步思考。\"}]}\n]\n\ncompletion = client.chat.completions.create(\n  model=\"kimi-vl-thinking-2506\", # 或者 kimi-vl\n  messages=messages\n)\n\nprint(completion.choices[0].message)\n```\n\n## 9. 引用\n\n```\n@misc{kimiteam2025kimivltechnicalreport,\n      title={{Kimi-VL} 技术报告}, \n      author={Kimi 团队 和 Angang Du 和 Bohong Yin 和 Bowei Xing 和 Bowen Qu 和 Bowen Wang 和 Cheng Chen 和 Chenlin Zhang 和 Chenzhuang Du 和 Chu Wei 和 Congcong Wang 和 Dehao Zhang 和 Dikang Du 和 Dongliang Wang 和 Enming Yuan 和 Enzhe Lu 和 Fang Li 和 Flood Sung 和 Guangda Wei 和 Guokun Lai 和 Han Zhu 和 Hao Ding 和 Hao Hu 和 Hao Yang 和 Hao Zhang 和 Haoning Wu 和 Haotian Yao 和 Haoyu Lu 和 Heng Wang 和 Hongcheng Gao 和 Huabin Zheng 和 Jiaming Li 和 Jianlin Su 和 Jianzhou Wang 和 Jiaqi Deng 和 Jiezhong Qiu 和 Jin Xie 和 Jinhong Wang 和 Jingyuan Liu 和 Junjie Yan 和 Kun Ouyang 和 Liang Chen 和 Lin Sui 和 Longhui Yu 和 Mengfan Dong 和 Mengnan Dong 和 Nuo Xu 和 Pengyu Cheng 和 Qizheng Gu 和 Runjie Zhou 和 Shaowei Liu 和 Sihan Cao 和 Tao Yu 和 Tianhui Song 和 Tongtong Bai 和 Wei Song 和 Weiran He 和 Weixiao Huang 和 Weixin Xu 和 Xiaokun Yuan 和 Xingcheng Yao 和 Xingzhe Wu 和 Xinxing Zu 和 Xinyu Zhou 和 Xinyuan Wang 和 Y. Charles 和 Yan Zhong 和 Yang Li 和 Yangyang Hu 和 Yanru Chen 和 Yejie Wang 和 Yibo Liu 和 Yibo Miao 和 Yidao Qin 和 Yimin Chen 和 Yiping Bao 和 Yiqin Wang 和 Yongsheng Kang 和 Yuanxin Liu 和 Yulun Du 和 Yuxin Wu 和 Yuzhi Wang 和 Yuzi Yan 和 Zaida Zhou 和 Zhaowei Li 和 Zhejun Jiang 和 Zheng Zhang 和 Zhilin Yang 和 Zhiqi Huang 和 Zihao Huang 和 Zijia Zhao 和 Ziwei Chen},\n      year={2025},\n      eprint={2504.07491},\n      archivePrefix={arXiv},\n      primaryClass={cs.CV},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.07491}, \n}\n```","# Kimi-VL 快速上手指南\n\nKimi-VL 是一款高效的开源混合专家（MoE）视觉语言模型，具备强大的多模态推理、长上下文理解及智能体能力。其语言解码器仅激活约 2.8B 参数，却能在多项基准测试中媲美旗舰模型。本指南将帮助您快速部署并使用该模型。\n\n## 1. 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**: Linux (推荐) 或 macOS\n*   **Python 版本**: 3.10 (官方推荐开发环境)\n*   **硬件要求**:\n    *   支持 CUDA 的 NVIDIA GPU (建议使用显存 ≥ 24GB 以运行全精度或大上下文任务)\n    *   若需加速推理并降低显存占用，强烈建议安装 `flash-attn`\n*   **核心依赖版本参考**:\n    *   `torch`: 2.5.1\n    *   `transformers`: 4.51.3\n\n## 2. 安装步骤\n\n### 2.1 创建虚拟环境\n推荐使用 Conda 创建独立的 Python 环境：\n\n```bash\nconda create -n kimi-vl python=3.10 -y\nconda activate kimi-vl\n```\n\n### 2.2 安装依赖库\n克隆项目仓库（如有 requirements.txt）或直接安装必要库。若遇到显存不足或希望加速推理，请优先安装 `flash-attn`。\n\n**安装 Flash Attention (可选但推荐):**\n```bash\npip install flash-attn --no-build-isolation\n```\n\n**安装核心依赖:**\n```bash\n# 假设已下载源码目录，执行：\npip install -r requirements.txt\n\n# 或者手动安装核心库：\npip install torch==2.5.1 transformers==4.51.3 pillow\n```\n\n> **注意**: 国内用户若下载缓慢，可使用清华源或阿里源加速：\n> `pip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple`\n\n## 3. 基本使用\n\nKimi-VL 提供两种主要变体，请根据场景选择：\n*   **Kimi-VL-A3B-Instruct**: 适用于通用多模态感知、OCR、长文档\u002F视频理解及智能体任务。推荐温度设置 `Temperature = 0.2`。\n*   **Kimi-VL-A3B-Thinking-2506**: 适用于复杂数学推理、深度逻辑思考任务。支持更高分辨率输入。推荐温度设置 `Temperature = 0.8`。\n\n以下示例基于 `transformers` 库。\n\n### 3.1 使用 Instruct 版本 (通用任务)\n\n```python\nimport torch\nfrom PIL import Image\nfrom transformers import AutoModelForCausalLM, AutoProcessor\n\nmodel_path = \"moonshotai\u002FKimi-VL-A3B-Instruct\"\n\n# 加载模型\nmodel = AutoModelForCausalLM.from_pretrained(\n    model_path,\n    torch_dtype=\"auto\",\n    device_map=\"auto\",\n    trust_remote_code=True,\n)\n# 若已安装 flash-attn，建议取消下方注释以节省显存并加速\n# model = AutoModelForCausalLM.from_pretrained(\n#     model_path,\n#     torch_dtype=torch.bfloat16,\n#     device_map=\"auto\",\n#     trust_remote_code=True,\n#     attn_implementation=\"flash_attention_2\"\n# )\n\nprocessor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)\n\n# 准备输入\nimage_path = \".\u002Ffigures\u002Fdemo.png\"\nimage = Image.open(image_path)\nmessages = [\n    {\"role\": \"user\", \"content\": [{\"type\": \"image\", \"image\": image_path}, {\"type\": \"text\", \"text\": \"What is the dome building in the picture? Think step by step.\"}]}\n]\n\n# 处理输入\ntext = processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors=\"pt\")\ninputs = processor(images=image, text=text, return_tensors=\"pt\", padding=True, truncation=True).to(model.device)\n\n# 生成回复\ngenerated_ids = model.generate(**inputs, max_new_tokens=512)\ngenerated_ids_trimmed = [\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n]\nresponse = processor.batch_decode(\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n)[0]\n\nprint(response)\n```\n\n### 3.2 使用 Thinking-2506 版本 (复杂推理\u002F多图)\n\n此版本支持多图输入及更长思维链，适合高难度推理任务。\n\n```python\nimport torch\nfrom PIL import Image\nfrom transformers import AutoModelForCausalLM, AutoProcessor\n\nmodel_path = \"moonshotai\u002FKimi-VL-A3B-Thinking-2506\"\n\n# 加载模型\nmodel = AutoModelForCausalLM.from_pretrained(\n    model_path,\n    torch_dtype=\"auto\",\n    device_map=\"auto\",\n    trust_remote_code=True,\n)\n# 若已安装 flash-attn，建议启用以下配置\n# model = AutoModelForCausalLM.from_pretrained(\n#     model_path,\n#     torch_dtype=torch.bfloat16,\n#     device_map=\"auto\",\n#     trust_remote_code=True,\n#     attn_implementation=\"flash_attention_2\"\n# )\n\nprocessor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)\n\n# 准备多图输入\nimage_paths = [\".\u002Ffigures\u002Fdemo1.png\", \".\u002Ffigures\u002Fdemo2.png\"]\nimages = [Image.open(path) for path in image_paths]\nmessages = [\n    {\n        \"role\": \"user\",\n        \"content\": [\n            {\"type\": \"image\", \"image\": image_path} for image_path in image_paths\n        ] + [{\"type\": \"text\", \"text\": \"Please infer step by step who this manuscript belongs to and what it records\"}],\n    },\n]\n\n# 处理输入\ntext = processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors=\"pt\")\ninputs = processor(images=images, text=text, return_tensors=\"pt\", padding=True, truncation=True).to(model.device)\n\n# 生成回复 (Thinking 模型建议设置 temperature=0.8 且允许更多 token)\ngenerated_ids = model.generate(**inputs, max_new_tokens=32768, temperature=0.8)\ngenerated_ids_trimmed = [\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n]\nresponse = processor.batch_decode(\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n)[0]\n\nprint(response)\n```\n\n### 3.3 微调与部署提示\n*   **微调**: 官方已支持通过 [LLaMA-Factory](https:\u002F\u002Fgithub.com\u002Fhiyouga\u002FLLaMA-Factory) 进行高效微调（支持单卡 LoRA 及多卡 DeepSpeed ZeRO-2）。\n*   **高速部署**: 生产环境推荐使用 [vLLM](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm) 进行部署，已获得官方支持。","某金融科技团队需要每日自动解析数百页包含复杂图表、公式及长篇幅文字说明的上市公司财报 PDF，并提取关键风险指标生成分析报告。\n\n### 没有 Kimi-VL 时\n- **长文档理解断裂**：传统模型受限于上下文窗口，处理几十页的财报时经常“遗忘”前文信息，导致跨章节的数据关联分析出错。\n- **高清图表识别模糊**：面对财报中高分辨率的复杂趋势图或精细表格，普通视觉模型因压缩输入分辨率而丢失细节，无法准确读取微小数值。\n- **复杂推理能力不足**：对于涉及多步计算的财务公式推导或隐含逻辑判断，现有工具只能做简单的文字摘录，无法进行深度的数学推理。\n- **人工复核成本高**：由于自动化准确率不稳定，分析师必须花费大量时间人工二次核对机器提取的结果，效率提升有限。\n\n### 使用 Kimi-VL 后\n- **超长上下文精准掌控**：借助 Kimi-VL 原生的 128K 上下文窗口，模型能一次性完整消化整本百页财报，精准定位并关联分散在不同章节的风险线索。\n- **原生高清视觉感知**：利用 MoonViT 编码器，Kimi-VL 直接处理超高清晰度图像，无损识别报表中微小的数据点和复杂的工程图纸细节。\n- **深度思维链推理**：调用 Kimi-VL-Thinking 变体，模型能通过长思维链自主拆解复杂的财务计算逻辑，准确完成从数据提取到趋势预测的多步推理。\n- **端到端自动化闭环**：凭借在专业基准测试中的卓越表现，输出结果可直接用于决策支持，将人工复核工作量降低了 90%，真正实现无人值守分析。\n\nKimi-VL 通过结合超长上下文理解、高清视觉感知与深度推理能力，将复杂的 multimodal 文档分析从“辅助阅读”升级为“自主决策”。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FMoonshotAI_Kimi-VL_016c25c3.png","MoonshotAI","Moonshot AI","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FMoonshotAI_691dfd9a.jpg","",null,"https:\u002F\u002Fmoonshot.ai","https:\u002F\u002Fgithub.com\u002FMoonshotAI",1175,76,"2026-04-09T10:17:01","MIT","未说明","需要 NVIDIA GPU（隐含，因依赖 flash-attn 和 CUDA），推荐显存 50GB+ 用于全量微调，单卡 LoRA 微调需 50GB VRAM；推理建议使用支持 bfloat16 的显卡并安装 flash-attn 以节省显存",{"notes":87,"python":88,"dependencies":89},"模型总参数量 16B，激活参数约 3B，支持 128K 上下文窗口。推理时强烈建议安装 flash-attn 并设置 dtype 为 bfloat16 以优化显存和速度。微调支持单卡 LoRA（需 50GB 显存）或多卡 DeepSpeed ZeRO-2。Thinking 版本推荐 Temperature=0.8，Instruct 版本推荐 Temperature=0.2。","3.10",[90,91,92,93,94],"torch==2.5.1","transformers==4.51.3","flash-attn","Pillow","accelerate",[35,15,96,13,97],"视频","其他","2026-03-27T02:49:30.150509","2026-04-19T15:38:19.857106",[101,106,111,116,121,126,131,136],{"id":102,"question_zh":103,"answer_zh":104,"source_url":105},42457,"运行官方演示代码时提示 \"kimi_vl\" model_type 不受支持，如何解决？","通常是因为导入模型的路径写错或 transformers 版本问题。请确保使用 pip 安装最新版本的 transformers（无需安装开发版），并检查模型路径是否正确。如果问题依旧，尝试升级 transformers 到 4.48.2 或更高版本。","https:\u002F\u002Fgithub.com\u002FMoonshotAI\u002FKimi-VL\u002Fissues\u002F8",{"id":107,"question_zh":108,"answer_zh":109,"source_url":110},42458,"加载模型时显存溢出（CUDA out of memory），即使使用多张高端显卡也无法启动，怎么办？","这通常是因为未启用 Flash Attention 导致显存占用过高。请在加载模型时设置 `attn_implementation=\"flash_attention_2\"`。示例代码如下：\n```python\nmodel = AutoModelForCausalLM.from_pretrained(\n    model_path,\n    torch_dtype=torch.bfloat16,\n    device_map=\"auto\",\n    trust_remote_code=True,\n    attn_implementation=\"flash_attention_2\"\n)\n```\n同时请参考官方 README 中的用法说明开启 Flash Attention。","https:\u002F\u002Fgithub.com\u002FMoonshotAI\u002FKimi-VL\u002Fissues\u002F51",{"id":112,"question_zh":113,"answer_zh":114,"source_url":115},42459,"Kimi-VL-A3B-Thinking 模型在多轮对话中表现异常或输出随机内容，如何解决？","在进行多轮对话时，必须从所有历史记录中移除 \"think\" 部分（即 `\u003Cthink>` 和 `\u003C\u002Fthink>` 之间的内容）。如果不移除思考过程，会导致模型行为异常。具体逻辑可参考 Hugging Face Demo 的代码实现：https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fmoonshotai\u002FKimi-VL-A3B-Thinking\u002Fblob\u002Fmain\u002Fkimi_vl\u002Fserve\u002Finference.py#L116","https:\u002F\u002Fgithub.com\u002FMoonshotAI\u002FKimi-VL\u002Fissues\u002F50",{"id":117,"question_zh":118,"answer_zh":119,"source_url":120},42460,"模型出现随机回答、输出无关内容或无限生成无法停止的情况，如何处理？","这可能是由于本地缓存的模型文件过旧或损坏导致的。请尝试重新从 Hugging Face 下载最新的模型权重文件覆盖本地缓存，然后再次运行。","https:\u002F\u002Fgithub.com\u002FMoonshotAI\u002FKimi-VL\u002Fissues\u002F40",{"id":122,"question_zh":123,"answer_zh":124,"source_url":125},42461,"是否有 Kimi-VL 模型的微调（Finetuning）示例代码？","有的。你可以参考项目中的 PR #23 获取微调示例，或者参考 LLaMA-Factory 项目中关于 Kimi-VL 的微调实现（Issue #7680）。相关链接：\n1. https:\u002F\u002Fgithub.com\u002FMoonshotAI\u002FKimi-VL\u002Fpull\u002F23\n2. https:\u002F\u002Fgithub.com\u002Fhiyouga\u002FLLaMA-Factory\u002Fissues\u002F7680","https:\u002F\u002Fgithub.com\u002FMoonshotAI\u002FKimi-VL\u002Fissues\u002F16",{"id":127,"question_zh":128,"answer_zh":129,"source_url":130},42462,"运行 Demo 时报错，提示 transformers 版本相关错误，该如何修复？","请将 transformers 库升级到 4.48.2 版本或以上，该版本修复了相关的兼容性问题。可以使用命令 `pip install --upgrade transformers` 进行升级。","https:\u002F\u002Fgithub.com\u002FMoonshotAI\u002FKimi-VL\u002Fissues\u002F14",{"id":132,"question_zh":133,"answer_zh":134,"source_url":135},42463,"如何使用 vllm 部署 Kimi-VL 模型？遇到无输出或 GPU 利用率低的问题怎么办？","如果遇到部署后无输出或 GPU 利用率低的情况，请首先确认是否按照官方文档正确配置了 OpenAI 兼容的服务端参数。建议参考官方 README 中关于 \"OpenAI Compatible Server\" 的章节进行排查：https:\u002F\u002Fgithub.com\u002FMoonshotAI\u002FKimi-VL?tab=readme-ov-file#openai-compatible-server。同时确保已正确安装并启用了 flash-attention。","https:\u002F\u002Fgithub.com\u002FMoonshotAI\u002FKimi-VL\u002Fissues\u002F30",{"id":137,"question_zh":138,"answer_zh":139,"source_url":140},42464,"Kimi-VL 模型支持视频推理吗？有没有最佳实践或帧率（fps）建议？","目前官方正在完善视频推理的相关文档和示例（Cookbooks）。在加载模型时，建议强制使用 `flash_attention_2` 以支持更多帧数的上下文。虽然理想情况下希望能支持 32 或 64 帧，但具体支持的帧数和最佳 fps 取决于显存大小和分辨率，建议先尝试降低分辨率或少量帧数进行测试。","https:\u002F\u002Fgithub.com\u002FMoonshotAI\u002FKimi-VL\u002Fissues\u002F7",[]]