[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-google--gemma.cpp":3,"tool-google--gemma.cpp":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",142651,2,"2026-04-06T23:34:12",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107888,"2026-04-06T11:32:50",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":10,"last_commit_at":59,"category_tags":60,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[35,15,13,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":77,"owner_twitter":78,"owner_website":79,"owner_url":80,"languages":81,"stars":107,"forks":108,"last_commit_at":109,"license":110,"difficulty_score":111,"env_os":112,"env_gpu":113,"env_ram":114,"env_deps":115,"category_tags":122,"github_topics":76,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":123,"updated_at":124,"faqs":125,"releases":154},4742,"google\u002Fgemma.cpp","gemma.cpp","lightweight, standalone C++ inference engine for Google's Gemma models.","gemma.cpp 是谷歌推出的一个轻量级、独立的 C++ 推理引擎，专为运行 Gemma 系列大语言模型而设计。它旨在填补传统部署型运行时与高度抽象的 Python 研究框架之间的空白，为开发者提供一个极简且透明的代码环境。\n\n对于希望深入理解模型底层运作或进行算法协同设计的研究人员和开发者而言，gemma.cpp 解决了现有工具要么过于复杂难以修改、要么过度封装无法触及底层计算的痛点。其核心代码仅约 2000 行，依赖极少，非常便于嵌入其他项目或直接进行二次开发。值得注意的是，该项目主要面向实验与研究场景，而非直接的生产环境部署。\n\n在技术亮点方面，gemma.cpp 利用 Google Highway 库实现了可移植的 SIMD 加速，支持在 CPU 上高效运行混合精度计算（如 fp8、bf16），并集成了自定义的权重压缩技术。此外，它不仅支持前向推理，还独特地提供了反向传播（VJP）和 Adam 优化器功能，使其成为探索模型训练机制的理想平台。无论是想尝试大模型底层优化的工程师，还是需要进行快速原型验证的科研人员，gemma.cpp 都是一个简洁而强大的选择。","# gemma.cpp\n\ngemma.cpp is a lightweight, standalone C++ inference engine for the Gemma\nfoundation models from Google.\n\nFor additional information about Gemma, see\n[ai.google.dev\u002Fgemma](https:\u002F\u002Fai.google.dev\u002Fgemma). Model weights, including\ngemma.cpp specific artifacts, are\n[available on kaggle](https:\u002F\u002Fwww.kaggle.com\u002Fmodels\u002Fgoogle\u002Fgemma-2).\n\n## Who is this project for?\n\nModern LLM inference engines are sophisticated systems, often with bespoke\ncapabilities extending beyond traditional neural network runtimes. With this\ncomes opportunities for research and innovation through co-design of high level\nalgorithms and low-level computation. However, there is a gap between\ndeployment-oriented C++ inference runtimes, which are not designed for\nexperimentation, and Python-centric ML research frameworks, which abstract away\nlow-level computation through compilation.\n\ngemma.cpp provides a minimalist implementation of Gemma-2, Gemma-3, and\nPaliGemma-2 models, focusing on simplicity and directness rather than full\ngenerality. This is inspired by vertically-integrated model implementations such\nas [ggml](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fggml),\n[llama.c](https:\u002F\u002Fgithub.com\u002Fkarpathy\u002Fllama2.c), and\n[llama.rs](https:\u002F\u002Fgithub.com\u002Fsrush\u002Fllama2.rs).\n\ngemma.cpp targets experimentation and research use cases. It is intended to be\nstraightforward to embed in other projects with minimal dependencies and also\neasily modifiable with a small ~2K LoC core implementation (along with ~4K LoC\nof supporting utilities). We use the [Google\nHighway](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fhighway) Library to take advantage of\nportable SIMD for CPU inference.\n\nFor production-oriented edge deployments we recommend standard deployment\npathways using Python frameworks like JAX, Keras, PyTorch, and Transformers\n([all model variations here](https:\u002F\u002Fwww.kaggle.com\u002Fmodels\u002Fgoogle\u002Fgemma)).\n\n## Contributing\n\nCommunity contributions large and small are welcome. See\n[DEVELOPERS.md](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fblob\u002Fmain\u002FDEVELOPERS.md)\nfor additional notes contributing developers and [join the discord by following\nthis invite link](https:\u002F\u002Fdiscord.gg\u002FH5jCBAWxAe). This project follows\n[Google's Open Source Community\nGuidelines](https:\u002F\u002Fopensource.google.com\u002Fconduct\u002F).\n\n> [!NOTE] Active development is currently done on the `dev` branch. Please open\n> pull requests targeting `dev` branch instead of `main`, which is intended to\n> be more stable.\n\n## What's inside?\n\n-   LLM\n\n    -   CPU-only inference for: Gemma 2-3, PaliGemma 2.\n    -   Sampling with TopK and temperature.\n    -   Backward pass (VJP) and Adam optimizer for Gemma research.\n\n-   Optimizations\n\n    -   Mixed-precision (fp8, bf16, fp32, fp64 bit) GEMM:\n        -   Designed for BF16 instructions, can efficiently emulate them.\n        -   Automatic runtime autotuning 7 parameters per matrix shape.\n    -   Weight compression integrated directly into GEMM:\n        -   Custom fp8 format with 2..3 mantissa bits; tensor scaling.\n        -   Also bf16, f32 and non-uniform 4-bit (NUQ); easy to add new formats.\n\n-   Infrastructure\n\n    -   SIMD: single implementation via Highway. Chooses ISA at runtime.\n    -   Tensor parallelism: CCX-aware, multi-socket thread pool.\n    -   Disk I\u002FO: memory map or parallel read (heuristic with user override).\n    -   Custom format with forward\u002Fbackward-compatible metadata serialization.\n    -   Model conversion from Safetensors, not yet open sourced.\n    -   Portability: Linux, Windows\u002FOS X supported. CMake\u002FBazel. 'Any' CPU.\n\n-   Frontends\n\n    -   C++ APIs with streaming for single query and batched inference.\n    -   Basic interactive command-line app.\n    -   Basic Python bindings (pybind11).\n\n## Quick Start\n\n### System requirements\n\nBefore starting, you should have installed:\n\n- [CMake](https:\u002F\u002Fcmake.org\u002F)\n- [Clang C++ compiler](https:\u002F\u002Fclang.llvm.org\u002Fget_started.html), supporting at\n  least C++17.\n- `tar` for extracting archives from Kaggle.\n\nBuilding natively on Windows requires the Visual Studio 2012 Build Tools with the\noptional Clang\u002FLLVM C++ frontend (`clang-cl`). This can be installed from the\ncommand line with\n[`winget`](https:\u002F\u002Flearn.microsoft.com\u002Fen-us\u002Fwindows\u002Fpackage-manager\u002Fwinget\u002F):\n\n```sh\nwinget install --id Kitware.CMake\nwinget install --id Microsoft.VisualStudio.2022.BuildTools --force --override \"--passive --wait --add Microsoft.VisualStudio.Workload.VCTools;installRecommended --add Microsoft.VisualStudio.Component.VC.Llvm.Clang --add Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset\"\n```\n\n### Step 1: Obtain model weights and tokenizer from Kaggle or Hugging Face Hub\n\nVisit the\n[Kaggle page for Gemma-2](https:\u002F\u002Fwww.kaggle.com\u002Fmodels\u002Fgoogle\u002Fgemma-2\u002FgemmaCpp)\nand select `Model Variations |> Gemma C++`.\n\nOn this tab, the `Variation` dropdown includes the options below. Note bfloat16\nweights are higher fidelity, while 8-bit switched floating point weights enable\nfaster inference. In general, we recommend starting with the `-sfp` checkpoints.\n\n> [!NOTE] **Important**: We strongly recommend starting off with the\n> `gemma2-2b-it-sfp` model to get up and running.\n\nGemma 2 models are named `gemma2-2b-it` for 2B and `9b-it` or `27b-it`. See the\n`ModelPrefix` function in `configs.cc`.\n\n### Step 2: Extract Files\n\nAfter filling out the consent form, the download should proceed to retrieve a\ntar archive file `archive.tar.gz`. Extract files from `archive.tar.gz` (this can\ntake a few minutes):\n\n```\ntar -xf archive.tar.gz\n```\n\nThis should produce a file containing model weights such as `2b-it-sfp.sbs` and\na tokenizer file (`tokenizer.spm`). You may want to move these files to a\nconvenient directory location (e.g. the `build\u002F` directory in this repo).\n\n### Step 3: Build\n\nThe build system uses [CMake](https:\u002F\u002Fcmake.org\u002F). To build the gemma inference\nruntime, create a build directory and generate the build files using `cmake`\nfrom the top-level project directory. Note if you previous ran `cmake` and are\nre-running with a different setting, be sure to delete all files in the `build\u002F`\ndirectory with `rm -rf build\u002F*`.\n\n#### Unix-like Platforms\n```sh\ncmake -B build\n```\n\nAfter running `cmake`, you can enter the `build\u002F` directory and run `make` to\nbuild the `.\u002Fgemma` executable:\n\n```sh\n# Configure `build` directory\ncmake --preset make\n\n# Build project using make\ncmake --build --preset make -j [number of parallel threads to use]\n```\n\nReplace `[number of parallel threads to use]` with a number - the number of\ncores available on your system is a reasonable heuristic. For example, `make -j4\ngemma` will build using 4 threads. If the `nproc` command is available, you can\nuse `make -j$(nproc) gemma` as a reasonable default for the number of threads.\n\nIf you aren't sure of the right value for the `-j` flag, you can simply run\n`make gemma` instead and it should still build the `.\u002Fgemma` executable.\n\n> [!NOTE]\n> On Windows Subsystem for Linux (WSL) users should set the number of\n> parallel threads to 1. Using a larger number may result in errors.\n\nIf the build is successful, you should now have a `gemma` executable in the\n`build\u002F` directory.\n\n#### Windows\n\n```sh\n# Configure `build` directory\ncmake --preset windows\n\n# Build project using Visual Studio Build Tools\ncmake --build --preset windows -j [number of parallel threads to use]\n```\n\nIf the build is successful, you should now have a `gemma.exe` executable in the\n`build\u002F` directory.\n\n#### Bazel\n\n```sh\nbazel build -c opt --cxxopt=-std=c++20 :gemma\n```\n\nIf the build is successful, you should now have a `gemma` executable in the\n`bazel-bin\u002F` directory.\n\n#### Make\n\nIf you prefer Makefiles, @jart has made one available here:\n\nhttps:\u002F\u002Fgithub.com\u002Fjart\u002Fgemma3\u002Fblob\u002Fmain\u002FMakefile\n\n### Step 4: Run\n\nYou can now run `gemma` from inside the `build\u002F` directory.\n\n`gemma` has the following required arguments:\n\nArgument      | Description                  | Example value\n------------- | ---------------------------- | ---------------\n`--weights`   | The compressed weights file. | `2b-it-sfp.sbs`\n`--tokenizer` | The tokenizer file.          | `tokenizer.spm`\n\nExample invocation for the following configuration:\n\n-   weights file `gemma2-2b-it-sfp.sbs` (Gemma2 2B instruction-tuned model,\n    8-bit switched floating point).\n-   Tokenizer file `tokenizer.spm` (can omit for single-format weights files\n    created after 2025-05-06, or output by migrate_weights.cc).\n\n```sh\n.\u002Fgemma \\\n--tokenizer tokenizer.spm --weights gemma2-2b-it-sfp.sbs\n```\n\n### PaliGemma Vision-Language Model\n\nThis repository includes a version of the PaliGemma 2 VLM\n([paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.03555)). We provide a C++ implementation of\nthe PaliGemma 2 model here.\n\nTo use the version of PaliGemma included in this repository, build the gemma\nbinary as noted above in Step 3. Download the compressed weights and tokenizer\nfrom\n[Kaggle](https:\u002F\u002Fwww.kaggle.com\u002Fmodels\u002Fgoogle\u002Fpaligemma-2\u002FgemmaCpp\u002Fpaligemma2-3b-mix-224)\nand run the binary as follows:\n\n```sh\n.\u002Fgemma \\\n--tokenizer paligemma_tokenizer.model \\\n--weights paligemma2-3b-mix-224-sfp.sbs \\\n--image_file paligemma\u002Ftestdata\u002Fimage.ppm\n```\n\nNote that the image reading code is very basic to avoid depending on an image\nprocessing library for now. We currently only support reading binary PPMs (P6).\nSo use a tool like `convert` to first convert your images into that format, e.g.\n\n`convert image.jpeg -resize 224x224^ image.ppm`\n\n(As the image will be resized for processing anyway, we can already resize at\nthis stage for slightly faster loading.)\n\nThe interaction with the image (using the mix-224 checkpoint) may then look\nsomething like this:\n\n```\n> Describe the image briefly\nA large building with two towers in the middle of a city.\n> What type of building is it?\nchurch\n> What color is the church?\ngray\n> caption image\nA large building with two towers stands tall on the water's edge. The building\nhas a brown roof and a window on the side. A tree stands in front of the\nbuilding, and a flag waves proudly from its top. The water is calm and blue,\nreflecting the sky above. A bridge crosses the water, and a red and white boat\nrests on its surface. The building has a window on the side, and a flag on top.\nA tall tree stands in front of the building, and a window on the building is\nvisible from the water. The water is green, and the sky is blue.\n```\n\n### Migrating to single-file format\n\nThere is now a new format for the weights file, which is a single file that\nallows to contain the tokenizer (and the model type) directly. A tool to migrate\nfrom the multi-file format to the single-file format is available.\n\n```sh\nio\u002Fmigrate_weights \\\n  --tokenizer ...\u002Ftokenizer.spm --weights ...\u002Fgemma2-2b-it-sfp.sbs \\\n  --output_weights ...\u002Fgemma2-2b-it-sfp-single.sbs\n```\n\nAfter migration, you can omit the tokenizer argument like this:\n\n```sh\n.\u002Fgemma --weights ...\u002Fgemma2-2b-it-sfp-single.sbs\n```\n\n### Troubleshooting and FAQs\n\n**Problems building in Windows \u002F Visual Studio**\n\nCurrently if you're using Windows, we recommend building in WSL (Windows\nSubsystem for Linux). We are exploring options to enable other build\nconfigurations, see issues for active discussion.\n\n**Model does not respond to instructions and produces strange output**\n\nA common issue is that you are using a pre-trained model, which is not\ninstruction-tuned and thus does not respond to instructions. Make sure you are\nusing an instruction-tuned model (`gemma2-2b-it-sfp`) and not a pre-trained\nmodel (any model with a `-pt` suffix).\n\n**What sequence lengths are supported?**\n\nSee `max_seq_len` in `configs.cc` and `InferenceArgs.seq_len`. For the Gemma 3\nmodels larger than 1B, this is typically 32K but 128K would also work given\nenough RAM. Note that long sequences will be slow due to the quadratic cost of\nattention.\n\n**How do I convert my fine-tune to a `.sbs` compressed model file?**\n\nFor PaliGemma 2 checkpoints, you can use python\u002Fconvert_from_safetensors.py to\nconvert from safetensors format (tested with building via bazel). For an adapter\nmodel, you will likely need to call merge_and_unload() to convert the adapter\nmodel to a single-file format before converting it.\n\nHere is how to use it using a bazel build of the compression library assuming\nlocally installed (venv) torch, numpy, safetensors, absl-py, etc.:\n\n```sh\nbazel build \u002F\u002Fcompression\u002Fpython:compression\nBAZEL_OUTPUT_DIR=\"${PWD}\u002Fbazel-bin\u002Fcompression\"\npython3 -c \"import site; print(site.getsitepackages())\"\n# Use your sites-packages file here:\nln -s $BAZEL_OUTPUT_DIR [...]\u002Fsite-packages\u002Fcompression\npython3 python\u002Fconvert_from_safetensors.py --load_path [...].safetensors.index.json\n```\n\n**What are some easy ways to make the model run faster?**\n\n1.  Make sure you are using the 8-bit switched floating point `-sfp` models.\n    These are half the size of bf16 and thus use less memory bandwidth and cache\n    space.\n2.  Due to auto-tuning, the second and especially third query will be faster.\n3.  If you're on a laptop, make sure power mode is set to maximize performance\n    and saving mode is **off**. For most laptops, the power saving modes get\n    activated automatically if the computer is not plugged in.\n4.  Close other unused cpu-intensive applications.\n5.  On macs, anecdotally we observe a \"warm-up\" ramp-up in speed as performance\n    cores get engaged.\n\nWe're also working on algorithmic and optimization approaches for faster\ninference, stay tuned.\n\n## Usage\n\n`gemma` has different usage modes, controlled by the verbosity flag.\n\nAll usage modes are currently interactive, triggering text generation upon\nnewline input.\n\n| Verbosity       | Usage mode | Details                                       |\n| --------------- | ---------- | --------------------------------------------- |\n| `--verbosity 0` | Minimal | Only prints generation output. Suitable as a CLI tool. |\n| `--verbosity 1` | Default | Standard user-facing terminal UI. |\n| `--verbosity 2` | Detailed | Shows additional developer and debug info. |\n\n### Interactive Terminal App\n\nBy default, verbosity is set to 1, bringing up a terminal-based interactive\ninterface when `gemma` is invoked:\n\n```sh\n$ .\u002Fgemma [...]\n  __ _  ___ _ __ ___  _ __ ___   __ _   ___ _ __  _ __\n \u002F _` |\u002F _ \\ '_ ` _ \\| '_ ` _ \\ \u002F _` | \u002F __| '_ \\| '_ \\\n| (_| |  __\u002F | | | | | | | | | | (_| || (__| |_) | |_) |\n \\__, |\\___|_| |_| |_|_| |_| |_|\\__,_(_)___| .__\u002F| .__\u002F\n  __\u002F |                                    | |   | |\n |___\u002F                                     |_|   |_|\n\n...\n\n*Usage*\n  Enter an instruction and press enter (%C reset conversation, %Q quits).\n\n*Examples*\n  - Write an email to grandma thanking her for the cookies.\n  - What are some historical attractions to visit around Massachusetts?\n  - Compute the nth fibonacci number in javascript.\n  - Write a standup comedy bit about WebGPU programming.\n\n> What are some outdoorsy places to visit around Boston?\n\n[ Reading prompt ] .....................\n\n\n**Boston Harbor and Islands:**\n\n* **Boston Harbor Islands National and State Park:** Explore pristine beaches, wildlife, and maritime history.\n* **Charles River Esplanade:** Enjoy scenic views of the harbor and city skyline.\n* **Boston Harbor Cruise Company:** Take a relaxing harbor cruise and admire the city from a different perspective.\n* **Seaport Village:** Visit a charming waterfront area with shops, restaurants, and a seaport museum.\n\n**Forest and Nature:**\n\n* **Forest Park:** Hike through a scenic forest with diverse wildlife.\n* **Quabbin Reservoir:** Enjoy boating, fishing, and hiking in a scenic setting.\n* **Mount Forest:** Explore a mountain with breathtaking views of the city and surrounding landscape.\n\n...\n```\n\n### Usage as a Command Line Tool\n\nFor using the `gemma` executable as a command line tool, it may be useful to\ncreate an alias for gemma.cpp with arguments fully specified:\n\n```sh\nalias gemma2b=\"~\u002Fgemma.cpp\u002Fbuild\u002Fgemma -- --tokenizer ~\u002Fgemma.cpp\u002Fbuild\u002Ftokenizer.spm --weights ~\u002Fgemma.cpp\u002Fbuild\u002Fgemma2-2b-it-sfp.sbs --verbosity 0\"\n```\n\nReplace the above paths with your own paths to the model and tokenizer paths\nfrom the download.\n\nHere is an example of prompting `gemma` with a truncated input\nfile (using a `gemma2b` alias like defined above):\n\n```sh\ncat configs.h | tail -n 35 | tr '\\n' ' ' | xargs -0 echo \"What does this C++ code do: \" | gemma2b\n```\n\n> [!NOTE]\n> CLI usage of gemma.cpp is experimental and should take context length\n> limitations into account.\n\nThe output of the above command should look like:\n\n```sh\n[ Reading prompt ] [...]\nThis C++ code snippet defines a set of **constants** used in a large language model (LLM) implementation, likely related to the **attention mechanism**.\n\nLet's break down the code:\n[...]\n```\n\n### Incorporating gemma.cpp as a Library in your Project\n\nThe easiest way to incorporate gemma.cpp in your own project is to pull in\ngemma.cpp and dependencies using `FetchContent`. You can add the following to\nyour CMakeLists.txt:\n\n```\ninclude(FetchContent)\n\nFetchContent_Declare(sentencepiece GIT_REPOSITORY https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fsentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c)\nFetchContent_MakeAvailable(sentencepiece)\n\nFetchContent_Declare(gemma GIT_REPOSITORY https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp GIT_TAG origin\u002Fmain)\nFetchContent_MakeAvailable(gemma)\n\nFetchContent_Declare(highway GIT_REPOSITORY https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fhighway.git GIT_TAG 2a16a50ff61071bb25ddef0ce35d92b0e2b9c579)\nFetchContent_MakeAvailable(highway)\n```\n\nNote for the gemma.cpp `GIT_TAG`, you may replace `origin\u002Fmain` for a specific\ncommit hash if you would like to pin the library version.\n\nAfter your executable is defined (substitute your executable name for\n`[Executable Name]` below):\n\n```\ntarget_link_libraries([Executable Name] libgemma hwy hwy_contrib sentencepiece)\nFetchContent_GetProperties(gemma)\nFetchContent_GetProperties(sentencepiece)\ntarget_include_directories([Executable Name] PRIVATE ${gemma_SOURCE_DIR})\ntarget_include_directories([Executable Name] PRIVATE ${sentencepiece_SOURCE_DIR})\n```\n\n### Building gemma.cpp as a Library\n\ngemma.cpp can also be used as a library dependency in your own project. The\nshared library artifact can be built by modifying the make invocation to build\nthe `libgemma` target instead of `gemma`.\n\n> [!NOTE]\n> If you are using gemma.cpp in your own project with the `FetchContent` steps\n> in the previous section, building the library is done automatically by `cmake`\n> and this section can be skipped.\n\nFirst, run `cmake`:\n\n```sh\ncmake -B build\n```\n\nThen, run `make` with the `libgemma` target:\n\n```sh\ncd build\nmake -j [number of parallel threads to use] libgemma\n```\n\nIf this is successful, you should now have a `libgemma` library file in the\n`build\u002F` directory. On Unix platforms, the filename is `libgemma.a`.\n\n## Independent Projects Using gemma.cpp\n\nSome independent projects using gemma.cpp:\n\n- [gemma-cpp-python - Python bindings](https:\u002F\u002Fgithub.com\u002Fnamtranase\u002Fgemma-cpp-python)\n- [lua-cgemma - Lua bindings](https:\u002F\u002Fgithub.com\u002Fufownl\u002Flua-cgemma)\n- [Godot engine demo project](https:\u002F\u002Fgithub.com\u002FRliop913\u002FGemma-godot-demo-project)\n\nIf you would like to have your project included, feel free to get in touch or\nsubmit a PR with a `README.md` edit.\n\n## Acknowledgements and Contacts\n\ngemma.cpp was started in fall 2023 by\n[Austin Huang](mailto:austinvhuang@google.com) and\n[Jan Wassenberg](mailto:janwas@google.com), and subsequently released February\n2024 thanks to contributions from Phil Culliton, Paul Chang, and Dan Zheng.\n\nGriffin support was implemented in April 2024 thanks to contributions by Andrey\nMikhaylov, Eugene Kliuchnikov, Jan Wassenberg, Jyrki Alakuijala, Lode\nVandevenne, Luca Versari, Martin Bruse, Phil Culliton, Sami Boukortt, Thomas\nFischbacher and Zoltan Szabadka. It was removed in 2025-09.\n\nGemma-2 support was implemented in June\u002FJuly 2024 with the help of several\npeople.\n\nPaliGemma support was implemented in September 2024 with contributions from\nDaniel Keysers.\n\n[Jan Wassenberg](mailto:janwas@google.com) has continued to contribute many\nimprovements, including major gains in efficiency, since the initial release.\n\nThis is not an officially supported Google product.\n","# gemma.cpp\n\ngemma.cpp 是一个轻量级的、独立的 C++ 推理引擎，用于运行来自 Google 的 Gemma 基础模型。\n\n有关 Gemma 的更多信息，请参阅 [ai.google.dev\u002Fgemma](https:\u002F\u002Fai.google.dev\u002Fgemma)。模型权重，包括 gemma.cpp 特有的文件，可在 [kaggle](https:\u002F\u002Fwww.kaggle.com\u002Fmodels\u002Fgoogle\u002Fgemma-2) 上获取。\n\n## 本项目适合哪些人群？\n\n现代大型语言模型（LLM）推理引擎是高度复杂的系统，通常具备超越传统神经网络运行时的定制化功能。这为通过高层算法与底层计算的协同设计进行研究和创新提供了机会。然而，面向部署的 C++ 推理运行时并不适合实验性开发；而以 Python 为中心的机器学习研究框架则通过编译抽象了底层计算细节，两者之间存在一定的差距。\n\ngemma.cpp 提供了 Gemma-2、Gemma-3 和 PaliGemma-2 模型的极简实现，专注于简洁性和直接性，而非全面通用性。这一设计灵感来源于垂直整合的模型实现，例如 [ggml](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fggml)、[llama.c](https:\u002F\u002Fgithub.com\u002Fkarpathy\u002Fllama2.c) 和 [llama.rs](https:\u002F\u002Fgithub.com\u002Fsrush\u002Fllama2.rs) 等项目。\n\ngemma.cpp 主要面向实验和研究场景。它易于嵌入到其他项目中，依赖性极低，并且核心实现代码量较小（约 2K 行），加上约 4K 行的支持工具代码，便于修改和扩展。我们使用 [Google Highway](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fhighway) 库来利用可移植的 SIMD 指令进行 CPU 推理。\n\n对于生产环境下的边缘部署，我们建议采用标准的 Python 框架（如 JAX、Keras、PyTorch 和 Transformers）来进行部署，这些框架支持所有版本的 Gemma 模型（详见 [kaggle](https:\u002F\u002Fwww.kaggle.com\u002Fmodels\u002Fgoogle\u002Fgemma)）。\n\n## 贡献说明\n\n欢迎社区成员以各种方式参与贡献。更多关于开发者贡献的信息，请参阅 [DEVELOPERS.md](https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fblob\u002Fmain\u002FDEVELOPERS.md)，并可通过此邀请链接加入我们的 Discord 社区：[discord.gg\u002FH5jCBAWxAe]。本项目遵循 [Google 开源社区行为准则](https:\u002F\u002Fopensource.google.com\u002Fconduct\u002F)。\n\n> [!注意] 目前的活跃开发工作在 `dev` 分支上进行。请将拉取请求提交至 `dev` 分支，而不是 `main` 分支，后者旨在保持更高的稳定性。\n\n## 项目内容概览\n\n- **LLM**\n  - 仅限 CPU 的推理：Gemma 2-3、PaliGemma 2。\n  - 使用 TopK 和温度采样。\n  - 支持反向传播（VJP）和 Adam 优化器，适用于 Gemma 的研究用途。\n\n- **优化技术**\n  - 混合精度（fp8、bf16、fp32、fp64）矩阵乘法：\n    - 针对 BF16 指令设计，能够高效模拟其行为。\n    - 自动运行时调优，每种矩阵形状有 7 个参数可供调整。\n  - 权重压缩直接集成到 GEMM 中：\n    - 自定义 fp8 格式，具有 2–3 位尾数；张量缩放。\n    - 同时支持 bf16、f32 和非均匀 4 位量化（NUQ）；易于添加新格式。\n\n- **基础设施**\n  - SIMD：通过 Highway 实现单一接口，在运行时选择合适的指令集。\n  - 张量并行：支持 CCX 感知和多插槽线程池。\n  - 磁盘 I\u002FO：支持内存映射或并行读取，并提供用户可覆盖的启发式策略。\n  - 自定义格式，支持正向和反向兼容的元数据序列化。\n  - 模型转换功能基于 Safetensors，但尚未开源。\n  - 跨平台支持：Linux、Windows 和 OS X 均可运行。使用 CMake 或 Bazel 构建，兼容任意 CPU。\n\n- **前端接口**\n  - C++ API，支持单次查询和批量推理的流式处理。\n  - 基本的交互式命令行应用。\n  - 基本的 Python 绑定（使用 pybind11）。\n\n## 快速入门\n\n### 系统要求\n\n在开始之前，您需要安装以下工具：\n\n- [CMake](https:\u002F\u002Fcmake.org\u002F)\n- [Clang C++ 编译器](https:\u002F\u002Fclang.llvm.org\u002Fget_started.html)，至少支持 C++17。\n- `tar` 工具，用于解压 Kaggle 下载的归档文件。\n\n在 Windows 上原生构建需要 Visual Studio 2012 Build Tools，并安装可选的 Clang\u002FLLVM C++ 前端 (`clang-cl`)。可以通过命令行使用 [winget](https:\u002F\u002Flearn.microsoft.com\u002Fen-us\u002Fwindows\u002Fpackage-manager\u002Fwinget\u002F) 安装：\n\n```sh\nwinget install --id Kitware.CMake\nwinget install --id Microsoft.VisualStudio.2022.BuildTools --force --override \"--passive --wait --add Microsoft.VisualStudio.Workload.VCTools;installRecommended --add Microsoft.VisualStudio.Component.VC.Llvm.Clang --add Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset\"\n```\n\n### 第一步：从 Kaggle 或 Hugging Face Hub 获取模型权重和分词器\n\n访问 [Gemma-2 的 Kaggle 页面](https:\u002F\u002Fwww.kaggle.com\u002Fmodels\u002Fgoogle\u002Fgemma-2\u002FgemmaCpp)，选择“Model Variations”中的“Gemma C++”。\n\n在该选项卡中，“Variation”下拉菜单包含以下选项。请注意，bfloat16 权重具有更高的保真度，而 8 位浮点权重则能加速推理。总体而言，我们建议从 `-sfp` 检查点开始。\n\n> [!注意] **重要提示**：强烈建议从 `gemma2-2b-it-sfp` 模型开始，以便快速上手。\n\nGemma 2 模型的命名规则为 `gemma2-2b-it` 对应 2B 参数量，`9b-it` 或 `27b-it` 则分别对应 9B 和 27B 参数量。具体命名规则请参考 `configs.cc` 文件中的 `ModelPrefix` 函数。\n\n### 第二步：解压文件\n\n填写同意书后，下载将生成一个名为 `archive.tar.gz` 的归档文件。使用以下命令解压该文件（可能需要几分钟）：\n\n```sh\ntar -xf archive.tar.gz\n```\n\n解压后会得到包含模型权重的文件，例如 `2b-it-sfp.sbs`，以及分词器文件 (`tokenizer.spm`)。您可以将这些文件移动到方便的位置，例如本项目的 `build\u002F` 目录。\n\n### 第三步：构建\n\n构建系统使用 CMake。要构建 gemma 推理运行时，首先创建一个构建目录，并从项目根目录运行 `cmake` 生成构建文件。请注意，如果您之前已经运行过 `cmake` 并希望使用不同的配置重新构建，请务必先删除 `build\u002F` 目录下的所有文件，命令如下：\n\n```sh\nrm -rf build\u002F*\n```\n\n#### 类 Unix 平台\n```sh\ncmake -B build\n```\n\n运行 `cmake` 后，进入 `build\u002F` 目录并执行 `make`，即可编译出 `.\u002Fgemma` 可执行文件：\n\n```sh\n# 配置 build 目录\ncmake --preset make\n\n# 使用 make 构建项目\ncmake --build --preset make -j [要使用的并行线程数]\n```\n\n将 `[要使用的并行线程数]` 替换为一个数字——系统可用的核心数是一个合理的参考值。例如，`make -j4 gemma` 将使用 4 个线程进行构建。如果 `nproc` 命令可用，可以使用 `make -j$(nproc) gemma` 作为线程数的合理默认值。\n\n如果你不确定 `-j` 标志的合适值，也可以直接运行 `make gemma`，它仍然会构建出 `.\u002Fgemma` 可执行文件。\n\n> [!NOTE]\n> 在 Windows Subsystem for Linux (WSL) 上的用户应将并行线程数设置为 1。使用更大的线程数可能会导致错误。\n\n如果构建成功，你现在应该在 `build\u002F` 目录下有一个 `gemma` 可执行文件。\n\n#### Windows\n\n```sh\n# 配置 `build` 目录\ncmake --preset windows\n\n# 使用 Visual Studio Build Tools 构建项目\ncmake --build --preset windows -j [要使用的并行线程数]\n```\n\n如果构建成功，你现在应该在 `build\u002F` 目录下有一个 `gemma.exe` 可执行文件。\n\n#### Bazel\n\n```sh\nbazel build -c opt --cxxopt=-std=c++20 :gemma\n```\n\n如果构建成功，你现在应该在 `bazel-bin\u002F` 目录下有一个 `gemma` 可执行文件。\n\n#### Make\n\n如果你更喜欢 Makefile，@jart 在这里提供了一个：\n\nhttps:\u002F\u002Fgithub.com\u002Fjart\u002Fgemma3\u002Fblob\u002Fmain\u002FMakefile\n\n### 第 4 步：运行\n\n现在你可以从 `build\u002F` 目录内运行 `gemma`。\n\n`gemma` 有以下必需参数：\n\n参数      | 描述                  | 示例值\n------------- | ---------------------------- | ---------------\n`--weights`   | 压缩的权重文件。 | `2b-it-sfp.sbs`\n`--tokenizer` | 分词器文件。          | `tokenizer.spm`\n\n以下配置的示例调用：\n\n-   权重文件 `gemma2-2b-it-sfp.sbs`（Gemma2 2B 指令微调模型，8 位切换浮点）。\n-   分词器文件 `tokenizer.spm`（对于 2025 年 5 月 6 日之后创建的单格式权重文件，或由 migrate_weights.cc 输出的文件，可以省略）。\n\n```sh\n.\u002Fgemma \\\n--tokenizer tokenizer.spm --weights gemma2-2b-it-sfp.sbs\n```\n\n### PaliGemma 视觉语言模型\n\n本仓库包含 PaliGemma 2 VLM 的版本（[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.03555)）。我们在此提供了 PaliGemma 2 模型的 C++ 实现。\n\n要使用本仓库中包含的 PaliGemma 版本，请按照上述第 3 步构建 gemma 二进制文件。从 Kaggle 下载压缩的权重和分词器：\n\n[Kaggle](https:\u002F\u002Fwww.kaggle.com\u002Fmodels\u002Fgoogle\u002Fpaligemma-2\u002FgemmaCpp\u002Fpaligemma2-3b-mix-224)\n\n然后按如下方式运行二进制文件：\n\n```sh\n.\u002Fgemma \\\n--tokenizer paligemma_tokenizer.model \\\n--weights paligemma2-3b-mix-224-sfp.sbs \\\n--image_file paligemma\u002Ftestdata\u002Fimage.ppm\n```\n\n请注意，图像读取代码非常简单，暂时避免依赖图像处理库。目前我们仅支持读取二进制 PPM（P6）格式。因此，可以使用 `convert` 等工具先将图像转换为该格式，例如：\n\n`convert image.jpeg -resize 224x224^ image.ppm`\n\n（由于图像最终仍会被调整大小以供处理，因此在此阶段提前调整大小可以略微加快加载速度。）\n\n与图像交互（使用 mix-224 检查点）可能看起来像这样：\n\n```\n> 简要描述一下这张图片\n城市中央有一座带有两座塔的大楼。\n> 这是什么类型的建筑？\n教堂\n> 教堂是什么颜色的？\n灰色\n> 图片说明\n一座带有两座塔的大楼矗立在水边。大楼有一个棕色的屋顶和一侧的窗户。大楼前面有一棵树，顶部飘扬着一面旗帜。水面平静而湛蓝，映照着天空。一座桥横跨水面，一艘红白相间的船停在水面上。大楼侧面有一扇窗，顶部插着一面旗子。大楼前方有一棵高大的树，从水面上可以看到大楼的一侧窗户。水面呈绿色，天空是蓝色的。\n```\n\n### 迁移到单文件格式\n\n现在有一种新的权重文件格式，即单文件格式，可以直接包含分词器（以及模型类型）。有一个工具可用于将多文件格式迁移到单文件格式。\n\n```sh\nio\u002Fmigrate_weights \\\n  --tokenizer ...\u002Ftokenizer.spm --weights ...\u002Fgemma2-2b-it-sfp.sbs \\\n  --output_weights ...\u002Fgemma2-2b-it-sfp-single.sbs\n```\n\n迁移完成后，你可以省略分词器参数，如下所示：\n\n```sh\n.\u002Fgemma --weights ...\u002Fgemma2-2b-it-sfp-single.sbs\n```\n\n### 故障排除与常见问题解答\n\n**在 Windows \u002F Visual Studio 中构建时出现问题**\n\n目前，如果你使用的是 Windows，我们建议在 WSL（Windows Subsystem for Linux）中进行构建。我们正在探索其他构建配置的可能性，相关讨论请参见 issues。\n\n**模型对指令无反应并产生奇怪的输出**\n\n常见问题之一是你使用的是预训练模型，而不是经过指令微调的模型，因此无法响应指令。请确保你使用的是指令微调模型（`gemma2-2b-it-sfp`），而不是预训练模型（任何带有 `-pt` 后缀的模型）。\n\n**支持哪些序列长度？**\n\n请查看 `configs.cc` 和 `InferenceArgs.seq_len` 中的 `max_seq_len`。对于大于 1B 的 Gemma 3 模型，通常为 32K，但如果内存充足，128K 也可以工作。请注意，长序列由于注意力机制的二次复杂度，处理速度会较慢。\n\n**如何将我的微调模型转换为 `.sbs` 压缩模型文件？**\n\n对于 PaliGemma 2 检查点，你可以使用 python\u002Fconvert_from_safetensors.py 从 safetensors 格式转换（已通过 bazel 构建测试）。对于适配器模型，你可能需要调用 `merge_and_unload()` 将适配器模型转换为单文件格式，然后再进行转换。\n\n以下是使用本地安装了 (venv) torch、numpy、safetensors、absl-py 等库的情况下，通过 bazel 构建压缩库来使用的方法：\n\n```sh\nbazel build \u002F\u002Fcompression\u002Fpython:compression\nBAZEL_OUTPUT_DIR=\"${PWD}\u002Fbazel-bin\u002Fcompression\"\npython3 -c \"import site; print(site.getsitepackages())\"\n\n# 在此处使用你的 sites-packages 文件：\nln -s $BAZEL_OUTPUT_DIR [...]\u002Fsite-packages\u002Fcompression\npython3 python\u002Fconvert_from_safetensors.py --load_path [...].safetensors.index.json\n```\n\n**有哪些简单的方法可以让模型运行得更快呢？**\n\n1. 确保你使用的是 8 位切换浮点 `-sfp` 模型。\n   这些模型的大小是 bf16 的一半，因此占用更少的内存带宽和缓存空间。\n2. 由于自动调优机制，第二次及尤其是第三次查询会更快。\n3. 如果你在笔记本电脑上运行，请确保电源模式设置为最大化性能，\n   并且节能模式已关闭。大多数笔记本电脑在未插电时会自动启用节能模式。\n4. 关闭其他未使用的、占用大量 CPU 资源的应用程序。\n5. 根据经验，在 Mac 上，我们观察到随着性能核心被激活，速度会有一个“预热”提升的过程。\n\n我们也在研究算法和优化方法以提高推理速度，敬请期待。\n\n## 使用方法\n\n`gemma` 有不同的使用模式，由详细程度标志控制。\n\n目前所有使用模式都是交互式的，会在用户输入换行符后触发文本生成。\n\n| 详细程度       | 使用模式 | 说明                                       |\n| --------------- | ---------- | --------------------------------------------- |\n| `--verbosity 0` | 极简模式 | 只打印生成的文本内容。适合作为命令行工具。 |\n| `--verbosity 1` | 默认模式 | 标准的面向用户的终端界面。                 |\n| `--verbosity 2` | 详细模式 | 显示额外的开发者和调试信息。               |\n\n### 交互式终端应用\n\n默认情况下，详细程度设置为 1，调用 `gemma` 时会启动基于终端的交互界面：\n\n```sh\n$ .\u002Fgemma [...]\n  __ _  ___ _ __ ___  _ __ ___   __ _   ___ _ __  _ __\n \u002F _` |\u002F _ \\ '_ ` _ \\| '_ ` _ \\ \u002F _` | \u002F __| '_ \\| '_ \\\n| (_| |  __\u002F | | | | | | | | | | (_| || (__| |_) | |_) |\n \\__, |\\___|_| |_| |_|_| |_| |_|\\__,_(_)___| .__\u002F| .__\u002F\n  __\u002F |                                    | |   | |\n |___\u002F                                     |_|   |_|\n\n...\n\n*使用说明*\n  输入指令并按回车键（%C 重置对话，%Q 退出）。\n\n*示例*\n  - 给奶奶写封邮件，感谢她送来的饼干。\n  - 马萨诸塞州周边有哪些值得参观的历史景点？\n  - 用 JavaScript 计算第 n 个斐波那契数。\n  - 写一段关于 WebGPU 编程的单口喜剧段子。\n\n> 波士顿周边有哪些适合户外活动的地方？\n\n[ 正在读取提示 ] .....................\n\n\n**波士顿港与群岛：**\n\n* **波士顿港国家与州立公园：** 探索原始海滩、野生动物和航海历史。\n* **查尔斯河滨步道：** 欣赏港口和城市天际线的迷人景色。\n* **波士顿港游轮公司：** 乘坐轻松的港口游轮，从不同角度欣赏城市风光。\n* **海港村：** 参观一个迷人的海滨区域，那里有商店、餐厅和海港博物馆。\n\n**森林与自然：**\n\n* **森林公园：** 徒步穿越风景优美的森林，观赏多样化的野生动物。\n* **夸宾水库：** 在风景如画的环境中划船、钓鱼和徒步旅行。\n* **森林山：** 探索一座可以俯瞰城市及周边景观的山峰。\n\n...\n```\n\n### 作为命令行工具使用\n\n如果想将 `gemma` 可执行文件用作命令行工具，可以为 gemma.cpp 创建一个带有完整参数的别名：\n\n```sh\nalias gemma2b=\"~\u002Fgemma.cpp\u002Fbuild\u002Fgemma -- --tokenizer ~\u002Fgemma.cpp\u002Fbuild\u002Ftokenizer.spm --weights ~\u002Fgemma.cpp\u002Fbuild\u002Fgemma2-2b-it-sfp.sbs --verbosity 0\"\n```\n\n请将上述路径替换为你自己下载的模型和分词器路径。\n\n以下是一个使用 `gemma2b` 别名（如上定义）来调用 `gemma` 并传入截断输入文件的示例：\n\n```sh\ncat configs.h | tail -n 35 | tr '\\n' ' ' | xargs -0 echo \"这段 C++ 代码的作用是什么：\" | gemma2b\n```\n\n> [!注意]\n> gemma.cpp 的命令行使用仍处于实验阶段，需考虑上下文长度的限制。\n\n上述命令的输出应如下所示：\n\n```sh\n[ 正在读取提示 ] [...]\n这段 C++ 代码片段定义了一组用于大型语言模型（LLM）实现的**常量**，很可能与**注意力机制**相关。\n\n让我们逐行解析这段代码：\n[...]\n```\n\n### 将 gemma.cpp 作为库集成到你的项目中\n\n将 gemma.cpp 集成到你自己的项目中最简单的方式是使用 `FetchContent` 引入 gemma.cpp 及其依赖项。你可以在 CMakeLists.txt 中添加以下内容：\n\n```\ninclude(FetchContent)\n\nFetchContent_Declare(sentencepiece GIT_REPOSITORY https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fsentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c)\nFetchContent_MakeAvailable(sentencepiece)\n\nFetchContent_Declare(gemma GIT_REPOSITORY https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp GIT_TAG origin\u002Fmain)\nFetchContent_MakeAvailable(gemma)\n\nFetchContent_Declare(highway GIT_REPOSITORY https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fhighway.git GIT_TAG 2a16a50ff61071bb25ddef0ce35d92b0e2b9c579)\nFetchContent_MakeAvailable(highway)\n```\n\n请注意，对于 gemma.cpp 的 `GIT_TAG`，你可以将其替换为特定的提交哈希值，以便固定库的版本。\n\n在定义好你的可执行文件后（将 `[Executable Name]` 替换为你自己的可执行文件名称）：\n\n```\ntarget_link_libraries([Executable Name] libgemma hwy hwy_contrib sentencepiece)\nFetchContent_GetProperties(gemma)\nFetchContent_GetProperties(sentencepiece)\ntarget_include_directories([Executable Name] PRIVATE ${gemma_SOURCE_DIR})\ntarget_include_directories([Executable Name] PRIVATE ${sentencepiece_SOURCE_DIR})\n```\n\n### 将 gemma.cpp 构建为库\n\ngemma.cpp 也可以作为库依赖项用于你自己的项目。只需修改构建命令，将目标改为 `libgemma` 而不是 `gemma` 即可构建共享库文件。\n\n> [!注意]\n> 如果你已经在前一节中通过 `FetchContent` 步骤将 gemma.cpp 集成到你的项目中，那么库的构建会由 `cmake` 自动完成，本节可以跳过。\n\n首先运行 `cmake`：\n\n```sh\ncmake -B build\n```\n\n然后使用 `libgemma` 目标运行 `make`：\n\n```sh\ncd build\nmake -j [要使用的并行线程数] libgemma\n```\n\n如果成功，你应该会在 `build\u002F` 目录下得到一个 `libgemma` 库文件。在 Unix 系统上，文件名为 `libgemma.a`。\n\n## 使用 gemma.cpp 的独立项目\n\n一些使用 gemma.cpp 的独立项目：\n\n- [gemma-cpp-python - Python 绑定](https:\u002F\u002Fgithub.com\u002Fnamtranase\u002Fgemma-cpp-python)\n- [lua-cgemma - Lua 绑定](https:\u002F\u002Fgithub.com\u002Fufownl\u002Flua-cgemma)\n- [Godot 引擎演示项目](https:\u002F\u002Fgithub.com\u002FRliop913\u002FGemma-godot-demo-project)\n\n如果你希望你的项目也被收录，请随时联系我们或提交包含 `README.md` 修改的 PR。\n\n## 致谢与联系方式\n\ngemma.cpp 项目于 2023 年秋季由 [Austin Huang](mailto:austinvhuang@google.com) 和 [Jan Wassenberg](mailto:janwas@google.com) 启动，并在 Phil Culliton、Paul Chang 和 Dan Zheng 等人的贡献下，于 2024 年 2 月正式发布。\n\nGriffin 支持于 2024 年 4 月实现，这得益于 Andrey Mikhaylov、Eugene Kliuchnikov、Jan Wassenberg、Jyrki Alakuijala、Lode Vandevenne、Luca Versari、Martin Bruse、Phil Culliton、Sami Boukortt、Thomas Fischbacher 和 Zoltan Szabadka 等人的贡献。该支持已于 2025 年 9 月移除。\n\nGemma-2 支持于 2024 年 6 月至 7 月期间，在多位人士的帮助下得以实现。\n\nPaliGemma 支持则于 2024 年 9 月实现，Daniel Keysers 为此做出了重要贡献。\n\n自首次发布以来，[Jan Wassenberg](mailto:janwas@google.com) 持续贡献了多项改进，其中包括显著提升效率。\n\n本项目并非 Google 官方支持的产品。","# gemma.cpp 快速上手指南\n\ngemma.cpp 是 Google Gemma 基础模型的轻量级独立 C++ 推理引擎，专为实验和研究场景设计。它支持 Gemma-2、Gemma-3 和 PaliGemma-2 模型，具有依赖少、代码简洁、易于嵌入等特点。\n\n## 环境准备\n\n在开始之前，请确保您的系统已安装以下依赖：\n\n*   **CMake**: 构建系统工具。\n*   **C++ 编译器**: 推荐使用 **Clang**，需支持至少 C++17 标准。\n*   **tar**: 用于解压从 Kaggle 下载的模型归档文件。\n\n### Windows 用户特别说明\n若在 Windows 原生环境构建，需要安装 Visual Studio 2022 Build Tools 并包含 Clang\u002FLLVM C++ 前端 (`clang-cl`)。可通过 `winget` 快速安装：\n\n```sh\nwinget install --id Kitware.CMake\nwinget install --id Microsoft.VisualStudio.2022.BuildTools --force --override \"--passive --wait --add Microsoft.VisualStudio.Workload.VCTools;installRecommended --add Microsoft.VisualStudio.Component.VC.Llvm.Clang --add Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset\"\n```\n\n> **建议**：Windows 用户推荐使用 **WSL (Windows Subsystem for Linux)** 进行构建，以获得更稳定的体验。\n\n## 安装步骤\n\n### 第一步：获取模型权重和分词器\n\n访问 [Kaggle Gemma-2 页面](https:\u002F\u002Fwww.kaggle.com\u002Fmodels\u002Fgoogle\u002Fgemma-2\u002FgemmaCpp) 或 Hugging Face Hub。\n1. 选择 `Model Variations` > `Gemma C++`。\n2. 在 `Variation` 下拉菜单中选择模型版本。\n   *   **推荐新手**：选择 `gemma2-2b-it-sfp`（8-bit switched floating point），推理速度更快且易于上手。\n   *   注意：带 `-it` 后缀的是指令微调模型，带 `-pt` 的是预训练模型（不支持指令交互）。\n3. 同意协议后下载 `archive.tar.gz` 文件。\n\n### 第二步：解压文件\n\n在下载目录执行以下命令解压（可能需要几分钟）：\n\n```sh\ntar -xf archive.tar.gz\n```\n\n解压后将得到权重文件（如 `2b-it-sfp.sbs`）和分词器文件（`tokenizer.spm`）。建议将它们移动到项目构建目录或方便访问的位置。\n\n### 第三步：编译项目\n\n在项目根目录下创建构建目录并生成构建文件。\n\n#### Linux \u002F macOS (Unix-like)\n\n```sh\n# 配置构建目录\ncmake -B build\n\n# 进入构建目录并编译\ncd build\ncmake --preset make\n# 使用多核编译，将 \u003C线程数> 替换为 CPU 核心数（如 4, 8），或直接运行 make gemma\ncmake --build --preset make -j \u003C线程数>\n```\n\n> **WSL 用户注意**：如果在 WSL 中遇到错误，请尝试将线程数设置为 1 (`-j 1`)。\n\n#### Windows (Visual Studio)\n\n```sh\n# 配置构建目录\ncmake --preset windows\n\n# 使用 Visual Studio Build Tools 编译\ncmake --build --preset windows -j \u003C线程数>\n```\n\n#### Bazel (可选)\n\n```sh\nbazel build -c opt --cxxopt=-std=c++20 :gemma\n```\n\n编译成功后，您将在 `build\u002F` (CMake) 或 `bazel-bin\u002F` (Bazel) 目录下找到 `gemma` (或 `gemma.exe`) 可执行文件。\n\n## 基本使用\n\n进入包含可执行文件的目录，即可运行推理。\n\n### 运行文本模型\n\n您需要指定权重文件和分词器文件。以下以 Gemma-2 2B 指令模型为例：\n\n```sh\n.\u002Fgemma \\\n--tokenizer tokenizer.spm \\\n--weights gemma2-2b-it-sfp.sbs\n```\n\n运行后进入交互式命令行，您可以直接输入提示词与模型对话。\n\n> **提示**：如果您使用了迁移后的单文件格式（包含分词器），可以省略 `--tokenizer` 参数：\n> `.\u002Fgemma --weights gemma2-2b-it-sfp-single.sbs`\n\n### 运行 PaliGemma 视觉语言模型\n\ngemma.cpp 也支持 PaliGemma-2 模型。首先从 Kaggle 下载对应的权重和分词器，并将图片转换为二进制 PPM 格式（P6）。\n\n1. **转换图片格式** (需安装 ImageMagick):\n   ```sh\n   convert image.jpeg -resize 224x224^ image.ppm\n   ```\n\n2. **运行推理**:\n   ```sh\n   .\u002Fgemma \\\n   --tokenizer paligemma_tokenizer.model \\\n   --weights paligemma2-3b-mix-224-sfp.sbs \\\n   --image_file paligemma\u002Ftestdata\u002Fimage.ppm\n   ```\n\n随后即可针对图片内容进行提问（例如：\"Describe the image briefly\"）。","某嵌入式 AI 研发团队需要在资源受限的工业边缘设备上，快速验证针对 Google Gemma-2 模型的自定义量化算法与微调策略。\n\n### 没有 gemma.cpp 时\n- **环境依赖沉重**：必须部署完整的 Python 深度学习栈（如 PyTorch\u002FJAX），在内存有限的边缘设备上极易因显存或内存不足导致崩溃。\n- **黑盒调试困难**：现有推理引擎将底层计算高度封装，研究人员无法直接修改矩阵乘法（GEMM）或反向传播逻辑，难以验证新型 fp8 量化格式的效果。\n- **跨平台移植复杂**：不同 CPU 架构（如 x86 与 ARM）需要编写特定的 SIMD 指令代码，适配过程耗时且容易出错，阻碍了算法的快速迭代。\n- **实验周期漫长**：从算法构思到在真实硬件上运行，往往需要数天时间进行环境配置和代码重构，严重拖慢研发节奏。\n\n### 使用 gemma.cpp 后\n- **轻量独立运行**：仅需一个约 2000 行核心代码的 C++ 单文件引擎，无需庞大 Python 环境即可直接在边缘设备 CPU 上高效运行 Gemma-2 模型。\n- **白盒深度定制**：直接开放底层 GEMM 实现与反向传播接口，团队可轻松嵌入自定义的 4-bit 非均匀量化逻辑并实时观察训练效果。\n- **自动指令优化**：内置 Google Highway 库自动识别 CPU 指令集并调用最优 SIMD 方案，一次编译即可在 Linux、Windows 及 macOS 多种架构上流畅运行。\n- **即时实验反馈**：借助极简的 C++ API 与 Python 绑定，研究人员能在几小时内完成从算法修改到端侧部署的全流程，大幅加速创新验证。\n\ngemma.cpp 通过极致的轻量化与代码透明度，填补了高层算法研究与底层高性能推理之间的鸿沟，让边缘侧的大模型创新变得触手可及。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgoogle_gemma.cpp_6b083bee.png","google","Google","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fgoogle_c4bedcda.png","Google ❤️ Open Source",null,"opensource@google.com","GoogleOSS","https:\u002F\u002Fopensource.google\u002F","https:\u002F\u002Fgithub.com\u002Fgoogle",[82,86,90,93,97,101,104],{"name":83,"color":84,"percentage":85},"C++","#f34b7d",92.3,{"name":87,"color":88,"percentage":89},"Starlark","#76d275",2.3,{"name":91,"color":92,"percentage":32},"Python","#3572A5",{"name":94,"color":95,"percentage":96},"C#","#178600",1.4,{"name":98,"color":99,"percentage":100},"C","#555555",0.7,{"name":102,"color":103,"percentage":100},"Shell","#89e051",{"name":105,"color":106,"percentage":100},"CMake","#DA3434",6810,613,"2026-04-06T18:37:03","Apache-2.0",4,"Linux, macOS, Windows","不需要 GPU，仅支持 CPU 推理（使用 SIMD 指令集）","未说明（取决于模型大小，2B\u002F3B 模型建议至少 8GB，更大模型需更多内存以支持长序列）",{"notes":116,"python":117,"dependencies":118},"该项目是纯 C++ 实现的轻量级推理引擎，专为 CPU 优化，利用 Google Highway 库实现可移植的 SIMD 加速。Windows 用户建议使用 WSL 进行构建，若原生构建需安装 Visual Studio Build Tools 及 Clang\u002FLLVM 前端。模型权重需从 Kaggle 下载（支持 fp8\u002Fbf16 等压缩格式），图像输入仅支持 PPM (P6) 格式。","可选（仅提供基础 Python 绑定，核心功能为 C++）",[105,119,120,121],"Clang C++ 编译器 (支持 C++17+)","Google Highway Library","tar",[35,14],"2026-03-27T02:49:30.150509","2026-04-07T09:46:49.145901",[126,131,136,141,145,150],{"id":127,"question_zh":128,"answer_zh":129,"source_url":130},21542,"运行程序时遇到 AddressSanitizer 报错或内存访问错误（SEGV），特别是在处理长文本或大输入时，该如何解决？","这通常是由于配置参数与模型权重不匹配导致的。请检查 `configs.h` 文件，确保 `kVocabSize` 和 `kKVHeads` 的值与当前使用的模型权重版本一致。例如，对于当前的 Kaggle 2b-it 权重，需要将 `kVocabSize` 修改为 256128，将 `kKVHeads` 修改为 8。维护者正在更新权重以匹配默认配置，但在更新前需手动调整代码配置。","https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fissues\u002F99",{"id":132,"question_zh":133,"answer_zh":134,"source_url":135},21543,"在长上下文（Long Context）模式下，生成的令牌重复或无意义，疑似 KV Cache 损坏，这是什么原因？","这个问题可能出现在特定的提交版本（如 \"MatPtr-ify KV\"）之后。如果在短对话（少于三轮）中表现正常，但在多轮对话或特定提示下出现异常，可能是 KV Cache 逻辑的回归错误。建议尝试切换到 `RelWithDebInfo` 构建类型并启用 ASAN（地址消毒剂）来复现断言失败，从而定位具体问题。如果确认是此问题，可能需要回退到之前的稳定版本或等待官方修复。","https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fissues\u002F608",{"id":137,"question_zh":138,"answer_zh":139,"source_url":140},21544,"如何在调试模式（Debug Mode）下排查分段错误（Segmentation Fault）？","当在调试模式下遇到分段错误时，首先注意 `allocator.h` 中的警告信息。可以通过使用 `-g1` 编译标志来获取更精确的堆栈跟踪信息，这将帮助定位到具体的函数（如 `TwoOfsMatVecLoop`）。此外，启用 MSAN（内存消毒剂）也有助于复现和诊断内存相关的崩溃问题。","https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fissues\u002F508",{"id":142,"question_zh":143,"answer_zh":144,"source_url":135},21545,"为什么现有的单元测试通过了，但实际运行中却发现了 KV Cache 的问题？","这是因为现有的 gemma\u002Fpaligemma 单元测试主要覆盖较短的上下文场景（通常少于三轮对话）。在这些短上下文中，输出可能只有细微差别而没有明显的崩溃或错误，因此测试能够通过。然而，在实际的多轮长对话应用中，KV Cache 的累积误差或逻辑缺陷会被放大，导致生成内容不符合预期。这表明需要增加针对长上下文和多轮对话的测试用例。",{"id":146,"question_zh":147,"answer_zh":148,"source_url":149},21546,"Google 是否会开源 Gemini Flash 模型？","目前 Google 尚未宣布开源 Gemini Flash 的具体计划。虽然社区强烈呼吁开放旧版本（如 1.5 系列）以促进研究和历史保存，类似于 Gemma 系列的发布策略，但截至当前，Gemini Flash 仍为专有模型。用户可以关注官方路线图，但目前无法获取其权重用于本地部署。","https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fissues\u002F221",{"id":151,"question_zh":152,"answer_zh":153,"source_url":130},21547,"在处理超大输入导致崩溃时，调用方如何获知错误并停止生成？","在旧版本中，当发生内存访问错误时，调用方可能无法感知并继续调用 `GenerateGemma()`，从而导致崩溃。修复后的版本（dev 分支）改进了错误通知机制。建议升级到最新的开发版本，以便在底层检测到错误时能够正确终止调用流程，避免无效的 `Attention()` 或 `Prefill()` 调用。",[155,160,165,170,175],{"id":156,"version":157,"summary_zh":158,"released_at":159},127533,"v0.1.4","## 变更内容\n* 由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F520 中重构 Gemma 构造函数，并改进池的 NUMA 支持\n* 由 @ufownl 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F523 中修复 gemma3-1b 的提示换行问题\n* 由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F521 中添加关于注意力长度和 SFP 的说明\n* 由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F525 中新增对辅助 EOS 标记的支持\n* 由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F526 中更新应用程序参数文档\n* 由 @ufownl 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F527 中为 Gemma2 设置辅助 EOS 标记\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fcompare\u002Fv0.1.3...v0.1.4","2025-03-25T18:04:03",{"id":161,"version":162,"summary_zh":163,"released_at":164},127534,"v0.1.3","- 支持 PaliGemma 2 和 Gemma 3。\n- 对 MatMul 及其相关操作进行了重大更新，代码库多个部分的性能显著提升。\n- 在许多领域对代码库进行了简化和重构。\n- 修复了若干 bug。\n\n## 变更内容\n* 添加更多算子：Sigmoid、(Two)MatVecAdd。优化了 TwoMatVec 的速度，由 @veluca93 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F129 中完成。\n* 改进权重处理方式，由 @veluca93 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F130 中完成。\n* 移除未使用的头文件包含，由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F132 中完成。\n* 添加基准测试和额外的测试用例，由 @veluca93 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F131 中完成。\n* 引入 Griffin 实现，由 @pculliton 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F136 中完成。\n* 将配置中的 `NumGemmaLayers` 和 `NumGriffinLayers` 改为常量，由 @ufownl 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F139 中完成。\n* 提及由 @jart 贡献的 Makefile，由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F141 中完成。\n* 重构数据结构以减少内存占用，由 @ufownl 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F142 中完成。\n* 增加存储各层激活输出的功能，由 @atorero 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F145 中完成。\n* 进一步优化 I\u002FO 操作，无需使用 -D 标志即可启用多种后端，由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F148 中完成。\n* 使用 lambda 表达式拆分函数，并修复 stream_token 可能导致预填充中断的问题，由 @zeerd 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F156 中完成。\n* 简化预填充提前退出逻辑（原合并自 #156），由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F158 中完成。\n* 修复 NUQ ClusterCost() 中的下溢问题，由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F162 中完成。\n* 为 Python 绑定添加错误检查，并补充缺失的头文件及 hwasan 检查，由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F163 中完成。\n* 简化线程管理：移除 inner_pool 的使用，由 @szabadka 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F167 中完成。\n* 在 MQA 模式下的 QKV 投影中使用更多并行度，由 @szabadka 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F170 中完成。\n* 修复 MHA 配置下的 kv 偏移计算问题，由 @szabadka 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F172 中完成。\n* 在注意力模块的最终输出中使用更多并行度，由 @szabadka 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F175 中完成。\n* 在 MHA 模块的 QKV 投影中使用更多并行度，由 @szabadka 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F176 中完成。\n* 将 MatVecs 中 bf16 向量的去交错处理提取出来，由 @samkaufman 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F166 中完成。\n* 在预填充模式下，进一步提高注意力模块的并行度，由 @szabadka 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F177 中完成。\n* 与 CMake 安装配合工作，由 @xinpingwang 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F169 中完成。\n* 在 AVX3_DL+ 上，SFP 解码速度提升 2 倍（整体提升 1.4 倍），由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F178 中完成。\n* 支持额外的缩放功能，由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma 中完成。","2025-03-14T16:15:04",{"id":166,"version":167,"summary_zh":168,"released_at":169},127535,"v0.1.2","- MQA 实现\n- 运算模块的重构与优化\n- 修复若干 bug\n- 模型导出脚本（`util\u002Fconvert_weights.py`）\n\n*重要提示*：随着 MQA 的实现，旧版 2B 模型的权重文件需要更新。请从 Kaggle 重新下载权重，并确保使用最新版本（-mqa 或版本 3）。\n\n## 变更内容\n* @austinvhuang 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F102 中清理了面向开发者的文档。\n* @ufownl 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F114 中为 2B 模型实现了 MQA。\n* @enum-class 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F105 中增强了 ops.h 中的工具函数。\n* @villesundell 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F115 中添加了 app.h 中缺失的一个空格。\n* @ufownl 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F120 中修复了当 `HWY_COMPILER_GCC_ACTUAL \u003C 1300` 时的编译错误。\n* .bazelversion：Bazel 7.1.1，由 @LINKIWI 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F122 中指定。\n* @szabadka 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F125 中添加了一个用于压缩权重的独立工具。\n* 性能提升 1.07 倍：按照 @veluca93 的建议，将 MQA 的并行部分合并，由 @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F126 中完成。\n* @szabadka 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F127 中修复了生成代码和令牌流回调中的越界错误。\n\n## 新贡献者\n* @villesundell 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F115 中完成了首次贡献。\n* @LINKIWI 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F122 中完成了首次贡献。\n* @szabadka 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F125 中完成了首次贡献。\n\n**完整变更日志**：https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fcompare\u002Fv0.1.1...v0.1.2","2024-04-05T02:52:10",{"id":171,"version":172,"summary_zh":173,"released_at":174},127536,"v0.1.1","- 重构库接口\n- 修复以支持 Android 和 Windows 构建，并对构建流程进行总体改进\n- Bazel 构建支持\n- CI 自动化\n- 允许使用 Hugging Face 或 Kaggle 下载模型文件（原仅支持 Kaggle）\n- 自 0.1.0 初始版本以来的大量小修复和体验优化\n\n## 变更内容\n* @austinvhuang 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F24 中将 Dev 分支同步至 Main 分支\n* @eltociear 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F22 中更新了 build.yml 文件\n* @shirayu 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F32 中修复了拼写错误\n* @dcoles 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F6 中实现了使用 `clang-cl` 工具链在 Windows 上构建的功能\n* @traversaro 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F3 中修改了 Release 构建，不再显式传递 `-O2` 编译器标志\n* @dan-zheng 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F35 中修复了构建问题\n* @kishida 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F34 中实现了会话重置功能\n* @dan-zheng 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F36 中将 BUILD 文件重命名为 BUILD.bazel\n* @shirayu 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F33 中添加了 `--eot_line` 选项\n* @austinvhuang 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F58 中清理了提交 `129e66ada2b4e461bdf28b88b70cd2465cb213e4` 后的代码格式\n* @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F61 中修复了未使用的成员变量、类型转换及未使用的函数等警告\n* @austinvhuang 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F66 中改进了命令行参数、README 文档并进行了代码清理\n* @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F63 中修复了 Android 平台 32 位 `off_t` 类型的问题，解决了 #62 问题\n* @austinvhuang 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F71 中添加了关于如何将 gemma 作为库使用的开发者说明\n* @enum-class 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F65 中引入了 clang-tidy 检查，修复了窄化问题和常量性问题\n* @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F75 中增加了对 Bazel 构建的支持，解决了 #16 问题\n* @osanseviero 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F74 中添加了从 Hugging Face Hub 下载模型的说明\n* @ufownl 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F81 中将 KV 缓存与 GemmaImpl 分离\n* @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F85 中避免在较旧的 Android 设备上使用 `fadvise`，解决了 #84 问题\n* @enum-class 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F78 中使用 HWY\u002FSIMD 进行 RMSNorm(f, bf, f) 计算\n* @enum-class 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F77 中使用 HWY SIMD 进行 SquaredL2 计算\n* @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F92 中检测并打印构建类型，参考了 #88 问题\n* @austinvhuang 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F82 中重构了 libgemma API，使其与交互式 REPL 示例解耦，并添加了使用 libgemma 的“Hello World”示例\n* @austinvhuang 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F87 中进一步清理了 libgemma 重构后的代码\n* @copybara-service 在 https:\u002F\u002Fgithub.com\u002Fgoogle\u002Fgemma.cpp\u002Fpull\u002F92 中使用 bf16 舍入后的平方根来缩放嵌入向量，以匹配 Gemma 的表现","2024-03-15T18:18:27",{"id":176,"version":177,"summary_zh":178,"released_at":179},127537,"v0.1.0","初始发布。","2024-02-22T19:11:21"]