[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-abetlen--llama-cpp-python":3,"tool-abetlen--llama-cpp-python":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":79,"owner_company":79,"owner_location":80,"owner_email":79,"owner_twitter":76,"owner_website":79,"owner_url":81,"languages":82,"stars":103,"forks":104,"last_commit_at":105,"license":106,"difficulty_score":10,"env_os":107,"env_gpu":108,"env_ram":109,"env_deps":110,"category_tags":116,"github_topics":79,"view_count":23,"oss_zip_url":79,"oss_zip_packed_at":79,"status":16,"created_at":117,"updated_at":118,"faqs":119,"releases":152},1406,"abetlen\u002Fllama-cpp-python","llama-cpp-python","Python bindings for llama.cpp","llama-cpp-python 是著名高性能推理引擎 llama.cpp 的官方 Python 接口，旨在让开发者能轻松在 Python 环境中调用本地大语言模型。它解决了直接在 Python 中高效运行大型模型的技术门槛问题，无需依赖昂贵的云端 GPU 服务，即可在普通电脑甚至树莓派上流畅运行量化后的模型。\n\n这款工具非常适合 AI 开发者、研究人员以及希望构建本地化智能应用的工程师使用。它不仅提供了底层的 C API 访问能力，还封装了易用的高级 Python 接口，支持类似 OpenAI 的调用格式，并能无缝对接 LangChain 和 LlamaIndex 等主流开发框架。此外，它还内置了一个兼容 OpenAI 标准的 Web 服务器，支持代码补全、函数调用及多模态视觉处理等进阶功能。\n\n其独特亮点在于灵活的硬件加速配置，用户可通过简单的环境变量或安装参数，轻松开启对 CPU、GPU（如 CUDA、Metal）等多种后端的支持，实现推理速度的最大化。无论是想快速原型验证，还是部署私有化知识库助手，llama-cpp-python 都能提供稳定且高效的底层支撑，是让大模型“落地","llama-cpp-python 是著名高性能推理引擎 llama.cpp 的官方 Python 接口，旨在让开发者能轻松在 Python 环境中调用本地大语言模型。它解决了直接在 Python 中高效运行大型模型的技术门槛问题，无需依赖昂贵的云端 GPU 服务，即可在普通电脑甚至树莓派上流畅运行量化后的模型。\n\n这款工具非常适合 AI 开发者、研究人员以及希望构建本地化智能应用的工程师使用。它不仅提供了底层的 C API 访问能力，还封装了易用的高级 Python 接口，支持类似 OpenAI 的调用格式，并能无缝对接 LangChain 和 LlamaIndex 等主流开发框架。此外，它还内置了一个兼容 OpenAI 标准的 Web 服务器，支持代码补全、函数调用及多模态视觉处理等进阶功能。\n\n其独特亮点在于灵活的硬件加速配置，用户可通过简单的环境变量或安装参数，轻松开启对 CPU、GPU（如 CUDA、Metal）等多种后端的支持，实现推理速度的最大化。无论是想快速原型验证，还是部署私有化知识库助手，llama-cpp-python 都能提供稳定且高效的底层支撑，是让大模型“落地”到本地的得力帮手。","\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Fraw.githubusercontent.com\u002Fabetlen\u002Fllama-cpp-python\u002Fmain\u002Fdocs\u002Ficon.svg\" style=\"height: 5rem; width: 5rem\">\n\u003C\u002Fp>\n\n#  Python Bindings for [`llama.cpp`](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp)\n\n[![Documentation Status](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fabetlen_llama-cpp-python_readme_6bf48b3e9a6d.png)](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002F?badge=latest)\n[![Tests](https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Factions\u002Fworkflows\u002Ftest.yaml\u002Fbadge.svg?branch=main)](https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Factions\u002Fworkflows\u002Ftest.yaml)\n[![PyPI](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fv\u002Fllama-cpp-python)](https:\u002F\u002Fpypi.org\u002Fproject\u002Fllama-cpp-python\u002F)\n[![PyPI - Python Version](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fpyversions\u002Fllama-cpp-python)](https:\u002F\u002Fpypi.org\u002Fproject\u002Fllama-cpp-python\u002F)\n[![PyPI - License](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fl\u002Fllama-cpp-python)](https:\u002F\u002Fpypi.org\u002Fproject\u002Fllama-cpp-python\u002F)\n[![PyPI - Downloads](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fabetlen_llama-cpp-python_readme_c86ac11b3594.png)](https:\u002F\u002Fpepy.tech\u002Fprojects\u002Fllama-cpp-python)\n[![Github All Releases](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fdownloads\u002Fabetlen\u002Fllama-cpp-python\u002Ftotal.svg?label=Github%20Downloads)]()\n\nSimple Python bindings for **@ggerganov's** [`llama.cpp`](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp) library.\nThis package provides:\n\n- Low-level access to C API via `ctypes` interface.\n- High-level Python API for text completion\n    - OpenAI-like API\n    - [LangChain compatibility](https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fintegrations\u002Fllms\u002Fllamacpp)\n    - [LlamaIndex compatibility](https:\u002F\u002Fdocs.llamaindex.ai\u002Fen\u002Fstable\u002Fexamples\u002Fllm\u002Fllama_2_llama_cpp.html)\n- OpenAI compatible web server\n    - [Local Copilot replacement](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#code-completion)\n    - [Function Calling support](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#function-calling)\n    - [Vision API support](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#multimodal-models)\n    - [Multiple Models](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#configuration-and-multi-model-support)\n\nDocumentation is available at [https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest).\n\n## Installation\n\nRequirements:\n\n  - Python 3.8+\n  - C compiler\n      - Linux: gcc or clang\n      - Windows: Visual Studio or MinGW\n      - MacOS: Xcode\n\nTo install the package, run:\n\n```bash\npip install llama-cpp-python\n```\n\nThis will also build `llama.cpp` from source and install it alongside this python package.\n\nIf this fails, add `--verbose` to the `pip install` see the full cmake build log.\n\n**Pre-built Wheel (New)**\n\nIt is also possible to install a pre-built wheel with basic CPU support.\n\n```bash\npip install llama-cpp-python \\\n  --extra-index-url https:\u002F\u002Fabetlen.github.io\u002Fllama-cpp-python\u002Fwhl\u002Fcpu\n```\n\n### Installation Configuration\n\n`llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp#build) for a full list.\n\nAll `llama.cpp` cmake build options can be set via the `CMAKE_ARGS` environment variable or via the `--config-settings \u002F -C` cli flag during installation.\n\n\u003Cdetails open>\n\u003Csummary>Environment Variables\u003C\u002Fsummary>\n\n```bash\n# Linux and Mac\nCMAKE_ARGS=\"-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS\" \\\n  pip install llama-cpp-python\n```\n\n```powershell\n# Windows\n$env:CMAKE_ARGS = \"-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS\"\npip install llama-cpp-python\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>CLI \u002F requirements.txt\u003C\u002Fsummary>\n\nThey can also be set via `pip install -C \u002F --config-settings` command and saved to a `requirements.txt` file:\n\n```bash\npip install --upgrade pip # ensure pip is up to date\npip install llama-cpp-python \\\n  -C cmake.args=\"-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS\"\n```\n\n```txt\n# requirements.txt\n\nllama-cpp-python -C cmake.args=\"-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS\"\n```\n\n\u003C\u002Fdetails>\n\n### Supported Backends\n\nBelow are some common backends, their build commands and any additional environment variables required.\n\n\u003Cdetails open>\n\u003Csummary>OpenBLAS (CPU)\u003C\u002Fsummary>\n\nTo install with OpenBLAS, set the `GGML_BLAS` and `GGML_BLAS_VENDOR` environment variables before installing:\n\n```bash\nCMAKE_ARGS=\"-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS\" pip install llama-cpp-python\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>CUDA\u003C\u002Fsummary>\n\nTo install with CUDA support, set the `GGML_CUDA=on` environment variable before installing:\n\n```bash\nCMAKE_ARGS=\"-DGGML_CUDA=on\" pip install llama-cpp-python\n```\n\n**Pre-built Wheel (New)**\n\nIt is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:\n\n- CUDA Version is 12.1, 12.2, 12.3, 12.4 or 12.5\n- Python Version is 3.10, 3.11 or 3.12\n\n```bash\npip install llama-cpp-python \\\n  --extra-index-url https:\u002F\u002Fabetlen.github.io\u002Fllama-cpp-python\u002Fwhl\u002F\u003Ccuda-version>\n```\n\nWhere `\u003Ccuda-version>` is one of the following:\n- `cu121`: CUDA 12.1\n- `cu122`: CUDA 12.2\n- `cu123`: CUDA 12.3\n- `cu124`: CUDA 12.4\n- `cu125`: CUDA 12.5\n\nFor example, to install the CUDA 12.1 wheel:\n\n```bash\npip install llama-cpp-python \\\n  --extra-index-url https:\u002F\u002Fabetlen.github.io\u002Fllama-cpp-python\u002Fwhl\u002Fcu121\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>Metal\u003C\u002Fsummary>\n\nTo install with Metal (MPS), set the `GGML_METAL=on` environment variable before installing:\n\n```bash\nCMAKE_ARGS=\"-DGGML_METAL=on\" pip install llama-cpp-python\n```\n\n**Pre-built Wheel (New)**\n\nIt is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements:\n\n- MacOS Version is 11.0 or later\n- Python Version is 3.10, 3.11 or 3.12\n\n```bash\npip install llama-cpp-python \\\n  --extra-index-url https:\u002F\u002Fabetlen.github.io\u002Fllama-cpp-python\u002Fwhl\u002Fmetal\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>hipBLAS (ROCm)\u003C\u002Fsummary>\n\nTo install with hipBLAS \u002F ROCm support for AMD cards, set the `GGML_HIPBLAS=on` environment variable before installing:\n\n```bash\nCMAKE_ARGS=\"-DGGML_HIPBLAS=on\" pip install llama-cpp-python\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>Vulkan\u003C\u002Fsummary>\n\nTo install with Vulkan support, set the `GGML_VULKAN=on` environment variable before installing:\n\n```bash\nCMAKE_ARGS=\"-DGGML_VULKAN=on\" pip install llama-cpp-python\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>SYCL\u003C\u002Fsummary>\n\nTo install with SYCL support, set the `GGML_SYCL=on` environment variable before installing:\n\n```bash\nsource \u002Fopt\u002Fintel\u002Foneapi\u002Fsetvars.sh   \nCMAKE_ARGS=\"-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx\" pip install llama-cpp-python\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>RPC\u003C\u002Fsummary>\n\nTo install with RPC support, set the `GGML_RPC=on` environment variable before installing:\n\n```bash\nsource \u002Fopt\u002Fintel\u002Foneapi\u002Fsetvars.sh   \nCMAKE_ARGS=\"-DGGML_RPC=on\" pip install llama-cpp-python\n```\n\u003C\u002Fdetails>\n\n\n### Windows Notes\n\n\u003Cdetails>\n\u003Csummary>Error: Can't find 'nmake' or 'CMAKE_C_COMPILER'\u003C\u002Fsummary>\n\nIf you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install:\n\n```ps\n$env:CMAKE_GENERATOR = \"MinGW Makefiles\"\n$env:CMAKE_ARGS = \"-DGGML_OPENBLAS=on -DCMAKE_C_COMPILER=C:\u002Fw64devkit\u002Fbin\u002Fgcc.exe -DCMAKE_CXX_COMPILER=C:\u002Fw64devkit\u002Fbin\u002Fg++.exe\"\n```\n\nSee the above instructions and set `CMAKE_ARGS` to the BLAS backend you want to use.\n\u003C\u002Fdetails>\n\n### MacOS Notes\n\nDetailed MacOS Metal GPU install documentation is available at [docs\u002Finstall\u002Fmacos.md](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Finstall\u002Fmacos\u002F)\n\n\u003Cdetails>\n\u003Csummary>M1 Mac Performance Issue\u003C\u002Fsummary>\n\nNote: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example:\n\n```bash\nwget https:\u002F\u002Fgithub.com\u002Fconda-forge\u002Fminiforge\u002Freleases\u002Flatest\u002Fdownload\u002FMiniforge3-MacOSX-arm64.sh\nbash Miniforge3-MacOSX-arm64.sh\n```\n\nOtherwise, while installing it will build the llama.cpp x86 version which will be 10x slower on Apple Silicon (M1) Mac.\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>M Series Mac Error: `(mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))`\u003C\u002Fsummary>\n\nTry installing with\n\n```bash\nCMAKE_ARGS=\"-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on\" pip install --upgrade --verbose --force-reinstall --no-cache-dir llama-cpp-python\n```\n\u003C\u002Fdetails>\n\n### Upgrading and Reinstalling\n\nTo upgrade and rebuild `llama-cpp-python` add `--upgrade --force-reinstall --no-cache-dir` flags to the `pip install` command to ensure the package is rebuilt from source.\n\n## High-level API\n\n[API Reference](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#high-level-api)\n\nThe high-level API provides a simple managed interface through the [`Llama`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama) class.\n\nBelow is a short example demonstrating how to use the high-level API to for basic text completion:\n\n```python\nfrom llama_cpp import Llama\n\nllm = Llama(\n      model_path=\".\u002Fmodels\u002F7B\u002Fllama-model.gguf\",\n      # n_gpu_layers=-1, # Uncomment to use GPU acceleration\n      # seed=1337, # Uncomment to set a specific seed\n      # n_ctx=2048, # Uncomment to increase the context window\n)\noutput = llm(\n      \"Q: Name the planets in the solar system? A: \", # Prompt\n      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window\n      stop=[\"Q:\", \"\\n\"], # Stop generating just before the model would generate a new question\n      echo=True # Echo the prompt back in the output\n) # Generate a completion, can also call create_completion\nprint(output)\n```\n\nBy default `llama-cpp-python` generates completions in an OpenAI compatible format:\n\n```python\n{\n  \"id\": \"cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx\",\n  \"object\": \"text_completion\",\n  \"created\": 1679561337,\n  \"model\": \".\u002Fmodels\u002F7B\u002Fllama-model.gguf\",\n  \"choices\": [\n    {\n      \"text\": \"Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.\",\n      \"index\": 0,\n      \"logprobs\": None,\n      \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 14,\n    \"completion_tokens\": 28,\n    \"total_tokens\": 42\n  }\n}\n```\n\nText completion is available through the [`__call__`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama.__call__) and [`create_completion`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama.create_completion) methods of the [`Llama`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama) class.\n\n### Pulling models from Hugging Face Hub\n\nYou can download `Llama` models in `gguf` format directly from Hugging Face using the [`from_pretrained`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama.from_pretrained) method.\nYou'll need to install the `huggingface-hub` package to use this feature (`pip install huggingface-hub`).\n\n```python\nllm = Llama.from_pretrained(\n    repo_id=\"lmstudio-community\u002FQwen3.5-0.8B-GGUF\",\n    filename=\"*Q8_0.gguf\",\n    verbose=False\n)\n```\n\nBy default [`from_pretrained`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama.from_pretrained) will download the model to the huggingface cache directory, you can then manage installed model files with the [`hf`](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fhuggingface_hub\u002Fen\u002Fguides\u002Fcli) tool.\n\n### Chat Completion\n\nThe high-level API also provides a simple interface for chat completion.\n\nChat completion requires that the model knows how to format the messages into a single prompt.\nThe `Llama` class does this using pre-registered chat formats (ie. `chatml`, `llama-2`, `gemma`, etc) or by providing a custom chat handler object.\n\nThe model will will format the messages into a single prompt using the following order of precedence:\n  - Use the `chat_handler` if provided\n  - Use the `chat_format` if provided\n  - Use the `tokenizer.chat_template` from the `gguf` model's metadata (should work for most new models, older models may not have this)\n  - else, fallback to the `llama-2` chat format\n\nSet `verbose=True` to see the selected chat format.\n\n```python\nfrom llama_cpp import Llama\nllm = Llama(\n      model_path=\"path\u002Fto\u002Fllama-2\u002Fllama-model.gguf\",\n      chat_format=\"llama-2\"\n)\nllm.create_chat_completion(\n      messages = [\n          {\"role\": \"system\", \"content\": \"You are an assistant who perfectly describes images.\"},\n          {\n              \"role\": \"user\",\n              \"content\": \"Describe this image in detail please.\"\n          }\n      ]\n)\n```\n\nChat completion is available through the [`create_chat_completion`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama.create_chat_completion) method of the [`Llama`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama) class.\n\nFor OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama.create_chat_completion_openai_v1) method which will return pydantic models instead of dicts.\n\n\n### JSON and JSON Schema Mode\n\nTo constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama.create_chat_completion).\n\n#### JSON Mode\n\nThe following example will constrain the response to valid JSON strings only.\n\n```python\nfrom llama_cpp import Llama\nllm = Llama(model_path=\"path\u002Fto\u002Fmodel.gguf\", chat_format=\"chatml\")\nllm.create_chat_completion(\n    messages=[\n        {\n            \"role\": \"system\",\n            \"content\": \"You are a helpful assistant that outputs in JSON.\",\n        },\n        {\"role\": \"user\", \"content\": \"Who won the world series in 2020\"},\n    ],\n    response_format={\n        \"type\": \"json_object\",\n    },\n    temperature=0.7,\n)\n```\n\n#### JSON Schema Mode\n\nTo constrain the response further to a specific JSON Schema add the schema to the `schema` property of the `response_format` argument.\n\n```python\nfrom llama_cpp import Llama\nllm = Llama(model_path=\"path\u002Fto\u002Fmodel.gguf\", chat_format=\"chatml\")\nllm.create_chat_completion(\n    messages=[\n        {\n            \"role\": \"system\",\n            \"content\": \"You are a helpful assistant that outputs in JSON.\",\n        },\n        {\"role\": \"user\", \"content\": \"Who won the world series in 2020\"},\n    ],\n    response_format={\n        \"type\": \"json_object\",\n        \"schema\": {\n            \"type\": \"object\",\n            \"properties\": {\"team_name\": {\"type\": \"string\"}},\n            \"required\": [\"team_name\"],\n        },\n    },\n    temperature=0.7,\n)\n```\n\n### Function Calling\n\nThe high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.\n\n```python\nfrom llama_cpp import Llama\nllm = Llama(model_path=\"path\u002Fto\u002Fchatml\u002Fllama-model.gguf\", chat_format=\"chatml-function-calling\")\nllm.create_chat_completion(\n      messages = [\n        {\n          \"role\": \"system\",\n          \"content\": \"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary\"\n\n        },\n        {\n          \"role\": \"user\",\n          \"content\": \"Extract Jason is 25 years old\"\n        }\n      ],\n      tools=[{\n        \"type\": \"function\",\n        \"function\": {\n          \"name\": \"UserDetail\",\n          \"parameters\": {\n            \"type\": \"object\",\n            \"title\": \"UserDetail\",\n            \"properties\": {\n              \"name\": {\n                \"title\": \"Name\",\n                \"type\": \"string\"\n              },\n              \"age\": {\n                \"title\": \"Age\",\n                \"type\": \"integer\"\n              }\n            },\n            \"required\": [ \"name\", \"age\" ]\n          }\n        }\n      }],\n      tool_choice={\n        \"type\": \"function\",\n        \"function\": {\n          \"name\": \"UserDetail\"\n        }\n      }\n)\n```\n\n\u003Cdetails>\n\u003Csummary>Functionary v2\u003C\u002Fsummary>\n\nThe various gguf-converted files for this set of models can be found [here](https:\u002F\u002Fhuggingface.co\u002Fmeetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**. You can provide either `functionary-v1` or `functionary-v2` for the `chat_format` when initializing the Llama class.\n\nDue to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files.\n\n```python\nfrom llama_cpp import Llama\nfrom llama_cpp.llama_tokenizer import LlamaHFTokenizer\nllm = Llama.from_pretrained(\n  repo_id=\"meetkai\u002Ffunctionary-small-v2.2-GGUF\",\n  filename=\"functionary-small-v2.2.q4_0.gguf\",\n  chat_format=\"functionary-v2\",\n  tokenizer=LlamaHFTokenizer.from_pretrained(\"meetkai\u002Ffunctionary-small-v2.2-GGUF\")\n)\n```\n\n**NOTE**: There is no need to provide the default system messages used in Functionary as they are added automatically in the Functionary chat handler. Thus, the messages should contain just the chat messages and\u002For system messages that provide additional context for the model (e.g.: datetime, etc.).\n\u003C\u002Fdetails>\n\n### Multi-modal Models\n\n`llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images.\n\nBelow are the supported multi-modal models and their respective chat handlers (Python API) and chat formats (Server API).\n\n| Model | `LlamaChatHandler` | `chat_format` |\n|:--- |:--- |:--- |\n| [llava-v1.5-7b](https:\u002F\u002Fhuggingface.co\u002Fmys\u002Fggml_llava-v1.5-7b) | `Llava15ChatHandler` | `llava-1-5` |\n| [llava-v1.5-13b](https:\u002F\u002Fhuggingface.co\u002Fmys\u002Fggml_llava-v1.5-13b) | `Llava15ChatHandler` | `llava-1-5` |\n| [llava-v1.6-34b](https:\u002F\u002Fhuggingface.co\u002Fcjpais\u002Fllava-v1.6-34B-gguf) | `Llava16ChatHandler` | `llava-1-6` |\n| [moondream2](https:\u002F\u002Fhuggingface.co\u002Fvikhyatk\u002Fmoondream2) | `MoondreamChatHandler` | `moondream2` |\n| [nanollava](https:\u002F\u002Fhuggingface.co\u002Fabetlen\u002Fnanollava-gguf) | `NanollavaChatHandler` | `nanollava` |\n| [llama-3-vision-alpha](https:\u002F\u002Fhuggingface.co\u002Fabetlen\u002Fllama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |\n| [minicpm-v-2.6](https:\u002F\u002Fhuggingface.co\u002Fopenbmb\u002FMiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` |\n| [qwen2.5-vl](https:\u002F\u002Fhuggingface.co\u002Funsloth\u002FQwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |\n\nThen you'll need to use a custom chat handler to load the clip model and process the chat messages and images.\n\n```python\nfrom llama_cpp import Llama\nfrom llama_cpp.llama_chat_format import Llava15ChatHandler\nchat_handler = Llava15ChatHandler(clip_model_path=\"path\u002Fto\u002Fllava\u002Fmmproj.bin\")\nllm = Llama(\n  model_path=\".\u002Fpath\u002Fto\u002Fllava\u002Fllama-model.gguf\",\n  chat_handler=chat_handler,\n  n_ctx=2048, # n_ctx should be increased to accommodate the image embedding\n)\nllm.create_chat_completion(\n    messages = [\n        {\"role\": \"system\", \"content\": \"You are an assistant who perfectly describes images.\"},\n        {\n            \"role\": \"user\",\n            \"content\": [\n                {\"type\" : \"text\", \"text\": \"What's in this image?\"},\n                {\"type\": \"image_url\", \"image_url\": {\"url\": \"https:\u002F\u002Fupload.wikimedia.org\u002Fwikipedia\u002Fcommons\u002Fthumb\u002Fd\u002Fdd\u002FGfp-wisconsin-madison-the-nature-boardwalk.jpg\u002F2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\" } }\n            ]\n        }\n    ]\n)\n```\n\nYou can also pull the model from the Hugging Face Hub using the `from_pretrained` method.\n\n```python\nfrom llama_cpp import Llama\nfrom llama_cpp.llama_chat_format import MoondreamChatHandler\n\nchat_handler = MoondreamChatHandler.from_pretrained(\n  repo_id=\"vikhyatk\u002Fmoondream2\",\n  filename=\"*mmproj*\",\n)\n\nllm = Llama.from_pretrained(\n  repo_id=\"vikhyatk\u002Fmoondream2\",\n  filename=\"*text-model*\",\n  chat_handler=chat_handler,\n  n_ctx=2048, # n_ctx should be increased to accommodate the image embedding\n)\n\nresponse = llm.create_chat_completion(\n    messages = [\n        {\n            \"role\": \"user\",\n            \"content\": [\n                {\"type\" : \"text\", \"text\": \"What's in this image?\"},\n                {\"type\": \"image_url\", \"image_url\": {\"url\": \"https:\u002F\u002Fupload.wikimedia.org\u002Fwikipedia\u002Fcommons\u002Fthumb\u002Fd\u002Fdd\u002FGfp-wisconsin-madison-the-nature-boardwalk.jpg\u002F2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\" } }\n\n            ]\n        }\n    ]\n)\nprint(response[\"choices\"][0][\"text\"])\n```\n\n**Note**: Multi-modal models also support tool calling and JSON mode.\n\n\u003Cdetails>\n\u003Csummary>Loading a Local Image\u003C\u002Fsummary>\n\nImages can be passed as base64 encoded data URIs. The following example demonstrates how to do this.\n\n```python\nimport base64\n\ndef image_to_base64_data_uri(file_path):\n    with open(file_path, \"rb\") as img_file:\n        base64_data = base64.b64encode(img_file.read()).decode('utf-8')\n        return f\"data:image\u002Fpng;base64,{base64_data}\"\n\n# Replace 'file_path.png' with the actual path to your PNG file\nfile_path = 'file_path.png'\ndata_uri = image_to_base64_data_uri(file_path)\n\nmessages = [\n    {\"role\": \"system\", \"content\": \"You are an assistant who perfectly describes images.\"},\n    {\n        \"role\": \"user\",\n        \"content\": [\n            {\"type\": \"image_url\", \"image_url\": {\"url\": data_uri }},\n            {\"type\" : \"text\", \"text\": \"Describe this image in detail please.\"}\n        ]\n    }\n]\n\n```\n\n\u003C\u002Fdetails>\n\n### Speculative Decoding\n\n`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model.\n\nThe fastest way to use speculative decoding is through the `LlamaPromptLookupDecoding` class.\n\nJust pass this as a draft model to the `Llama` class during initialization.\n\n```python\nfrom llama_cpp import Llama\nfrom llama_cpp.llama_speculative import LlamaPromptLookupDecoding\n\nllama = Llama(\n    model_path=\"path\u002Fto\u002Fmodel.gguf\",\n    draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.\n)\n```\n\n### Embeddings\n\nTo generate text embeddings use [`create_embedding`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama.create_embedding) or [`embed`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama.embed). Note that you must pass `embedding=True` to the constructor upon model creation for these to work properly.\n\n```python\nimport llama_cpp\n\nllm = llama_cpp.Llama(model_path=\"path\u002Fto\u002Fmodel.gguf\", embedding=True)\n\nembeddings = llm.create_embedding(\"Hello, world!\")\n\n# or create multiple embeddings at once\n\nembeddings = llm.create_embedding([\"Hello, world!\", \"Goodbye, world!\"])\n```\n\nThere are two primary notions of embeddings in a Transformer-style model: *token level* and *sequence level*. Sequence level embeddings are produced by \"pooling\" token level embeddings together, usually by averaging them or using the first token.\n\nModels that are explicitly geared towards embeddings will usually return sequence level embeddings by default, one for each input string. Non-embedding models such as those designed for text generation will typically return only token level embeddings, one for each token in each sequence. Thus the dimensionality of the return type will be one higher for token level embeddings.\n\nIt is possible to control pooling behavior in some cases using the `pooling_type` flag on model creation. You can ensure token level embeddings from any model using `LLAMA_POOLING_TYPE_NONE`. The reverse, getting a generation oriented model to yield sequence level embeddings is currently not possible, but you can always do the pooling manually.\n\n### Adjusting the Context Window\n\nThe context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.\n\nFor instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object:\n\n```python\nllm = Llama(model_path=\".\u002Fmodels\u002F7B\u002Fllama-model.gguf\", n_ctx=2048)\n```\n\n## OpenAI Compatible Web Server\n\n`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.\nThis allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc).\n\nTo install the server package and get started:\n\n```bash\npip install 'llama-cpp-python[server]'\npython3 -m llama_cpp.server --model models\u002F7B\u002Fllama-model.gguf\n```\n\nSimilar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this:\n\n```bash\nCMAKE_ARGS=\"-DGGML_CUDA=on\" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'\npython3 -m llama_cpp.server --model models\u002F7B\u002Fllama-model.gguf --n_gpu_layers 35\n```\n\nNavigate to [http:\u002F\u002Flocalhost:8000\u002Fdocs](http:\u002F\u002Flocalhost:8000\u002Fdocs) to see the OpenAPI documentation.\n\nTo bind to `0.0.0.0` to enable remote connections, use `python3 -m llama_cpp.server --host 0.0.0.0`.\nSimilarly, to change the port (default is 8000), use `--port`.\n\nYou probably also want to set the prompt format. For chatml, use\n\n```bash\npython3 -m llama_cpp.server --model models\u002F7B\u002Fllama-model.gguf --chat_format chatml\n```\n\nThat will format the prompt according to how model expects it. You can find the prompt format in the model card.\nFor possible options, see [llama_cpp\u002Fllama_chat_format.py](llama_cpp\u002Fllama_chat_format.py) and look for lines starting with \"@register_chat_format\".\n\nIf you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.\n\n```bash\npython3 -m llama_cpp.server --hf_model_repo_id lmstudio-community\u002FQwen3.5-0.8B-GGUF --model '*Q8_0.gguf'\n```\n\n### Web Server Features\n\n- [Local Copilot replacement](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#code-completion)\n- [Function Calling support](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#function-calling)\n- [Vision API support](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#multimodal-models)\n- [Multiple Models](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#configuration-and-multi-model-support)\n\n## Docker image\n\nA Docker image is available on [GHCR](https:\u002F\u002Fghcr.io\u002Fabetlen\u002Fllama-cpp-python). To run the server:\n\n```bash\ndocker run --rm -it -p 8000:8000 -v \u002Fpath\u002Fto\u002Fmodels:\u002Fmodels -e MODEL=\u002Fmodels\u002Fllama-model.gguf ghcr.io\u002Fabetlen\u002Fllama-cpp-python:latest\n```\n\n[Docker on termux (requires root)](https:\u002F\u002Fgist.github.com\u002FFreddieOliveira\u002Fefe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Fissues\u002F389)\n\n## Low-level API\n\n[API Reference](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#low-level-api)\n\nThe low-level API is a direct [`ctypes`](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fctypes.html) binding to the C API provided by `llama.cpp`.\nThe entire low-level API can be found in [llama_cpp\u002Fllama_cpp.py](https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Fblob\u002Fmaster\u002Fllama_cpp\u002Fllama_cpp.py) and directly mirrors the C API in [llama.h](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp\u002Fblob\u002Fmaster\u002Fllama.h).\n\nBelow is a short example demonstrating how to use the low-level API to tokenize a prompt:\n\n```python\nimport llama_cpp\nimport ctypes\nllama_cpp.llama_backend_init(False) # Must be called once at the start of each program\nparams = llama_cpp.llama_context_default_params()\n# use bytes for char * params\nmodel = llama_cpp.llama_load_model_from_file(b\".\u002Fmodels\u002F7b\u002Fllama-model.gguf\", params)\nctx = llama_cpp.llama_new_context_with_model(model, params)\nmax_tokens = params.n_ctx\n# use ctypes arrays for array params\ntokens = (llama_cpp.llama_token * int(max_tokens))()\nn_tokens = llama_cpp.llama_tokenize(ctx, b\"Q: Name the planets in the solar system? A: \", tokens, max_tokens, llama_cpp.c_bool(True))\nllama_cpp.llama_free(ctx)\n```\n\nCheck out the [examples folder](examples\u002Flow_level_api) for more examples of using the low-level API.\n\n## Documentation\n\nDocumentation is available via [https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002F](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002F).\nIf you find any issues with the documentation, please open an issue or submit a PR.\n\n## Development\n\nThis package is under active development and I welcome any contributions.\n\nTo get started, clone the repository and install the package in editable \u002F development mode:\n\n```bash\ngit clone --recurse-submodules https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python.git\ncd llama-cpp-python\n\n# Upgrade pip (required for editable mode)\npip install --upgrade pip\n\n# Install with pip\npip install -e .\n\n# install development tooling (tests, docs, ruff)\npip install -e '.[dev]'\n\n# if you want to use the fastapi \u002F openapi server\npip install -e '.[server]'\n\n# to install all optional dependencies\npip install -e '.[all]'\n\n# to clear the local build cache\nmake clean\n```\n\nNow try running the tests\n\n```bash\npytest\n```\n\nAnd check formatting \u002F linting before opening a PR:\n\n```bash\npython -m ruff check llama_cpp tests\npython -m ruff format --check llama_cpp tests\n\n# or use the Makefile targets\nmake lint\nmake format\n```\n\nThere's a `Makefile` available with useful targets.\nA typical workflow would look like this:\n\n```bash\nmake build\nmake test\n```\n\nYou can also test out specific commits of `llama.cpp` by checking out the desired commit in the `vendor\u002Fllama.cpp` submodule and then running `make clean` and `pip install -e .` again. Any changes in the `llama.h` API will require\nchanges to the `llama_cpp\u002Fllama_cpp.py` file to match the new API (additional changes may be required elsewhere).\n\n## FAQ\n\n### Are there pre-built binaries \u002F binary wheels available?\n\nThe recommended installation method is to install from source as described above.\nThe reason for this is that `llama.cpp` is built with compiler optimizations that are specific to your system.\nUsing pre-built binaries would require disabling these optimizations or supporting a large number of pre-built binaries for each platform.\n\nThat being said there are some pre-built binaries available through the Releases as well as some community provided wheels.\n\nIn the future, I would like to provide pre-built binaries and wheels for common platforms and I'm happy to accept any useful contributions in this area.\nThis is currently being tracked in [#741](https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Fissues\u002F741)\n\n### How does this compare to other Python bindings of `llama.cpp`?\n\nI originally wrote this package for my own use with two goals in mind:\n\n- Provide a simple process to install `llama.cpp` and access the full C API in `llama.h` from Python\n- Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `llama.cpp`\n\nAny contributions and changes to this package will be made with these goals in mind.\n\n## License\n\nThis project is licensed under the terms of the MIT license.\n","\u003Cp align=\"center\">\n  \u003Cimg src=\"https:\u002F\u002Fraw.githubusercontent.com\u002Fabetlen\u002Fllama-cpp-python\u002Fmain\u002Fdocs\u002Ficon.svg\" style=\"height: 5rem; width: 5rem\">\n\u003C\u002Fp>\n\n# Python 绑定：针对 [`llama.cpp`](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp) 的封装\n\n[![文档状态](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fabetlen_llama-cpp-python_readme_6bf48b3e9a6d.png)](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002F?badge=latest)\n[![测试](https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Factions\u002Fworkflows\u002Ftest.yaml\u002Fbadge.svg?branch=main)](https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Factions\u002Fworkflows\u002Ftest.yaml)\n[![PyPI](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fv\u002Fllama-cpp-python)](https:\u002F\u002Fpypi.org\u002Fproject\u002Fllama-cpp-python\u002F)\n[![PyPI - Python 版本](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fpyversions\u002Fllama-cpp-python)](https:\u002F\u002Fpypi.org\u002Fproject\u002Fllama-cpp-python\u002F)\n[![PyPI - 许可证](https:\u002F\u002Fimg.shields.io\u002Fpypi\u002Fl\u002Fllama-cpp-python)](https:\u002F\u002Fpypi.org\u002Fproject\u002Fllama-cpp-python\u002F)\n[![PyPI - 下载量](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fabetlen_llama-cpp-python_readme_c86ac11b3594.png)](https:\u002F\u002Fpepy.tech\u002Fprojects\u002Fllama-cpp-python)\n[![GitHub 全部版本](https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fdownloads\u002Fabetlen\u002Fllama-cpp-python\u002Ftotal.svg?label=GitHub%20Downloads)]()\n\n为 **@ggerganov** 的 [`llama.cpp`](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp) 库提供了一套简单的 Python 绑定。\n该包提供了以下功能：\n\n- 通过 `ctypes` 接口对 C API 进行低级访问。\n- 高级 Python API，用于文本补全\n    - 类似 OpenAI 的 API\n    - [LangChain 兼容性](https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fintegrations\u002Fllms\u002Fllamacpp)\n    - [LlamaIndex 兼容性](https:\u002F\u002Fdocs.llamaindex.ai\u002Fen\u002Fstable\u002Fexamples\u002Fllm\u002Fllama_2_llama_cpp.html)\n- OpenAI 兼容的 Web 服务器\n    - [本地 Copilot 替代方案](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#code-completion)\n    - [函数调用支持](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#function-calling)\n    - [视觉 API 支持](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#multimodal-models)\n    - [多模型支持](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#configuration-and-multi-model-support)\n\n相关文档可在 [https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest) 中找到。\n\n## 安装\n\n所需条件：\n\n- Python 3.8 及以上版本\n- C 编译器\n    - Linux：gcc 或 clang\n    - Windows：Visual Studio 或 MinGW\n    - macOS：Xcode\n\n要安装该包，请运行：\n\n```bash\npip install llama-cpp-python\n```\n\n此操作还将从源代码构建 `llama.cpp`，并将其与本 Python 包一同安装。\n\n若安装失败，请在 `pip install` 命令中添加 `--verbose` 参数，以查看完整的 CMake 构建日志。\n\n**预编译 Wheel（新版本）**\n\n您也可以安装一个预编译的 Wheel，以支持基本的 CPU 加速。\n\n```bash\npip install llama-cpp-python \\\n  --extra-index-url https:\u002F\u002Fabetlen.github.io\u002Fllama-cpp-python\u002Fwhl\u002Fcpu\n```\n\n### 安装配置\n\n`llama.cpp` 支持多种硬件加速后端，以提升推理速度，并且还提供了针对不同后端的特定选项。如需了解完整列表，请参阅 [llama.cpp 的 README](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp#build)。\n\n所有 `llama.cpp` 的 CMake 构建选项均可通过环境变量 `CMAKE_ARGS` 设置，或在安装时通过命令行参数 `--config-settings \u002F -C` 进行设置。\n\n\u003Cdetails open>\n\u003Csummary>环境变量\u003C\u002Fsummary>\n\n```bash\n# Linux 和 Mac\nCMAKE_ARGS=\"-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS\" \\\n  pip install llama-cpp-python\n```\n\n```powershell\n# Windows\n$env:CMAKE_ARGS = \"-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS\"\npip install llama-cpp-python\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>命令行\u002Frequirements.txt\u003C\u002Fsummary>\n\n您也可以通过 `pip install -C \u002F --config-settings` 命令进行设置，并将配置保存到 `requirements.txt` 文件中：\n\n```bash\npip install --upgrade pip # 确保 pip 已更新\npip install llama-cpp-python \\\n  -C cmake.args=\"-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS\"\n```\n\n```txt\n# requirements.txt\n\nllama-cpp-python -C cmake.args=\"-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS\"\n```\n\n\u003C\u002Fdetails>\n\n### 支持的后端\n\n以下是几种常见的后端及其构建命令和所需的额外环境变量。\n\n\u003Cdetails open>\n\u003Csummary>OpenBLAS（CPU）\u003C\u002Fsummary>\n\n要使用 OpenBLAS 安装，请在安装前设置 `GGML_BLAS` 和 `GGML_BLAS_VENDOR` 环境变量：\n\n```bash\nCMAKE_ARGS=\"-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS\" pip install llama-cpp-python\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>CUDA\u003C\u002Fsummary>\n\n要安装支持 CUDA 的版本，请在安装前设置 `GGML_CUDA=on` 环境变量：\n\n```bash\nCMAKE_ARGS=\"-DGGML_CUDA=on\" pip install llama-cpp-python\n```\n\n**预编译 Wheel（新版本）**\n\n您还可以安装一个带有 CUDA 支持的预编译 Wheel。只要您的系统满足以下条件：\n\n- CUDA 版本为 12.1、12.2、12.3、12.4 或 12.5\n- Python 版本为 3.10、3.11 或 3.12\n\n```bash\npip install llama-cpp-python \\\n  --extra-index-url https:\u002F\u002Fabetlen.github.io\u002Fllama-cpp-python\u002Fwhl\u002F\u003Ccuda-version>\n```\n\n其中 `\u003Ccuda-version>` 可以是以下任一版本：\n- `cu121`：CUDA 12.1\n- `cu122`：CUDA 12.2\n- `cu123`：CUDA 12.3\n- `cu124`：CUDA 12.4\n- `cu125`：CUDA 12.5\n\n例如，要安装 CUDA 12.1 的 Wheel：\n\n```bash\npip install llama-cpp-python \\\n  --extra-index-url https:\u002F\u002Fabetlen.github.io\u002Fllama-cpp-python\u002Fwhl\u002Fcu121\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>Metal\u003C\u002Fsummary>\n\n要使用 Metal（MPS）安装，请在安装前设置 `GGML_METAL=on` 环境变量：\n\n```bash\nCMAKE_ARGS=\"-DGGML_METAL=on\" pip install llama-cpp-python\n```\n\n**预编译 Wheel（新版本）**\n\n您还可以安装一个带有 Metal 支持的预编译 Wheel。只要您的系统满足以下条件：\n\n- macOS 版本为 11.0 或更高\n- Python 版本为 3.10、3.11 或 3.12\n\n```bash\npip install llama-cpp-python \\\n  --extra-index-url https:\u002F\u002Fabetlen.github.io\u002Fllama-cpp-python\u002Fwhl\u002Fmetal\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>hipBLAS（ROCm）\u003C\u002Fsummary>\n\n要安装支持 hipBLAS \u002F ROCm 的 AMD 显卡版本，请在安装前设置 `GGML_HIPBLAS=on` 环境变量：\n\n```bash\nCMAKE_ARGS=\"-DGGML_HIPBLAS=on\" pip install llama-cpp-python\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>Vulkan\u003C\u002Fsummary>\n\n要安装支持 Vulkan 的版本，请在安装前设置 `GGML_VULKAN=on` 环境变量：\n\n```bash\nCMAKE_ARGS=\"-DGGML_VULKAN=on\" pip install llama-cpp-python\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>SYCL\u003C\u002Fsummary>\n\n要安装支持 SYCL 的版本，请在安装前设置 `GGML_SYCL=on` 环境变量：\n\n```bash\nsource \u002Fopt\u002Fintel\u002Foneapi\u002Fsetvars.sh   \nCMAKE_ARGS=\"-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx\" pip install llama-cpp-python\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>RPC\u003C\u002Fsummary>\n\n要安装支持 RPC 的版本，请在安装前设置 `GGML_RPC=on` 环境变量：\n\n```bash\nsource \u002Fopt\u002Fintel\u002Foneapi\u002Fsetvars.sh   \nCMAKE_ARGS=\"-DGGML_RPC=on\" pip install llama-cpp-python\n```\n\u003C\u002Fdetails>\n\n### Windows 使用说明\n\n\u003Cdetails>\n\u003Csummary>错误提示：无法找到 ‘nmake’ 或 ‘CMAKE_C_COMPILER’\u003C\u002Fsummary>\n\n如果您在运行过程中遇到“无法找到 ‘nmake’”、“无法找到 ‘?’”或 CMAKE_C_COMPILER”的报错信息，您可以按照 [Llama.cpp 仓库中的说明](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp#openblas) 的步骤，将 w64devkit 解压并手动将其添加到 CMAKE_ARGS 中，然后再运行 `pip` 安装：\n\n```powershell\n$env:CMAKE_GENERATOR = \"MinGW Makefiles\"\n$env:CMAKE_ARGS = \"-DGGML_OPENBLAS=on -DCMAKE_C_COMPILER=C:\u002Fw64devkit\u002Fbin\u002Fgcc.exe -DCMAKE_CXX_COMPILER=C:\u002Fw64devkit\u002Fbin\u002Fg++.exe\"\n```\n\n请参考上述说明，并将 `CMAKE_ARGS` 设置为您想要使用的 BLAS 后端。\n\u003C\u002Fdetails>\n\n### macOS 使用说明\n\n有关详细的 macOS Metal GPU 安装文档，请访问 [docs\u002Finstall\u002Fmacos.md](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Finstall\u002Fmacos\u002F)。\n\n\u003Cdetails>\n\u003Csummary>M1 Mac 性能问题\u003C\u002Fsummary>\n\n注意：如果您使用的是 Apple Silicon（M1）Mac，请确保已安装支持 arm64 架构的 Python 版本。例如：\n\n```bash\nwget https:\u002F\u002Fgithub.com\u002Fconda-forge\u002Fminiforge\u002Freleases\u002Flatest\u002Fdownload\u002FMiniforge3-MacOSX-arm64.sh\nbash Miniforge3-MacOSX-arm64.sh\n```\n\n否则，在安装时，系统会默认构建 x86 版本的 llama.cpp，而该版本在 Apple Silicon（M1）Mac 上的运行速度会慢 10 倍。\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>M Series Mac 错误：`(mach-o 文件，但架构不兼容（需为 ‘x86_64’，需为 ‘arm64’）)`\u003C\u002Fsummary>\n\n尝试通过以下命令进行安装：\n\n```bash\nCMAKE_ARGS=\"-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on\" pip install --upgrade --verbose --force-reinstall --no-cache-dir llama-cpp-python\n```\n\n\u003C\u002Fdetails>\n\n### 升级与重新安装\n\n要升级并重新构建 `llama-cpp-python`，您可以在 `pip install` 命令中添加 `--upgrade --force-reinstall --no-cache-dir` 标志，以确保包从源代码重新构建。\n\n## 高级 API\n\n[API 参考](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#high-level-api)\n\n高级 API 提供了一个简单、易用的管理接口，可通过 `Llama` 类实现。\n\n以下是一个简短示例，演示如何使用高级 API 进行基础文本补全：\n\n```python\nfrom llama_cpp import Llama\n\nllm = Llama(\n      model_path=\".\u002Fmodels\u002F7B\u002Fllama-model.gguf\",\n      # n_gpu_layers=-1, # 如需启用 GPU 加速，请取消注释\n      # seed=1337, # 如需设置特定种子，请取消注释\n      # n_ctx=2048, # 如需扩大上下文窗口大小，请取消注释\n)\noutput = llm(\n      \"Q: 请说出太阳系中的行星名称？A: \", # 提示语\n      max_tokens=32, # 生成最多 32 个 token，若设为 None，则可生成至上下文窗口的末尾\n      stop=[\"Q:\", \"\\n\"], # 在模型生成新问题之前停止生成\n      echo=True # 将提示语回传至输出中\n) # 生成补全结果，也可调用 create_completion\nprint(output)\n```\n\n默认情况下，`llama-cpp-python` 会以 OpenAI 兼容的格式生成补全结果：\n\n```python\n{\n  \"id\": \"cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx\",\n  \"object\": \"text_completion\",\n  \"created\": 1679561337,\n  \"model\": \".\u002Fmodels\u002F7B\u002Fllama-model.gguf\",\n  \"choices\": [\n    {\n      \"text\": \"Q: 请说出太阳系中的行星名称？A: 水星、金星、地球、火星、木星、土星、天王星、海王星和冥王星。\",\n      \"index\": 0,\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 14,\n    \"completion_tokens\": 28,\n    \"total_tokens\": 42\n  }\n}\n```\n\n文本补全功能可通过 `Llama` 类的 `__call__` 方法以及 `create_completion` 方法实现。\n\n### 从 Hugging Face Hub 下载模型\n\n您可以通过 `from_pretrained` 方法，直接从 Hugging Face 下载 `Llama` 的 `gguf` 格式模型。\n\n要使用此功能，您需要先安装 `huggingface-hub` 包（`pip install huggingface-hub`）。\n\n```python\nllm = Llama.from_pretrained(\n    repo_id=\"lmstudio-community\u002FQwen3.5-0.8B-GGUF\",\n    filename=\"*Q8_0.gguf\",\n    verbose=False\n)\n```\n\n默认情况下，`from_pretrained` 方法会将模型下载至 Hugging Face 缓存目录中；您随后可以使用 `hf` 工具来管理已安装的模型文件。\n\n### 聊天补全\n\n高级 API 也提供了一套简单的聊天补全接口。\n\n聊天补全要求模型能够将消息格式化为单个提示语。`Llama` 类会通过预先注册的聊天格式（如 `chatml`、`llama-2`、`gemma` 等）来完成这一任务，或者通过提供自定义的聊天处理对象来实现。\n\n模型会按照以下优先级顺序对消息进行格式化，以生成单个提示语：\n- 如果提供了 `chat_handler`，则使用该方法\n- 如果提供了 `chat_format`，则使用该方法\n- 如果 `gguf` 模型元数据中包含 `tokenizer.chat_template`，则使用该方法（适用于大多数新模型；旧型号可能未配备此功能）\n- 若以上方法均未生效，则退而采用 `llama-2` 的聊天格式\n\n若要查看所选的聊天格式，请将 `verbose` 参数设置为 `True`。\n\n```python\nfrom llama_cpp import Llama\nllm = Llama(\n      model_path=\"path\u002Fto\u002Fllama-2\u002Fllama-model.gguf\",\n      chat_format=\"llama-2\"\n)\nllm.create_chat_completion(\n      messages = [\n          {\"role\": \"system\", \"content\": \"您是一位能够完美描述图像的助手。\"},\n          {\n              \"role\": \"user\",\n              \"content\": \"请详细描述这张图片。\"\n          }\n      ]\n)\n```\n\n聊天补全功能可通过 `Llama` 类的 `create_chat_completion` 方法实现。\n\n为了兼容 OpenAI API v1，您可以使用 `create_chat_completion_openai_v1` 方法，该方法会返回 Pydantic 模型，而非字典形式的结果。\n\n### JSON 与 JSON Schema 模式\n\n要将聊天回复严格限制为仅包含有效的 JSON 或特定的 JSON Schema，请在 [`create_chat_completion`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama.create_chat_completion) 中使用 `response_format` 参数。\n\n#### JSON 模式\n以下示例会将回复严格限定为仅包含有效的 JSON 字符串。\n\n```python\nfrom llama_cpp import Llama\nllm = Llama(model_path=\"path\u002Fto\u002Fmodel.gguf\", chat_format=\"chatml\")\nllm.create_chat_completion(\n    messages=[\n        {\n            \"role\": \"system\",\n            \"content\": \"您是一位乐于助人的助手，能够以 JSON 格式输出结果。\"\n        },\n        {\"role\": \"user\", \"content\": \"2020 年世界大赛的冠军是谁？\"}\n    ],\n    response_format={\n        \"type\": \"json_object\"\n    },\n    temperature=0.7\n)\n```\n\n#### JSON Schema 模式\n若要进一步将回复严格限制为特定的 JSON Schema，只需在 `response_format` 参数的 `schema` 属性中添加该 Schema 即可。\n\n```python\nfrom llama_cpp import Llama\nllm = Llama(model_path=\"path\u002Fto\u002Fmodel.gguf\", chat_format=\"chatml\")\nllm.create_chat_completion(\n    messages=[\n        {\n            \"role\": \"system\",\n            \"content\": \"您是一位乐于助人的助手，能够以 JSON 格式输出结果。\"\n        },\n        {\"role\": \"user\", \"content\": \"2020 年世界大赛的冠军是谁？\"}\n    ],\n    response_format={\n        \"type\": \"json_object\",\n        \"schema\": {\n            \"type\": \"object\",\n            \"properties\": {\"team_name\": {\"type\": \"string\"}}\n        }\n    },\n    temperature=0.7\n)\n```\n\n### 函数调用\n高级 API 支持与 OpenAI 兼容的函数和工具调用。可通过 `functionary` 预训练模型的聊天格式，或通过通用的 `chatml-function-calling` 聊天格式实现这一功能。\n\n```python\nfrom llama_cpp import Llama\nllm = Llama(model_path=\"path\u002Fto\u002Fchatml\u002Fllama-model.gguf\", chat_format=\"chatml-function-calling\")\nllm.create_chat_completion(\n      messages = [\n        {\n          \"role\": \"system\",\n          \"content\": \"一场好奇的用户与人工智能助手之间的对话。助手会为用户提供有帮助、详尽且礼貌的回答。必要时，助手会调用相关函数并提供恰当的输入。\"\n        },\n        {\n          \"role\": \"user\",\n          \"content\": \"请提取 Jason 的年龄\"\n        }\n      ],\n      tools=[{\n        \"type\": \"function\",\n        \"function\": {\n          \"name\": \"UserDetail\",\n          \"parameters\": {\n            \"type\": \"object\",\n            \"title\": \"UserDetail\",\n            \"properties\": {\n              \"name\": {\n                \"title\": \"姓名\",\n                \"type\": \"string\"\n              },\n              \"age\": {\n                \"title\": \"年龄\",\n                \"type\": \"integer\"\n              }\n            },\n            \"required\": [ \"name\", \"age\" ]\n          }\n        }\n      }],\n      tool_choice={\n        \"type\": \"function\",\n        \"function\": {\n          \"name\": \"UserDetail\"\n        }\n      }\n)\n```\n\n\u003Cdetails>\n\u003Csummary>Functionary v2\u003C\u002Fsummary>\n\n该系列模型的多种 gguf 转换文件可在此处找到 [链接](https:\u002F\u002Fhuggingface.co\u002Fmeetkai)。Functionary 能够智能地调用函数，并能分析任何提供的函数输出，从而生成连贯的回复。所有 Functionary v2 版本的模型均支持 **并行函数调用**。在初始化 Llama 类时，您可以选择使用 `functionary-v1` 或 `functionary-v2` 作为 `chat_format`。\n\n由于 llama.cpp 与 Hugging Face 的分词器之间存在差异，因此在使用 Functionary 时，必须提供 Hugging Face 分词器。可以通过初始化 `LlamaHFTokenizer` 类，并将其传递给 Llama 类来实现。这将覆盖 Llama 类中默认使用的 llama.cpp 分词器。分词器文件已内置于托管 gguf 文件的相应 Hugging Face 仓库中。\n\n```python\nfrom llama_cpp import Llama\nfrom llama_cpp.llama_tokenizer import LlamaHFTokenizer\nllm = Llama.from_pretrained(\n  repo_id=\"meetkai\u002Ffunctionary-small-v2.2-GGUF\",\n  filename=\"functionary-small-v2.2.q4_0.gguf\",\n  chat_format=\"functionary-v2\",\n  tokenizer=LlamaHFTokenizer.from_pretrained(\"meetkai\u002Ffunctionary-small-v2.2-GGUF\")\n)\n```\n\n**注意**：无需提供 Functionary 默认的系统消息，因为这些消息已在 Functionary 聊天处理程序中自动添加。因此，消息应仅包含聊天消息和\u002F或系统消息，以为模型提供额外的上下文信息（例如：日期时间等）。\n\u003C\u002Fdetails>\n\n### 多模态模型\n\n`llama-cpp-python` 支持多种模型，例如 llava1.5，该模型能够同时从文本和图像中读取信息，用于语言模型的处理。\n\n以下是支持的多模态模型及其对应的聊天处理程序（Python API）和聊天格式（Server API）：\n\n| 模型 | `LlamaChatHandler` | `chat_format` |\n|:--- |:--- |:--- |\n| [llava-v1.5-7b](https:\u002F\u002Fhuggingface.co\u002Fmys\u002Fggml_llava-v1.5-7b) | `Llava15ChatHandler` | `llava-1-5` |\n| [llava-v1.5-13b](https:\u002F\u002Fhuggingface.co\u002Fmys\u002Fggml_llava-v1.5-13b) | `Llava15ChatHandler` | `llava-1-5` |\n| [llava-v1.6-34b](https:\u002F\u002Fhuggingface.co\u002Fcjpais\u002Fllava-v1.6-34B-gguf) | `Llava16ChatHandler` | `llava-1-6` |\n| [moondream2](https:\u002F\u002Fhuggingface.co\u002Fvikhyatk\u002Fmoondream2) | `MoondreamChatHandler` | `moondream2` |\n| [nanollava](https:\u002F\u002Fhuggingface.co\u002Fabetlen\u002Fnanollava-gguf) | `NanollavaChatHandler` | `nanollava` |\n| [llama-3-vision-alpha](https:\u002F\u002Fhuggingface.co\u002Fabetlen\u002Fllama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |\n| [minicpm-v-2.6](https:\u002F\u002Fhuggingface.co\u002Fopenbmb\u002FMiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` |\n| [qwen2.5-vl](https:\u002F\u002Fhuggingface.co\u002Funsloth\u002FQwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |\n\n接下来，您需要使用自定义的聊天处理程序来加载剪辑模型，并对聊天消息和图像进行处理。\n\n```python\nfrom llama_cpp import Llama\nfrom llama_cpp.llama_chat_format import Llava15ChatHandler\nchat_handler = Llava15ChatHandler(clip_model_path=\"path\u002Fto\u002Fllava\u002Fmmproj.bin\")\nllm = Llama(\n  model_path=\".\u002Fpath\u002Fto\u002Fllava\u002Fllama-model.gguf\",\n  chat_handler=chat_handler,\n  n_ctx=2048, # n_ctx 应该增加以适应图像嵌入\n)\nllm.create_chat_completion(\n    messages = [\n        {\"role\": \"system\", \"content\": \"您是一位能完美描述图像的助手。\"},\n        {\n            \"role\": \"user\",\n            \"content\": [\n                {\"type\": \"text\", \"text\": \"这张图片里有什么？\"},\n                {\"type\": \"image_url\", \"image_url\": {\"url\": \"https:\u002F\u002Fupload.wikimedia.org\u002Fwikipedia\u002Fcommons\u002Fthumb\u002Fd\u002Fdd\u002FGfp-wisconsin-madison-the-nature-boardwalk.jpg\u002F2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\" } }\n            ]\n        }\n    ]\n)\n```\n\n您也可以通过 `from_pretrained` 方法，从 Hugging Face Hub 中拉取模型。\n\n```python\nfrom llama_cpp import Llama\nfrom llama_cpp.llama_chat_format import MoondreamChatHandler\n\nchat_handler = MoondreamChatHandler.from_pretrained(\n  repo_id=\"vikhyatk\u002Fmoondream2\",\n  filename=\"*mmproj*\"\n)\n\nllm = Llama.from_pretrained(\n  repo_id=\"vikhyatk\u002Fmoondream2\",\n  filename=\"*text-model*\",\n  chat_handler=chat_handler,\n  n_ctx=2048, # n_ctx 应该增加以适应图像嵌入\n)\n\nresponse = llm.create_chat_completion(\n    messages = [\n        {\n            \"role\": \"user\",\n            \"content\": [\n                {\"type\": \"text\", \"text\": \"这张图片里有什么？\"},\n                {\"type\": \"image_url\", \"image_url\": {\"url\": \"https:\u002F\u002Fupload.wikimedia.org\u002Fwikipedia\u002Fcommons\u002Fthumb\u002Fd\u002Fdd\u002FGfp-wisconsin-madison-the-nature-boardwalk.jpg\u002F2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\" } }\n\n            ]\n        }\n    ]\n)\nprint(response[\"choices\"][0][\"text\"])\n```\n\n**注意**：多模态模型还支持工具调用和 JSON 格式。\n\n\u003Cdetails>\n\u003Csummary>加载本地图像\u003C\u002Fsummary>\n\n图像可以作为 Base64 编码的数据 URI 传递。以下示例展示了如何实现这一点。\n\n```python\nimport base64\n\ndef image_to_base64_data_uri(file_path):\n    with open(file_path, \"rb\") as img_file:\n        base64_data = base64.b64encode(img_file.read()).decode('utf-8')\n        return f\"data:image\u002Fpng;base64,{base64_data}\"\n\n# 将 'file_path.png' 替换为您的 PNG 文件的实际路径\nfile_path = 'file_path.png'\ndata_uri = image_to_base64_data_uri(file_path)\n\nmessages = [\n    {\"role\": \"system\", \"content\": \"您是一位能完美描述图像的助手。\"}, \n    {\n        \"role\": \"user\",\n        \"content\": [\n            {\"type\": \"image_url\", \"image_url\": {\"url\": data_uri }},\n            {\"type\": \"text\", \"text\": \"请详细描述这张图片。\"}\n        ]\n    }\n]\n\n```\n\n\u003C\u002Fdetails>\n\n### 思考性解码\n\n`llama-cpp-python` 支持思考性解码功能，使模型能够基于草稿模型生成完整的结果。\n\n使用思考性解码的最快方式是通过 `LlamaPromptLookupDecoding` 类。\n\n在初始化 `Llama` 类时，只需将此类作为草稿模型传入即可。\n\n```python\nfrom llama_cpp import Llama\nfrom llama_cpp.llama_speculative import LlamaPromptLookupDecoding\n\nllama = Llama(\n    model_path=\"path\u002Fto\u002Fmodel.gguf\",\n    draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens 是要预测的 token 数量；10 是默认值，通常对 GPU 来说效果不错，而 2 则更适合仅使用 CPU 的机器。\n)\n```\n\n### 嵌入式表示\n\n要生成文本嵌入，可使用 [`create_embedding`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama.create_embedding) 或 [`embed`](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#llama_cpp.Llama.embed)。请注意，在创建模型时，必须将 `embedding=True` 传递给构造函数，才能使这些方法正常工作。\n\n```python\nimport llama_cpp\n\nllm = llama_cpp.Llama(model_path=\"path\u002Fto\u002Fmodel.gguf\", embedding=True)\n\nembeddings = llm.create_embedding(\"你好，世界！\")\n\n# 或者一次性创建多个嵌入式表示\n\nembeddings = llm.create_embedding([\"你好，世界！\", \"再见，世界！\"])\n```\n\nTransformer 风格的模型中，嵌入式表示主要有两种概念：*token 级别* 和 *序列级别*。序列级别嵌入式表示是通过将 token 级别嵌入式进行“池化”得到的，通常采用平均值或使用第一个 token 作为基础。\n\n专门针对嵌入式表示设计的模型，默认情况下会返回序列级别嵌入式，每条输入字符串对应一个序列级别嵌入式。而像那些专为文本生成设计的非嵌入式模型，则通常只返回 token 级别嵌入式，每条序列中的每个 token 都对应一个嵌入式表示。因此，返回类型的空间维度在 token 级别嵌入式的情况下会高出一个维度。\n\n在某些情况下，您可以通过在模型创建时使用 `pooling_type` 标志来控制池化行为。您可以使用 `LLAMA_POOLING_TYPE_NONE` 来确保任何模型都能返回 token 级别嵌入式。不过，目前无法让面向生成任务的模型生成序列级别嵌入式，但您始终可以手动执行池化操作。\n\n### 调整上下文窗口\n\nLlama 模型的上下文窗口决定了一次可以处理的最大 token 数量。默认情况下，该窗口大小设置为 512 个 token，但您可根据实际需求进行调整。\n\n例如，如果您希望处理更大的上下文，可以在初始化 Llama 对象时，通过设置 `n_ctx` 参数来扩展上下文窗口：\n\n```python\nllm = Llama(model_path=\".\u002Fmodels\u002F7B\u002Fllama-model.gguf\", n_ctx=2048)\n```\n\n## 与 OpenAI 兼容的 Web 服务器\n\n`llama-cpp-python` 提供了一个 Web 服务器，旨在作为 OpenAI API 的即插即用替代品。\n通过使用该服务器，您可以将兼容 llama.cpp 的模型与任何兼容 OpenAI 的客户端（如语言库、服务等）配合使用。\n\n要安装服务器软件包并开始使用：\n\n```bash\npip install 'llama-cpp-python[server]'\npython3 -m llama_cpp.server --model models\u002F7B\u002Fllama-model.gguf\n```\n\n与上述“硬件加速”部分类似，您也可以通过支持 GPU（cuBLAS）的版本进行安装，如下所示：\n\n```bash\nCMAKE_ARGS=\"-DGGML_CUDA=on\" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'\npython3 -m llama_cpp.server --model models\u002F7B\u002Fllama-model.gguf --n_gpu_layers 35\n```\n\n访问 [http:\u002F\u002Flocalhost:8000\u002Fdocs](http:\u002F\u002Flocalhost:8000\u002Fdocs) 即可查看 OpenAPI 文档。\n\n若要绑定到 `0.0.0.0` 以启用远程连接，请使用 `python3 -m llama_cpp.server --host 0.0.0.0`。\n同样地，若要更改端口（默认为 8000），请使用 `--port` 参数。\n\n您可能还希望设置提示格式。对于 chatml 格式，可以使用：\n\n```bash\npython3 -m llama_cpp.server --model models\u002F7B\u002Fllama-model.gguf --chat_format chatml\n```\n\n此命令会根据模型的预期格式化提示。您可以在模型详情页中找到具体的提示格式。\n有关可用选项，请参阅 [llama_cpp\u002Fllama_chat_format.py](llama_cpp\u002Fllama_chat_format.py)，并查找以 `@register_chat_format` 开头的行。\n\n如果您已安装 `huggingface-hub`，还可以使用 `--hf_model_repo_id` 标志从 Hugging Face Hub 加载模型。\n\n```bash\npython3 -m llama_cpp.server --hf_model_repo_id lmstudio-community\u002FQwen3.5-0.8B-GGUF --model '*Q8_0.gguf'\n```\n\n### Web 服务器功能\n\n- [本地 Copilot 替代方案](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#code-completion)\n- [函数调用支持](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#function-calling)\n- [视觉 API 支持](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#multimodal-models)\n- [多模型支持](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fserver\u002F#configuration-and-multi-model-support)\n\n## Docker 镜像\n\nDocker 镜像已在 [GHCR](https:\u002F\u002Fghcr.io\u002Fabetlen\u002Fllama-cpp-python) 上提供。要运行服务器：\n\n```bash\ndocker run --rm -it -p 8000:8000 -v \u002Fpath\u002Fto\u002Fmodels:\u002Fmodels -e MODEL=\u002Fmodels\u002Fllama-model.gguf ghcr.io\u002Fabetlen\u002Fllama-cpp-python:latest\n```\n\n目前，只有通过 Termux 运行此服务的方法是已知的（需 root 权限）。有关更多信息，请参阅 [Termux 支持问题](https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Fissues\u002F389)。\n\n## 低级 API\n\n[API 参考](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002Fen\u002Flatest\u002Fapi-reference\u002F#low-level-api)\n\n低级 API 是对 `llama.cpp` 提供的 C API 的直接 `ctypes` 绑定。整个低级 API 可在 [llama_cpp\u002Fllama_cpp.py](https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Fblob\u002Fmaster\u002Fllama_cpp\u002Fllama_cpp.py) 中找到，并且直接映射了 C API 的结构于 [llama.h](https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fllama.cpp\u002Fblob\u002Fmaster\u002Fllama.h) 中。\n\n以下是一个简短示例，演示如何使用低级 API 对提示进行分词：\n\n```python\nimport llama_cpp\nimport ctypes\nllama_cpp.llama_backend_init(False) # 必须在每个程序启动时调用一次\nparams = llama_cpp.llama_context_default_params()\n# 使用字节类型表示 char * params\nmodel = llama_cpp.llama_load_model_from_file(b\".\u002Fmodels\u002F7b\u002Fllama-model.gguf\", params)\nctx = llama_cpp.llama_new_context_with_model(model, params)\nmax_tokens = params.n_ctx\n# 使用 ctypes 数组来存储数组参数\ntokens = (llama_cpp.llama_token * int(max_tokens))()\nn_tokens = llama_cpp.llama_tokenize(ctx, b\"Q: 请说出太阳系中的行星名称？A: \", tokens, max_tokens, llama_cpp.c_bool(True))\nllama_cpp.llama_free(ctx)\n```\n\n请查看 [examples\u002Flow_level_api] 文件夹，了解更多关于低级 API 的使用示例。\n\n## 文档\n\n文档可通过 [https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002F](https:\u002F\u002Fllama-cpp-python.readthedocs.io\u002F) 获取。\n如果您发现文档有任何问题，请随时提交问题或创建 PR。\n\n## 开发\n\n本软件包正处于积极开发阶段，我们欢迎任何贡献。\n\n要开始使用，请克隆仓库并以可编辑\u002F开发模式安装软件包：\n\n```bash\ngit clone --recurse-submodules https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python.git\ncd llama-cpp-python\n\n# 升级 pip（适用于可编辑模式）\npip install --upgrade pip\n\n# 通过 pip 安装\npip install -e .\n\n# 安装开发工具（测试、文档、ruff）\npip install -e '.[dev]'\n\n# 如果您想使用 fastapi \u002F openapi 服务器\npip install -e '.[server]'\n\n# 若需安装所有可选依赖项\npip install -e '.[all]'\n\n# 清理本地构建缓存\nmake clean\n```\n\n现在尝试运行测试：\n\n```bash\npytest\n```\n\n在提交 PR 前，请检查格式化和代码检查：\n\n```bash\npython -m ruff check llama_cpp tests\npython -m ruff format --check llama_cpp tests\n\n# 或者使用 Makefile 目标\nmake lint\nmake format\n```\n\nMakefile 中包含了一些实用的目标。典型的流程如下：\n\n```bash\nmake build\nmake test\n```\n\n您还可以通过在 `vendor\u002Fllama.cpp` 子模块中检出所需的提交，然后再次运行 `make clean` 和 `pip install -e .`，来测试 `llama.cpp` 的特定提交。如果 `llama.h` API 发生变更，您需要对 `llama_cpp\u002Fllama_cpp.py` 文件进行相应修改，以匹配新的 API（其他地方也可能需要进行额外的调整）。\n\n## 常见问题\n\n### 是否有预编译的二进制文件或二进制轮子可供使用？\n\n推荐的安装方式是按照上述说明从源码编译安装。原因在于，`llama.cpp` 是基于特定于您系统的编译器优化而构建的。若使用预编译的二进制文件，您需要禁用这些优化，或者为每个平台支持大量预编译的二进制文件。\n\n不过，您仍然可以通过发布版以及社区提供的轮子获取一些预编译的二进制文件。\n\n未来，我希望能为常见平台提供预编译的二进制文件和轮子，并乐于接受任何在这方面的有用贡献。目前，这一工作正在 [#741](https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Fissues\u002F741) 中进行跟踪。\n\n### 与 `llama.cpp` 的其他 Python 绑定相比，它有何不同？\n\n我最初编写这个软件包是为了满足自己的需求，主要有两个目标：\n\n- 提供一种简单便捷的方式，用于安装 `llama.cpp` 并从 Python 调用 `llama.h` 中的完整 C API。\n- 提供一个高级别的 Python API，可作为 OpenAI API 的即插即用替代品，从而轻松将现有应用迁移到使用 `llama.cpp` 的环境中。\n\n任何对该软件包的贡献和修改，都将围绕以上目标展开。\n\n## 许可证\n\n本项目采用 MIT 许可证授权。","# llama-cpp-python 快速上手指南\n\n`llama-cpp-python` 是 `llama.cpp` 的 Python 绑定库，允许你在本地高效运行量化后的 LLM 模型（GGUF 格式）。它提供了从底层 C API 访问到高层 OpenAI 兼容接口的完整支持。\n\n## 环境准备\n\n在开始之前，请确保你的系统满足以下要求：\n\n*   **Python 版本**：3.8 或更高版本。\n*   **编译器依赖**：安装过程中需要从源码编译 `llama.cpp`，因此需要安装 C 编译器。\n    *   **Linux**: `gcc` 或 `clang`\n    *   **Windows**: Visual Studio Build Tools 或 MinGW\n    *   **macOS**: Xcode Command Line Tools\n*   **硬件加速（可选）**：如需 GPU 加速，需提前安装对应的驱动和工具包（如 NVIDIA CUDA Toolkit, macOS Metal 等）。\n\n> **注意**：如果你使用的是 Apple Silicon (M1\u002FM2\u002FM3) Mac，请务必安装支持 `arm64` 架构的 Python 版本（推荐使用 Miniforge 或官方 arm64 安装包），否则性能会大幅下降。\n\n## 安装步骤\n\n### 1. 基础安装（CPU 版本）\n\n最简单的安装方式是使用 pip。这将自动下载并编译 `llama.cpp`。\n\n```bash\npip install llama-cpp-python\n```\n\n如果网络较慢导致下载失败，可以使用国内镜像源加速：\n\n```bash\npip install llama-cpp-python -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n### 2. 启用硬件加速（推荐）\n\n为了获得更好的推理速度，建议根据硬件配置设置环境变量进行安装。\n\n#### **NVIDIA GPU (CUDA)**\n确保已安装 CUDA Toolkit (版本 12.1-12.5)。\n\n```bash\nCMAKE_ARGS=\"-DGGML_CUDA=on\" pip install llama-cpp-python\n```\n\n或者安装预编译的 CUDA Wheel（需 Python 3.10+ 且 CUDA 版本匹配）：\n\n```bash\npip install llama-cpp-python \\\n  --extra-index-url https:\u002F\u002Fabetlen.github.io\u002Fllama-cpp-python\u002Fwhl\u002Fcu124\n```\n\n#### **macOS (Metal\u002FMPS)**\n适用于 M 系列芯片 Mac，启用 Metal 加速。\n\n```bash\nCMAKE_ARGS=\"-DGGML_METAL=on\" pip install llama-cpp-python\n```\n\n或者安装预编译的 Metal Wheel：\n\n```bash\npip install llama-cpp-python \\\n  --extra-index-url https:\u002F\u002Fabetlen.github.io\u002Fllama-cpp-python\u002Fwhl\u002Fmetal\n```\n\n#### **AMD GPU (ROCm)**\n```bash\nCMAKE_ARGS=\"-DGGML_HIPBLAS=on\" pip install llama-cpp-python\n```\n\n> **提示**：如果安装失败，请添加 `--verbose` 参数查看详细的编译日志。若需重新编译，请加上 `--upgrade --force-reinstall --no-cache-dir` 参数。\n\n## 基本使用\n\n### 1. 加载模型并生成文本\n\n首先，你需要一个 `.gguf` 格式的模型文件。以下是最简单的高层 API 使用示例：\n\n```python\nfrom llama_cpp import Llama\n\n# 初始化模型\nllm = Llama(\n      model_path=\".\u002Fmodels\u002F7B\u002Fllama-model.gguf\",\n      # n_gpu_layers=-1, # 取消注释以启用 GPU 加速（将所有层卸载到 GPU）\n      # n_ctx=2048,      # 取消注释以设置上下文窗口大小\n)\n\n# 生成文本\noutput = llm(\n      \"Q: Name the planets in the solar system? A: \", # 提示词\n      max_tokens=32,     # 最大生成 token 数\n      stop=[\"Q:\", \"\\n\"], # 停止生成的条件\n      echo=True          # 是否在输出中包含提示词\n)\n\nprint(output)\n```\n\n输出结果为 OpenAI 兼容的字典格式，包含生成的文本、token 用量等信息。\n\n### 2. 直接从 Hugging Face 加载模型\n\n你可以无需手动下载模型，直接使用 `from_pretrained` 方法从 Hugging Face Hub 加载 GGUF 模型（需先安装 `huggingface-hub`）：\n\n```bash\npip install huggingface-hub\n```\n\n```python\nfrom llama_cpp import Llama\n\nllm = Llama.from_pretrained(\n    repo_id=\"lmstudio-community\u002FQwen3.5-0.8B-GGUF\",\n    filename=\"*Q8_0.gguf\", # 支持通配符匹配文件名\n    verbose=False\n)\n\nresponse = llm(\"你好，请介绍一下你自己。\", max_tokens=100)\nprint(response[\"choices\"][0][\"text\"])\n```\n\n### 3. 开启聊天模式\n\n对于对话类模型，可以使用 `create_chat_completion` 方法，它会自动处理消息格式化：\n\n```python\nmessages = [\n    {\"role\": \"system\", \"content\": \"你是一个乐于助人的助手。\"},\n    {\"role\": \"user\", \"content\": \"太阳系有哪些行星？\"}\n]\n\noutput = llm.create_chat_completion(\n      messages=messages,\n      max_tokens=100\n)\n\nprint(output[\"choices\"][0][\"message\"][\"content\"])\n```","某初创团队希望在本地笔记本上部署一个支持代码补全和函数调用的私有 AI 助手，以保护敏感业务数据且避免高昂的云端 API 费用。\n\n### 没有 llama-cpp-python 时\n- **硬件门槛高**：必须依赖昂贵的云端 GPU 实例或高性能服务器才能运行大模型，本地普通 CPU 几乎无法启动推理。\n- **集成复杂度高**：若想对接现有的 LangChain 应用或模仿 OpenAI 接口，需要自行编写复杂的底层 C++ 绑定代码或寻找不稳定的第三方桥接方案。\n- **隐私与成本焦虑**：每次测试新功能都需将代码上传至公有云，既担心核心算法泄露，又因按 Token 计费导致开发测试成本不可控。\n- **功能扩展困难**：想要添加多模态（视觉）识别或自定义函数调用逻辑，往往需要修改模型源码或等待官方云端功能更新，响应极慢。\n\n### 使用 llama-cpp-python 后\n- **轻量本地运行**：直接利用笔记本 CPU 甚至集成显卡即可流畅运行量化后的 Llama 模型，无需任何云端依赖，实现真正的离线开发。\n- **无缝生态对接**：通过其内置的 OpenAI 兼容服务端和 LangChain 适配器，仅需几行配置即可让旧项目平滑切换至本地模型，保留原有调用逻辑。\n- **数据完全自主**：所有推理过程均在本地内存完成，敏感代码和业务数据不出内网，同时彻底消除了云端 API 的调用费用。\n- **灵活功能定制**：原生支持函数调用、多模型热切换及视觉输入，开发者可像搭积木一样快速构建具备代码补全能力的本地 Copilot 替代品。\n\nllama-cpp-python 让开发者能以最低的成本和最简单的代码，将强大的大模型能力安全、高效地嵌入到任何本地 Python 应用中。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fabetlen_llama-cpp-python_4b831423.png","abetlen","Andrei","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fabetlen_6c078dce.jpg",null,"San Francisco, California","https:\u002F\u002Fgithub.com\u002Fabetlen",[83,87,91,95,99],{"name":84,"color":85,"percentage":86},"Python","#3572A5",96.8,{"name":88,"color":89,"percentage":90},"CMake","#DA3434",1.2,{"name":92,"color":93,"percentage":94},"Shell","#89e051",0.9,{"name":96,"color":97,"percentage":98},"Dockerfile","#384d54",0.6,{"name":100,"color":101,"percentage":102},"Makefile","#427819",0.4,10137,1352,"2026-04-05T10:34:00","MIT","Linux, macOS, Windows","非必需（支持纯 CPU 运行）。若需加速，支持 NVIDIA GPU (CUDA 12.1-12.5)、AMD GPU (ROCm\u002FhipBLAS)、Intel GPU (SYCL\u002FVulkan) 或 Apple Silicon (Metal\u002FMPS)。预编译 Wheel 要求 CUDA 版本为 12.1, 12.2, 12.3, 12.4 或 12.5。显存大小取决于模型参数量，文中未指定具体数值。","未说明",{"notes":111,"python":112,"dependencies":113},"1. 安装时会自动从源码构建 llama.cpp，若失败可添加 --verbose 查看日志。2. 支持通过 CMAKE_ARGS 环境变量或 pip config-settings 配置后端加速（如 CUDA, Metal, OpenBLAS 等）。3. macOS M1\u002FM2 用户务必安装 arm64 架构的 Python，否则性能会下降 10 倍；若遇架构错误需强制指定 CMAKE_OSX_ARCHITECTURES=arm64 重新安装。4. 提供预编译 Wheel 以简化 CPU、CUDA 和 Metal 环境的安装。5. 升级或重装时需添加 --upgrade --force-reinstall --no-cache-dir 参数以确保重新构建。","3.8+",[114,115],"C 编译器 (Linux: gcc\u002Fclang, Windows: Visual Studio\u002FMinGW, macOS: Xcode)","huggingface-hub (可选，用于从 HF 下载模型)",[26,13],"2026-03-27T02:49:30.150509","2026-04-06T05:19:39.461155",[120,125,130,134,139,143,148],{"id":121,"question_zh":122,"answer_zh":123,"source_url":124},6446,"在 Windows 上安装时提示需要 Visual Studio 或找不到 C 编译器怎么办？","这是因为构建 wheel 需要编译环境。解决方法是下载并安装 Visual Studio Build Tools（包含 C++ 构建工具）：\n访问 https:\u002F\u002Fvisualstudio.microsoft.com\u002Fvs\u002Ffeatures\u002Fcplusplus\u002F 下载并安装。\n安装完成后重新运行 pip install 命令即可。如果不想自行编译，也可以尝试直接安装预编译的 Linux wheel（仅限 Linux 环境）。","https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Fissues\u002F40",{"id":126,"question_zh":127,"answer_zh":128,"source_url":129},6447,"在 macOS (Apple Silicon\u002FM1) 上安装时出现 libpython3.11.a 文件未找到的错误如何解决？","该问题通常发生在 Conda 环境中。请确保你使用的是支持 arm64 架构的 Python 版本（如通过 miniforge3 安装）。\n如果是启用 Metal 加速安装，请使用以下命令：\nCMAKE_ARGS=\"-DLLAMA_METAL=on\" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir\n如果仍然报错，检查是否安装了完整的 Xcode 命令行工具，并确保环境变量配置正确。","https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Fissues\u002F714",{"id":131,"question_zh":132,"answer_zh":133,"source_url":129},6448,"如何正确克隆项目以避免子模块缺失导致的安装失败？","该项目包含子模块，如果使用 git clone 必须加上 --recurse-submodules 参数，否则会导致编译失败。\n正确的克隆命令为：\ngit clone --recurse-submodules https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python.git\n克隆后进入目录执行：\ncd llama-cpp-python\npip install .",{"id":135,"question_zh":136,"answer_zh":137,"source_url":138},6449,"是否有预编译的 Wheel 包可以避免用户自行构建？","目前官方尚未提供所有平台的预编译 Wheel（特别是带 CUDA 或 Metal 支持的版本），用户通常需要本地构建。\n对于高级用户，可以通过 qemu 或 Docker 在 GitHub Actions 中交叉编译 arm64 或其他架构的 Wheel，但这需要大量磁盘空间和调试时间。\n临时方案是直接安装特定版本的预编译 Wheel（如果存在），例如：\npython -m pip install https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Freleases\u002Fdownload\u002Fv0.1.36\u002Fllama_cpp_python-0.1.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl","https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Fissues\u002F243",{"id":140,"question_zh":141,"answer_zh":142,"source_url":129},6450,"如何在 Windows 上解决加载共享库时的 ctypes 错误？","在 Windows 平台上，加载 DLL 时可能需要指定 winmode 参数。\n可以尝试修改源码 llama_cpp\u002Fllama_cpp.py 第 55 行附近：\n将 return ctypes.CDLL(str(_lib_path))\n改为 return ctypes.CDLL(str(_lib_path), winmode=0)\n此外，确保已安装 CUDA Toolkit 并设置了 CUDA_PATH 环境变量，这对某些硬件加速功能是必需的。",{"id":144,"question_zh":145,"answer_zh":146,"source_url":147},6451,"如何减少重复调用 chat_completion API 时的延迟？","库已内置提示词缓存机制。当多次调用具有相同前缀的 prompt 时，模型会自动复用已计算的 KV 缓存，从而跳过重复计算部分。\n无需额外配置，只需确保传入的 prompt 包含之前已处理过的前缀内容即可生效。这对于聊天场景中的多轮对话优化非常有效。","https:\u002F\u002Fgithub.com\u002Fabetlen\u002Fllama-cpp-python\u002Fissues\u002F44",{"id":149,"question_zh":150,"answer_zh":151,"source_url":124},6452,"在 Linux 上安装时缺少 build-essential 导致编译失败怎么办？","在 Debian\u002FUbuntu 系统上，需要先安装基础构建工具：\nsudo apt install build-essential\n安装完成后再运行 pip install llama-cpp-python 即可正常编译安装。",[153,157,161,165,169,173,177,181,185,189,193,197,201,205,209,213,217,221,225,229],{"id":154,"version":155,"summary_zh":79,"released_at":156},106014,"v0.3.20-metal","2026-04-03T07:00:02",{"id":158,"version":159,"summary_zh":79,"released_at":160},106015,"v0.3.20-cu124","2026-04-03T08:10:29",{"id":162,"version":163,"summary_zh":79,"released_at":164},106016,"v0.3.20-cu123","2026-04-03T08:19:29",{"id":166,"version":167,"summary_zh":79,"released_at":168},106017,"v0.3.20-cu122","2026-04-03T08:11:28",{"id":170,"version":171,"summary_zh":79,"released_at":172},106018,"v0.3.20-cu121","2026-04-03T08:09:31",{"id":174,"version":175,"summary_zh":79,"released_at":176},106019,"v0.3.19-metal","2026-03-25T22:51:28",{"id":178,"version":179,"summary_zh":79,"released_at":180},106020,"v0.3.19-cu124","2026-03-25T23:39:36",{"id":182,"version":183,"summary_zh":79,"released_at":184},106021,"v0.3.19-cu123","2026-03-25T23:42:29",{"id":186,"version":187,"summary_zh":79,"released_at":188},106022,"v0.3.19-cu122","2026-03-25T23:43:36",{"id":190,"version":191,"summary_zh":79,"released_at":192},106023,"v0.3.19-cu121","2026-03-25T23:41:09",{"id":194,"version":195,"summary_zh":79,"released_at":196},106024,"v0.3.19","2026-03-26T00:05:53",{"id":198,"version":199,"summary_zh":79,"released_at":200},106025,"v0.3.18-cu124","2026-03-25T10:09:13",{"id":202,"version":203,"summary_zh":79,"released_at":204},106026,"v0.3.18-cu123","2026-03-25T10:12:46",{"id":206,"version":207,"summary_zh":79,"released_at":208},106027,"v0.3.18-cu122","2026-03-25T10:09:44",{"id":210,"version":211,"summary_zh":79,"released_at":212},106028,"v0.3.18-cu121","2026-03-25T10:10:43",{"id":214,"version":215,"summary_zh":79,"released_at":216},106029,"v0.3.18-metal","2026-03-24T10:25:25",{"id":218,"version":219,"summary_zh":79,"released_at":220},106030,"v0.3.18","2026-03-24T11:32:06",{"id":222,"version":223,"summary_zh":79,"released_at":224},106031,"v0.3.17-metal","2026-03-23T06:44:08",{"id":226,"version":227,"summary_zh":79,"released_at":228},106032,"v0.3.16-metal","2025-08-15T05:04:17",{"id":230,"version":231,"summary_zh":79,"released_at":232},106033,"v0.3.16-cu124","2025-08-15T06:19:09"]