[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-varunvasudeva1--llm-server-docs":3,"tool-varunvasudeva1--llm-server-docs":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",146793,2,"2026-04-08T23:32:35",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108111,"2026-04-08T11:23:26",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":10,"last_commit_at":59,"category_tags":60,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[35,15,13,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":75,"owner_location":75,"owner_email":75,"owner_twitter":75,"owner_website":75,"owner_url":76,"languages":75,"stars":77,"forks":78,"last_commit_at":79,"license":80,"difficulty_score":81,"env_os":82,"env_gpu":83,"env_ram":84,"env_deps":85,"category_tags":98,"github_topics":99,"view_count":32,"oss_zip_url":75,"oss_zip_packed_at":75,"status":17,"created_at":113,"updated_at":114,"faqs":115,"releases":136},5731,"varunvasudeva1\u002Fllm-server-docs","llm-server-docs","End-to-end documentation to set up your own local & fully private LLM server on Debian. Equipped with chat, web search, RAG, model management, MCP servers, image generation, and TTS.","llm-server-docs 是一套详尽的端到端指南，旨在帮助用户在 Debian 系统上搭建完全本地化且隐私安全的个人大语言模型（LLM）服务器。它解决了用户依赖云端服务导致的数据隐私泄露、高昂 API 费用以及网络延迟等痛点，让所有数据处理均在本地完成。\n\n这套文档特别适合希望掌握数据主权的技术爱好者、开发者及研究人员。通过跟随指南，用户可以整合多种开源组件，构建功能完备的 AI 工作台：不仅支持基于 Ollama、llama.cpp 或 vLLM 的模型推理和 Open WebUI 聊天界面，还集成了联网搜索（SearXNG）、知识库检索（RAG）、图像生成（ComfyUI）以及语音合成（Kokoro）等高级功能。\n\n其技术亮点在于提供了模块化的软件栈架构，并详细涵盖了从 Docker 容器加固、SSH 安全配置到利用 Tailscale 实现安全远程访问的全流程。此外，它还引入了 MCP（模型上下文协议）代理服务，增强了模型与外部工具的交互能力。无论是想部署私有知识库，还是构建多功能的本地 AI 助手，llm-server-docs 都能提供清晰、可操作的技术路径。","# Local LLaMA Server Setup Documentation\n\n_TL;DR_: End-to-end documentation to set up your own local & fully private LLM server on Debian. Equipped with chat, web search, RAG, model management, MCP servers, image generation, and TTS, along with steps for configuring SSH, firewall, and secure remote access via Tailscale.\n\nSoftware Stack:\n\n- Inference Engine ([Ollama](https:\u002F\u002Fgithub.com\u002Follama\u002Follama), [llama.cpp](https:\u002F\u002Fgithub.com\u002Fggml-org\u002Fllama.cpp), [vLLM](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm))\n- Search Engine ([SearXNG](https:\u002F\u002Fgithub.com\u002Fsearxng\u002Fsearxng))\n- Model Server ([llama-swap](https:\u002F\u002Fgithub.com\u002Fmostlygeek\u002Fllama-swap), `systemd` service)\n- Chat Platform ([Open WebUI](https:\u002F\u002Fgithub.com\u002Fopen-webui\u002Fopen-webui))\n- MCP Proxy Server ([mcp-proxy](https:\u002F\u002Fgithub.com\u002Fsparfenyuk\u002Fmcp-proxy), [MCPJungle](https:\u002F\u002Fgithub.com\u002Fmcpjungle\u002FMCPJungle))\n- Text-to-Speech Server ([Kokoro FastAPI](https:\u002F\u002Fgithub.com\u002Fremsky\u002FKokoro-FastAPI))\n- Image Generation Server ([ComfyUI](https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI))\n\n![Software Stack Architectural Diagram](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fvarunvasudeva1_llm-server-docs_readme_1432289eb875.png)\n\n## Table of Contents\n\n- [Local LLaMA Server Setup Documentation](#local-llama-server-setup-documentation)\n  - [Table of Contents](#table-of-contents)\n  - [About](#about)\n  - [Priorities](#priorities)\n  - [Prerequisites](#prerequisites)\n  - [General](#general)\n    - [Allow `sudo` Permissions](#allow-sudo-permissions)\n    - [Update System Packages](#update-system-packages)\n    - [Schedule Startup Script](#schedule-startup-script)\n    - [Configure Script Permissions](#configure-script-permissions)\n    - [Configure Auto-Login (optional)](#configure-auto-login-optional)\n  - [Docker](#docker)\n    - [Add User to Docker Group](#add-user-to-docker-group)\n    - [Nvidia Container Toolkit](#nvidia-container-toolkit)\n    - [Create a Network](#create-a-network)\n    - [Harden Docker Containers](#harden-docker-containers)\n    - [Helpful Commands](#helpful-commands)\n  - [HuggingFace CLI](#huggingface-cli)\n    - [Manage Models](#manage-models)\n    - [Download Models](#download-models)\n    - [Delete Models](#delete-models)\n  - [Search Engine](#search-engine)\n    - [SearXNG](#searxng)\n    - [Open WebUI Integration](#open-webui-integration)\n  - [Inference Engine](#inference-engine)\n    - [Ollama](#ollama)\n    - [llama.cpp](#llamacpp)\n    - [vLLM](#vllm)\n    - [Open WebUI Integration](#open-webui-integration-1)\n    - [Ollama vs. llama.cpp](#ollama-vs-llamacpp)\n    - [vLLM vs. Ollama\u002Fllama.cpp](#vllm-vs-ollamallamacpp)\n  - [Model Server](#model-server)\n    - [llama-swap](#llama-swap)\n    - [`systemd` Service](#systemd-service)\n    - [Open WebUI Integration](#open-webui-integration-2)\n      - [llama-swap](#llama-swap-1)\n      - [`systemd` Service](#systemd-service-1)\n  - [Chat Platform](#chat-platform)\n    - [Open WebUI](#open-webui)\n  - [MCP Proxy Server](#mcp-proxy-server)\n    - [mcp-proxy](#mcp-proxy)\n    - [MCPJungle](#mcpjungle)\n    - [Comparison](#comparison)\n    - [Open WebUI Integration](#open-webui-integration-3)\n      - [mcp-proxy](#mcp-proxy-1)\n      - [MCPJungle](#mcpjungle-1)\n    - [VS Code\u002FClaude Desktop Integration](#vs-codeclaude-desktop-integration)\n  - [Text-to-Speech Server](#text-to-speech-server)\n    - [Kokoro FastAPI](#kokoro-fastapi)\n    - [Open WebUI Integration](#open-webui-integration-4)\n  - [Image Generation Server](#image-generation-server)\n    - [ComfyUI](#comfyui)\n    - [Open WebUI Integration](#open-webui-integration-5)\n  - [SSH](#ssh)\n  - [Firewall](#firewall)\n  - [Remote Access](#remote-access)\n    - [Tailscale](#tailscale)\n      - [Installation](#installation)\n      - [Exit Nodes](#exit-nodes)\n      - [Local DNS](#local-dns)\n      - [Third-Party VPN Integration](#third-party-vpn-integration)\n  - [Updating](#updating)\n    - [General](#general-1)\n    - [Nvidia Drivers \\& CUDA](#nvidia-drivers--cuda)\n    - [Ollama](#ollama-1)\n    - [llama.cpp](#llamacpp-1)\n    - [vLLM](#vllm-1)\n    - [llama-swap](#llama-swap-2)\n    - [Open WebUI](#open-webui-1)\n    - [mcp-proxy\u002FMCPJungle](#mcp-proxymcpjungle)\n    - [Kokoro FastAPI](#kokoro-fastapi-1)\n    - [ComfyUI](#comfyui-1)\n  - [Troubleshooting](#troubleshooting)\n    - [Docker](#docker-1)\n    - [`ssh`](#ssh-1)\n    - [Nvidia Drivers](#nvidia-drivers)\n    - [Ollama](#ollama-2)\n    - [vLLM](#vllm-2)\n    - [Open WebUI](#open-webui-2)\n  - [Monitoring](#monitoring)\n  - [Notes](#notes)\n    - [Software](#software)\n    - [Hardware](#hardware)\n  - [References](#references)\n  - [Acknowledgements](#acknowledgements)\n\n## About\n\nThis repository outlines the steps to run a server for running local language models. It uses Debian specifically, but most Linux distros should follow a very similar process. It aims to be a guide for Linux beginners like me who are setting up a server for the first time.\n\nThe process involves installing the requisite drivers, setting the GPU power limit, setting up auto-login, and scheduling the `init.bash` script to run at boot. All these settings are based on my ideal setup for a language model server that runs most of the day but a lot can be customized to suit your needs.\n\n> [!IMPORTANT]\n> No part of this guide was written using AI - any hallucinations are the good old human kind. While I've done my absolute best to ensure correctness in every step\u002Fcommand, check **everything** you execute in a terminal. Enjoy!\n\n## Priorities\n\n- **Simplicity**: It should be relatively straightforward to set up the components of the solution.\n- **Stability**: The components should be stable and capable of running for weeks at a time without any intervention necessary.\n- **Security**: The components should be able to be tightly secured and limited in their capability to damage the system in case of a known vulnerability affecting any of the components.\n- **Maintainability**: The components and their interactions should be uncomplicated enough that you know enough to maintain them as they evolve (because they *will* evolve).\n- **Aesthetics**: The result should be as close to a cloud provider's chat platform as possible. A homelab solution doesn't necessarily need to feel like it was cobbled together haphazardly.\n- **Modularity**: Components in the setup should be able to be swapped out for newer\u002Fmore performant\u002Fbetter maintained alternatives easily. Standard protocols (OpenAI-compatibility, MCPs, etc.) help with this a lot and, in this guide, they are always preferred over bundled solutions.\n- **Open source**: The code should be able to be verified by a community of engineers. Chat platforms and LLMs involve large amounts of personal data conveyed in natural language and it's important to know that data isn't going outside your machine.\n\n## Prerequisites\n\nAny modern CPU and GPU combination should work for this guide. Previously, compatibility with AMD GPUs was an issue but the latest releases of Ollama have worked through this and [AMD GPUs are now supported natively](https:\u002F\u002Follama.com\u002Fblog\u002Famd-preview). \n\nFor reference, this guide was built around the following system:\n- **CPU**: Intel Core i5-12600KF\n- **Memory**: 96GB 3200MHz DDR4 RAM\n- **Storage**: 1TB M.2 NVMe SSD\n- **GPU**: 2x Nvidia RTX 3090 (24GB)\n\n> [!NOTE]\n> **AMD GPUs**: Power limiting is skipped for AMD GPUs as [AMD has recently made it difficult to set power limits on their GPUs](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Flinux_gaming\u002Fcomments\u002F1b6l1tz\u002Fno_more_power_limiting_for_amd_gpus_because_it_is\u002F). Naturally, skip any steps involving `nvidia-smi` or `nvidia-persistenced` and the power limit in the `init.bash` script.\n> \n> **CPU-only**: You can skip the GPU driver installation and power limiting steps. The rest of the guide should work as expected.\n\n> [!NOTE]\n> This guide uses `~\u002F` (or `\u002Fhome\u002F\u003Cyour_username>`) as the base directory. If you're working in different directory, please modify all your commands accordingly.\n\nTo begin the process of setting up your server, you will need the following:\n\n- Fresh install of Debian\n- Internet connection\n- Basic understanding of the Linux terminal\n- Peripherals like a monitor, keyboard, and mouse\n\nTo install Debian on your newly built server hardware:\n\n- Download the [Debian ISO](https:\u002F\u002Fwww.debian.org\u002Fdistrib\u002F) from the official website.\n- Create a bootable USB using a tool like [Rufus](https:\u002F\u002Frufus.ie\u002Fen\u002F) for Windows or [Balena Etcher](https:\u002F\u002Fetcher.balena.io) for MacOS.\n- Boot into the USB and install Debian.\n\nFor a more detailed guide on installing Debian, refer to the [official documentation](https:\u002F\u002Fwww.debian.org\u002Freleases\u002Fbuster\u002Famd64\u002F). For those who aren't yet experienced with Linux, I recommend using the graphical installer - you will be given an option between the text-based installer and graphical installer. \n\nI also recommend installing a lightweight desktop environment like XFCE for ease of use. Other options like GNOME or KDE are also available - GNOME may be a better option for those using their server as a primary workstation as it is more feature-rich (and, as such, heavier) than XFCE.\n\n## General\n\n### Allow `sudo` Permissions\n\nTo do a bunch of things in this guide, we need to be root. But we don't have the ability to act as root until root allows our user to. First, we'll switch to root and grant our user permission to run commands with `sudo`.\n\n> [!TIP]\n> `sudo` stands for \"superuser doer\" - in Linux, it signals to the OS that you want the command you're running to be running on behalf of the root user. This should be used sparingly on a highly secure system (we want user-specific permissions for most processes) and carefully since it can affect the system with the same power as the root user. Don't overthink its use or worry when using it - just know that it **can** be dangerous if used incorrectly.\n\n- Switch to root:\n    ```bash\n    su root\n    ```\n\n- Run the following command to add your user to the `sudo` group (which has the permissions we're looking for):\n    ```bash\n    sudo usermod -a -G sudo \u003Cusername>\n    ```\n    > Replace `\u003Cusername>` with your username.\n\n    Save and exit (`Ctrl+X`).\n- Close your existing terminal and open a new session. This is required to see the changes.\n- (Optional) Test your new permissions by running `ls` (command to view the contents of a directory) with `sudo`:\n    ```bash\n    sudo ls\n    ```\n\n### Update System Packages\n\n- Update the system by running the following commands:\n    ```\n    sudo apt update\n    sudo apt upgrade\n    ```\n\nNow, we'll install the required GPU drivers that allow programs to utilize their compute capabilities.\n\n**Nvidia GPUs**\n- Follow Nvidia's [guide on downloading CUDA Toolkit](https:\u002F\u002Fdeveloper.nvidia.com\u002Fcuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Debian). The instructions are specific to your machine and the website will lead you to them interactively.\n- Run the following commands:\n    ```bash\n    sudo apt install linux-headers-amd64\n    sudo apt install nvidia-driver firmware-misc-nonfree\n    ```\n- Reboot the server.\n- Run the following command to verify the installation:\n    ```bash\n    nvidia-smi\n    ```\n  \n**AMD GPUs**\n- Run the following commands:\n    ```bash\n    deb http:\u002F\u002Fdeb.debian.org\u002Fdebian bookworm main contrib non-free-firmware\n    apt install firmware-amd-graphics libgl1-mesa-dri libglx-mesa0 mesa-vulkan-drivers xserver-xorg-video-all\n    ```\n- Reboot the server.\n\nWe'll also install some packages that are not installed on Debian by default but may be required later:\n```\nsudo apt install libcurl cmake\n```\n\n### Schedule Startup Script\n\nIn this step, we'll create a script called `init.bash`. This script will be run at boot to set the GPU power limit and start the server using Ollama. We set the GPU power limit lower because it has been seen in testing and inference that there is only a 5-15% performance decrease for a 30% reduction in power consumption. This is especially important for servers that are running 24\u002F7.\n\n- Run the following commands:\n    ```bash\n    touch init.bash\n    nano init.bash\n    ```\n- Add the following lines to the script:\n    ```bash\n    #!\u002Fbin\u002Fbash\n    sudo nvidia-smi -pm 1\n    sudo nvidia-smi -pl \u003Cpower_limit>\n    ```\n    > Replace `\u003Cpower_limit>` with the desired power limit in watts. For example, `sudo nvidia-smi -pl 250`.\n\n    For multiple GPUs, modify the script to set the power limit for each GPU:\n    ```bash\n    sudo nvidia-smi -i 0 -pl \u003Cpower_limit>\n    sudo nvidia-smi -i 1 -pl \u003Cpower_limit>\n    ```\n- Save and exit the script.\n- Make the script executable:\n    ```bash\n    chmod +x init.bash\n    ```\n\nAdding the `init.bash` script to the crontab will schedule it to run at boot.\n\n- Run the following command:\n    ```bash\n    crontab -e\n    ```\n- Add the following line to the file:\n    ```bash\n    @reboot \u002Fpath\u002Fto\u002Finit.bash\n    ```\n    > Replace `\u002Fpath\u002Fto\u002Finit.bash` with the path to the `init.bash` script.\n\n- (Optional) Add the following line to shutdown the server at 12am:\n    ```bash\n    0 0 * * * \u002Fsbin\u002Fshutdown -h now\n    ```\n- Save and exit the file.\n\n### Configure Script Permissions\n\nWe want `init.bash` to run the `nvidia-smi` commands without having to enter a password. This is done by giving `nvidia-persistenced` and `nvidia-smi` passwordless `sudo` permissions, and can be achieved by editing the `sudoers` file.\n\nAMD users can skip this step as power limiting is not supported on AMD GPUs.\n\n- Run the following command to edit the sudoers file:\n    ```bash\n    sudo visudo\n    ```\n- Add the following lines to the file:\n    ```\n    \u003Cusername> ALL=(ALL) NOPASSWD: \u002Fusr\u002Fbin\u002Fnvidia-persistenced\n    \u003Cusername> ALL=(ALL) NOPASSWD: \u002Fusr\u002Fbin\u002Fnvidia-smi\n    ```\n    > Replace `\u003Cusername>` with your username.\n- Save and exit the file.\n\n> [!IMPORTANT]\n> Ensure that you add these lines AFTER `%sudo ALL=(ALL:ALL) ALL`. The order of the lines in the file matters - the last matching line will be used so if you add these lines before `%sudo ALL=(ALL:ALL) ALL`, they will be ignored.\n\n### Configure Auto-Login (optional)\n\nWhen the server boots up, we may want it to automatically log in to a user account and run the `init.bash` script. This is done by configuring the `lightdm` display manager.\n\n- Run the following command:\n    ```bash\n    sudo nano \u002Fetc\u002Flightdm\u002Flightdm.conf\n    ```\n- Find the following commented line. It should be in the `[Seat:*]` section.\n    ```\n    # autologin-user=\n    ```\n- Uncomment the line and add your username:\n    ```\n    autologin-user=\u003Cusername>\n    ```\n    > Replace `\u003Cusername>` with your username.\n- Save and exit the file.\n\n## Docker\n\n📖 [**Documentation**](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002F)\n\nDocker is a containerization platform that allows you to run applications in isolated environments. This subsection follows [Docker's guide](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002Finstall\u002Fdebian\u002F) to install Docker Engine on Debian. The commands are listed below, but visiting the guide is recommended in case instructions have changed.\n\n- If you already have a Docker installation on your system, it's a good idea to re-install so there are no broken\u002Fout-of-date dependencies. The command below will iterate through your system's installed packages and remove the ones associated with Docker.\n    ```bash\n    for pkg in docker.io docker-doc docker-compose podman-docker containerd runc; do sudo apt purge $pkg; done\n    ```\n\n- Run the following commands:\n    ```bash\n    # Add Docker's official GPG key:\n    sudo apt update\n    sudo apt install ca-certificates curl\n    sudo install -m 0755 -d \u002Fetc\u002Fapt\u002Fkeyrings\n    sudo curl -fsSL https:\u002F\u002Fdownload.docker.com\u002Flinux\u002Fdebian\u002Fgpg -o \u002Fetc\u002Fapt\u002Fkeyrings\u002Fdocker.asc\n    sudo chmod a+r \u002Fetc\u002Fapt\u002Fkeyrings\u002Fdocker.asc\n\n    # Add the repository to Apt sources:\n    sudo tee \u002Fetc\u002Fapt\u002Fsources.list.d\u002Fdocker.sources \u003C\u003CEOF\n    Types: deb\n    URIs: https:\u002F\u002Fdownload.docker.com\u002Flinux\u002Fdebian\n    Suites: $(. \u002Fetc\u002Fos-release && echo \"$VERSION_CODENAME\")\n    Components: stable\n    Signed-By: \u002Fetc\u002Fapt\u002Fkeyrings\u002Fdocker.asc\n    EOF\n\n    sudo apt update\n    ```\n- Install the Docker packages:\n    ```bash\n    sudo apt install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin\n    ```\n- Verify the installation (optional):\n    ```bash\n    sudo systemctl status docker\n    ```\n\n    If it hasn't started, try manually starting the daemon:\n    ```bash\n    sudo systemctl start docker\n    ```\n\n### Add User to Docker Group\n\nSo that we can forgo the use of `sudo` for Docker commands and run them directly under our user, we will add our user to the `docker` group. This isn't strictly necessary but convenient. This sub-section uses [Docker's own post-install documentation](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002Finstall\u002Flinux-postinstall\u002F) as a reference - have a look just in case things have changed since the time of writing.\n\n> [!CAUTION]\n> Being in the `docker` group grants significant system access - effectively root-equivalent privileges to the Docker daemon. Only add trusted users to this group.\n\n1. Add your user to the Docker group:\n    ```bash\n    sudo usermod -aG docker $USER\n    ```\n\n2. Apply the changes:\n    ```bash\n    newgrp docker\n    ```\n    \n3. Verify your new permissions by checking running containers:\n    ```bash\n    docker ps -a\n    ```\n\nIf you encounter \"permission denied\" and Docker commands still require sudo, log out and log back in.\n\n> [!NOTE]\n> If you decide not to do this, you will need to append `sudo` in front of any `docker` commands.\n\n### Nvidia Container Toolkit\n\nYou will most likely want to use GPUs via Docker - this will require Nvidia Container Toolkit, which allows Docker to allocate\u002Fde-allocate memory on Nvidia GPUs. The steps for installing this are listed below, but it is recommended to reference [Nvidia's documentation](https:\u002F\u002Fdocs.nvidia.com\u002Fdatacenter\u002Fcloud-native\u002Fcontainer-toolkit\u002Flatest\u002Finstall-guide.html) for the most up-to-date commands.\n\n1. Configure the repository\n    ```bash\n    curl -fsSL https:\u002F\u002Fnvidia.github.io\u002Flibnvidia-container\u002Fgpgkey | sudo gpg --dearmor -o \u002Fusr\u002Fshare\u002Fkeyrings\u002Fnvidia-container-toolkit-keyring.gpg \\\n    && curl -s -L https:\u002F\u002Fnvidia.github.io\u002Flibnvidia-container\u002Fstable\u002Fdeb\u002Fnvidia-container-toolkit.list | \\\n        sed 's#deb https:\u002F\u002F#deb [signed-by=\u002Fusr\u002Fshare\u002Fkeyrings\u002Fnvidia-container-toolkit-keyring.gpg] https:\u002F\u002F#g' | \\\n        sudo tee \u002Fetc\u002Fapt\u002Fsources.list.d\u002Fnvidia-container-toolkit.list\n    ```\n\n2. Update packages:\n    ```bash\n    sudo apt update\n    ```\n\n3. Install Nvidia Container Toolkit packages:\n    ```bash\n    sudo apt install -y nvidia-container-toolkit\n    ```\n\n4. Restart the Docker daemon:\n    ```bash\n    sudo systemctl restart docker\n    ```\n\n### Create a Network\n\nWe'll be running most services via Docker containers. To allow multiple containers to communicate with each other, we can open up ports via UFW (which we'll [configure later](#firewall)) but this is less optimal than creating a Docker network. This way, all containers on the network can securely talk to each other without needing to open ports for the many services we could run via UFW, inherently creating a more secure setup.\n\nWe'll call this network `app-net`: you can call it anything you like, just be sure to update the commands that use it later.\n\nRun the following:\n\n```bash\ndocker network create app-net\n```\n\nThat's it! Now, when we create containers, we can reference it as follows:\n\n**Docker Run**\n```bash\ndocker run \u003Ccontainer> --network app-net\n```\n\n**Docker Compose**\n```yaml\nservices:\n  \u003Ccontainer>:\n    # add this\n    networks:\n      - app-net\n\n# add this\nnetworks:\n  app-net:\n    external: true\n```\n\n> Replace `\u003Ccontainer>` with the actual service - don't forget to add other parameters too.\n\nWith this configured, we can now call containers by name and port. Let's pretend we're calling the `\u002Fhealth` endpoint in `llama-swap` from `open-webui` (two actual containers we'll be creating later on) to ensure that the containers can see and speak to each other. Run (`CTRL+C` to quit):\n\n```bash\ndocker exec -i open-webui curl http:\u002F\u002Fllama-swap:8080\u002Fhealth\n```\n\nYou could also run it the other way to be extra sure:\n\n```bash\ndocker exec -it llama-swap curl http:\u002F\u002Fopen-webui:8080\n```\n\n> [!IMPORTANT]\n> The port is always the **internal port** the container is running on. If a container runs on 1111:8080, for example, 1111 is the port on the host (where you might access it, like `http:\u002F\u002Flocalhost:1111` or `http:\u002F\u002F\u003Cserver_ip>:1111`) and 8080 is the internal port the container is running on. Thus, trying to access the container on 1111 via `app-net` will not work. Remembering this when specifying URLs in services will save you a lot of unnecessary \"why isn't this working?\" pains.\n\n### Harden Docker Containers\n\nTo \"harden\" something in software terms is to make it more secure and more resilient to cyberattacks. Usually, this involves reducing the surface area by which an attacker can gain access to the system but it also includes mitigating what an attacker can do assuming they have successfully gotten access. Essentially, we will set up both prevention and cure - although, as they say, an ounce of prevention is better than a pound of cure. This sub-section leverages knowledge from [this Reddit comment](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fselfhosted\u002Fcomments\u002F1pr74r4\u002Fcomment\u002Fnv07sp4\u002F), courtesy of [u\u002Farnedam](https:\u002F\u002Fwww.reddit.com\u002Fuser\u002Farnedam\u002F). Another good reference is the [OWASP Docker Security Cheat Sheet](https:\u002F\u002Fcheatsheetseries.owasp.org\u002Fcheatsheets\u002FDocker_Security_Cheat_Sheet.html).\n\nTo a large extent, the security objectives we're aiming towards can be achieved by applying the [Principle of Least Privilege (PoLP)](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FPrinciple_of_least_privilege) - in other words, giving our containers exactly and **only** what they need to perform their function. For each guideline, I've provided an existing risk and a mitigation strategy (along with how it fixes the problem); while not strictly necessary, learning *how* things are at risk is beneficial to start thinking securely about the services you host.\n\n> [!NOTE]\n> This sub-section is **critical** for public-facing services. However, even if you only intend to access your services via a private network (**preferred if possible**), doing this is quite literally never a bad idea because it limits how much damage can be done to your server and its services if it ever was to become compromised somehow.\n\n1. Run the service under your user, not root:\n    ```yaml\n    user: \"\u003Cyour_user_id>:\u003Cyour_group_id>\"\n    ```\n    To find `\u003Cyour_user_id>`, run `id -u`. To find `\u003Cyour_group_id>`, run `id -g`.\n\n    **RISK**: Unless specified otherwise, containers run under the root user. Compromised containers running under root can do all sorts of damage, ranging anywhere from executing malicious code to prying on\u002Fdeleting system files.\n\n    **MITIGATION**: By specifying a user that the container runs under, the container's privilege is limited to the user's privilege, which is almost definitively lower than the root user's privilege to modify the system.\n\n2. Make the file system read-only:\n    ```yaml\n    read_only: true\n    ```\n\n    **RISK**: By default, containers are allowed to read and write the files they have access to. This means an attacker who has successfully compromised your container could write bogus files\u002Fmalicious code and spy on\u002Fdelete critical information.\n    \n    **MITIGATION**: Adding this will prevent attackers from being able to write malicious code files or delete important data from your system that are accessible to the container.\n\n> [!WARNING]\n> Apply the `read_only: true` directive with care: if your service leverages file writes, like the many services that write to `\u002Ftmp`, those subprocesses will fail. If those subprocesses are critical, you may even see the container fail to start. Expect this **not** to work in an all-encompassing way for all containers - you will most likely need to surgically apply the `:ro` directive to volume mounts in most places.\n\n3. Limit the physical resources your container can access:\n    ```yaml\n    # maximum number of processes the container can execute\n    pids_limit: 512\n    # maximium memory for the container\n    mem_limit: 3g\n    # maximum number of CPU cores the container can utilize - can be fractional\n    cpus: 3\n    ```\n\n    **RISK**: Containers are, by default, given unrestricted physical resource allocation capabilities. This means that if a container was to be compromised, it would be able to allocate as much of your CPU clock cycles, RAM capacity, and VRAM capacity to executing malicious code.\n\n    **MITIGATION**: Allocating fixed limits to resources will prevent attackers from using the entirety of your server's physical resources to do damage if your container was to be compromised, e.g. running a botnet via your machine.\n\n    **Docker Swarm**\n\n    If you're using Docker Swarm (not used in this guide), you need to format it slightly differently:\n\n    ```yaml\n    deploy:\n      resources:\n        limits:\n          pids: 512\n          memory: 3g\n          cpus: 3\n    ```\n\n4. Disable `tty` and `stdin` in the container:\n    ```yaml\n    tty: false\n    stdin_open: false\n    ```\n\n    **RISK**: `stdin` gives the container the ability for commands to be written to it (input injection) and `tty` gives the container an active shell environment - together, they grant the container complete interactive shell capability such that potentially malicious commands can be run from it.\n\n    **MITIGATION**: Disabling these will prevent attackers from being able to execute code via the containers, severely minimizing [arbitrary code execution](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FArbitrary_code_execution) (ACE) vulnerabilities.\n\n5. Disallow the container from elevating its own privileges:\n    ```yaml\n    security_opt:\n      - \"no-new-privileges=true\"\n    ```\n\n    **RISK**: Containers are able to elevate their own access given an interactive shell environment. This could even override the `user` directive provided earlier, giving root-level access to the system via the container.\n\n    **MITIGATION**: Adding this line will prevent attackers from being able to override the `user` directive we provided earlier and stop them from being able to grant the now-compromised container root-level permissions.\n\n6. Drop default container capabilities:\n    ```yaml\n    cap_drop:\n      - ALL\n    ```\n\n    **RISK**: Containers are given a lot of permissions by default. Most of these permissions are not required by most containers. Giving extra access increases the attack vector surface area in case a container was to become compromised - especially since these capabilities can impact the host kernel.\n\n    **MITIGATION**: Adding this will drop the widely-permissive default permissions granted to containers. This ensures that we only give containers the system capabilities to do the things they are required to do.\n\n    `cap_drop` with a value of `ALL` can be too aggressive for containers that require multiple capabilities. In this case, right after the above block, consider reviewing the following commonly-used capabilities. They are **not all required and should not all be added** - they are just a starting point for some capabilities sometimes required by containers.\n    ```yaml\n    cap_add:\n      # LOWER RISK\n      # Make arbitrary changes to file UIDs and GIDs (see chown(2))\n      - CHOWN\n      # Make arbitrary manipulations of process GIDs and supplementary GID list\n      - SETGID\n      # Make arbitrary manipulations of process UIDs\n      - SETUID\n      # Bind a socket to internet domain privileged ports (port numbers less than 1024)\n      - NET_BIND_SERVICE\n\n      # HIGHER RISK - CAN AFFECT IMPORTANT SYSTEM-LEVEL CONFIGURATIONS\n      # Perform various network-related operations\n      - NET_ADMIN\n      # Use RAW and PACKET sockets\n      - NET_RAW\n      # Perform a range of system administration operations\n      - SYS_ADMIN\n    ```\n\n    This will, in order, drop all capabilities and then surgically re-add the ones that are required for the functionality the container achieves. A complete list of capabilities can be found [here](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002Fcontainers\u002Frun\u002F#runtime-privilege-and-linux-capabilities).\n    \n    Containers with inadequate capabilities will fail to run. If you drop one that is required, you could:\n    1. Ideally, take a look at the list of capabilities referenced above, and perform some trial-and-error addition of capabilities. LLMs can help greatly with this endeavor, especially when equipped with tools that can make them capable of reading documentation or the source code of the service you're looking to run. You can skip this directive until you have everything configured and come back to it later.\n    2. Less ideally, give up and remove the `cap_drop` directive entirely. I wouldn't recommend it but, with private services, this is hardly the most insecure setup in the world if you do everything else.\n\n7. Prevent excessive logging:\n    ```yaml\n    logging:\n      driver: \"json-file\"\n      options:\n        max-file: \"10\"\n        max-size: \"20m\"\n    ```\n\n    **RISK**: Much like a [zip bomb](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FZip_bomb), a logging bomb can recursively clutter and eventually render the system unusable by overwhelming the system with an ever-increasing number of large log files.\n\n    **MITIGATION**: By capping the maximum number of files (10) and sizes of the individual files (20MB), we can limit the effectiveness of an attack like this. \n\n    Feel free to change the limits (reasonably) as you wish if you run into issues.\n\n8. Limit the temporary file directory's privileges:\n    ```yaml\n    tmpfs:\n      - \u002Ftmp:rw,noexec,nosuid,nodev,size=512m\n    ```\n\n    **RISK**: If allowed to download files, compromised containers could both install malicious files to the filesystem and execute them.\n\n    **MITIGATION**: Setting up a temporary file area with very limited permissions stops this potential attack vector. Containers are limited to downloading files here, in a little sandbox with a fixed small size and without script execution privileges.\n\n9. Mount read-only volumes:\n    ```yaml\n    volumes:\n      - \u002Fpath\u002Fto\u002Fmount1:\u002Fmount1:ro\n      - \u002Fpath\u002Fto\u002Fmount2:\u002Fmount2:ro\n    ```\n\n    **RISK**: To access the file system of the host, Docker containers need directories \"mounted\" as volumes. This lets them read and write from the host as if the container was the host itself. Allowing write access to every mount is usually not necessary and can allow a compromised container to delete or overwrite important information in a cyber-attack.\n\n    **MITIGATION**: Mounting volumes as read-only, wherever possible, eliminates the container's ability to destroy the data in those volumes via writes. This wouldn't prevent spyware, but it would limit data from being lost in an attack.\n\n    > Replace `\u002Fpath\u002Fto\u002Fmount1` and `\u002Fpath\u002Fto\u002Fmount2` with actual directory paths.\n\n> [!IMPORTANT]\n> For containers where free allocation of CPU cores and memory is crucial - llama-swap, for example - you will not want to limit the maximum resources the container can access as shown in step 3.\n\nHere's a combined chunk to copy + paste into your existing services' Compose files:\n\n```yaml\nservices:\n  \u003Cservice_name>:\n    # 1\n    user: \"\u003Cyour_user_id>:\u003Cyour_group_id>\"\n    # 2 - Uncomment only if the container does not write\n    # read_only: true\n    # 3\n    pids_limit: 512\n    mem_limit: 3g\n    cpus: 3\n    # 4\n    tty: false\n    stdin_open: false\n    # 5\n    security_opt:\n      - \"no-new-privileges=true\"\n    # 6\n    cap_drop:\n      - ALL\n    # Add cap_add section if required\n    # 7\n    logging:\n      driver: \"json-file\"\n      options:\n        max-file: \"10\"\n        max-size: \"20m\"\n    # 8\n    tmpfs:\n      - \u002Ftmp:rw,noexec,nosuid,nodev,size=512m\n    # 9\n    volumes:\n      - \u002Fpath\u002Fto\u002Fmount1:\u002Fmount1:ro\n      - \u002Fpath\u002Fto\u002Fmount2:\u002Fmount2:ro\n```\n\n### Helpful Commands\n\nIn the process of setting up this server (or anywhere down the rabbit hole of setting up services), you're likely to use Docker often. For the uninitiated, here are helpful commands that will make navigating and troubleshooting containers easier:\n\n- See available\u002Frunning containers: `docker ps -a`\n- Restart a container: `docker restart \u003Ccontainer_name>`\n- View a container's logs (live): `docker logs -f \u003Ccontainer_name>` (`CTRL+C` to quit)\n- Rename a container: `docker rename \u003Ccontainer_name> \u003Cnew_container_name>`\n- Sometimes, a single service will spin up multiple containers, e.g. `xyz-server` and `xyz-db`. To restart both simultaneously, run the following command **from inside the directory containing the Compose file**: `docker compose restart`\n- Recreate a service: `docker compose down && docker compose up -d`\n- Test a container's configuration: `docker compose config --quiet`\n- View a container's resolved configuration: `docker compose config`\n- List Docker networks: `docker network ls`\n\n> [!TIP]\n> There are no set rules when it comes to how you set up your Docker containers\u002Fservices. However, here are my two cents:  \n> It's cleanest to use Docker Compose (`docker compose up -d` with a `docker-compose.yaml` file as opposed to `docker run -d \u003Cimage_name>`). Unless you take copious notes on your homelab and its setup, this method is almost self-documenting and keeps a neat trail of the services you run via their compose files. One compose file per directory is standard.\n\n> [!TIP]\n> Occasionally, restarting a container is not enough to persist changes to configurations. If ever you find that changes you made to a service are not showing up, recreate the service altogether first before continuing to other troubleshooting.\n\n## HuggingFace CLI\n\n📖 [**Documentation**](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fhuggingface_hub\u002Fmain\u002Fen\u002Fguides\u002Fcli)\n\nHuggingFace is the leading open-source ML\u002FAI platform - it hosts models (including LLMs), datasets, and demo apps that can be used to test models. For the purpose of this guide, we'll be using HuggingFace to download popular open-source LLMs. \n\n> [!NOTE]\n> Only needed for llama.cpp\u002FvLLM.\n\n- Create a new virtual environment:\n    ```bash\n    python3 -m venv hf-env\n    source hf-env\u002Fbin\u002Factivate\n    ```\n- Download the `huggingface_hub` package using `pip`:\n    ```bash\n    pip install -U \"huggingface_hub[cli]\"\n    ```\n- Create an authentication token on https:\u002F\u002Fhuggingface.com\n- Log in to HF Hub:\n    ```bash\n    hf auth login\n    ```\n- Enter your token when prompted.\n- Run the following to verify your login:\n    ```bash\n    hf auth whoami\n    ```\n\n    The result should be your username.\n\n### Manage Models\n\nModels can be downloaded either to the default location (`.cache\u002Fhuggingface\u002Fhub`) or to any local directory you specify. Where the model is stored can be defined using the `--local-dir` command line flag. Not specifying this will result in the model being stored in the default location. Storing the model in the folder where the packages for the inference engine are stored is good practice - this way, everything you need to run inference on a model is stored in the same place. However, if you use the same models with multiple backends frequently (e.g. using Qwen_QwQ-32B-Q4_K_M.gguf with both llama.cpp and vLLM), either set a common model directory or use the default HF option without specifying this flag.\n\nFirst, activate the virtual environment that contains `huggingface_hub`:\n```\nsource hf-env\u002Fbin\u002Factivate\n```\n\n### Download Models\n\nModels are downloaded using their HuggingFace tag. Here, we'll use bartowski\u002FQwen_QwQ-32B-GGUF as an example. To download a model, run:\n```\nhf download bartowski\u002FQwen_QwQ-32B-GGUF Qwen_QwQ-32B-Q4_K_M.gguf --local-dir models\n```\nEnsure that you are in the correct directory when you run this.\n\n### Delete Models\n\nTo delete a model in the specified location, run:\n```\nrm \u003Cmodel_name>\n```\n\nTo delete a model in the default location, run:\n```\nhf delete-cache\n```\n\nThis will start an interactive session where you can remove models from the HuggingFace directory. In case you've been saving models in a different location than `.cache\u002Fhuggingface`, deleting models from there will free up space but the metadata will remain in the HF cache until it is deleted properly. This can be done via the above command but you can also simply delete the model directory from `.cache\u002Fhuggingface\u002Fhub`.\n\n## Search Engine\n\n> [!NOTE]\n> This step is optional but highly recommended for grounding LLMs with relevant search results from reputable sources. Targeted web searches via MCP tool calls make reports generated by LLMs much less prone to random hallucinations.\n\n### SearXNG\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fsearxng\u002Fsearxng)  \n📖 [**Documentation**](https:\u002F\u002Fdocs.searxng.org)  \n\nTo power our search-based workflows, we don't want to rely on a search provider that can monitor searches. While using any search engine has this problem, metasearch engines like SearXNG mitigate it to a decent degree. SearXNG aggregates results from over 245 [search services](https:\u002F\u002Fdocs.searxng.org\u002Fuser\u002Fconfigured_engines.html#configured-engines) and does not track\u002Fprofile users. You can use a hosted instance on the Internet but, considering the priorities of this guide and how trivial it is to set one up, we'll be spinning up our own instance on port 5050.\n\n1. Start the container:\n    ```bash\n    docker pull searxng\u002Fsearxng\n    export PORT=5050\n    docker run \\\n        -d -p ${PORT}:8080 \\\n        --name searxng \\\n        --network app-net \\\n        -v \"${PWD}\u002Fsearxng:\u002Fetc\u002Fsearxng\" \\\n        -e \"BASE_URL=http:\u002F\u002F0.0.0.0:$PORT\u002F\" \\\n        -e \"INSTANCE_NAME=searxng\" \\\n        --restart unless-stopped \\\n        searxng\u002Fsearxng\n    ```\n\n2. Edit `settings.yml` to include JSON format support:\n    ```bash\n    sudo nano searxng\u002Fsettings.yml\n    ```\n\n    Add the following:\n    ```yaml\n      search:\n        # ...other parameters here...\n        formats:\n            - html\n            - json      # add this line\n    ```\n\n3. Restart the container with `docker restart searxng`\n\n### Open WebUI Integration\n\nIn case you want a simple web search workflow and want to skip MCP servers\u002Fagentic setups, Open WebUI supports web search functionality natively. Navigate to `Admin Panel > Settings > Web Search` and set the following values:\n\n- Enable `Web Search`\n- Web Search Engine: `searxng`\n- Searxng Query URL: `http:\u002F\u002Fsearxng:8080\u002Fsearch?q=\u003Cquery>`\n- API Key: `anything-you-like`\n\n## Inference Engine\n\nThe inference engine is one of the primary components of this setup. It is code that takes model files containing weights and makes it possible to get useful outputs from them. This guide allows a choice between llama.cpp, vLLM, and Ollama - all of these are popular inference engines with different priorities and stengths (note: Ollama uses llama.cpp under the hood and is simply a CLI wrapper). It can be daunting to jump straight into the deep end with command line arguments in llama.cpp and vLLM. If you're a power user and enjoy the flexibility afforded by tight control over serving parameters, using either llama.cpp or vLLM will be a wonderful experience and really come down to the quantization format you decide. However, if you're a beginner or aren't yet comfortable with this, Ollama can be convenient stopgap while you build the skills you need or the very end of the line if you decide your current level of knowledge is enough!\n\n### Ollama\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Follama\u002Follama)  \n📖 [**Documentation**](https:\u002F\u002Fgithub.com\u002Follama\u002Follama\u002Ftree\u002Fmain\u002Fdocs)  \n🔧 [**Engine Arguments**](https:\u002F\u002Fgithub.com\u002Follama\u002Follama\u002Fblob\u002Fmain\u002Fdocs\u002Fmodelfile.md)\n\nOllama will be installed as a service, so it runs automatically at boot.\n\n- Download Ollama from the official repository:\n    ```\n    curl -fsSL https:\u002F\u002Follama.com\u002Finstall.sh | sh\n    ```\n\nWe want our API endpoint to be reachable by the rest of the LAN. For Ollama, this means setting `OLLAMA_HOST=0.0.0.0` in the `ollama.service`.\n\n- Run the following command to edit the service:\n    ```\n    systemctl edit ollama.service\n    ```\n- Find the `[Service]` section and add `Environment=\"OLLAMA_HOST=0.0.0.0\"` under it. It should look like this:\n    ```\n    [Service]\n    Environment=\"OLLAMA_HOST=0.0.0.0\"\n    ```\n- Save and exit.\n- Reload the environment.\n    ```\n    systemctl daemon-reload\n    systemctl restart ollama\n    ```\n\n> [!TIP]\n> If you installed Ollama manually or don't use it as a service, remember to run `ollama serve` to properly start the server. Refer to [Ollama's troubleshooting steps](#ollama-3) if you encounter an error.\n\n### llama.cpp\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fggml-org\u002Fllama.cpp)  \n📖 [**Documentation**](https:\u002F\u002Fgithub.com\u002Fggml-org\u002Fllama.cpp\u002Ftree\u002Fmaster\u002Fdocs)  \n🔧 [**Engine Arguments**](https:\u002F\u002Fgithub.com\u002Fggml-org\u002Fllama.cpp\u002Ftree\u002Fmaster\u002Fexamples\u002Fserver)\n\n- Clone the llama.cpp GitHub repository:\n    ```\n    git clone https:\u002F\u002Fgithub.com\u002Fggml-org\u002Fllama.cpp.git\n    cd llama.cpp\n    ```\n- Build the binary:\n\n    **CPU**\n    ```\n    cmake -B build\n    cmake --build build --config Release\n    ```\n\n    **CUDA**\n    ```\n    cmake -B build -DGGML_CUDA=ON\n    cmake --build build --config Release\n    ```\n    For other systems looking to use Metal, Vulkan and other low-level graphics APIs, view the complete [llama.cpp build documentation](https:\u002F\u002Fgithub.com\u002Fggml-org\u002Fllama.cpp\u002Fblob\u002Fmaster\u002Fdocs\u002Fbuild.md) to leverage accelerated inference.\n\n### vLLM\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm)  \n📖 [**Documentation**](https:\u002F\u002Fdocs.vllm.ai\u002Fen\u002Fstable\u002Findex.html)  \n🔧 [**Engine Arguments**](https:\u002F\u002Fdocs.vllm.ai\u002Fen\u002Fstable\u002Fserving\u002Fengine_args.html)\n\nvLLM comes with its own OpenAI-compatible API that we can use just like Ollama. Where Ollama runs GGUF model files, vLLM can run AWQ, GPTQ, GGUF, BitsAndBytes, and safetensors (the default release type) natively.\n\n**Manual Installation (Recommended)**\n\n- Create a directory and virtual environment for vLLM:\n    ```\n    mkdir vllm\n    cd vllm\n    python3 -m venv .venv\n    source .venv\u002Fbin\u002Factivate\n    ```\n\n- Install vLLM using `pip`:\n    ```\n    pip install vllm\n    ```\n\n- Serve with your desired flags. It uses port 8000 by default, but I'm using port 8556 here so it doesn't conflict with any other services:\n    ```\n    vllm serve \u003Cmodel> --port 8556\n    ```\n\n- To use as a service, add the following block to `init.bash` to serve vLLM on startup:\n    ```\n    source .venv\u002Fbin\u002Factivate\n    vllm serve \u003Cmodel> --port 8556\n    ```\n    > Replace `\u003Cmodel>` with your desired model tag, copied from HuggingFace.\n\n**Docker Installation**\n\n- Run:\n    ```\n    docker run --gpus all \\\n    -v ~\u002F.cache\u002Fhuggingface:\u002Froot\u002F.cache\u002Fhuggingface \\\n    --env \"HUGGING_FACE_HUB_TOKEN=\u003Cyour_hf_hub_token>\" \\\n    -p 8556:8000 \\\n    --ipc=host \\\n    vllm\u002Fvllm-openai:latest \\\n    --model \u003Cmodel>\n    ```\n    > Replace `\u003Cyour_hf_hub_token>` with your HuggingFace Hub token and `\u003Cmodel>` with your desired model tag, copied from HuggingFace.\n\nTo serve a different model:\n\n- First stop the existing container:\n    ```\n    docker ps -a\n    docker stop \u003Cvllm_container_ID>\n    ```\n\n- If you want to run the exact same setup again in the future, skip this step. Otherwise, run the following to delete the container and not clutter your Docker container environment:\n    ```\n    docker rm \u003Cvllm_container_ID>\n    ```\n\n- Rerun the Docker command from the installation with the desired model.\n    ```\n    docker run --gpus all \\\n    -v ~\u002F.cache\u002Fhuggingface:\u002Froot\u002F.cache\u002Fhuggingface \\\n    --env \"HUGGING_FACE_HUB_TOKEN=\u003Cyour_hf_hub_token>\" \\\n    -p 8556:8000 \\\n    --ipc=host \\\n    vllm\u002Fvllm-openai:latest \\\n    --model \u003Cmodel>\n    ```\n\n### Open WebUI Integration\n> [!NOTE]\n> Only needed for llama.cpp\u002FvLLM.\n\nNavigate to `Admin Panel > Settings > Connections` and set the following values:\n\n- Enable `OpenAI API`\n- API Base URL: `http:\u002F\u002Fhost.docker.internal:\u003Cport>\u002Fv1`\n- API Key: `anything-you-like`\n\n> [!NOTE]\n> `host.docker.internal` is a magic hostname that resolves to the internal IP address assigned to the host by Docker. This allows containers to communicate with services running on the host, such as databases or web servers, without needing to know the host's IP address. It simplifies communication between containers and host-based services, making it easier to develop and deploy applications.\n\n### Ollama vs. llama.cpp\n\n| **Aspect**                 | **Ollama (Wrapper)**                                          | **llama.cpp (Vanilla)**                                                                   |\n| -------------------------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |\n| **Installation\u002FSetup**     | One-click install & CLI model management                      | Requires manual setup\u002Fconfiguration                                                       |\n| **Open WebUI Integration** | First-class citizen                                           | Requires OpenAI-compatible endpoint setup                                                 |\n| **Model Switching**        | Native model-switching via server                             | Requires manual port management or [llama-swap](https:\u002F\u002Fgithub.com\u002Fmostlygeek\u002Fllama-swap) |\n| **Customizability**        | Limited: Modelfiles are cumbersome                            | Full control over parameters via CLI                                                      |\n| **Transparency**           | Defaults may override model parameters (e.g., context length) | Full transparency in parameter settings                                                   |\n| **GGUF Support**           | Inherits llama.cpp's best-in-class implementation             | Best GGUF implementation                                                                  |\n| **GPU-CPU Splitting**      | Inherits llama.cpp's efficient splitting                      | Trivial GPU-CPU splitting out-of-the-box                                                  |\n\n---\n\n### vLLM vs. Ollama\u002Fllama.cpp\n| **Feature**             | **vLLM**                                     | **Ollama\u002Fllama.cpp**                                                                  |\n| ----------------------- | -------------------------------------------- | ------------------------------------------------------------------------------------- |\n| **Vision Models**       | Supports Qwen 2.5 VL, Llama 3.2 Vision, etc. | Ollama supports some vision models, llama.cpp does not support any (via llama-server) |\n| **Quantization**        | Supports AWQ, GPTQ, BnB, etc.                | Only supports GGUF                                                                    |\n| **Multi-GPU Inference** | Yes                                          | Yes                                                                                   |\n| **Tensor Parallelism**  | Yes                                          | No                                                                                    |\n\nIn summary,\n\n- **Ollama**: Best for those who want an \"it just works\" experience.\n- **llama.cpp**: Best for those who want total control over their inference servers and are familiar with engine arguments.\n- **vLLM**: Best for those who want (i) to run non-GGUF quantizations of models, (ii) multi-GPU inference using tensor parallelism, or (iii) to use vision models.\n\nUsing Ollama as a service offers no degradation in experience because unused models are offloaded from VRAM after some time. Using vLLM or llama.cpp as a service keeps a model in memory, so I wouldn't use this alongside Ollama in an automated, always-on fashion unless it was your primary inference engine. Essentially,\n\n| Primary Engine | Secondary Engine | Run SE as service? |\n| -------------- | ---------------- | ------------------ |\n| Ollama         | llama.cpp\u002FvLLM   | No                 |\n| llama.cpp\u002FvLLM | Ollama           | Yes                |\n\n## Model Server\n\n> [!NOTE]\n> Only needed for manual installations of llama.cpp\u002FvLLM. Ollama manages model management via its CLI.\n\nWhile the above steps will help you get up and running with an OpenAI-compatible LLM server, they will not help with this server persisting after you close your terminal window or restart your physical server. They also won't allow a chat platform to reliably reference and swap between the various models you have available - a likely use-case in a landscape where different models specialize in different tasks. Running the inference engine via Docker can achieve this persistence with the `-d` (for \"detach\") flag but (i) services like llama.cpp and vLLM are usually configured without Docker and (ii) it can't swap models on-demand. This necessitates a server that can manage loading\u002Funloading, swapping, and listing available models.\n\n### llama-swap\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fmostlygeek\u002Fllama-swap)  \n📖 [**Documentation**](https:\u002F\u002Fgithub.com\u002Fmostlygeek\u002Fllama-swap\u002Fwiki)\n\n> [!TIP]\n> This is my recommended way to run llama.cpp\u002FvLLM models.\n\nllama-swap is a lightweight proxy server for LLMs that solves our pain points from above. It's an extremely configurable tool that allows a single point of entry for models from various backends. Models can be set up in groups, listed\u002Funlisted easily, configured with customized hyperparameters, and monitored using streamed logs in the llama-swap web UI.\n\nIn the installation below, we'll use `Qwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf` for llama.cpp and `Qwen\u002FQwen3-4B-Instruct-2507` for vLLM. We'll also use port 7000 to serve the models on.\n\n1. Create a new directory with the `config.yaml` file:\n    ```bash\n    sudo mkdir llama-swap\n    cd llama-swap\n    sudo nano config.yaml\n    ```\n\n2. Enter the following and save:\n\n    **llama.cpp**\n    ```yaml\n    models:\n        \"qwen3-4b\":\n            proxy: \"http:\u002F\u002F127.0.0.1:7000\"\n            cmd: |\n            \u002Fapp\u002Fllama-server\n            -m \u002Fmodels\u002FQwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf\n            # or use `-hf unsloth\u002FQwen3-4B-Instruct-2507-GGUF:Q4_K_XL` for HuggingFace\n            --port 7000\n    ```\n\n    **vLLM (Docker)**\n    ```yaml\n    models:\n        \"qwen3-4b\":\n            proxy: \"http:\u002F\u002F127.0.0.1:7000\"\n            cmd: |\n            docker run --name qwen-vllm\n            --init --rm -p 7000:8080\n            --ipc=host \\\n            vllm\u002Fvllm-openai:latest\n            -m \u002Fmodels\u002FQwen\u002FQwen3-4B-Instruct-2507\n            cmdStop: docker stop qwen-vllm\n    ```\n\n    **vLLM (local)**:\n    ```yaml\n    models:\n        \"qwen3-4b\":\n            proxy: \"http:\u002F\u002F127.0.0.1:7000\"\n            cmd: |\n            source \u002Fapp\u002Fvllm\u002F.venv\u002Fbin\u002Factivate && \\\n            \u002Fapp\u002Fvllm\u002F.venv\u002Fbin\u002Fvllm serve \\\n            --port 7000 \\\n            --host 0.0.0.0 \\\n            -m \u002Fmodels\u002FQwen\u002FQwen3-4B-Instruct-2507\n            cmdStop: pkill -f \"vllm serve\"\n    ```\n\n3. Install the container:\n    \n    We use the `cuda` tag here, but llama-swap offers `cpu`, `intel`, `vulkan`, and `musa` tags as well. Releases can be found [here](https:\u002F\u002Fgithub.com\u002Fmostlygeek\u002Fllama-swap\u002Fpkgs\u002Fcontainer\u002Fllama-swap).\n\n    **llama.cpp**\n    ```bash\n    docker run -d --gpus all --restart unless-stopped --network app-net --pull=always --name llama-swap -p 9292:8080 \\\n    -v \u002Fpath\u002Fto\u002Fmodels:\u002Fmodels \\\n    -v \u002Fhome\u002F\u003Cyour_username>\u002Fllama-swap\u002Fconfig.yaml:\u002Fapp\u002Fconfig.yaml \\\n    -v \u002Fhome\u002F\u003Cyour_username>\u002Fllama.cpp\u002Fbuild\u002Fbin\u002Fllama-server:\u002Fapp\u002Fllama-server \\\n    ghcr.io\u002Fmostlygeek\u002Fllama-swap:cuda\n    ```\n\n    **vLLM (Docker\u002Flocal)**\n    ```bash\n    docker run -d --gpus all --restart unless-stopped --network app-net --pull=always --name llama-swap -p 9292:8080 \\\n    -v \u002Fpath\u002Fto\u002Fmodels:\u002Fmodels \\\n    -v \u002Fhome\u002F\u003Cyour_username>\u002Fvllm:\u002Fapp\u002Fvllm \\\n    -v \u002Fhome\u002F\u003Cyour_username>\u002Fllama-swap\u002Fconfig.yaml:\u002Fapp\u002Fconfig.yaml \\\n    ghcr.io\u002Fmostlygeek\u002Fllama-swap:cuda\n    ```\n\n    > Replace \u003Cyour_username> with your actual username and `\u002Fpath\u002Fto\u002Fmodels` with the path to your model files.\n\n> [!NOTE]\n> llama-swap prefers Docker-based vLLM due to cleanliness of environments and adherence to SIGTERM signals sent by the server. I've written out both options here.\n\nThis should result in a functioning llama-swap instance running at `http:\u002F\u002Flocalhost:9292`, which can be confirmed by running `curl http:\u002F\u002Flocalhost:9292\u002Fhealth`. It is **highly recommended** that you read the [configuration documentation](https:\u002F\u002Fgithub.com\u002Fmostlygeek\u002Fllama-swap\u002Fwiki\u002FConfiguration). llama-swap is thoroughly documented and highly configurable - utilizing its capabilities will result in a tailored setup ready to deploy as you need it.\n\n### `systemd` Service\n\nThe other way to persist a model across system reboots is to start the inference engine in a `.service` file that will run alongside the Linux operating system when booting, ensuring that it is available whenever the server is on. If you're willing to live with the relative compromise of not being able to swap models\u002Fbackends and are satisfied with running one model, this is the lowest overhead solution and works great.\n\nLet's call the service we're about to build `llm-server.service`. We'll assume all models are in the `models` child directory - you can change this as you need to.\n\n1. Create the `systemd` service file:\n    ```bash\n    sudo nano \u002Fetc\u002Fsystemd\u002Fsystem\u002Fllm-server.service\n    ```\n\n2. Configure the service file:\n\n    **llama.cpp**\n    ```ini\n    [Unit]\n    Description=LLM Server Service\n    After=network.target\n\n    [Service]\n    User=\u003Cuser>\n    Group=\u003Cuser>\n    WorkingDirectory=\u002Fhome\u002F\u003Cuser>\u002Fllama.cpp\u002Fbuild\u002Fbin\u002F\n    ExecStart=\u002Fhome\u002F\u003Cuser>\u002Fllama.cpp\u002Fbuild\u002Fbin\u002Fllama-server \\\n        --port \u003Cport> \\\n        --host 0.0.0.0 \\\n        -m \u002Fhome\u002F\u003Cuser>\u002Fllama.cpp\u002Fmodels\u002F\u003Cmodel> \\\n        --no-webui # [other engine arguments]\n    Restart=always\n    RestartSec=10s\n\n    [Install]\n    WantedBy=multi-user.target\n    ```\n\n    **vLLM**\n    ```ini\n    [Unit]\n    Description=LLM Server Service\n    After=network.target\n\n    [Service]\n    User=\u003Cuser>\n    Group=\u003Cuser>\n    WorkingDirectory=\u002Fhome\u002F\u003Cuser>\u002Fvllm\u002F\n    ExecStart=\u002Fbin\u002Fbash -c 'source .venv\u002Fbin\u002Factivate && vllm serve --port \u003Cport> --host 0.0.0.0 -m \u002Fhome\u002F\u003Cuser>\u002Fvllm\u002Fmodels\u002F\u003Cmodel>'\n    Restart=always\n    RestartSec=10s\n\n    [Install]\n    WantedBy=multi-user.target\n    ```\n    > Replace `\u003Cuser>`, `\u003Cport>`, and `\u003Cmodel>` with your Linux username, desired port for serving, and desired model respectively.\n\n3. Reload the `systemd` daemon:\n    ```bash\n    sudo systemctl daemon-reload\n    ```\n4. Run the service:\n\n    If `llm-server.service` doesn't exist:\n    ```bash\n    sudo systemctl enable llm-server.service\n    sudo systemctl start llm-server\n    ```\n\n    If `llm-server.service` does exist:\n    ```bash\n    sudo systemctl restart llm-server\n    ```\n5. (Optional) Check the service's status:\n    ```bash\n    sudo systemctl status llm-server\n    ```\n\n### Open WebUI Integration\n\n#### llama-swap\n\nNavigate to `Admin Panel > Settings > Connections` and set the following values:\n\n- Enable OpenAI API\n- API Base URL: `http:\u002F\u002Fllama-swap:8080\u002Fv1`\n- API Key: `anything-you-like`\n\n#### `systemd` Service\n\nFollow the same steps as above.\n\n- Enable OpenAI API\n- API Base URL: `http:\u002F\u002Flocalhost:\u003Cport>\u002Fv1`\n- API Key: `anything-you-like`\n\n> Replace `\u003Cport>` with your desired port.\n\n## Chat Platform\n\n### Open WebUI\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fopen-webui\u002Fopen-webui)  \n📖 [**Documentation**](https:\u002F\u002Fdocs.openwebui.com)\n\nOpen WebUI is a web-based interface for managing models and chats, and provides a beautiful, performant UI for communicating with your models. You will want to do this if you want to access your models from a web interface. If you're fine with using the command line or want to consume models through a plugin\u002Fextension, you can skip this step.\n\nTo install without Nvidia GPU support, run the following command:\n```bash\ndocker run -d -p 3000:8080 --network app-net --add-host=host.docker.internal:host-gateway -v open-webui:\u002Fapp\u002Fbackend\u002Fdata --name open-webui --restart always ghcr.io\u002Fopen-webui\u002Fopen-webui:main\n```\n\nFor Nvidia GPUs, run the following command:\n```bash\ndocker run -d -p 3000:8080 --network app-net --gpus all --add-host=host.docker.internal:host-gateway -v open-webui:\u002Fapp\u002Fbackend\u002Fdata --name open-webui --restart always ghcr.io\u002Fopen-webui\u002Fopen-webui:cuda\n```\n\nYou can access it by navigating to `http:\u002F\u002Flocalhost:3000` in your browser or `http:\u002F\u002F\u003Cserver_ip>:3000` from another device on the same network. There's no need to add this to the `init.bash` script as Open WebUI will start automatically at boot via Docker Engine.\n\nRead more about Open WebUI [here](https:\u002F\u002Fgithub.com\u002Fopen-webui\u002Fopen-webui).\n\n## MCP Proxy Server\n\nModel Context Protocol (MCP) is a protocol that tools (functions\u002Fscripts written in code) to LLMs in a standardized way. Generally, models are being trained more and more with the ability to natively call tools in order to power agentic tasks - think along the lines of having a model use sequential thinking to formulate multiple thoughts, execute multiple targeted web searches, and provide a response leveraging real-time information. MCP also, probably more importantly for most people, enables models to call third-party tools like those for GitHub, Azure, etc. A complete list, curated and maintained by Anthropic, can be found [here](https:\u002F\u002Fgithub.com\u002Fmodelcontextprotocol\u002Fservers).\n\nMost guides on the Internet concerning MCP will have you spin up an MCP server via a client like VS Code, Cline, etc. since most agentic uses are for coding or Claude Desktop, which is a proprietary app by Anthropic and not at all what we're aiming to achieve in this guide with respect to privacy. There *are* other chat clients that support MCP server management from the UI itself (LobeChat, Cherry Studio, etc.), but we want to be able to manage MCP servers in a central and modular way. That way, (i) they aren't tied to a specific client and are available to every client you may use them with and (ii) if you switch chat platforms in the future, your MCP servers require zero changes because they run as a decoupled service - little bit more maintenance for a lot more flexibility down the line. We can do this by setting up an MCP proxy server.\n\nThis proxy server will take the MCP servers running via stdio (standard IO) protocol (that can only be accessed by an application running on that device) and make them compatible with streamable HTTP. Any MCP-enabled client can use streamable HTTP so they'll also be able to use all the servers we install on our physical server. This centralizes the management of your MCP servers: create\u002Fedit\u002Fdelete servers in one place, use any of them from your various clients (Open WebUI, VS Code, etc.).\n\nWe'll use the [fetch](https:\u002F\u002Fgithub.com\u002Fzcaceres\u002Ffetch-mcp), [sequential-thinking](https:\u002F\u002Fgithub.com\u002Farben-adm\u002Fmcp-sequential-thinking), and [searxng](https:\u002F\u002Fgithub.com\u002Fihor-sokoliuk\u002FMCP-searxng) MCP servers to get started. The process for adding more servers will be identical.\n\n### mcp-proxy\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fsparfenyuk\u002Fmcp-proxy)  \n\nmcp-proxy is a server proxy that allows switching between transports (stdio to streamable HTTP and vice versa). I'll be using port 3131 to avoid conflicts - feel free to change this as you need to. I will also be extending mcp-proxy to include `uv`: most MCP servers use either `npx` or `uv` and setting mcp-proxy up without `uv` is going to hamper your ability to run the MCP servers you'd like. If you don't require `uv`, (i) don't add the `build` section in the compose file and (ii) skip step 4.\n\n1. Create a compose file\n    ```bash\n    mkdir mcp-proxy\n    cd mcp-proxy\n    sudo nano docker-compose.yaml\n    ```\n\n2. Enter the following:\n    ```yaml\n    services:\n      mcp-proxy:\n        container_name: mcp-proxy\n        build:\n            context: .\n            dockerfile: Dockerfile\n        networks:\n        - app-net\n        volumes:\n        - .:\u002Fconfig\n        - \u002F:\u002F\u003Cserver_hostname>:ro\n        restart: unless-stopped\n        ports:\n        - 3131:3131\n        command: \"--pass-environment --port=3131 --host 0.0.0.0 --transport streamablehttp --named-server-config \u002Fconfig\u002Fservers.json\"\n\n    networks:\n      app-net:\n        external: true\n    ```\n\n    > Replace `\u003Cserver_hostname>` with your actual server's hostname (or whatever else). This is primarily useful when adding `filesystem` or similar MCP servers that read from and write files to the file system. Feel free to skip if that isn't your goal.\n\n3. Create a `servers.json` file:\n    ```json\n    {\n        \"mcpServers\": {\n            \"fetch\": {\n                \"disabled\": false,\n                \"timeout\": 60,\n                \"command\": \"uvx\",\n                \"args\": [\n                    \"mcp-server-fetch\"\n                ],\n                \"transportType\": \"stdio\"\n            },\n            \"sequential-thinking\": {\n                \"command\": \"npx\",\n                \"args\": [\n                    \"-y\",\n                    \"@modelcontextprotocol\u002Fserver-sequential-thinking\"\n                ]\n            },\n            \"searxng\": {\n                \"command\": \"npx\",\n                \"args\": [\"-y\", \"mcp-searxng\"],\n                \"env\": {\n                    \"SEARXNG_URL\": \"http:\u002F\u002Fsearxng:8080\u002Fsearch?q=\u003Cquery>\"\n                }\n            }\n        }\n    }\n    ```\n\n4. Create a `Dockerfile`:\n    ```bash\n    sudo nano Dockerfile\n    ```\n    Enter the following:\n    ```Dockerfile\n    FROM ghcr.io\u002Fsparfenyuk\u002Fmcp-proxy:latest\n\n    # Install dependencies for nvm and Node.js\n    RUN apk add --update npm\n\n    # Install the 'uv' package\n    RUN python3 -m ensurepip && pip install --no-cache-dir uv\n\n    ENV PATH=\"\u002Fusr\u002Flocal\u002Fbin:\u002Fusr\u002Fbin:$PATH\" \\\n        UV_PYTHON_PREFERENCE=only-system\n\n    ENTRYPOINT [\"catatonit\", \"--\", \"mcp-proxy\"]\n    ```\n\n5. Start the container with `docker compose up -d`\n\nYour mcp-proxy container should be up and running! Adding servers is simple: add the relevant server to `servers.json` (you can use the same configuration that the MCP server's developer provides for VS Code, it's identical) and then restart the container with `docker restart mcp-proxy`.\n\n### MCPJungle\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fmcpjungle\u002FMCPJungle?tab=readme-ov-file)  \n\nMCPJungle is another MCP proxy server with a different focus. It focuses on providing more of a \"production-grade\" experience, a lot of which is disabled by default in the development mode of the application. We'll use the standard development version of the container here on port 4141.\n\n1. Create a compose file:\n    ```bash\n    mkdir mcpjungle\n    cd mcpjungle\n    sudo nano docker-compose.yaml\n    ```\n\n   Enter the following and save:\n\n    ```yaml\n    # MCPJungle Docker Compose configuration for individual users.\n    # Use this compose file if you want to run MCPJungle locally for your personal MCP management & Gateway.\n    # The mcpjungle server runs in development mode.\n    services:\n      db:\n        image: postgres:latest\n        container_name: mcpjungle-db\n        environment:\n            POSTGRES_USER: mcpjungle\n            POSTGRES_PASSWORD: mcpjungle\n            POSTGRES_DB: mcpjungle\n        ports:\n        - \"5432:5432\"\n        networks:\n        - app-net\n        volumes:\n        - db_data:\u002Fvar\u002Flib\u002Fpostgresql\u002Fdata\n        healthcheck:\n            test: [\"CMD-SHELL\", \"PGPASSWORD=mcpjungle pg_isready -U mcpjungle\"]\n            interval: 10s\n            timeout: 5s\n            retries: 5\n        restart: unless-stopped\n\n      mcpjungle:\n        image: mcpjungle\u002Fmcpjungle:${MCPJUNGLE_IMAGE_TAG:-latest-stdio}\n        container_name: mcpjungle-server\n        environment:\n            DATABASE_URL: postgres:\u002F\u002Fmcpjungle:mcpjungle@db:5432\u002Fmcpjungle\n            SERVER_MODE: ${SERVER_MODE:-development}\n            OTEL_ENABLED: ${OTEL_ENABLED:-false}\n        ports:\n        - \"4141:8080\"\n        networks:\n        - app-net\n        volumes:\n        # Mount host filesystem current directory to enable filesystem MCP server access\n        - .:\u002Fhost\u002Fproject:ro\n        - \u002Fhome\u002F\u003Cyour_username>:\u002Fhost:ro\n        # Other options:\n        # - ${HOME}:\u002Fhost\u002Fhome:ro\n        # - \u002Ftmp:\u002Fhost\u002Ftmp:rw\n        depends_on:\n        db:\n            condition: service_healthy\n        restart: always\n\n    volumes:\n        db_data:\n\n    networks:\n      app-net:\n        external: true\n    ```\n\n2. Start the container with `docker compose up -d`\n\n3. Create a tool file:\n    ```bash\n    sudo nano fetch.json\n    ```\n\n    Enter the following and save:\n    ```json\n    {\n        \"name\": \"fetch\",\n        \"transport\": \"stdio\",\n        \"command\": \"npx\",\n        \"args\": [\"mcp-server-fetch\"]\n    }\n    ```\n\n4. Register the tool:\n    ```bash\n    docker exec -i mcpjungle-server \u002Fmcpjungle register -c \u002Fhost\u002Fproject\u002Ffetch.json\n    ```\n\nRepeat steps 3 and 4 for every tool mentioned. Commands for `sequential-thinking` and `searxng` can be found below.\n\n**sequential-thinking**\n```json\n{\n    \"name\": \"sequential-thinking\",\n    \"transport\": \"stdio\",\n    \"command\": \"npx\",\n    \"args\": [\"-y\", \"@modelcontextprotocol\u002Fserver-sequential-thinking\"]\n}\n```\n\n**searxng**\n```json\n{\n    \"name\": \"searxng\",\n    \"transport\": \"stdio\",\n    \"command\": \"npx\",\n    \"args\": [\"-y\", \"mcp-searxng\"],\n    \"env\": {\n        \"SEARXNG_URL\": \"http:\u002F\u002Fsearxng:8080\u002Fsearch?q=\u003Cquery>\"\n    }\n}\n```\n\n### Comparison\n\nThe choice between the two services is yours entirely: I use mcp-proxy because I find the workflow slightly less cumbersome than MCPJungle. Here's a comparison with the stengths of each service.\n\n**mcp-proxy > MCPJungle**\n\n- Servers can just be added to `servers.json` and will be registered automatically on container restart - MCPJungle requires manual registration of tools via the CLI\n- Uses the standard MCP syntax that most clients accept for configuration\n- Lighter footprint in that it doesn't need to spin up a separate database container\n- Uses stateful connections - MCPJungle spins up a new connection per tool call, which can lead to some performance overhead\n\n**MCPJungle > mcp-proxy**\n\n- Combines all tools under one endpoint, making it very easy to integrate into a chat frontend\n- Capable of creating a very configurable setup with tool groups, access control, selective tool enabling\u002Fdisabling\n- Supports enterprise features like telemetry\n\n### Open WebUI Integration\n\nOpen WebUI recently added support for streamable HTTP - where once you may have had to use [mcpo](https:\u002F\u002Fgithub.com\u002Fopen-webui\u002Fmcpo), Open WebUI's way of automatically generating an OpenAPI-compatible HTTP server, you can use the MCP servers you've set up as-is with no changes.\n\n#### mcp-proxy\n\nNavigate to `Admin Panel > Settings > External Tools`. Click the `+` button to add a new tool and enter the following information:\n\n- URL: `http:\u002F\u002Fmcp-proxy:\u003Cport>\u002Fservers\u002F\u003Ctool_name>\u002Fmcp`\n- API Key: `anything-you-like`\n- ID: `\u003Ctool_name>`\n- Name: `\u003Ctool_name>`\n\n> Replace `\u003Cport>` with the port of the MCP service and `\u003Ctool_name>` with the specific tool you're adding.\n\n#### MCPJungle\n\nFollow the same steps as above. By design, MCPJungle exposes all tools via one endpoint, so you should only have to add once:\n\n- URL: `http:\u002F\u002Fmcpjungle-server:8080\u002Fmcp`\n- API Key: `anything-you-like`\n- ID: `\u003Ctool_name>`\n- Name: `\u003Ctool_name>`\n\n> [!IMPORTANT]\n> When configuring models in Open WebUI (via `Admin Panel > Settings > Models > my-cool-model > Advanced Params`), change the `Function Calling` parameter from `Default` to `Native`. This step will allow the model to use multiple tool calls to formulate a single response instead of just one.\n\n### VS Code\u002FClaude Desktop Integration\n\nThe steps for integrating your MCP proxy server in another client such as VS Code (Claude Desktop, Zed, etc.) will be similar, if not exactly the same.\n\nAdd the following key and value to your `mcp.json` file:\n\n```json\n\"your-mcp-proxy-name\": {\n    \"timeout\": 60,\n    \"type\": \"stdio\",\n    \"command\": \"npx\",\n    \"args\": [\n    \"mcp-remote\",\n    \"http:\u002F\u002F\u003Cyour-server-url>\u002Fmcp\",\n    \"--allow-http\"\n    ]\n}\n```\n\n## Text-to-Speech Server\n\n### Kokoro FastAPI\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fremsky\u002FKokoro-FastAPI)\n\nKokoro FastAPI is a text-to-speech server that wraps around and provides OpenAI-compatible API inference for [Kokoro-82M](https:\u002F\u002Fhuggingface.co\u002Fhexgrad\u002FKokoro-82M), a state-of-the-art TTS model. The documentation for this project is fantastic and covers most, if not all, of the use cases for the project itself.\n\nTo install Kokoro-FastAPI, run\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fremsky\u002FKokoro-FastAPI.git\ncd Kokoro-FastAPI\ndocker compose up --build\n```\n\nThe server can be used in two ways: an API and a UI. By default, the API is served on port 8880 and the UI is served on port 7860.\n\n### Open WebUI Integration\n\nNavigate to `Admin Panel > Settings > Audio` and set the following values:\n\n- Text-to-Speech Engine: `OpenAI`\n- API Base URL: `http:\u002F\u002Fhost.docker.internal:8880\u002Fv1`\n- API Key: `anything-you-like`\n- Set Model: `kokoro`\n- Response Splitting: None (this is crucial - Kokoro uses a novel audio splitting system)\n\nThe server can be used in two ways: an API and a UI. By default, the API is served on port 8880 and the UI is served on port 7860.\n\n## Image Generation Server\n\n### ComfyUI\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI)  \n📖 [**Documentation**](https:\u002F\u002Fdocs.comfy.org)\n\nComfyUI is a popular open-source graph-based tool for generating images using image generation models such as Stable Diffusion XL, Stable Diffusion 3, and the Flux family of models.\n\n- Clone and navigate to the repository:\n    ```\n    git clone https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI\n    cd ComfyUI\n    ```\n- Set up a new virtual environment:\n    ```\n    python3 -m venv comfyui-env\n    source comfyui-env\u002Fbin\u002Factivate\n    ```\n- Download the platform-specific dependencies:\n  - Nvidia GPUs\n    ```\n    pip install torch torchvision torchaudio --extra-index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu121\n    ```\n  - AMD GPUs\n    ```\n    pip install torch torchvision torchaudio --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Frocm6.0\n    ```\n  - Intel GPUs\n  \n    Read the installation instructions from [ComfyUI's GitHub](https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI?tab=readme-ov-file#intel-gpus).\n    \n- Download the general dependencies:\n    ```\n    pip install -r requirements.txt\n    ```\n\nNow, we have to download and load a model. Here, we'll use FLUX.1 [dev], a new, state-of-the-art medium-tier model by Black Forest Labs that fits well on an RTX 3090 24GB. Since we want this to be set up as easily as possible, we'll use a complete checkpoint that can be loaded directly into ComfyUI. For a completely customized workflow, CLIPs, VAEs, and models can be downloaded separately. Follow [this guide](https:\u002F\u002Fcomfyanonymous.github.io\u002FComfyUI_examples\u002Fflux\u002F#simple-to-use-fp8-checkpoint-version) by ComfyUI's creator to install the FLUX.1 models in a fully customizable way.\n\n> [!NOTE]\n> [FLUX.1 [schnell] HuggingFace](https:\u002F\u002Fhuggingface.co\u002FComfy-Org\u002Fflux1-schnell\u002Fblob\u002Fmain\u002Fflux1-schnell-fp8.safetensors) (smaller, ideal for \u003C24GB VRAM)\n> \n> [FLUX.1 [dev] HuggingFace](https:\u002F\u002Fhuggingface.co\u002FComfy-Org\u002Fflux1-dev\u002Fblob\u002Fmain\u002Fflux1-dev-fp8.safetensors) (larger, ideal for 24GB VRAM)\n\n- Download your desired model into `\u002Fmodels\u002Fcheckpoints`.\n\n- If you want ComfyUI to be served at boot and effectively run as a service, add the following lines to `init.bash`:\n    ```\n    cd \u002Fpath\u002Fto\u002Fcomfyui\n    source comfyui\u002Fbin\u002Factivate\n    python main.py --listen\n    ```\n    > Replace `\u002Fpath\u002Fto\u002Fcomfyui` with the correct relative path to `init.bash`.\n\n    Otherwise, to run it just once, simply execute the above lines in a terminal window.\n\n### Open WebUI Integration\n\nNavigate to `Admin Panel > Settings > Images` and set the following values:\n\n- Image Generation Engine: `ComfyUI`\n- API Base URL: `http:\u002F\u002Flocalhost:8188`\n\n> [!TIP]\n> You'll either need more than 24GB of VRAM or to use a small language model mostly on CPU to use Open WebUI with FLUX.1 [dev]. FLUX.1 [schnell] and a small language model, however, should fit cleanly in 24GB of VRAM, making for a faster experience if you intend to regularly use both text and image generation together.\n\n## SSH\n\nEnabling SSH allows you to connect to the server remotely. After configuring SSH, you can connect to the server from another device on the same network using an SSH client like PuTTY or the terminal. This lets you run your server headlessly without needing a monitor, keyboard, or mouse after the initial setup.\n\nOn the server:\n- Run the following command:\n    ```\n    sudo apt install openssh-server\n    ```\n- Start the SSH service:\n    ```\n    sudo systemctl start ssh\n    ```\n- Enable the SSH service to start at boot:\n    ```\n    sudo systemctl enable ssh\n    ```\n- Find the server's IP address:\n    ```\n    ip a\n    ```\n\nOn the client:\n- Connect to the server using SSH:\n    ```\n    ssh \u003Cusername>@\u003Cip_address>\n    ```\n    > Replace `\u003Cusername>` with your username and `\u003Cip_address>` with the server's IP address.\n\n> [!NOTE]\n> If you expect to tunnel into your server often, I highly recommend following [this guide](https:\u002F\u002Fwww.raspberrypi.com\u002Fdocumentation\u002Fcomputers\u002Fremote-access.html#configure-ssh-without-a-password) to enable passwordless SSH using `ssh-keygen` and `ssh-copy-id`. It worked perfectly on my Debian system despite having been written for Raspberry Pi OS.\n\n## Firewall\n\nSetting up a firewall is essential for securing your server. The Uncomplicated Firewall (UFW) is a simple and easy-to-use firewall for Linux. You can use UFW to allow or deny incoming and outgoing traffic to and from your server.\n\n- Install UFW:\n    ```bash\n    sudo apt install ufw\n    ```\n\n- Allow SSH, HTTPS, and HTTP to your local network:\n    ```bash\n    # Allow all \u003Cip_range> hosts access to port \u003Cport>\n    sudo ufw allow from \u003Cip_range> to any port \u003Cport> proto tcp\n    ```\n    \n    Start with running the above command to open ports 22 (SSH), 80 (HTTP), and 443 (HTTPS) to our local network. Since we use the `app-net` Docker network for our containers, there's no need to open anything else up. Open ports up carefully, ideally only to specific IPs or to your local network. To allow a port for a specific IP, you can replace the IP range with a single IP and it'll work exactly the same way.\n\n> [!TIP]\n> You can find your local network's IP address range by running `ip route show`. The result will be something like this:\n> ```\n> me@my-cool-server:~$ ip route show\n> default via \u003Crouter_ip> dev enp3s0 proto dhcp src \u003Cserver_ip> metric 100\n> \u003Cnetwork_ip_range> dev enp3s0 proto kernel scope link src \u003Cserver_ip> metric 100\n> # more routes\n> ```\n\n- Enable UFW:\n    ```bash\n    sudo ufw enable\n    ```\n\n- Check the status of UFW:\n    ```bash\n    sudo ufw status verbose\n    ```\n\n> [!WARNING]\n> Enabling UFW without allowing access to port 22 will disrupt your existing SSH connections. If you run a headless setup, this means connecting a monitor to your server and then allowing SSH access through UFW. Be careful to ensure that this port is allowed when making changes to UFW's configuration.\n\nRefer to [this guide](https:\u002F\u002Fwww.digitalocean.com\u002Fcommunity\u002Ftutorials\u002Fhow-to-set-up-a-firewall-with-ufw-on-debian-10) for more information on setting up UFW.\n\n## Remote Access\n\nRemote access refers to the ability to access your server outside of your home network. For example, when you leave the house, you aren't going to be able to access `http:\u002F\u002F\u003Cyour_server_ip>`, because your network has changed from your home network to some other network (either your mobile carrier's or a local network in some other place). This means that you won't be able to access the services running on your server. There are many solutions on the web that solve this problem and we'll explore some of the easiest-to-use here.\n\n### Tailscale\n\nTailscale is a peer-to-peer VPN service that combines many services into one. Its most common use-case is to bind many different devices of many different kinds (Windows, Linux, macOS, iOS, Android, etc.) on one virtual network. This way, all these devices can be connected to different networks but still be able to communicate with each other as if they were all on the same local network. Tailscale is not completely open source (its GUI is proprietary), but it is based on the [Wireguard](https:\u002F\u002Fwww.wireguard.com) VPN protocol and the remainder of the actual service is open source. Comprehensive documentation on the service can be found [here](https:\u002F\u002Ftailscale.com\u002Fkb) and goes into many topics not mentioned here - I would recommend reading it to get the most of out the service.\n\nOn Tailscale, networks are referred to as tailnets. Creating and managing tailnets requires creating an account with Tailscale (an expected scenario with a VPN service) but connections are peer-to-peer and happen without any routing to Tailscale servers. This connection being based on Wireguard means 100% of your traffic is encrypted and cannot be accessed by anyone but the devices on your tailnet.\n\n#### Installation\n\nFirst, create a tailnet through the Admin Console on Tailscale. Download the Tailscale app on any client you want to access your tailnet from. For Windows, macOS, iOS, and Android, the apps can be found on their respective OS app stores. After signing in, your device will be added to the tailnet.\n\nFor Linux, the steps required are as follows.\n\n1) Install Tailscale\n    ```\n    curl -fsSL https:\u002F\u002Ftailscale.com\u002Finstall.sh | sh\n    ```\n\n2) Start the service\n    ```\n    sudo tailscale up\n    ```\n\nFor SSH, run `sudo tailscale up --ssh`.\n\n#### Exit Nodes\n\nAn exit node allows access to a different network while still being on your tailnet. For example, you can use this to allow a server on your network to act as a tunnel for other devices. This way, you can not only access that device (by virtue of your tailnet) but also all the devices on the host network its on. This is useful to access non-Tailscale devices on a network.\n\nTo advertise a device on as an exit node, run `sudo tailscale up --advertise-exit-node`. To allow access to the local network via this device, add the `--exit-node-allow-lan-access` flag.\n\n#### Local DNS\n\nIf one of the devices on your tailnet runs a [DNS-sinkhole](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FDNS_sinkhole) service like [Pi-hole](https:\u002F\u002Fpi-hole.net), you'll probably want other devices to use it as their DNS server. Assume this device is named `poplar`. This means every networking request made by a any device on your tailnet will send this request to `poplar`, which will in turn decide whether that request will be answered or rejected according to your Pi-hole configuration. However, since `poplar` is also one of the devices on your tailnet, it will send networking requests to itself in accordance with this rule and not to somewhere that will actually resolve the request. Thus, we don't want such devices to accept the DNS settings according to the tailnet but follow their otherwise preconfigured rules.\n\nTo reject the tailnet's DNS settings, run `sudo tailscale up --accept-dns=false`.\n\n#### Third-Party VPN Integration\n\nTailscale offers a [Mullvad VPN](https:\u002F\u002Fmullvad.net\u002Fen) exit node add-on with their service. This add-on allows for a traditional VPN experience that will route your requests through a proxy server in some other location, effectively masking your IP and allowing the circumvention of geolocation restrictions on web services. Assigned devices can be configured from the Admin Console. Mullvad VPN has [proven their no-log policy](https:\u002F\u002Fmullvad.net\u002Fen\u002Fblog\u002F2023\u002F4\u002F20\u002Fmullvad-vpn-was-subject-to-a-search-warrant-customer-data-not-compromised) and offers a fixed $5\u002Fmonth price no matter what duration you choose to pay for.\n\nTo use a Mullvad exit on one of your devices, first find the exit node you want to use by running `sudo tailscale exit-node list`. Note the IP and run `sudo tailscale up --exit-node=\u003Cyour_chosen_exit_node_ip>`.\n\n> [!WARNING]\n> Ensure the device is allowed to use the Mullvad add-on through the Admin Console first.\n\n## Updating\n\nUpdating your system is a good idea to keep software running optimally and with the latest security patches. Updates to Ollama allow for inference from new model architectures and updates to Open WebUI enable new features like voice calling, function calling, pipelines, and more.\n\nI've compiled steps to update these \"primary function\" installations in a standalone section because I think it'd be easier to come back to one section instead of hunting for update instructions in multiple subsections.\n\n### General\n\nUpgrade Debian packages by running the following commands:\n```\nsudo apt update\nsudo apt upgrade\n```\n\n### Nvidia Drivers & CUDA\n\nFollow Nvidia's guide [here](https:\u002F\u002Fdeveloper.nvidia.com\u002Fcuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Debian) to install the latest CUDA drivers.\n\n> [!WARNING]\n> Don't skip this step. Not installing the latest drivers after upgrading Debian packages will throw your installations out of sync, leading to broken functionality. When updating, target everything important at once. Also, rebooting after this step is a good idea to ensure that your system is operating as expected after upgrading these crucial drivers.\n\n### Ollama\n\nRerun the command that installs Ollama - it acts as an updater too:\n```\ncurl -fsSL https:\u002F\u002Follama.com\u002Finstall.sh | sh\n```\n\n### llama.cpp\n\nEnter your llama.cpp folder and run the following commands:\n```\ncd llama.cpp\ngit pull\n# Rebuild according to your setup - uncomment `-DGGML_CUDA=ON` for CUDA support\ncmake -B build # -DGGML_CUDA=ON\ncmake --build build --config Release\n```\n\n### vLLM\n\nFor a manual installation, enter your virtual environment and update via `pip`:\n```\nsource vllm\u002F.venv\u002Fbin\u002Factivate\npip install vllm --upgrade\n```\n\nFor a Docker installation, you're good to go when you re-run your Docker command, because it pulls the latest Docker image for vLLM.\n\n### llama-swap\n\nDelete the current container:\n```bash\ndocker stop llama-swap\ndocker rm llama-swap\n```\n\nRe-run the container command from the [llama-swap section](#llama-swap).\n\n### Open WebUI\n\nTo update Open WebUI once, run the following command:\n```\ndocker run --rm --volume \u002Fvar\u002Frun\u002Fdocker.sock:\u002Fvar\u002Frun\u002Fdocker.sock containrrr\u002Fwatchtower --run-once open-webui\n```\n\nTo keep it updated automatically, run the following command:\n```\ndocker run -d --name watchtower --volume \u002Fvar\u002Frun\u002Fdocker.sock:\u002Fvar\u002Frun\u002Fdocker.sock containrrr\u002Fwatchtower open-webui\n```\n\n### mcp-proxy\u002FMCPJungle\n\nNavigate to the directory and pull the latest container image:\n```bash\ncd mcp-proxy # or mcpjungle\ndocker compose down\ndocker compose pull\ndocker compose up -d\n```\n\n### Kokoro FastAPI\n\nNavigate to the directory and pull the latest container image:\n```\ncd Kokoro-FastAPI\ndocker compose pull\ndocker compose up -d\n```\n\n### ComfyUI\n\nNavigate to the directory, pull the latest changes, and update dependencies:\n```\ncd ComfyUI\ngit pull\nsource comfyui-env\u002Fbin\u002Factivate\npip install -r requirements.txt\n```\n\n## Troubleshooting\n\n### Docker\n\n- Since we aim to use our user instead of root to run services, you may encounter permissions issues when trying to mount a volume that is not technically owned by our user. To fix this, you can do one of two things:\n  - Change the ownership of the directory:\n    ```bash\n    sudo chown -R $(id -u):$(id -g) \u002Fpath\u002Fto\u002Fvolume\n    ```\n\n  - Grant access to our user with access control lists (ACLs, in short):\n    ```bash\n    # Grant read\u002Fwrite to specific user\n    sudo setfacl -R -m u:$(id -u):rw \u002Fpath\u002Fto\u002Fvolume\n\n    # Grant to group\n    sudo setfacl -R -m g:$(id -g):rw \u002Fpath\u002Fto\u002Fvolume\n    ```\n\n    In case you don't have the `acl` package, install with `sudo apt update && sudo apt install acl`. The official package page can be found [here](https:\u002F\u002Fpackages.debian.org\u002Fbookworm\u002Facl).\n\n    > Replace `\u002Fpath\u002Fto\u002Fvolume` with the desired path.\n\n    I like ACLs better - it solves the access issue cleanly without changing ownership unnecessarily and, in my experience, tends to break things less. However, if the resource is clearly meant to be owned by a specific user, changing ownership is preferred purely for separation of concerns. Gauge for yourself, both are completely acceptable options. Avoid changing permissions if your container is already up and running, though.\n\n### `ssh`\n- If you encounter an issue using `ssh-copy-id` to set up passwordless SSH, try running `ssh-keygen -t rsa` on the client before running `ssh-copy-id`. This generates the RSA key pair that `ssh-copy-id` needs to copy to the server.\n\n### Nvidia Drivers\n- Disable Secure Boot in the BIOS if you're having trouble with the Nvidia drivers not working. For me, all packages were at the latest versions and `nvidia-detect` was able to find my GPU correctly, but `nvidia-smi` kept returning the `NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver` error. [Disabling Secure Boot](https:\u002F\u002Faskubuntu.com\u002Fa\u002F927470) fixed this for me. Better practice than disabling Secure Boot is to sign the Nvidia drivers yourself but I didn't want to go through that process for a non-critical server that can afford to have Secure Boot disabled.\n- If you run into `docker: Error response from daemon: unknown or invalid runtime name: nvidia.`, you probably have `--runtime nvidia` in your Docker statement. This is meant for `nvidia-docker`, [which is deprecated now](https:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F52865988\u002Fnvidia-docker-unknown-runtime-specified-nvidia). Removing this flag from your command should get rid of this error.\n\n### Ollama\n- If you receive the `could not connect to ollama app, is it running?` error, your Ollama instance wasn't served properly. This could be because of a manual installation or the desire to use it at-will and not as a service. To run the Ollama server once, run:\n    ```\n    ollama serve\n    ```\n    Then, **in a new terminal**, you should be able to access your models regularly by running:\n    ```\n    ollama run \u003Cmodel>\n    ```\n    For detailed instructions on _manually_ configuring Ollama to run as a service (to run automatically at boot), read the official documentation [here](https:\u002F\u002Fgithub.com\u002Follama\u002Follama\u002Fblob\u002Fmain\u002Fdocs\u002Flinux.md). You shouldn't need to do this unless your system faces restrictions using Ollama's automated installer.\n    \n- If you receive the `Failed to open \"\u002Fetc\u002Fsystemd\u002Fsystem\u002Follama.service.d\u002F.#override.confb927ee3c846beff8\": Permission denied` error from Ollama after running `systemctl edit ollama.service`, simply creating the file works to eliminate it. Use the following steps to edit the file. \n  - Run:\n    ```\n    sudo mkdir -p \u002Fetc\u002Fsystemd\u002Fsystem\u002Follama.service.d\n    sudo nano \u002Fetc\u002Fsystemd\u002Fsystem\u002Follama.service.d\u002Foverride.conf\n    ```\n  - Retry the remaining steps.\n- If you still can't connect to your API endpoint, check your firewall settings. [This guide to UFW (Uncomplicated Firewall) on Debian](https:\u002F\u002Fwww.digitalocean.com\u002Fcommunity\u002Ftutorials\u002Fhow-to-set-up-a-firewall-with-ufw-on-debian-10) is a good resource.\n\n### vLLM\n- If you encounter ```RuntimeError: An error occurred while downloading using `hf_transfer`. Consider disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling.```, add `HF_HUB_ENABLE_HF_TRANSFER=0` to the `--env` flag after your HuggingFace Hub token. If this still doesn't fix the issue -\n  - Ensure your user has all the requisite permissions for HuggingFace to be able to write to the cache. To give read+write access over the HF cache to your user (and, thus, `huggingface-cli`), run:\n    ```\n    sudo chmod 777 ~\u002F.cache\u002Fhuggingface\n    sudo chmod 777 ~\u002F.cache\u002Fhuggingface\u002Fhub\n    ```\n  - Manually download a model via the HuggingFace CLI and specify `--download-dir=~\u002F.cache\u002Fhuggingface\u002Fhub` in the engine arguments. If your `.cache\u002Fhuggingface` directory is being troublesome, specify another directory to the `--download-dir` in the engine arguments and remember to do the same with the `--local-dir` flag in any `huggingface-cli` commands.\n\n### Open WebUI\n- If you encounter `Ollama: llama runner process has terminated: signal: killed`, check your `Advanced Parameters`, under `Settings > General > Advanced Parameters`. For me, bumping the context length past what certain models could handle was breaking the Ollama server. Leave it to the default (or higher, but make sure it's still under the limit for the model you're using) to fix this issue.\n\n## Monitoring\n\nTo monitor GPU usage, power draw, and temperature, you can use the `nvidia-smi` command. To monitor GPU usage, run:\n```\nwatch -n 1 nvidia-smi\n```\nThis will update the GPU usage every second without cluttering the terminal environment. Press `Ctrl+C` to exit.\n\n## Notes\n\nThis is my first foray into setting up a server and ever working with Linux so there may be better ways to do some of the steps. I will update this repository as I learn more.\n\n### Software\n\n- I chose Debian because it is, apparently, one of the most stable Linux distros. I also went with an XFCE desktop environment because it is lightweight and I wasn't yet comfortable going full command line.\n- Use a user for auto-login, don't log in as root unless for a specific reason.\n- To switch to root in the command line without switching users, run `sudo -i`.\n- If something using a Docker container doesn't work, try running `docker ps -a` to see if the container is running. If it isn't, try running `docker compose up -d` again. If it is and isn't working, try running `docker restart \u003Ccontainer_id>` to restart the container.\n- If something isn't working no matter what you do, try rebooting the server. It's a common solution to many problems. Try this before spending hours troubleshooting. Sigh.\n- While it takes some time to get comfortable with, using an inference engine like llama.cpp and vLLM (as compared to Ollama) is really the way to go to squeeze the maximum performance out of your hardware. If you're reading this guide in the first place and haven't already thrown up your hands and used a cloud provider, it's a safe assumption that you care about the ethos of hosting all this stuff locally. Thus, get your experience as close to a cloud provider as it can be by optimizing your server.\n\n### Hardware\n\n- The power draw of my EVGA FTW3 Ultra RTX 3090 was 350W at stock settings. I set the power limit to 250W and the performance decrease was negligible for my use case, which is primarily code completion in VS Code and the Q&A via chat. \n- Using a power monitor, I measured the power draw of my server for multiple days - the running average is ~60W. The power can spike to 350W during prompt processing and token generation, but this only lasts for a few seconds. For the remainder of the generation time, it tended to stay at the 250W power limit and dropped back to the average power draw after the model wasn't in use for about 20 seconds. \n- Ensure your power supply has enough headroom for transient spikes (particularly in multi GPU setups) or you may face random shutdowns. Your GPU can blow past its rated power draw and also any software limit you set for it based on the chip's actual draw. I usually aim for +50% of my setup's estimated total power draw.\n\n## References\n\nAdding user to `sudo` group:\n- https:\u002F\u002Faskubuntu.com\u002Fquestions\u002F168280\u002Fhow-do-i-grant-sudo-privileges-to-an-existing-user\n\nDownloading Nvidia drivers:\n- https:\u002F\u002Fdeveloper.nvidia.com\u002Fcuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Debian\n- https:\u002F\u002Fwiki.debian.org\u002FNvidiaGraphicsDrivers\n\nDownloading AMD drivers:\n- https:\u002F\u002Fwiki.debian.org\u002FAtiHowTo\n\nSecure Boot:\n- https:\u002F\u002Faskubuntu.com\u002Fa\u002F927470\n\nMonitoring GPU usage, power draw: \n- https:\u002F\u002Funix.stackexchange.com\u002Fquestions\u002F38560\u002Fgpu-usage-monitoring-cuda\u002F78203#78203\n\nPasswordless `sudo`:\n- https:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F25215604\u002Fuse-sudo-without-password-inside-a-script\n- https:\u002F\u002Fwww.reddit.com\u002Fr\u002FFedora\u002Fcomments\u002F11lh9nn\u002Fset_nvidia_gpu_power_and_temp_limit_on_boot\u002F\n- https:\u002F\u002Faskubuntu.com\u002Fquestions\u002F100051\u002Fwhy-is-sudoers-nopasswd-option-not-working\n\nAuto-login:\n- https:\u002F\u002Fforums.debian.net\u002Fviewtopic.php?t=149849\n- https:\u002F\u002Fwiki.archlinux.org\u002Ftitle\u002FLightDM#Enabling_autologin\n\nExpose Ollama to LAN:\n- https:\u002F\u002Fgithub.com\u002Follama\u002Follama\u002Fblob\u002Fmain\u002Fdocs\u002Ffaq.md#setting-environment-variables-on-linux\n- https:\u002F\u002Fgithub.com\u002Follama\u002Follama\u002Fissues\u002F703\n\nFirewall:\n- https:\u002F\u002Fwww.digitalocean.com\u002Fcommunity\u002Ftutorials\u002Fhow-to-set-up-a-firewall-with-ufw-on-debian-10\n\nPasswordless `ssh`:\n- https:\u002F\u002Fwww.raspberrypi.com\u002Fdocumentation\u002Fcomputers\u002Fremote-access.html#configure-ssh-without-a-password\n\nAdding CUDA to PATH:\n- https:\u002F\u002Faskubuntu.com\u002Fquestions\u002F885610\u002Fnvcc-version-command-says-nvcc-is-not-installed\n\nDocs:\n\n- [Debian](https:\u002F\u002Fwww.debian.org\u002Freleases\u002Fbuster\u002Famd64\u002F)\n- [Docker](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002Finstall\u002Fdebian\u002F)\n- [Ollama](https:\u002F\u002Fgithub.com\u002Follama\u002Follama\u002Fblob\u002Fmain\u002Fdocs\u002Fapi.md)\n- [vLLM](https:\u002F\u002Fdocs.vllm.ai\u002Fen\u002Fstable\u002Findex.html)\n- [Open WebUI](https:\u002F\u002Fgithub.com\u002Fopen-webui\u002Fopen-webui)\n- [ComfyUI](https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI)\n\n## Acknowledgements\n\nCheers to all the fantastic work done by the open-source community. This guide wouldn't exist without the effort of the many contributors to the projects and guides referenced here. To stay up-to-date on the latest developments in the field of machine learning, LLMs, and other vision\u002Fspeech models, check out [r\u002FLocalLLaMA](https:\u002F\u002Fwww.reddit.com\u002Fr\u002FLocalLLaMA\u002F). To stay in touch with (or fall down the rabbit hole of) the world of self-hosted apps out there for your new server, check out [r\u002Fselfhosted](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fselfhosted).\n\n> [!NOTE]\n> Please star any projects you find useful and consider contributing to them if you can. Stars on this guide would also be appreciated if you found it helpful, as it helps others find it too. \n","# 本地 LLaMA 服务器搭建文档\n\n_简而言之_：在 Debian 上搭建您自己的本地且完全私有的大语言模型服务器的端到端指南。该服务器配备聊天功能、网页搜索、RAG、模型管理、MCP 服务器、图像生成和 TTS，并提供配置 SSH、防火墙以及通过 Tailscale 实现安全远程访问的步骤。\n\n软件栈：\n\n- 推理引擎（[Ollama](https:\u002F\u002Fgithub.com\u002Follama\u002Follama)、[llama.cpp](https:\u002F\u002Fgithub.com\u002Fggml-org\u002Fllama.cpp)、[vLLM](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm)）\n- 搜索引擎（[SearXNG](https:\u002F\u002Fgithub.com\u002Fsearxng\u002Fsearxng)）\n- 模型服务器（[llama-swap](https:\u002F\u002Fgithub.com\u002Fmostlygeek\u002Fllama-swap)、`systemd` 服务）\n- 聊天平台（[Open WebUI](https:\u002F\u002Fgithub.com\u002Fopen-webui\u002Fopen-webui)）\n- MCP 代理服务器（[mcp-proxy](https:\u002F\u002Fgithub.com\u002Fsparfenyuk\u002Fmcp-proxy)、[MCPJungle](https:\u002F\u002Fgithub.com\u002Fmcpjungle\u002FMCPJungle)）\n- 文本转语音服务器（[Kokoro FastAPI](https:\u002F\u002Fgithub.com\u002Fremsky\u002FKokoro-FastAPI)）\n- 图像生成服务器（[ComfyUI](https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI)）\n\n![软件栈架构图](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fvarunvasudeva1_llm-server-docs_readme_1432289eb875.png)\n\n## 目录\n\n- [本地 LLaMA 服务器搭建文档](#local-llama-server-setup-documentation)\n  - [目录](#table-of-contents)\n  - [简介](#about)\n  - [优先级](#priorities)\n  - [前提条件](#prerequisites)\n  - [通用](#general)\n    - [允许 `sudo` 权限](#allow-sudo-permissions)\n    - [更新系统软件包](#update-system-packages)\n    - [计划启动脚本](#schedule-startup-script)\n    - [配置脚本权限](#configure-script-permissions)\n    - [配置自动登录（可选）](#configure-auto-login-optional)\n  - [Docker](#docker)\n    - [将用户添加到 Docker 组](#add-user-to-docker-group)\n    - [Nvidia 容器工具包](#nvidia-container-toolkit)\n    - [创建网络](#create-a-network)\n    - [加固 Docker 容器](#harden-docker-containers)\n    - [实用命令](#helpful-commands)\n  - [HuggingFace CLI](#huggingface-cli)\n    - [管理模型](#manage-models)\n    - [下载模型](#download-models)\n    - [删除模型](#delete-models)\n  - [搜索引擎](#search-engine)\n    - [SearXNG](#searxng)\n    - [与 Open WebUI 集成](#open-webui-integration)\n  - [推理引擎](#inference-engine)\n    - [Ollama](#ollama)\n    - [llama.cpp](#llamacpp)\n    - [vLLM](#vllm)\n    - [与 Open WebUI 集成](#open-webui-integration-1)\n    - [Ollama 与 llama.cpp 对比](#ollama-vs-llamacpp)\n    - [vLLM 与 Ollama\u002Fllama.cpp 对比](#vllm-vs-ollamallamacpp)\n  - [模型服务器](#model-server)\n    - [llama-swap](#llama-swap)\n    - [`systemd` 服务](#systemd-service)\n    - [与 Open WebUI 集成](#open-webui-integration-2)\n      - [llama-swap](#llama-swap-1)\n      - [`systemd` 服务](#systemd-service-1)\n  - [聊天平台](#chat-platform)\n    - [Open WebUI](#open-webui)\n  - [MCP 代理服务器](#mcp-proxy-server)\n    - [mcp-proxy](#mcp-proxy)\n    - [MCPJungle](#mcpjungle)\n    - [对比](#comparison)\n    - [与 Open WebUI 集成](#open-webui-integration-3)\n      - [mcp-proxy](#mcp-proxy-1)\n      - [MCPJungle](#mcpjungle-1)\n    - [VS Code\u002FClaude Desktop 集成](#vs-codeclaude-desktop-integration)\n  - [文本转语音服务器](#text-to-speech-server)\n    - [Kokoro FastAPI](#kokoro-fastapi)\n    - [与 Open WebUI 集成](#open-webui-integration-4)\n  - [图像生成服务器](#image-generation-server)\n    - [ComfyUI](#comfyui)\n    - [与 Open WebUI 集成](#open-webui-integration-5)\n  - [SSH](#ssh)\n  - [防火墙](#firewall)\n  - [远程访问](#remote-access)\n    - [Tailscale](#tailscale)\n      - [安装](#installation)\n      - [出口节点](#exit-nodes)\n      - [本地 DNS](#local-dns)\n      - [第三方 VPN 集成](#third-party-vpn-integration)\n  - [更新](#updating)\n    - [通用](#general-1)\n    - [Nvidia 驱动与 CUDA](#nvidia-drivers--cuda)\n    - [Ollama](#ollama-1)\n    - [llama.cpp](#llamacpp-1)\n    - [vLLM](#vllm-1)\n    - [llama-swap](#llama-swap-2)\n    - [Open WebUI](#open-webui-1)\n    - [mcp-proxy\u002FMCPJungle](#mcp-proxymcpjungle)\n    - [Kokoro FastAPI](#kokoro-fastapi-1)\n    - [ComfyUI](#comfyui-1)\n  - [故障排除](#troubleshooting)\n    - [Docker](#docker-1)\n    - [`ssh`](#ssh-1)\n    - [Nvidia 驱动](#nvidia-drivers)\n    - [Ollama](#ollama-2)\n    - [vLLM](#vllm-2)\n    - [Open WebUI](#open-webui-2)\n  - [监控](#monitoring)\n  - [注释](#notes)\n    - [软件](#software)\n    - [硬件](#hardware)\n  - [参考文献](#references)\n  - [致谢](#acknowledgements)\n\n## 简介\n\n本仓库概述了运行本地语言模型服务器的步骤。虽然特别针对 Debian，但大多数 Linux 发行版也应遵循非常相似的流程。它旨在为像我这样首次搭建服务器的 Linux 初学者提供指导。\n\n整个过程包括安装必要的驱动程序、设置 GPU 功率限制、配置自动登录，以及安排 `init.bash` 脚本在系统启动时运行。所有这些设置都基于我对语言模型服务器的理想配置——该服务器可以全天候运行，但许多部分都可以根据您的需求进行自定义。\n\n> [!重要提示]\n> 本指南的任何部分均未使用 AI 编写——如果出现“幻觉”，那也是人类固有的特性。尽管我已尽最大努力确保每一步骤和每个命令的正确性，请务必在终端中执行前仔细检查**一切**内容。祝您使用愉快！\n\n## 优先级\n\n- **简单性**：解决方案的各个组件应相对容易设置。\n- **稳定性**：组件应稳定可靠，能够在无需干预的情况下连续运行数周。\n- **安全性**：组件应能够被严格保护，并在其存在已知漏洞时限制其对系统的潜在破坏能力。\n- **可维护性**：组件及其交互应足够简单，以便您能够随着它们的发展（因为它们*一定会*发展）对其进行维护。\n- **美观性**：最终结果应尽可能接近云服务商的聊天平台。家庭实验室解决方案不必给人一种随意拼凑的感觉。\n- **模块化**：设置中的组件应能轻松替换为更新、性能更强或维护更好的替代方案。标准协议（如 OpenAI 兼容性、MCP 等）在这方面有很大帮助，在本指南中，始终优先考虑这些标准协议而非捆绑式解决方案。\n- **开源**：代码应能被工程师社区验证。聊天平台和大语言模型涉及大量以自然语言传递的个人数据，因此了解这些数据不会离开您的机器非常重要。\n\n## 前置条件\n\n本指南适用于任何现代的 CPU 和 GPU 组合。此前，与 AMD 显卡的兼容性曾是个问题，但 Ollama 的最新版本已解决了这一问题，[AMD 显卡现在已得到原生支持](https:\u002F\u002Follama.com\u002Fblog\u002Famd-preview)。\n\n作为参考，本指南是在以下系统上构建的：\n- **CPU**：Intel Core i5-12600KF\n- **内存**：96GB 3200MHz DDR4 RAM\n- **存储**：1TB M.2 NVMe SSD\n- **GPU**：2x Nvidia RTX 3090（24GB）\n\n> [!NOTE]\n> **AMD 显卡**：由于 [AMD 最近使得在其显卡上设置功耗限制变得困难](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Flinux_gaming\u002Fcomments\u002F1b6l1tz\u002Fno_more_power_limiting_for_amd_gpus_because_it_is\u002F)，因此会跳过针对 AMD 显卡的功耗限制步骤。自然地，请跳过所有涉及 `nvidia-smi` 或 `nvidia-persistenced` 的步骤，以及 `init.bash` 脚本中的功耗限制部分。\n> \n> **仅 CPU 系统**：您可以跳过 GPU 驱动程序的安装和功耗限制步骤。指南的其余部分应能按预期工作。\n\n> [!NOTE]\n> 本指南使用 `~\u002F`（或 `\u002Fhome\u002F\u003Cyour_username>`）作为基础目录。如果您在其他目录下操作，请相应地修改所有命令。\n\n要开始设置您的服务器，您需要以下内容：\n\n- Debian 的全新安装\n- 互联网连接\n- 对 Linux 终端的基本了解\n- 显示器、键盘和鼠标等外设\n\n要在您新搭建的服务器硬件上安装 Debian：\n\n- 从官方网站下载 [Debian ISO](https:\u002F\u002Fwww.debian.org\u002Fdistrib\u002F)。\n- 使用 Rufus（Windows）或 Balena Etcher（MacOS）等工具创建可启动的 USB 启动盘。\n- 从 USB 启动并安装 Debian。\n\n有关 Debian 安装的更详细指南，请参阅 [官方文档](https:\u002F\u002Fwww.debian.org\u002Freleases\u002Fbuster\u002Famd64\u002F)。对于尚不熟悉 Linux 的用户，建议使用图形化安装程序——安装过程中会提供文本模式和图形化两种选择。\n\n此外，我建议安装一个轻量级的桌面环境，如 XFCE，以方便使用。GNOME 或 KDE 也是可选的；如果将服务器用作主要工作站，GNOME 可能是更好的选择，因为它功能更丰富（但也更占用资源），而 XFCE 则更为轻便。\n\n## 通用步骤\n\n### 允许 `sudo` 权限\n\n为了完成本指南中的许多操作，我们需要以 root 用户身份执行。但在 root 用户允许我们使用之前，我们无法获得 root 权限。首先，我们将切换到 root 用户，并授予当前用户使用 `sudo` 执行命令的权限。\n\n> [!TIP]\n> `sudo` 是“超级用户执行”的缩写——在 Linux 中，它向操作系统表明您希望以 root 用户的身份运行所执行的命令。在高度安全的系统中，应谨慎使用 `sudo`（大多数进程应使用特定用户的权限），并且由于其具有与 root 用户相同的权限，稍有不慎就可能对系统造成影响。不必过度担心它的使用，只需知道如果使用不当，它确实可能带来危险。\n\n- 切换到 root 用户：\n    ```bash\n    su root\n    ```\n\n- 运行以下命令，将您的用户添加到 `sudo` 组（该组拥有我们所需的权限）：\n    ```bash\n    sudo usermod -a -G sudo \u003Cusername>\n    ```\n    > 将 `\u003Cusername>` 替换为您的用户名。\n\n    保存并退出（`Ctrl+X`）。\n- 关闭当前终端并打开一个新的会话。这是使更改生效的必要步骤。\n- （可选）通过运行带有 `sudo` 的 `ls` 命令来测试新权限：\n    ```bash\n    sudo ls\n    ```\n\n### 更新系统软件包\n\n- 运行以下命令更新系统：\n    ```\n    sudo apt update\n    sudo apt upgrade\n    ```\n\n接下来，我们将安装必要的 GPU 驱动程序，以便程序能够利用 GPU 的计算能力。\n\n**Nvidia 显卡**\n- 按照 Nvidia 的 [CUDA Toolkit 下载指南](https:\u002F\u002Fdeveloper.nvidia.com\u002Fcuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Debian)，该网站会根据您的机器配置交互式地引导您完成下载。\n- 运行以下命令：\n    ```bash\n    sudo apt install linux-headers-amd64\n    sudo apt install nvidia-driver firmware-misc-nonfree\n    ```\n- 重启服务器。\n- 运行以下命令验证安装是否成功：\n    ```bash\n    nvidia-smi\n    ```\n  \n**AMD 显卡**\n- 运行以下命令：\n    ```bash\n    deb http:\u002F\u002Fdeb.debian.org\u002Fdebian bookworm main contrib non-free-firmware\n    apt install firmware-amd-graphics libgl1-mesa-dri libglx-mesa0 mesa-vulkan-drivers xserver-xorg-video-all\n    ```\n- 重启服务器。\n\n我们还将安装一些 Debian 默认未安装但后续可能需要的软件包：\n```\nsudo apt install libcurl cmake\n```\n\n### 设置启动脚本\n\n在此步骤中，我们将创建一个名为 `init.bash` 的脚本。该脚本将在系统启动时运行，用于设置 GPU 功耗限制，并使用 Ollama 启动服务器。我们将 GPU 的功耗限制调低，因为测试和推理表明，即使功耗降低 30%，性能也只会下降 5% 至 15%。这一点对于 24 小时运行的服务器尤为重要。\n\n- 运行以下命令：\n    ```bash\n    touch init.bash\n    nano init.bash\n    ```\n- 在脚本中添加以下内容：\n    ```bash\n    #!\u002Fbin\u002Fbash\n    sudo nvidia-smi -pm 1\n    sudo nvidia-smi -pl \u003Cpower_limit>\n    ```\n    > 将 `\u003Cpower_limit>` 替换为您希望设置的功耗限制值（单位：瓦）。例如，`sudo nvidia-smi -pl 250`。\n\n    如果有多块 GPU，则需修改脚本以分别设置每块 GPU 的功耗限制：\n    ```bash\n    sudo nvidia-smi -i 0 -pl \u003Cpower_limit>\n    sudo nvidia-smi -i 1 -pl \u003Cpower_limit>\n    ```\n- 保存并退出脚本。\n- 将脚本设置为可执行：\n    ```bash\n    chmod +x init.bash\n    ```\n\n将 `init.bash` 脚本添加到 crontab 中，即可安排它在系统启动时自动运行。\n\n- 运行以下命令：\n    ```bash\n    crontab -e\n    ```\n- 在文件中添加以下行：\n    ```bash\n    @reboot \u002Fpath\u002Fto\u002Finit.bash\n    ```\n    > 将 `\u002Fpath\u002Fto\u002Finit.bash` 替换为 `init.bash` 脚本的实际路径。\n\n- （可选）添加以下行以在午夜关闭服务器：\n    ```bash\n    0 0 * * * \u002Fsbin\u002Fshutdown -h now\n    ```\n- 保存并退出文件。\n\n### 配置脚本权限\n\n我们希望 `init.bash` 脚本能无需输入密码即可执行 `nvidia-smi` 命令。这可以通过为 `nvidia-persistenced` 和 `nvidia-smi` 授予无密码的 `sudo` 权限来实现，具体方法是编辑 `sudoers` 文件。\n\nAMD 用户可以跳过此步骤，因为 AMD 显卡不支持功率限制功能。\n\n- 运行以下命令以编辑 sudoers 文件：\n    ```bash\n    sudo visudo\n    ```\n- 在文件中添加以下两行：\n    ```\n    \u003Cusername> ALL=(ALL) NOPASSWD: \u002Fusr\u002Fbin\u002Fnvidia-persistenced\n    \u003Cusername> ALL=(ALL) NOPASSWD: \u002Fusr\u002Fbin\u002Fnvidia-smi\n    ```\n    > 请将 `\u003Cusername>` 替换为您的用户名。\n- 保存并退出文件。\n\n> [!重要]\n> 请确保将这些行添加到 `%sudo ALL=(ALL:ALL) ALL` 之后。文件中行的顺序很重要——系统会使用最后匹配的一行，因此如果将这些行添加到 `%sudo ALL=(ALL:ALL) ALL` 之前，它们将被忽略。\n\n### 配置自动登录（可选）\n\n当服务器启动时，我们可能希望它自动登录到某个用户账户并运行 `init.bash` 脚本。这可以通过配置 `lightdm` 显示管理器来实现。\n\n- 运行以下命令：\n    ```bash\n    sudo nano \u002Fetc\u002Flightdm\u002Flightdm.conf\n    ```\n- 找到以下被注释掉的行。它应该位于 `[Seat:*]` 部分：\n    ```\n    # autologin-user=\n    ```\n- 取消该行的注释，并添加您的用户名：\n    ```\n    autologin-user=\u003Cusername>\n    ```\n    > 请将 `\u003Cusername>` 替换为您的用户名。\n- 保存并退出文件。\n\n## Docker\n\n📖 [**文档**](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002F)\n\nDocker 是一个容器化平台，允许您在隔离的环境中运行应用程序。本小节遵循 [Docker 官方指南](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002Finstall\u002Fdebian\u002F)，介绍如何在 Debian 系统上安装 Docker 引擎。以下是相关命令，但建议您访问该指南，以防说明有所更新。\n\n- 如果您的系统上已经安装了 Docker，最好重新安装一次，以避免出现损坏或过时的依赖项。以下命令会遍历系统已安装的软件包，并移除与 Docker 相关的软件包：\n    ```bash\n    for pkg in docker.io docker-doc docker-compose podman-docker containerd runc; do sudo apt purge $pkg; done\n    ```\n\n- 运行以下命令：\n    ```bash\n    # 添加 Docker 官方 GPG 密钥：\n    sudo apt update\n    sudo apt install ca-certificates curl\n    sudo install -m 0755 -d \u002Fetc\u002Fapt\u002Fkeyrings\n    sudo curl -fsSL https:\u002F\u002Fdownload.docker.com\u002Flinux\u002Fdebian\u002Fgpg -o \u002Fetc\u002Fapt\u002Fkeyrings\u002Fdocker.asc\n    sudo chmod a+r \u002Fetc\u002Fapt\u002Fkeyrings\u002Fdocker.asc\n\n    # 将仓库添加到 Apt 源：\n    sudo tee \u002Fetc\u002Fapt\u002Fsources.list.d\u002Fdocker.sources \u003C\u003CEOF\n    Types: deb\n    URIs: https:\u002F\u002Fdownload.docker.com\u002Flinux\u002Fdebian\n    Suites: $(. \u002Fetc\u002Fos-release && echo \"$VERSION_CODENAME\")\n    Components: stable\n    Signed-By: \u002Fetc\u002Fapt\u002Fkeyrings\u002Fdocker.asc\n    EOF\n\n    sudo apt update\n    ```\n- 安装 Docker 相关软件包：\n    ```bash\n    sudo apt install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin\n    ```\n- 验证安装（可选）：\n    ```bash\n    sudo systemctl status docker\n    ```\n\n    如果服务未启动，请尝试手动启动 Docker 守护进程：\n    ```bash\n    sudo systemctl start docker\n    ```\n\n### 将用户加入 Docker 组\n\n为了让我们在使用 Docker 命令时无需再输入 `sudo`，而是可以直接以当前用户身份运行，我们需要将当前用户加入 `docker` 组。虽然这不是必须的，但这样做非常方便。本小节参考了 [Docker 的安装后配置文档](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002Finstall\u002Flinux-postinstall\u002F)——建议您查看一下，以防自撰写以来内容有所变化。\n\n> [!警告]\n> 加入 `docker` 组意味着拥有对系统的高度访问权限，实际上等同于 Docker 守护进程的 root 权限。请仅将可信用户加入此组。\n\n1. 将当前用户加入 Docker 组：\n    ```bash\n    sudo usermod -aG docker $USER\n    ```\n\n2. 应用更改：\n    ```bash\n    newgrp docker\n    ```\n    \n3. 验证新权限：检查正在运行的容器：\n    ```bash\n    docker ps -a\n    ```\n\n如果您仍然遇到“权限拒绝”问题，且 Docker 命令仍需要使用 `sudo`，请注销并重新登录。\n\n> [!注意]\n> 如果您决定不执行此操作，则每次使用 Docker 命令时都需要在前面加上 `sudo`。\n\n### Nvidia Container Toolkit\n\n您很可能会希望通过 Docker 使用 GPU——这就需要安装 Nvidia Container Toolkit，它可以让 Docker 在 Nvidia 显卡上分配和释放显存。以下是安装步骤，但建议您参考 [Nvidia 官方文档](https:\u002F\u002Fdocs.nvidia.com\u002Fdatacenter\u002Fcloud-native\u002Fcontainer-toolkit\u002Flatest\u002Finstall-guide.html)，以获取最新的命令。\n\n1. 配置软件源：\n    ```bash\n    curl -fsSL https:\u002F\u002Fnvidia.github.io\u002Flibnvidia-container\u002Fgpgkey | sudo gpg --dearmor -o \u002Fusr\u002Fshare\u002Fkeyrings\u002Fnvidia-container-toolkit-keyring.gpg \\\n    && curl -s -L https:\u002F\u002Fnvidia.github.io\u002Flibnvidia-container\u002Fstable\u002Fdeb\u002Fnvidia-container-toolkit.list | \\\n        sed 's#deb https:\u002F\u002F#deb [signed-by=\u002Fusr\u002Fshare\u002Fkeyrings\u002Fnvidia-container-toolkit-keyring.gpg] https:\u002F\u002F#g' | \\\n        sudo tee \u002Fetc\u002Fapt\u002Fsources.list.d\u002Fnvidia-container-toolkit.list\n    ```\n\n2. 更新软件包：\n    ```bash\n    sudo apt update\n    ```\n\n3. 安装 Nvidia Container Toolkit 软件包：\n    ```bash\n    sudo apt install -y nvidia-container-toolkit\n    ```\n\n4. 重启 Docker 守护进程：\n    ```bash\n    sudo systemctl restart docker\n    ```\n\n### 创建网络\n\n我们将通过 Docker 容器运行大多数服务。为了让多个容器相互通信，我们可以通过 UFW 开放端口（稍后会进行配置），但这种方式不如创建 Docker 网络高效。通过创建 Docker 网络，网络中的所有容器都可以安全地相互通信，而无需为每个服务单独开放端口，从而构建更安全的环境。\n\n我们将这个网络命名为 `app-net`：您可以根据需要命名，只需确保后续使用该网络的相关命令也相应更新。\n\n运行以下命令：\n\n```bash\ndocker network create app-net\n```\n\n完成！现在，当我们创建容器时，可以这样引用该网络：\n\n**Docker Run**\n```bash\ndocker run \u003Ccontainer> --network app-net\n```\n\n**Docker Compose**\n```yaml\nservices:\n  \u003Ccontainer>:\n    # 添加这一行\n    networks:\n      - app-net\n\n# 添加这一行\nnetworks:\n  app-net:\n    external: true\n```\n\n> 将 `\u003Ccontainer>` 替换为实际的服务名称——别忘了同时添加其他参数。\n\n配置完成后，我们现在可以通过容器名和端口来调用它们。假设我们要从 `open-webui` 调用 `llama-swap` 中的 `\u002Fhealth` 端点（这两个是我们稍后会创建的实际容器），以确保容器之间能够互相访问和通信。运行以下命令（按 `CTRL+C` 退出）：\n\n```bash\ndocker exec -i open-webui curl http:\u002F\u002Fllama-swap:8080\u002Fhealth\n```\n\n你也可以反过来执行一次，以进一步确认：\n\n```bash\ndocker exec -it llama-swap curl http:\u002F\u002Fopen-webui:8080\n```\n\n> [!重要]\n> 这里的端口始终是容器内部运行的端口。例如，如果一个容器以 1111:8080 的形式运行，那么 1111 是宿主机上的端口（你可以通过 `http:\u002F\u002Flocalhost:1111` 或 `http:\u002F\u002F\u003Cserver_ip>:1111` 访问它），而 8080 则是容器内部实际运行的端口。因此，试图通过 `app-net` 网络使用 1111 端口访问该容器是无效的。在服务中指定 URL 时记住这一点，可以避免许多“为什么不起作用？”的困扰。\n\n### 加固 Docker 容器\n\n在软件领域，“加固”是指提高系统的安全性及对网络攻击的抵御能力。通常，这包括减少攻击者可能利用的入侵途径，同时也涉及在攻击者成功入侵后，限制其可执行的操作。本质上，我们将同时采取预防和应对措施——尽管常说“预防胜于治疗”。本小节参考了 Reddit 上的一条评论[链接](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fselfhosted\u002Fcomments\u002F1pr74r4\u002Fcomment\u002Fnv07sp4\u002F)，由用户 [u\u002Farnedam](https:\u002F\u002Fwww.reddit.com\u002Fuser\u002Farnedam\u002F) 提供。另一个有用的参考资料是 OWASP 的 Docker 安全备忘录[链接](https:\u002F\u002Fcheatsheetseries.owasp.org\u002Fcheatsheets\u002FDocker_Security_Cheat_Sheet.html)。\n\n在很大程度上，我们所追求的安全目标可以通过应用“最小权限原则”（PoLP）来实现——也就是说，只赋予容器完成其功能所需的精确权限，不多也不少。对于每一条建议，我都列出了潜在的风险以及相应的缓解措施（并说明其如何解决问题）；虽然并非严格必要，但了解系统面临哪些风险，有助于你在部署服务时更安全地思考。\n\n> [!注]\n> 本小节对于面向公众的服务至关重要。然而，即使你仅打算通过私有网络访问你的服务（**推荐方式**），这样做也绝不会多余——因为它能够在服务器或服务遭到入侵时，限制攻击者所能造成的损害范围。\n\n1. 以普通用户而非 root 用户运行服务：\n    ```yaml\n    user: \"\u003Cyour_user_id>:\u003Cyour_group_id>\"\n    ```\n    使用 `id -u` 查找 `\u003Cyour_user_id>`，使用 `id -g` 查找 `\u003Cyour_group_id>`。\n\n    **风险**：若未特别指定，容器默认以 root 用户身份运行。被攻陷的 root 权限容器可能造成各种破坏，从执行恶意代码到窥探或删除系统文件等。\n\n    **缓解措施**：通过指定容器运行的用户，可以将容器的权限限制在该用户的权限范围内，而普通用户的权限几乎总是低于 root 用户。\n\n2. 将文件系统设置为只读：\n    ```yaml\n    read_only: true\n    ```\n\n    **风险**：默认情况下，容器被允许读取和写入其有权访问的文件。这意味着一旦容器被攻陷，攻击者就可能写入恶意文件或代码，并窃取或删除关键信息。\n\n    **缓解措施**：启用只读模式后，攻击者将无法向容器可访问的系统区域写入恶意代码或删除重要数据。\n\n> [!警告]\n> 使用 `read_only: true` 指令时需谨慎：如果服务依赖于文件写操作，比如许多会写入 `\u002Ftmp` 目录的服务，这些子进程将会失败。若这些子进程至关重要，容器甚至可能无法启动。请预期此设置并不能完全适用于所有容器——你很可能需要在大多数挂载卷上单独添加 `:ro` 选项。\n\n3. 限制容器可使用的物理资源：\n    ```yaml\n    # 容器最多可运行的进程数\n    pids_limit: 512\n    # 容器的最大内存限制\n    mem_limit: 3g\n    # 容器最多可使用的 CPU 核心数——可以是小数\n    cpus: 3\n    ```\n\n    **风险**：默认情况下，容器被赋予无限制的资源分配能力。这意味着一旦容器被攻陷，它就能占用大量 CPU、内存和显存来执行恶意代码。\n\n    **缓解措施**：为资源设置固定上限，可以在容器被攻陷时防止攻击者耗尽服务器的所有物理资源进行破坏，例如利用你的机器运行僵尸网络。\n\n    **Docker Swarm**\n\n    如果你使用 Docker Swarm（本指南未涉及），则需要采用略有不同的格式：\n\n    ```yaml\n    deploy:\n      resources:\n        limits:\n          pids: 512\n          memory: 3g\n          cpus: 3\n    ```\n\n4. 禁用容器中的 `tty` 和 `stdin`：\n    ```yaml\n    tty: false\n    stdin_open: false\n    ```\n\n    **风险**：`stdin` 允许向容器输入命令（输入注入），而 `tty` 则提供了一个活跃的 Shell 环境——两者结合使容器具备完整的交互式 Shell 功能，从而可能执行潜在的恶意命令。\n\n    **缓解措施**：禁用这些功能后，攻击者将无法通过容器执行代码，从而大大降低任意代码执行（ACE）漏洞的风险。\n\n5. 禁止容器提升自身权限：\n    ```yaml\n    security_opt:\n      - \"no-new-privileges=true\"\n    ```\n\n    **风险**：在拥有交互式 Shell 环境的情况下，容器可以提升自身的访问权限。这甚至可能覆盖之前设置的 `user` 指令，从而使容器获得对系统的 root 权限。\n\n    **缓解措施**：添加此行后，攻击者将无法覆盖我们先前设置的 `user` 指令，也无法授予已被攻陷的容器 root 权限。\n\n6. 移除容器的默认能力：\n    ```yaml\n    cap_drop:\n      - ALL\n    ```\n\n**風險**：容器在默认情况下会被授予大量權限。然而，大多數容器並不需要這些權限。額外的存取權限會增加攻擊面，一旦容器被攻破，尤其是當這些能力能夠影響主機內核時，風險將進一步擴大。\n\n    **緩解措施**：添加此配置將取消容器預設的寬鬆權限，確保只授予容器完成其功能所需的系統能力。\n\n    `cap_drop` 設為 `ALL` 對於需要多種能力的容器來說可能過於激進。在此情況下，在上述配置之後，建議審查以下常見的能力。它們並非全部必要，也不應全部添加——僅作為某些容器可能需要的能力的起點。\n    ```yaml\n    cap_add:\n      # 低風險\n      # 隨意更改檔案的 UID 和 GID（參見 chown(2)）\n      - CHOWN\n      # 隨意操作程序的 GID 及輔助 GID 列表\n      - SETGID\n      # 隨意操作程序的 UID\n      - SETUID\n      # 將套接字綁定到 Internet 域的特權端口（端口号小於 1024）\n      - NET_BIND_SERVICE\n\n      # 高風險——可能影響重要的系統級配置\n      # 執行各種網路相關操作\n      - NET_ADMIN\n      # 使用 RAW 和 PACKET 套接字\n      - NET_RAW\n      # 執行一系列系統管理操作\n      - SYS_ADMIN\n    ```\n\n    這段配置將首先移除所有能力，然後精準地重新添加容器運作所需的能力。完整的權限清單可參考[這裡](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002Fcontainers\u002Frun\u002F#runtime-privilege-and-linux-capabilities)。\n\n    如果容器缺乏必要的能力，將無法正常運行。若不小心移除了某項必需的能力，您可以：\n    1. 理想的做法是查看上述能力列表，並通過試錯的方式逐步添加所需能力。大型語言模型可以在此過程中提供很大幫助，特別是當它們具備閱讀文件或服務源代碼的能力時。您也可以暫時忽略此指令，待所有配置都完成後再回頭處理。\n    2. 不太理想的選擇是直接放棄並完全移除 `cap_drop` 指令。雖然不推薦這樣做，但在私有服務中，只要其他安全措施到位，這種做法也未必是最不安全的配置。\n\n7. 防止過度日誌記錄：\n    ```yaml\n    logging:\n      driver: \"json-file\"\n      options:\n        max-file: \"10\"\n        max-size: \"20m\"\n    ```\n\n    **風險**：如同 [zip 炸彈](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FZip_bomb)，日誌炸彈也可能通過遞歸生成大量日誌文件來使系統不堪重負，最終導致系統無法運作。\n\n    **緩解措施**：透過限制日誌文件的最大數量（10個）和單個文件的最大大小（20MB），可以有效遏制此類攻擊。\n\n    如遇問題，您可以根據實際情況合理調整這些限制。\n\n8. 限制臨時文件目錄的權限：\n    ```yaml\n    tmpfs:\n      - \u002Ftmp:rw,noexec,nosuid,nodev,size=512m\n    ```\n\n    **風險**：如果被允許下載文件，已被入侵的容器可能會將惡意文件植入文件系統並執行它。\n\n    **緩解措施**：設置一個權限極為受限的臨時文件區域，可以阻斷這一潛在的攻擊途徑。容器只能在此小型沙盒中下載文件，且文件大小固定、無腳本執行權限。\n\n9. 掛載只讀卷：\n    ```yaml\n    volumes:\n      - \u002Fpath\u002Fto\u002Fmount1:\u002Fmount1:ro\n      - \u002Fpath\u002Fto\u002Fmount2:\u002Fmount2:ro\n    ```\n\n    **風險**：為了訪問主機的文件系統，Docker 容器需要將目錄以「卷」的形式掛載。這樣容器就能像主機本身一樣讀取和寫入數據。通常並不需要為每個掛載點都開放寫入權限，否則一旦容器被攻破，就可能刪除或覆蓋重要資訊，造成嚴重的網絡攻擊。\n\n    **緩解措施**：盡可能將卷掛載為只讀模式，從而消除容器通過寫入操作破壞這些卷中數據的可能性。這雖然無法阻止間諜軟件的活動，但能有效減少攻擊造成的數據損失。\n\n    > 請將 `\u002Fpath\u002Fto\u002Fmount1` 和 `\u002Fpath\u002Fto\u002Fmount2` 替換為實際的目錄路徑。\n\n> [!重要]\n> 對於那些對 CPU 核心和記憶體資源自由分配至關重要的容器——例如 llama-swap——您不應按照步驟 3 中所示的方式限制容器可使用的最大資源量。\n\n以下是一個整合好的片段，可供您複製並貼入現有服務的 Compose 文件中：\n\n```yaml\nservices:\n  \u003Cservice_name>:\n    # 1\n    user: \"\u003Cyour_user_id>:\u003Cyour_group_id>\"\n    # 2 - 僅在容器不進行寫操作時取消註釋\n    # read_only: true\n    # 3\n    pids_limit: 512\n    mem_limit: 3g\n    cpus: 3\n    # 4\n    tty: false\n    stdin_open: false\n    # 5\n    security_opt:\n      - \"no-new-privileges=true\"\n    # 6\n    cap_drop:\n      - ALL\n    # 如有需要，請添加 cap_add 部分\n    # 7\n    logging:\n      driver: \"json-file\"\n      options:\n        max-file: \"10\"\n        max-size: \"20m\"\n    # 8\n    tmpfs:\n      - \u002Ftmp:rw,noexec,nosuid,nodev,size=512m\n    # 9\n    volumes:\n      - \u002Fpath\u002Fto\u002Fmount1:\u002Fmount1:ro\n      - \u002Fpath\u002Fto\u002Fmount2:\u002Fmount2:ro\n```\n\n### 有用的命令\n\n在设置这台服务器的过程中（或者在深入服务部署的复杂流程中），你很可能会频繁使用 Docker。对于不熟悉 Docker 的用户，以下是一些有助于更轻松地导航和排查容器问题的命令：\n\n- 查看所有可用或正在运行的容器：`docker ps -a`\n- 重启某个容器：`docker restart \u003Ccontainer_name>`\n- 实时查看容器日志：`docker logs -f \u003Ccontainer_name>`（按 `CTRL+C` 退出）\n- 重命名容器：`docker rename \u003Ccontainer_name> \u003Cnew_container_name>`\n- 有时，一个服务会启动多个容器，例如 `xyz-server` 和 `xyz-db`。要同时重启这两个容器，请**从包含 Compose 文件的目录内**执行以下命令：`docker compose restart`\n- 重新创建服务：`docker compose down && docker compose up -d`\n- 测试容器配置：`docker compose config --quiet`\n- 查看容器的解析后配置：`docker compose config`\n- 列出 Docker 网络：`docker network ls`\n\n> [!TIP]\n> 在设置 Docker 容器或服务时，并没有固定的规则。不过，我的建议是：  \n> 使用 Docker Compose 是最整洁的方式（通过 `docker-compose.yaml` 文件运行 `docker compose up -d`，而不是直接使用 `docker run -d \u003Cimage_name>`）。除非你对家庭实验室及其设置做了详细的记录，否则这种方式几乎可以自动记录并维护清晰的服务运行轨迹，每个目录对应一个 Compose 文件是最常见的做法。\n\n> [!TIP]\n> 有时，仅仅重启容器并不能使配置更改生效。如果你发现对服务所做的更改没有生效，在继续其他故障排除之前，可以先尝试完全重建该服务。\n\n## HuggingFace CLI\n\n📖 [**文档**](https:\u002F\u002Fhuggingface.co\u002Fdocs\u002Fhuggingface_hub\u002Fmain\u002Fen\u002Fguides\u002Fcli)\n\nHuggingFace 是领先的开源机器学习\u002F人工智能平台，它托管了模型（包括大型语言模型）、数据集以及可用于测试模型的演示应用。在本指南中，我们将使用 HuggingFace 下载流行的开源大型语言模型。\n\n> [!NOTE]\n> 仅适用于 llama.cpp\u002FvLLM。\n\n- 创建一个新的虚拟环境：\n    ```bash\n    python3 -m venv hf-env\n    source hf-env\u002Fbin\u002Factivate\n    ```\n- 使用 `pip` 安装 `huggingface_hub` 包：\n    ```bash\n    pip install -U \"huggingface_hub[cli]\"\n    ```\n- 在 https:\u002F\u002Fhuggingface.com 上创建一个认证令牌。\n- 登录到 HF Hub：\n    ```bash\n    hf auth login\n    ```\n- 按提示输入你的令牌。\n- 运行以下命令以验证登录：\n    ```bash\n    hf auth whoami\n    ```\n\n    输出应为你的用户名。\n\n### 管理模型\n\n模型可以下载到默认位置（`.cache\u002Fhuggingface\u002Fhub`）或你指定的任何本地目录。可以通过 `--local-dir` 命令行参数来定义模型存储位置。如果不指定此参数，则模型将被存储在默认位置。将模型存储在推理引擎相关包所在的文件夹中是一种良好的实践——这样，运行模型推理所需的一切都集中在一个地方。然而，如果你经常在多个后端中使用相同的模型（例如，同时用 llama.cpp 和 vLLM 使用 Qwen_QwQ-32B-Q4_K_M.gguf），则要么设置一个通用的模型目录，要么直接使用 HF 的默认选项而不指定该参数。\n\n首先，激活包含 `huggingface_hub` 的虚拟环境：\n```\nsource hf-env\u002Fbin\u002Factivate\n```\n\n### 下载模型\n\n模型通过其 HuggingFace 标签进行下载。这里我们以 bartowski\u002FQwen_QwQ-32B-GGUF 为例。要下载模型，请运行：\n```\nhf download bartowski\u002FQwen_QwQ-32B-GGUF Qwen_QwQ-32B-Q4_K_M.gguf --local-dir models\n```\n请确保在运行此命令时位于正确的目录中。\n\n### 删除模型\n\n要删除指定位置的模型，请运行：\n```\nrm \u003Cmodel_name>\n```\n\n要删除默认位置的模型，请运行：\n```\nhf delete-cache\n```\n\n这将启动一个交互式会话，你可以从中移除 HuggingFace 目录中的模型。如果你一直将模型保存在不同于 `.cache\u002Fhuggingface` 的位置，那么从那里删除模型虽然能释放空间，但元数据仍会保留在 HF 缓存中，直到被正确删除。这可以通过上述命令完成，也可以直接删除 `.cache\u002Fhuggingface\u002Fhub` 中的模型目录。\n\n## 搜索引擎\n\n> [!NOTE]\n> 此步骤是可选的，但强烈推荐用于通过来自权威来源的相关搜索结果为大型语言模型提供上下文信息。通过 MCP 工具调用进行有针对性的网络搜索，可以使 LLM 生成的报告更不容易出现随机幻觉。\n\n### SearXNG\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fsearxng\u002Fsearxng)  \n📖 [**文档**](https:\u002F\u002Fdocs.searxng.org)  \n\n为了支持基于搜索的工作流，我们不希望依赖可能监控搜索的搜索引擎提供商。虽然使用任何搜索引擎都存在这个问题，但像 SearXNG 这样的元搜索引擎可以在一定程度上缓解这一问题。SearXNG 会聚合超过 245 个[搜索服务](https:\u002F\u002Fdocs.searxng.org\u002Fuser\u002Fconfigured_engines.html#configured-engines)的结果，并且不会跟踪或分析用户行为。你可以在互联网上使用托管实例，但考虑到本指南的重点以及搭建过程非常简单，我们将自己在端口 5050 上启动一个实例。\n\n1. 启动容器：\n    ```bash\n    docker pull searxng\u002Fsearxng\n    export PORT=5050\n    docker run \\\n        -d -p ${PORT}:8080 \\\n        --name searxng \\\n        --network app-net \\\n        -v \"${PWD}\u002Fsearxng:\u002Fetc\u002Fsearxng\" \\\n        -e \"BASE_URL=http:\u002F\u002F0.0.0.0:$PORT\u002F\" \\\n        -e \"INSTANCE_NAME=searxng\" \\\n        --restart unless-stopped \\\n        searxng\u002Fsearxng\n    ```\n\n2. 编辑 `settings.yml` 以支持 JSON 格式：\n    ```bash\n    sudo nano searxng\u002Fsettings.yml\n    ```\n\n    添加以下内容：\n    ```yaml\n      search:\n        # ...其他参数...\n        formats:\n            - html\n            - json      # 添加这一行\n    ```\n\n3. 使用 `docker restart searxng` 重启容器。\n\n### Open WebUI 集成\n\n如果你想要一个简单的网页搜索工作流，并跳过 MCP 服务器或代理型架构的搭建，Open WebUI 原生支持网页搜索功能。前往 `管理面板 > 设置 > 网页搜索`，并设置以下值：\n\n- 启用 `网页搜索`\n- 网页搜索引擎：`searxng`\n- Searxng 查询 URL：`http:\u002F\u002Fsearxng:8080\u002Fsearch?q=\u003Cquery>`\n- API 密钥：`任意你喜欢的值`\n\n## 推理引擎\n\n推理引擎是此设置的主要组件之一。它是一段代码，能够读取包含权重的模型文件，并从中生成有用的输出。本指南提供了 llama.cpp、vLLM 和 Ollama 三种选择——这些都是流行的推理引擎，各有侧重和优势（注意：Ollama 内部使用 llama.cpp，只是一个 CLI 封装）。对于初学者来说，直接上手 llama.cpp 和 vLLM 的命令行参数可能会有些困难。如果你是高级用户，并且喜欢通过精细控制服务参数来获得灵活性，那么使用 llama.cpp 或 vLLM 将会是非常棒的体验，最终的选择主要取决于你决定使用的量化格式。然而，如果你是新手或还不太熟悉这些工具，Ollama 可以作为一个便捷的过渡方案，帮助你逐步掌握所需技能；或者，如果你觉得当前的知识水平已经足够，它也可以作为你的最终选择！\n\n### Ollama\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Follama\u002Follama)  \n📖 [**文档**](https:\u002F\u002Fgithub.com\u002Follama\u002Follama\u002Ftree\u002Fmain\u002Fdocs)  \n🔧 [**引擎参数**](https:\u002F\u002Fgithub.com\u002Follama\u002Follama\u002Fblob\u002Fmain\u002Fdocs\u002Fmodelfile.md)\n\nOllama 将被安装为一个服务，因此会在系统启动时自动运行。\n\n- 从官方仓库下载 Ollama：\n    ```\n    curl -fsSL https:\u002F\u002Follama.com\u002Finstall.sh | sh\n    ```\n\n我们希望 LAN 中的其他设备能够访问到 API 端点。对于 Ollama 而言，这意味着需要在 `ollama.service` 中设置 `OLLAMA_HOST=0.0.0.0`。\n\n- 运行以下命令编辑服务配置：\n    ```\n    systemctl edit ollama.service\n    ```\n- 找到 `[Service]` 部分，在其下添加 `Environment=\"OLLAMA_HOST=0.0.0.0\"`。配置应如下所示：\n    ```\n    [Service]\n    Environment=\"OLLAMA_HOST=0.0.0.0\"\n    ```\n- 保存并退出。\n- 重新加载配置：\n    ```\n    systemctl daemon-reload\n    systemctl restart ollama\n    ```\n\n> [!TIP]\n> 如果你是手动安装 Ollama，或者没有将其作为服务运行，请记得执行 `ollama serve` 来正确启动服务器。如果遇到问题，可以参考 [Ollama 的故障排除步骤](#ollama-3)。\n\n### llama.cpp\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fggml-org\u002Fllama.cpp)  \n📖 [**文档**](https:\u002F\u002Fgithub.com\u002Fggml-org\u002Fllama.cpp\u002Ftree\u002Fmaster\u002Fdocs)  \n🔧 [**引擎参数**](https:\u002F\u002Fgithub.com\u002Fggml-org\u002Fllama.cpp\u002Ftree\u002Fmaster\u002Fexamples\u002Fserver)\n\n- 克隆 llama.cpp 的 GitHub 仓库：\n    ```\n    git clone https:\u002F\u002Fgithub.com\u002Fggml-org\u002Fllama.cpp.git\n    cd llama.cpp\n    ```\n- 构建二进制文件：\n\n    **CPU**\n    ```\n    cmake -B build\n    cmake --build build --config Release\n    ```\n\n    **CUDA**\n    ```\n    cmake -B build -DGGML_CUDA=ON\n    cmake --build build --config Release\n    ```\n    对于希望使用 Metal、Vulkan 等底层图形 API 的系统，请参阅完整的 [llama.cpp 构建文档](https:\u002F\u002Fgithub.com\u002Fggml-org\u002Fllama.cpp\u002Fblob\u002Fmaster\u002Fdocs\u002Fbuild.md)，以利用加速推理功能。\n\n### vLLM\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm)  \n📖 [**文档**](https:\u002F\u002Fdocs.vllm.ai\u002Fen\u002Fstable\u002Findex.html)  \n🔧 [**引擎参数**](https:\u002F\u002Fdocs.vllm.ai\u002Fen\u002Fstable\u002Fserving\u002Fengine_args.html)\n\nvLLM 自带一个与 OpenAI 兼容的 API，我们可以像使用 Ollama 一样直接调用。与 Ollama 只能运行 GGUF 模型文件不同，vLLM 原生支持 AWQ、GPTQ、GGUF、BitsAndBytes 以及 safetensors（默认发布格式）。\n\n**手动安装（推荐）**\n\n- 创建用于 vLLM 的目录和虚拟环境：\n    ```\n    mkdir vllm\n    cd vllm\n    python3 -m venv .venv\n    source .venv\u002Fbin\u002Factivate\n    ```\n\n- 使用 `pip` 安装 vLLM：\n    ```\n    pip install vllm\n    ```\n\n- 使用所需的参数启动服务。默认端口是 8000，但这里我使用 8556 端口，以免与其他服务冲突：\n    ```\n    vllm serve \u003Cmodel> --port 8556\n    ```\n\n- 若要将其作为服务运行，可将以下内容添加到 `init.bash` 文件中，以便在系统启动时自动运行 vLLM：\n    ```\n    source .venv\u002Fbin\u002Factivate\n    vllm serve \u003Cmodel> --port 8556\n    ```\n    > 请将 `\u003Cmodel>` 替换为你从 HuggingFace 复制的所需模型标签。\n\n**Docker 安装**\n\n- 运行以下命令：\n    ```\n    docker run --gpus all \\\n    -v ~\u002F.cache\u002Fhuggingface:\u002Froot\u002F.cache\u002Fhuggingface \\\n    --env \"HUGGING_FACE_HUB_TOKEN=\u003Cyour_hf_hub_token>\" \\\n    -p 8556:8000 \\\n    --ipc=host \\\n    vllm\u002Fvllm-openai:latest \\\n    --model \u003Cmodel>\n    ```\n    > 请将 `\u003Cyour_hf_hub_token>` 替换为你自己的 HuggingFace Hub 令牌，并将 `\u003Cmodel>` 替换为你从 HuggingFace 复制的所需模型标签。\n\n若要运行不同的模型：\n\n- 首先停止现有容器：\n    ```\n    docker ps -a\n    docker stop \u003Cvllm_container_ID>\n    ```\n\n- 如果你希望未来再次运行完全相同的配置，可以跳过此步骤。否则，为了不使 Docker 容器环境变得杂乱，请执行以下命令删除容器：\n    ```\n    docker rm \u003Cvllm_container_ID>\n    ```\n\n- 然后使用安装时的 Docker 命令，替换为所需的模型重新运行：\n    ```\n    docker run --gpus all \\\n    -v ~\u002F.cache\u002Fhuggingface:\u002Froot\u002F.cache\u002Fhuggingface \\\n    --env \"HUGGING_FACE_HUB_TOKEN=\u003Cyour_hf_hub_token>\" \\\n    -p 8556:8000 \\\n    --ipc=host \\\n    vllm\u002Fvllm-openai:latest \\\n    --model \u003Cmodel>\n    ```\n\n### Open WebUI 集成\n> [!NOTE]\n> 仅适用于 llama.cpp\u002FvLLM。\n\n前往 `管理面板 > 设置 > 连接`，设置以下值：\n\n- 启用 `OpenAI API`\n- API 基础 URL：`http:\u002F\u002Fhost.docker.internal:\u003Cport>\u002Fv1`\n- API 密钥：`任意值`\n\n> [!NOTE]\n> `host.docker.internal` 是一个特殊的主机名，它会解析为 Docker 分配给宿主机的内部 IP 地址。这使得容器能够与运行在宿主机上的服务（如数据库或 Web 服务器）进行通信，而无需知道宿主机的具体 IP 地址。它简化了容器与宿主机服务之间的通信，从而更容易开发和部署应用程序。\n\n### Ollama 与 llama.cpp\n\n| **方面**                 | **Ollama（封装层）**                                          | **llama.cpp（原生实现）**                                                                   |\n| -------------------------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |\n| **安装\u002F配置**     | 一键安装及 CLI 模型管理                                      | 需手动设置和配置                                                       |\n| **Open WebUI 集成** | 一级支持                                                   | 需配置 OpenAI 兼容的 API 端点                                                 |\n| **模型切换**        | 服务器端原生支持模型切换                                     | 需手动管理端口或使用 [llama-swap](https:\u002F\u002Fgithub.com\u002Fmostlygeek\u002Fllama-swap) 工具 |\n| **可定制性**        | 有限：模型文件操作较为繁琐                                   | 可通过 CLI 完全控制参数                                                                  |\n| **透明度**          | 默认设置可能覆盖模型参数（如上下文长度）                     | 参数设置完全透明                                                                         |\n| **GGUF 支持**       | 继承 llama.cpp 的业界最佳实现                                | GGUF 实现最为出色                                                                        |\n| **GPU-CPU 分离**    | 继承 llama.cpp 的高效分离机制                                | 开箱即用地实现 GPU-CPU 分离                                                              |\n\n---\n\n### vLLM 与 Ollama\u002Fllama.cpp\n| **特性**             | **vLLM**                                     | **Ollama\u002Fllama.cpp**                                                                  |\n| ----------------------- | -------------------------------------------- | ------------------------------------------------------------------------------------- |\n| **视觉模型**       | 支持 Qwen 2.5 VL、Llama 3.2 Vision 等         | Ollama 支持部分视觉模型，llama.cpp 不支持任何视觉模型（需通过 llama-server） |\n| **量化**            | 支持 AWQ、GPTQ、BnB 等                       | 仅支持 GGUF                                                                    |\n| **多 GPU 推理**    | 是                                           | 是                                                                                   |\n| **张量并行**        | 是                                           | 否                                                                                    |\n\n综上所述：\n\n- **Ollama**：最适合追求“开箱即用”体验的用户。\n- **llama.cpp**：最适合希望完全掌控推理服务，并熟悉引擎参数配置的用户。\n- **vLLM**：最适合以下场景的用户：(i) 运行非 GGUF 格式的量化模型；(ii) 使用张量并行进行多 GPU 推理；(iii) 使用视觉模型。\n\n将 Ollama 作为服务运行时，不会导致体验下降，因为未使用的模型会在一段时间后从显存中卸载。而使用 vLLM 或 llama.cpp 作为服务时，模型会一直驻留在内存中，因此除非它们是你的主要推理引擎，否则不建议将其与 Ollama 一起以自动化、持续运行的方式部署。简而言之：\n\n| 主要引擎 | 次要引擎 | 是否将次要引擎作为服务运行？ |\n| ---------- | ---------- | ------------------------------ |\n| Ollama     | llama.cpp\u002FvLLM   | 否                             |\n| llama.cpp\u002FvLLM | Ollama           | 是                             |\n\n## 模型服务器\n\n> [!NOTE]\n> 仅在手动安装 llama.cpp\u002FvLLM 时需要。Ollama 通过其 CLI 自动管理模型加载与卸载。\n\n虽然上述步骤可以帮助你快速搭建一个兼容 OpenAI 的 LLM 服务器，但它们无法保证在关闭终端窗口或重启物理服务器后，该服务仍能持续运行。此外，这些方法也无法让聊天平台可靠地引用和动态切换不同模型——而在实际应用中，不同模型往往擅长处理不同的任务。通过 Docker 运行推理引擎可以借助 `-d`（detach）标志实现持久化，但 (i) llama.cpp 和 vLLM 通常并未针对 Docker 进行优化；(ii) Docker 无法按需切换模型。因此，我们需要一个专门的服务器来管理模型的加载、卸载、切换以及列出可用模型的功能。\n\n### llama-swap\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fmostlygeek\u002Fllama-swap)  \n📖 [**文档**](https:\u002F\u002Fgithub.com\u002Fmostlygeek\u002Fllama-swap\u002Fwiki)\n\n> [!TIP]\n> 这是我推荐运行 llama.cpp\u002FvLLM 模型的方式。\n\nllama-swap 是一个轻量级的 LLM 代理服务器，能够解决我们前面提到的问题。它是一个高度可配置的工具，允许通过单一入口访问来自不同后端的模型。模型可以分组设置，轻松地进行上线或下线操作，使用自定义超参数进行配置，并且可以通过 llama-swap 的 Web UI 中的流式日志进行监控。\n\n在下面的安装步骤中，我们将使用 `Qwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf` 作为 llama.cpp 的模型，而 vLLM 则使用 `Qwen\u002FQwen3-4B-Instruct-2507`。我们还将使用 7000 端口来提供这些模型的服务。\n\n1. 创建一个新的目录，并在其中创建 `config.yaml` 文件：\n    ```bash\n    sudo mkdir llama-swap\n    cd llama-swap\n    sudo nano config.yaml\n    ```\n\n2. 输入以下内容并保存：\n\n    **llama.cpp**\n    ```yaml\n    models:\n        \"qwen3-4b\":\n            proxy: \"http:\u002F\u002F127.0.0.1:7000\"\n            cmd: |\n            \u002Fapp\u002Fllama-server\n            -m \u002Fmodels\u002FQwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf\n            # 或者使用 `-hf unsloth\u002FQwen3-4B-Instruct-2507-GGUF:Q4_K_XL` 来从 HuggingFace 加载\n            --port 7000\n    ```\n\n    **vLLM（Docker）**\n    ```yaml\n    models:\n        \"qwen3-4b\":\n            proxy: \"http:\u002F\u002F127.0.0.1:7000\"\n            cmd: |\n            docker run --name qwen-vllm\n            --init --rm -p 7000:8080\n            --ipc=host \\\n            vllm\u002Fvllm-openai:latest\n            -m \u002Fmodels\u002FQwen\u002FQwen3-4B-Instruct-2507\n            cmdStop: docker stop qwen-vllm\n    ```\n\n    **vLLM（本地）**\n    ```yaml\n    models:\n        \"qwen3-4b\":\n            proxy: \"http:\u002F\u002F127.0.0.1:7000\"\n            cmd: |\n            source \u002Fapp\u002Fvllm\u002F.venv\u002Fbin\u002Factivate && \\\n            \u002Fapp\u002Fvllm\u002F.venv\u002Fbin\u002Fvllm serve \\\n            --port 7000 \\\n            --host 0.0.0.0 \\\n            -m \u002Fmodels\u002FQwen\u002FQwen3-4B-Instruct-2507\n            cmdStop: pkill -f \"vllm serve\"\n    ```\n\n3. 安装容器：\n\n    我们在这里使用 `cuda` 标签，但 llama-swap 也提供了 `cpu`、`intel`、`vulkan` 和 `musa` 等标签。发布版本可以在 [这里](https:\u002F\u002Fgithub.com\u002Fmostlygeek\u002Fllama-swap\u002Fpkgs\u002Fcontainer\u002Fllama-swap) 找到。\n\n    **llama.cpp**\n    ```bash\n    docker run -d --gpus all --restart unless-stopped --network app-net --pull=always --name llama-swap -p 9292:8080 \\\n    -v \u002Fpath\u002Fto\u002Fmodels:\u002Fmodels \\\n    -v \u002Fhome\u002F\u003Cyour_username>\u002Fllama-swap\u002Fconfig.yaml:\u002Fapp\u002Fconfig.yaml \\\n    -v \u002Fhome\u002F\u003Cyour_username>\u002Fllama.cpp\u002Fbuild\u002Fbin\u002Fllama-server:\u002Fapp\u002Fllama-server \\\n    ghcr.io\u002Fmostlygeek\u002Fllama-swap:cuda\n    ```\n\n    **vLLM（Docker\u002F本地）**\n    ```bash\n    docker run -d --gpus all --restart unless-stopped --network app-net --pull=always --name llama-swap -p 9292:8080 \\\n    -v \u002Fpath\u002Fto\u002Fmodels:\u002Fmodels \\\n    -v \u002Fhome\u002F\u003Cyour_username>\u002Fvllm:\u002Fapp\u002Fvllm \\\n    -v \u002Fhome\u002F\u003Cyour_username>\u002Fllama-swap\u002Fconfig.yaml:\u002Fapp\u002Fconfig.yaml \\\n    ghcr.io\u002Fmostlygeek\u002Fllama-swap:cuda\n    ```\n\n    > 请将 `\u003Cyour_username>` 替换为您的实际用户名，将 `\u002Fpath\u002Fto\u002Fmodels` 替换为您模型文件所在的路径。\n\n> [!NOTE]\n> llama-swap 更倾向于使用基于 Docker 的 vLLM，因为这样环境更加整洁，并且能够更好地响应服务器发送的 SIGTERM 信号。我在这里同时列出了两种方式。\n\n以上步骤应该会启动一个运行在 `http:\u002F\u002Flocalhost:9292` 的 llama-swap 实例，您可以通过运行 `curl http:\u002F\u002Flocalhost:9292\u002Fhealth` 来确认其是否正常工作。**强烈建议**您阅读 [配置文档](https:\u002F\u002Fgithub.com\u002Fmostlygeek\u002Fllama-swap\u002Fwiki\u002FConfiguration)。llama-swap 文档非常详尽，且高度可配置——充分利用它的功能，您可以根据需要定制一套适合自己的部署方案。\n\n### `systemd` 服务\n\n另一种让模型在系统重启后仍然保持运行的方法是将推理引擎放入一个 `.service` 文件中，该文件会在 Linux 系统启动时随系统一同运行，从而确保服务器开启时模型始终可用。如果您可以接受无法切换模型或后端的限制，并且只运行一个模型，那么这是一种开销最低的解决方案，效果也非常不错。\n\n让我们将即将创建的服务命名为 `llm-server.service`。我们假设所有模型都位于 `models` 子目录中——您可以根据需要进行调整。\n\n1. 创建 `systemd` 服务文件：\n    ```bash\n    sudo nano \u002Fetc\u002Fsystemd\u002Fsystem\u002Fllm-server.service\n    ```\n\n2. 配置服务文件：\n\n    **llama.cpp**\n    ```ini\n    [Unit]\n    Description=LLM Server Service\n    After=network.target\n\n    [Service]\n    User=\u003Cuser>\n    Group=\u003Cuser>\n    WorkingDirectory=\u002Fhome\u002F\u003Cuser>\u002Fllama.cpp\u002Fbuild\u002Fbin\u002F\n    ExecStart=\u002Fhome\u002F\u003Cuser>\u002Fllama.cpp\u002Fbuild\u002Fbin\u002Fllama-server \\\n        --port \u003Cport> \\\n        --host 0.0.0.0 \\\n        -m \u002Fhome\u002F\u003Cuser>\u002Fllama.cpp\u002Fmodels\u002F\u003Cmodel> \\\n        --no-webui # [其他引擎参数]\n    Restart=always\n    RestartSec=10s\n\n    [Install]\n    WantedBy=multi-user.target\n    ```\n\n    **vLLM**\n    ```ini\n    [Unit]\n    Description=LLM Server Service\n    After=network.target\n\n    [Service]\n    User=\u003Cuser>\n    Group=\u003Cuser>\n    WorkingDirectory=\u002Fhome\u002F\u003Cuser>\u002Fvllm\u002F\n    ExecStart=\u002Fbin\u002Fbash -c 'source .venv\u002Fbin\u002Factivate && vllm serve --port \u003Cport> --host 0.0.0.0 -m \u002Fhome\u002F\u003Cuser>\u002Fvllm\u002Fmodels\u002F\u003Cmodel>'\n    Restart=always\n    RestartSec=10s\n\n    [Install]\n    WantedBy=multi-user.target\n    ```\n    > 请将 `\u003Cuser>`, `\u003Cport>` 和 `\u003Cmodel>` 分别替换为您的 Linux 用户名、期望的服务端口以及所需的模型名称。\n\n3. 重新加载 `systemd` 守护进程：\n    ```bash\n    sudo systemctl daemon-reload\n    ```\n4. 启动服务：\n\n    如果 `llm-server.service` 尚未存在：\n    ```bash\n    sudo systemctl enable llm-server.service\n    sudo systemctl start llm-server\n    ```\n\n    如果 `llm-server.service` 已经存在：\n    ```bash\n    sudo systemctl restart llm-server\n    ```\n5. （可选）检查服务状态：\n    ```bash\n    sudo systemctl status llm-server\n    ```\n\n### Open WebUI 集成\n\n#### llama-swap\n\n导航到 `Admin Panel > Settings > Connections`，并设置以下值：\n\n- 启用 OpenAI API\n- API 基础 URL：`http:\u002F\u002Fllama-swap:8080\u002Fv1`\n- API 密钥：`anything-you-like`\n\n#### `systemd` 服务\n\n按照上述步骤操作。\n\n- 启用 OpenAI API\n- API 基础 URL：`http:\u002F\u002Flocalhost:\u003Cport>\u002Fv1`\n- API 密钥：`anything-you-like`\n\n> 请将 `\u003Cport>` 替换为您的期望端口。\n\n## 聊天平台\n\n### Open WebUI\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fopen-webui\u002Fopen-webui)  \n📖 [**文档**](https:\u002F\u002Fdocs.openwebui.com)\n\nOpen WebUI 是一个用于管理模型和聊天的基于 Web 的界面，提供美观且高性能的用户界面，方便您与模型进行交互。如果您希望通过 Web 界面访问您的模型，那么这就是您需要的操作。如果您习惯使用命令行，或者希望通过插件\u002F扩展来调用模型，则可以跳过此步骤。\n\n若不支持 Nvidia GPU，请运行以下命令进行安装：\n```bash\ndocker run -d -p 3000:8080 --network app-net --add-host=host.docker.internal:host-gateway -v open-webui:\u002Fapp\u002Fbackend\u002Fdata --name open-webui --restart always ghcr.io\u002Fopen-webui\u002Fopen-webui:main\n```\n\n对于配备 Nvidia GPU 的系统，请运行以下命令：\n```bash\ndocker run -d -p 3000:8080 --network app-net --gpus all --add-host=host.docker.internal:host-gateway -v open-webui:\u002Fapp\u002Fbackend\u002Fdata --name open-webui --restart always ghcr.io\u002Fopen-webui\u002Fopen-webui:cuda\n```\n\n您可以通过浏览器访问 `http:\u002F\u002Flocalhost:3000`，或在同网络下的其他设备上访问 `http:\u002F\u002F\u003Cserver_ip>:3000`。无需将此内容添加到 `init.bash` 脚本中，因为 Open WebUI 将通过 Docker 引擎在系统启动时自动运行。\n\n有关 Open WebUI 的更多信息，请参阅 [此处](https:\u002F\u002Fgithub.com\u002Fopen-webui\u002Fopen-webui)。\n\n## MCP 代理服务器\n\n模型上下文协议（MCP）是一种以标准化方式将工具（用代码编写的函数或脚本）与大型语言模型连接起来的协议。通常，模型正在被越来越多地训练为能够原生调用工具，从而支持智能体任务——例如，让模型通过序列化思维生成多个想法、执行多次有针对性的网络搜索，并利用实时信息给出响应。对大多数人来说，更重要的是，MCP 还使模型能够调用第三方工具，如 GitHub、Azure 等。Anthropic 维护并整理的完整工具列表可在 [此处](https:\u002F\u002Fgithub.com\u002Fmodelcontextprotocol\u002Fservers) 查看。\n\n互联网上大多数关于 MCP 的指南都会建议您通过 VS Code、Cline 等客户端启动 MCP 服务器，因为大多数智能体应用场景都与编程或 Anthropic 的专有应用 Claude Desktop 相关，而这与本指南所追求的隐私保护目标并不一致。确实也有一些聊天客户端支持从其 UI 中直接管理 MCP 服务器（如 LobeChat、Cherry Studio 等），但我们希望以集中且模块化的方式管理 MCP 服务器。这样，(i) 它们不会绑定到特定客户端，而是可供您使用的任何客户端调用；(ii) 如果将来您更换聊天平台，您的 MCP 服务器也无需做任何更改，因为它们作为独立的服务运行——虽然初期维护工作稍多，但长期来看灵活性更高。我们可以通过搭建一个 MCP 代理服务器来实现这一点。\n\n该代理服务器会将通过 stdio（标准输入输出）协议运行的 MCP 服务器（只能由同一设备上的应用程序访问）转换为可通过 HTTP 流传输的形式。任何支持 MCP 的客户端都可以使用这种流式 HTTP 接口，因此它们也能使用我们在物理服务器上部署的所有 MCP 服务器。这样一来，您就可以在一个地方集中管理所有的 MCP 服务器：创建、编辑或删除服务器，并在不同的客户端（如 Open WebUI、VS Code 等）中使用它们。\n\n我们将使用 [fetch](https:\u002F\u002Fgithub.com\u002Fzcaceres\u002Ffetch-mcp)、[sequential-thinking](https:\u002F\u002Fgithub.com\u002Farben-adm\u002Fmcp-sequential-thinking) 和 [searxng](https:\u002F\u002Fgithub.com\u002Fihor-sokoliuk\u002FMCP-searxng) 这三个 MCP 服务器作为起点。后续添加更多服务器的过程与此完全相同。\n\n### mcp-proxy\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fsparfenyuk\u002Fmcp-proxy)  \n\nmcp-proxy 是一个服务器代理，允许在不同传输协议之间切换（stdio 与可流式传输的 HTTP 互转）。我将使用 3131 端口以避免端口冲突，您也可以根据需要自行更改。此外，我还会对 mcp-proxy 进行扩展，加入 `uv` 支持：大多数 MCP 服务器要么使用 `npx`，要么使用 `uv`，如果不配置 `uv`，将会影响您运行所需 MCP 服务器的能力。如果您不需要 `uv`，则 (i) 不需在 compose 文件中添加 `build` 部分，(ii) 可跳过第 4 步。\n\n1. 创建一个 compose 文件：\n    ```bash\n    mkdir mcp-proxy\n    cd mcp-proxy\n    sudo nano docker-compose.yaml\n    ```\n\n2. 输入以下内容：\n    ```yaml\n    services:\n      mcp-proxy:\n        container_name: mcp-proxy\n        build:\n            context: .\n            dockerfile: Dockerfile\n        networks:\n        - app-net\n        volumes:\n        - .:\u002Fconfig\n        - \u002F:\u002F\u003Cserver_hostname>:ro\n        restart: unless-stopped\n        ports:\n        - 3131:3131\n        command: \"--pass-environment --port=3131 --host 0.0.0.0 --transport streamablehttp --named-server-config \u002Fconfig\u002Fservers.json\"\n\n    networks:\n      app-net:\n        external: true\n    ```\n\n    > 请将 `\u003Cserver_hostname>` 替换为您实际的服务器主机名（或其他名称）。这主要在添加 `filesystem` 或类似需要读写文件系统的 MCP 服务器时有用。如果您的目标并非如此，可以跳过此步骤。\n\n3. 创建一个 `servers.json` 文件：\n    ```json\n    {\n        \"mcpServers\": {\n            \"fetch\": {\n                \"disabled\": false,\n                \"timeout\": 60,\n                \"command\": \"uvx\",\n                \"args\": [\n                    \"mcp-server-fetch\"\n                ],\n                \"transportType\": \"stdio\"\n            },\n            \"sequential-thinking\": {\n                \"command\": \"npx\",\n                \"args\": [\n                    \"-y\",\n                    \"@modelcontextprotocol\u002Fserver-sequential-thinking\"\n                ]\n            },\n            \"searxng\": {\n                \"command\": \"npx\",\n                \"args\": [\"-y\", \"mcp-searxng\"],\n                \"env\": {\n                    \"SEARXNG_URL\": \"http:\u002F\u002Fsearxng:8080\u002Fsearch?q=\u003Cquery>\"\n                }\n            }\n        }\n    }\n    ```\n\n4. 创建一个 `Dockerfile`：\n    ```bash\n    sudo nano Dockerfile\n    ```\n    输入以下内容：\n    ```Dockerfile\n    FROM ghcr.io\u002Fsparfenyuk\u002Fmcp-proxy:latest\n\n    # 安装 nvm 和 Node.js 的依赖\n    RUN apk add --update npm\n\n    # 安装 'uv' 包\n    RUN python3 -m ensurepip && pip install --no-cache-dir uv\n\n    ENV PATH=\"\u002Fusr\u002Flocal\u002Fbin:\u002Fusr\u002Fbin:$PATH\" \\\n        UV_PYTHON_PREFERENCE=only-system\n\n    ENTRYPOINT [\"catatonit\", \"--\", \"mcp-proxy\"]\n    ```\n\n5. 使用 `docker compose up -d` 启动容器。\n\n您的 mcp-proxy 容器现在应该已经成功运行！添加服务器非常简单：只需将相关服务器信息添加到 `servers.json` 文件中（您可以直接使用 MCP 服务器开发者为 VS Code 提供的配置，两者完全一致），然后通过 `docker restart mcp-proxy` 重启容器即可。\n\n### MCPJungle\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fmcpjungle\u002FMCPJungle?tab=readme-ov-file)  \n\nMCPJungle 是另一款 MCP 代理服务器，但其侧重点有所不同。它更注重提供“生产级”的使用体验，而这些功能在应用程序的开发模式下默认是禁用的。我们将在本教程中使用该容器的标准开发版本，并将其运行在 4141 端口。\n\n1. 创建一个 `docker-compose` 文件：\n    ```bash\n    mkdir mcpjungle\n    cd mcpjungle\n    sudo nano docker-compose.yaml\n    ```\n\n   输入以下内容并保存：\n\n    ```yaml\n    # MCPJungle Docker Compose 配置文件，适用于个人用户。\n    # 如果您希望在本地运行 MCPJungle 来管理您的个人 MCP 和网关，请使用此配置文件。\n    # MCPJungle 服务器以开发模式运行。\n    services:\n      db:\n        image: postgres:latest\n        container_name: mcpjungle-db\n        environment:\n            POSTGRES_USER: mcpjungle\n            POSTGRES_PASSWORD: mcpjungle\n            POSTGRES_DB: mcpjungle\n        ports:\n        - \"5432:5432\"\n        networks:\n        - app-net\n        volumes:\n        - db_data:\u002Fvar\u002Flib\u002Fpostgresql\u002Fdata\n        healthcheck:\n            test: [\"CMD-SHELL\", \"PGPASSWORD=mcpjungle pg_isready -U mcpjungle\"]\n            interval: 10s\n            timeout: 5s\n            retries: 5\n        restart: unless-stopped\n\n      mcpjungle:\n        image: mcpjungle\u002Fmcpjungle:${MCPJUNGLE_IMAGE_TAG:-latest-stdio}\n        container_name: mcpjungle-server\n        environment:\n            DATABASE_URL: postgres:\u002F\u002Fmcpjungle:mcpjungle@db:5432\u002Fmcpjungle\n            SERVER_MODE: ${SERVER_MODE:-development}\n            OTEL_ENABLED: ${OTEL_ENABLED:-false}\n        ports:\n        - \"4141:8080\"\n        networks:\n        - app-net\n        volumes:\n        # 将主机文件系统的当前目录挂载进来，以便 MCP 服务器可以访问文件系统\n        - .:\u002Fhost\u002Fproject:ro\n        - \u002Fhome\u002F\u003Cyour_username>:\u002Fhost:ro\n        # 其他选项：\n        # - ${HOME}:\u002Fhost\u002Fhome:ro\n        # - \u002Ftmp:\u002Fhost\u002Ftmp:rw\n        depends_on:\n        db:\n            condition: service_healthy\n        restart: always\n\n    volumes:\n        db_data:\n\n    networks:\n      app-net:\n        external: true\n    ```\n\n2. 使用 `docker compose up -d` 启动容器。\n\n3. 创建一个工具文件：\n    ```bash\n    sudo nano fetch.json\n    ```\n\n    输入以下内容并保存：\n    ```json\n    {\n        \"name\": \"fetch\",\n        \"transport\": \"stdio\",\n        \"command\": \"npx\",\n        \"args\": [\"mcp-server-fetch\"]\n    }\n    ```\n\n4. 注册工具：\n    ```bash\n    docker exec -i mcpjungle-server \u002Fmcpjungle register -c \u002Fhost\u002Fproject\u002Ffetch.json\n    ```\n\n对每个提到的工具重复步骤 3 和 4。`sequential-thinking` 和 `searxng` 的命令如下所示。\n\n**sequential-thinking**\n```json\n{\n    \"name\": \"sequential-thinking\",\n    \"transport\": \"stdio\",\n    \"command\": \"npx\",\n    \"args\": [\"-y\", \"@modelcontextprotocol\u002Fserver-sequential-thinking\"]\n}\n```\n\n**searxng**\n```json\n{\n    \"name\": \"searxng\",\n    \"transport\": \"stdio\",\n    \"command\": \"npx\",\n    \"args\": [\"-y\", \"mcp-searxng\"],\n    \"env\": {\n        \"SEARXNG_URL\": \"http:\u002F\u002Fsearxng:8080\u002Fsearch?q=\u003Cquery>\"\n    }\n}\n```\n\n### 对比\n\n选择哪项服务完全取决于您：我使用 mcp-proxy，是因为我觉得它的工作流程比 MCPJungle 稍微简单一些。以下是两者的对比及其各自的优势。\n\n**mcp-proxy > MCPJungle**\n\n- 服务器只需添加到 `servers.json` 文件中，容器重启时会自动注册——而 MCPJungle 则需要通过 CLI 手动注册工具。\n- 使用大多数客户端都能接受的标准 MCP 语法进行配置。\n- 占用资源更少，因为它不需要单独启动数据库容器。\n- 使用有状态连接——而 MCPJungle 每次调用工具时都会建立一个新的连接，这可能会导致一定的性能开销。\n\n**MCPJungle > mcp-proxy**\n\n- 将所有工具整合到一个端点下，非常便于集成到聊天前端。\n- 能够通过工具组、访问控制以及选择性启用\u002F禁用工具来创建高度可配置的设置。\n- 支持遥测等企业级功能。\n\n### Open WebUI 集成\n\nOpen WebUI 最近增加了对流式 HTTP 的支持——过去您可能需要使用 [mcpo](https:\u002F\u002Fgithub.com\u002Fopen-webui\u002Fmcpo)，即 Open WebUI 自动生成的兼容 OpenAPI 的 HTTP 服务器，而现在您可以直接使用已搭建好的 MCP 服务器，无需任何修改。\n\n#### mcp-proxy\n\n导航至 `Admin Panel > Settings > External Tools`。点击 `+` 按钮添加新工具，并输入以下信息：\n\n- URL：`http:\u002F\u002Fmcp-proxy:\u003Cport>\u002Fservers\u002F\u003Ctool_name>\u002Fmcp`\n- API 密钥：`任意值`\n- ID：`\u003Ctool_name>`\n- 名称：`\u003Ctool_name>`\n\n> 将 `\u003Cport>` 替换为 MCP 服务的端口号，将 `\u003Ctool_name>` 替换为您要添加的具体工具名称。\n\n#### MCPJungle\n\n按照上述步骤操作。由于 MCPJungle 的设计是将所有工具暴露在一个端点下，因此您只需添加一次即可：\n\n- URL：`http:\u002F\u002Fmcpjungle-server:8080\u002Fmcp`\n- API 密钥：`任意值`\n- ID：`\u003Ctool_name>`\n- 名称：`\u003Ctool_name>`\n\n> [!IMPORTANT]\n> 在 Open WebUI 中配置模型（通过 `Admin Panel > Settings > Models > my-cool-model > Advanced Params`）时，需将 `Function Calling` 参数从 `Default` 更改为 `Native`。这一步将使模型能够使用多个工具调用来生成单个响应，而不仅仅是一个工具调用。\n\n### VS Code\u002FClaude Desktop 集成\n\n将您的 MCP 代理服务器集成到其他客户端（如 VS Code、Claude Desktop、Zed 等）中的步骤类似，甚至完全相同。\n\n在您的 `mcp.json` 文件中添加以下键值对：\n\n```json\n\"your-mcp-proxy-name\": {\n    \"timeout\": 60,\n    \"type\": \"stdio\",\n    \"command\": \"npx\",\n    \"args\": [\n    \"mcp-remote\",\n    \"http:\u002F\u002F\u003Cyour-server-url>\u002Fmcp\",\n    \"--allow-http\"\n    ]\n}\n```\n\n## 文本转语音服务器\n\n### Kokoro FastAPI\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fremsky\u002FKokoro-FastAPI)\n\nKokoro FastAPI 是一款文本转语音服务器，它封装了 [Kokoro-82M](https:\u002F\u002Fhuggingface.co\u002Fhexgrad\u002FKokoro-82M) 模型，并提供了与 OpenAI 兼容的 API 推理接口。这款模型属于最先进的 TTS 模型之一。该项目的文档非常出色，几乎涵盖了该项目的全部使用场景。\n\n要安装 Kokoro FastAPI，请执行以下命令：\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fremsky\u002FKokoro-FastAPI.git\ncd Kokoro-FastAPI\ndocker compose up --build\n```\n\n该服务器有两种使用方式：API 和 UI。默认情况下，API 在 8880 端口提供服务，UI 则在 7860 端口提供服务。\n\n### Open WebUI 集成\n\n前往 `管理面板 > 设置 > 音频`，并设置以下值：\n\n- 文本转语音引擎：`OpenAI`\n- API 基础 URL：`http:\u002F\u002Fhost.docker.internal:8880\u002Fv1`\n- API 密钥：`anything-you-like`\n- 设置模型：`kokoro`\n- 响应拆分：无（这一点至关重要——Kokoro 使用一种新颖的音频拆分系统）\n\n该服务器可以通过两种方式使用：API 和 UI。默认情况下，API 在 8880 端口提供服务，UI 在 7860 端口提供服务。\n\n## 图像生成服务器\n\n### ComfyUI\n\n🌟 [**GitHub**](https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI)  \n📖 [**文档**](https:\u002F\u002Fdocs.comfy.org)\n\nComfyUI 是一款流行的开源基于图的工具，用于使用图像生成模型（如 Stable Diffusion XL、Stable Diffusion 3 和 Flux 系列模型）生成图像。\n\n- 克隆并进入仓库：\n    ```\n    git clone https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI\n    cd ComfyUI\n    ```\n- 设置一个新的虚拟环境：\n    ```\n    python3 -m venv comfyui-env\n    source comfyui-env\u002Fbin\u002Factivate\n    ```\n- 下载平台特定的依赖项：\n  - Nvidia 显卡\n    ```\n    pip install torch torchvision torchaudio --extra-index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu121\n    ```\n  - AMD 显卡\n    ```\n    pip install torch torchvision torchaudio --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Frocm6.0\n    ```\n  - Intel 显卡\n  \n    请阅读 ComfyUI 的 GitHub 页面上的安装说明：[ComfyUI 安装指南](https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI?tab=readme-ov-file#intel-gpus)。\n    \n- 下载通用依赖项：\n    ```\n    pip install -r requirements.txt\n    ```\n\n现在，我们需要下载并加载一个模型。这里我们将使用 Black Forest Labs 推出的新一代中端模型 FLUX.1 [dev]，它非常适合 RTX 3090 24GB 显存的显卡。为了尽可能简化设置，我们将使用可以直接加载到 ComfyUI 中的完整检查点。如果需要完全自定义的工作流，也可以分别下载 CLIP、VAE 和模型。请按照 ComfyUI 创作者提供的[此指南](https:\u002F\u002Fcomfyanonymous.github.io\u002FComfyUI_examples\u002Fflux\u002F#simple-to-use-fp8-checkpoint-version)，以完全自定义的方式安装 FLUX.1 模型。\n\n> [!注意]\n> [FLUX.1 [schnell] HuggingFace](https:\u002F\u002Fhuggingface.co\u002FComfy-Org\u002Fflux1-schnell\u002Fblob\u002Fmain\u002Fflux1-schnell-fp8.safetensors)（较小，适合 \u003C24GB 显存）\n> \n> [FLUX.1 [dev] HuggingFace](https:\u002F\u002Fhuggingface.co\u002FComfy-Org\u002Fflux1-dev\u002Fblob\u002Fmain\u002Fflux1-dev-fp8.safetensors)（较大，适合 24GB 显存）\n\n- 将您所需的模型下载到 `\u002Fmodels\u002Fcheckpoints` 目录中。\n\n- 如果您希望 ComfyUI 在启动时自动运行并作为服务有效运行，请将以下行添加到 `init.bash` 文件中：\n    ```\n    cd \u002Fpath\u002Fto\u002Fcomfyui\n    source comfyui\u002Fbin\u002Factivate\n    python main.py --listen\n    ```\n    > 请将 `\u002Fpath\u002Fto\u002Fcomfyui` 替换为 `init.bash` 文件的正确相对路径。\n\n    否则，如果您只想运行一次，只需在终端窗口中执行上述命令即可。\n\n### Open WebUI 集成\n\n前往 `管理面板 > 设置 > 图像`，并设置以下值：\n\n- 图像生成引擎：`ComfyUI`\n- API 基础 URL：`http:\u002F\u002Flocalhost:8188`\n\n> [!提示]\n> 要在 Open WebUI 中使用 FLUX.1 [dev]，您要么需要超过 24GB 的显存，要么主要在 CPU 上使用小型语言模型。然而，FLUX.1 [schnell] 和小型语言模型应该可以很好地适应 24GB 显存，如果您打算经常同时使用文本和图像生成功能，这将带来更快的体验。\n\n## SSH\n\n启用 SSH 可让您远程连接到服务器。配置好 SSH 后，您可以使用 PuTTY 或终端等 SSH 客户端，从同一网络中的其他设备连接到服务器。这样，在完成初始设置后，您就可以无需显示器、键盘或鼠标地以无头模式运行您的服务器。\n\n在服务器上：\n- 运行以下命令：\n    ```\n    sudo apt install openssh-server\n    ```\n- 启动 SSH 服务：\n    ```\n    sudo systemctl start ssh\n    ```\n- 设置 SSH 服务在开机时自动启动：\n    ```\n    sudo systemctl enable ssh\n    ```\n- 查找服务器的 IP 地址：\n    ```\n    ip a\n    ```\n    \n在客户端上：\n- 使用 SSH 连接到服务器：\n    ```\n    ssh \u003Cusername>@\u003Cip_address>\n    ```\n    > 请将 `\u003Cusername>` 替换为您的用户名，`\u003Cip_address>` 替换为服务器的 IP 地址。\n\n> [!注意]\n> 如果您预计会频繁通过隧道访问您的服务器，强烈建议您遵循[此指南](https:\u002F\u002Fwww.raspberrypi.com\u002Fdocumentation\u002Fcomputers\u002Fremote-access.html#configure-ssh-without-a-password)，使用 `ssh-keygen` 和 `ssh-copy-id` 启用无密码 SSH。尽管该指南是为 Raspberry Pi OS 编写的，但它在我使用的 Debian 系统上同样完美运行。\n\n## 防火墙\n\n设置防火墙对于保护您的服务器至关重要。Uncomplicated Firewall (UFW) 是一款简单易用的 Linux 防火墙。您可以使用 UFW 允许或拒绝进出您服务器的流量。\n\n- 安装 UFW：\n    ```bash\n    sudo apt install ufw\n    ```\n\n- 允许 SSH、HTTPS 和 HTTP 流量进入本地网络：\n    ```bash\n    # 允许 \u003Cip_range> 范围内的所有主机访问 \u003Cport> 端口\n    sudo ufw allow from \u003Cip_range> to any port \u003Cport> proto tcp\n    ```\n    \n    首先运行上述命令，为我们的本地网络开放 22（SSH）、80（HTTP）和 443（HTTPS）端口。由于我们使用 `app-net` Docker 网络来运行容器，因此无需再开放其他端口。请谨慎开放端口，最好仅对特定 IP 或本地网络开放。如果要为特定 IP 开放端口，只需将 IP 范围替换为单个 IP，效果将完全相同。\n\n> [!提示]\n> 您可以通过运行 `ip route show` 来查找本地网络的 IP 地址范围。结果可能如下所示：\n> ```\n> me@my-cool-server:~$ ip route show\n> 默认网关 via \u003Crouter_ip> dev enp3s0 proto dhcp src \u003Cserver_ip> metric 100\n> \u003Cnetwork_ip_range> dev enp3s0 proto kernel scope link src \u003Cserver_ip> metric 100\n> # 更多路由\n> ```\n\n- 启用 UFW：\n    ```bash\n    sudo ufw enable\n    ```\n\n- 检查 UFW 状态：\n    ```bash\n    sudo ufw status verbose\n    ```\n\n> [!警告]\n> 如果在未允许 22 端口访问的情况下启用 UFW，将会中断您现有的 SSH 连接。如果您采用无头设置，这意味着需要将显示器连接到服务器，然后再通过 UFW 允许 SSH 访问。在更改 UFW 配置时，请务必确保已允许该端口访问。\n\n有关设置 UFW 的更多信息，请参阅[此指南](https:\u002F\u002Fwww.digitalocean.com\u002Fcommunity\u002Ftutorials\u002Fhow-to-set-up-a-firewall-with-ufw-on-debian-10)。\n\n## 远程访问\n\n远程访问是指在家庭网络之外访问您的服务器的能力。例如，当您离开家时，将无法再通过 `http:\u002F\u002F\u003Cyour_server_ip>` 访问服务器，因为您的网络环境已从家庭网络切换到了其他网络（可能是移动运营商的网络，也可能是其他地方的本地网络）。这意味着您将无法访问服务器上运行的服务。网络上有许多解决方案可以解决这一问题，我们将在下面探讨其中一些易于使用的方案。\n\n### Tailscale\n\nTailscale 是一种点对点 VPN 服务，它将多种功能整合到一个平台中。最常见的用例是将各种不同类型的设备（Windows、Linux、macOS、iOS、Android 等）连接到同一个虚拟网络。这样一来，这些设备虽然连接在不同的物理网络上，但仍能像处于同一局域网内一样相互通信。Tailscale 并非完全开源（其图形界面为专有），但它基于 [Wireguard](https:\u002F\u002Fwww.wireguard.com) VPN 协议，而服务的核心部分则是开源的。关于该服务的全面文档可以在 [这里](https:\u002F\u002Ftailscale.com\u002Fkb) 找到，其中涵盖了此处未提及的诸多主题——建议仔细阅读以充分利用该服务。\n\n在 Tailscale 中，网络被称为 tailnet。创建和管理 tailnet 需要先注册一个 Tailscale 账户（这是 VPN 服务的常见要求），但实际的连接是点对点的，无需经过 Tailscale 的任何服务器中转。由于这种连接基于 Wireguard 协议，因此您的所有流量都会被 100% 加密，只有 tailnet 上的设备才能解密并查看这些数据。\n\n#### 安装\n\n首先，通过 Tailscale 的管理控制台创建一个 tailnet。然后，在您希望接入该 tailnet 的任何客户端设备上下载 Tailscale 应用程序。对于 Windows、macOS、iOS 和 Android，您可以在各自的操作系统的应用商店中找到相应的应用程序。登录后，您的设备就会被添加到该 tailnet 中。\n\n对于 Linux，安装步骤如下：\n\n1) 安装 Tailscale  \n   ```\n   curl -fsSL https:\u002F\u002Ftailscale.com\u002Finstall.sh | sh\n   ```\n\n2) 启动服务  \n   ```\n   sudo tailscale up\n   ```\n\n如果需要使用 SSH，可以运行 `sudo tailscale up --ssh`。\n\n#### 出口节点\n\n出口节点允许您在保持在 tailnet 内的同时访问其他网络。例如，您可以使用此功能让网络中的某台服务器充当其他设备的隧道。这样，您不仅可以访问这台设备（因为它们都在同一个 tailnet 中），还可以访问该服务器所在主机网络上的所有设备。这对于访问网络中非 Tailscale 设备非常有用。\n\n要将某台设备设置为出口节点，可以运行 `sudo tailscale up --advertise-exit-node`。若要允许通过该设备访问本地网络，则需添加 `--exit-node-allow-lan-access` 标志。\n\n#### 本地 DNS\n\n如果您 tailnet 中的某台设备运行着类似 [Pi-hole](https:\u002F\u002Fpi-hole.net) 的 [DNS 污点池](https:\u002F\u002Fen.wikipedia.org\u002Fwiki\u002FDNS_sinkhole) 服务，您可能希望其他设备也将其作为自己的 DNS 服务器。假设这台设备名为 `poplar`，那么 tailnet 中的所有设备发出的网络请求都会被发送到 `poplar`，由它根据 Pi-hole 的配置决定是否响应或拒绝该请求。然而，由于 `poplar` 本身也是 tailnet 中的一台设备，它会按照这条规则将网络请求发送给自己，而不是发送到能够真正解析请求的服务器。因此，我们不希望这些设备接受 tailnet 的 DNS 设置，而是继续遵循它们原本配置的规则。\n\n要拒绝 tailnet 的 DNS 设置，可以运行 `sudo tailscale up --accept-dns=false`。\n\n#### 第三方 VPN 集成\n\nTailscale 提供与 [Mullvad VPN](https:\u002F\u002Fmullvad.net\u002Fen) 配合使用的出口节点插件。该插件可以让您体验传统的 VPN 服务，将您的请求路由到位于其他地区的代理服务器，从而有效隐藏您的 IP 地址，并绕过网站服务的地理限制。您可以通过管理控制台为指定设备配置此功能。Mullvad VPN 已经 [证明了其无日志政策](https:\u002F\u002Fmullvad.net\u002Fen\u002Fblog\u002F2023\u002F4\u002F20\u002Fmullvad-vpn-was-subject-to-a-search-warrant-customer-data-not-compromised)，并且无论您选择支付多长时间的费用，每月只需固定支付 5 美元。\n\n要在您的某台设备上使用 Mullvad 出口节点，首先通过运行 `sudo tailscale exit-node list` 查找出您想要使用的出口节点，记下其 IP 地址，然后运行 `sudo tailscale up --exit-node=\u003Cyour_chosen_exit_node_ip>`。\n\n> [!WARNING]  \n> 请确保已在管理控制台中允许该设备使用 Mullvad 插件。\n\n## 更新\n\n定期更新系统有助于保持软件的最佳运行状态，并及时应用最新的安全补丁。Ollama 的更新可以支持新的模型架构推理，而 Open WebUI 的更新则带来了语音通话、函数调用、流水线等新功能。\n\n我将这些“核心功能”组件的更新步骤单独整理成一节，因为这样更便于查阅，而不必在多个子章节中寻找更新说明。\n\n### 一般步骤\n\n通过以下命令升级 Debian 软件包：\n```\nsudo apt update\nsudo apt upgrade\n```\n\n### Nvidia 驱动与 CUDA\n\n请按照 Nvidia 的官方指南 [这里](https:\u002F\u002Fdeveloper.nvidia.com\u002Fcuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Debian) 安装最新的 CUDA 驱动程序。\n\n> [!WARNING]  \n> 请勿跳过此步骤。在升级 Debian 软件包后未安装最新驱动程序会导致系统各组件不同步，进而引发功能故障。更新时应一次性完成所有重要组件的升级。此外，完成此步骤后重启系统是个好习惯，以确保在更新关键驱动程序后系统能够正常运行。\n\n### Ollama\n\n重新运行安装 Ollama 的命令即可完成更新：\n```\ncurl -fsSL https:\u002F\u002Follama.com\u002Finstall.sh | sh\n```\n\n### llama.cpp\n\n进入 llama.cpp 目录并执行以下命令：\n```\ncd llama.cpp\ngit pull\n# 根据您的配置重新编译——如需 CUDA 支持，请取消注释 `-DGGML_CUDA=ON`\ncmake -B build # -DGGML_CUDA=ON\ncmake --build build --config Release\n```\n\n### vLLM\n\n如果是手动安装，请进入您的虚拟环境并通过 `pip` 更新：\n```\nsource vllm\u002F.venv\u002Fbin\u002Factivate\npip install vllm --upgrade\n```\n\n如果是 Docker 安装，只需重新运行 Docker 命令即可，因为 Docker 会自动拉取 vLLM 的最新镜像。\n\n### llama-swap\n\n首先停止并删除当前容器：\n```bash\ndocker stop llama-swap\ndocker rm llama-swap\n```\n\n然后按照 [llama-swap 章节](#llama-swap) 中的说明重新运行容器命令。\n\n### Open WebUI\n\n要一次性更新 Open WebUI，运行以下命令：\n```\ndocker run --rm --volume \u002Fvar\u002Frun\u002Fdocker.sock:\u002Fvar\u002Frun\u002Fdocker.sock containrrr\u002Fwatchtower --run-once open-webui\n```\n\n若希望自动保持更新，运行以下命令：\n```\ndocker run -d --name watchtower --volume \u002Fvar\u002Frun\u002Fdocker.sock:\u002Fvar\u002Frun\u002Fdocker.sock containrrr\u002Fwatchtower open-webui\n```\n\n### mcp-proxy\u002FMCPJungle\n\n进入相应目录并拉取最新容器镜像：\n```bash\ncd mcp-proxy # 或 mcpjungle\ndocker compose down\ndocker compose pull\ndocker compose up -d\n```\n\n### Kokoro FastAPI\n\n进入相应目录并拉取最新容器镜像：\n```\ncd Kokoro-FastAPI\ndocker compose pull\ndocker compose up -d\n```\n\n### ComfyUI\n\n进入目录，拉取最新更改并更新依赖：\n```\ncd ComfyUI\ngit pull\nsource comfyui-env\u002Fbin\u002Factivate\npip install -r requirements.txt\n```\n\n## 故障排除\n\n### Docker\n\n- 由于我们希望使用普通用户而非 root 用户来运行服务，当尝试挂载并非由当前用户拥有的卷时，可能会遇到权限问题。解决方法有以下两种：\n  - 更改目录的所有权：\n    ```bash\n    sudo chown -R $(id -u):$(id -g) \u002Fpath\u002Fto\u002Fvolume\n    ```\n\n  - 使用访问控制列表（ACL）为当前用户授予权限：\n    ```bash\n    # 授予特定用户的读写权限\n    sudo setfacl -R -m u:$(id -u):rw \u002Fpath\u002Fto\u002Fvolume\n\n    # 授予组的读写权限\n    sudo setfacl -R -m g:$(id -g):rw \u002Fpath\u002Fto\u002Fvolume\n    ```\n\n    如果系统未安装 `acl` 包，请先运行 `sudo apt update && sudo apt install acl` 进行安装。官方包页面可在 [这里](https:\u002F\u002Fpackages.debian.org\u002Fbookworm\u002Facl) 查阅。\n\n    > 请将 `\u002Fpath\u002Fto\u002Fvolume` 替换为实际路径。\n\n    我更倾向于使用 ACL，因为它能在不必要地更改所有权的情况下干净地解决问题，且根据我的经验，出错的概率更低。不过，如果资源明确应归特定用户所有，则出于职责分离的原则，建议直接更改所有权。具体选择可根据实际情况决定，两者都是可行的方案。但请注意，不要在容器已运行时随意更改权限。\n\n### `ssh`\n\n- 如果在使用 `ssh-copy-id` 设置无密码 SSH 时遇到问题，可在执行 `ssh-copy-id` 前先在客户端运行 `ssh-keygen -t rsa`。这会生成 `ssh-copy-id` 所需的 RSA 密钥对，并将其复制到服务器。\n\n### Nvidia 驱动程序\n\n- 如果 Nvidia 驱动程序无法正常工作，可尝试在 BIOS 中禁用安全启动。我曾遇到所有软件包均为最新版本、`nvidia-detect` 能正确识别 GPU，但 `nvidia-smi` 却一直报错“NVIDIA-SMI 已失败，因为它无法与 NVIDIA 驱动程序通信”。最终通过禁用安全启动解决了该问题。更好的做法是自行签名 Nvidia 驱动程序，但考虑到这台服务器并不关键，且可以承受安全启动被禁用的情况，因此选择了后者。\n\n- 如果出现 `docker: Error response from daemon: unknown or invalid runtime name: nvidia.` 错误，很可能是在 Docker 命令中使用了 `--runtime nvidia` 参数。此参数适用于现已弃用的 `nvidia-docker`（详情见 [这里](https:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F52865988\u002Fnvidia-docker-unknown-runtime-specified-nvidia)）。移除该参数即可解决此问题。\n\n### Ollama\n\n- 如果收到 `could not connect to ollama app, is it running?` 错误，说明 Ollama 实例未能正确启动。这可能是由于手动安装或希望按需运行而非作为服务所致。要一次性启动 Ollama 服务器，运行：\n    ```\n    ollama serve\n    ```\n    然后，在**另一个终端**中，您应该能够通过以下命令正常使用模型：\n    ```\n    ollama run \u003Cmodel>\n    ```\n    如需详细了解如何手动配置 Ollama 以作为服务（即开机自启动），请参阅官方文档 [此处](https:\u002F\u002Fgithub.com\u002Follama\u002Follama\u002Fblob\u002Fmain\u002Fdocs\u002Flinux.md)。除非您的系统受到限制而无法使用 Ollama 的自动安装程序，否则通常无需进行此类操作。\n\n- 如果在运行 `systemctl edit ollama.service` 后收到 `Failed to open \"\u002Fetc\u002Fsystemd\u002Fsystem\u002Follama.service.d\u002F.#override.confb927ee3c846beff8\": Permission denied` 错误，只需创建该文件即可消除问题。具体步骤如下：\n  - 运行：\n    ```\n    sudo mkdir -p \u002Fetc\u002Fsystemd\u002Fsystem\u002Follama.service.d\n    sudo nano \u002Fetc\u002Fsystemd\u002Fsystem\u002Follama.service.d\u002Foverride.conf\n    ```\n  - 再次尝试后续步骤。\n\n- 如果仍然无法连接到 API 端点，请检查防火墙设置。[Debian 上 UFW（Uncomplicated Firewall）指南](https:\u002F\u002Fwww.digitalocean.com\u002Fcommunity\u002Ftutorials\u002Fhow-to-set-up-a-firewall-with-ufw-on-debian-10)是一个不错的参考资源。\n\n### vLLM\n\n- 如果遇到 ```RuntimeError: An error occurred while downloading using `hf_transfer`. Consider disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling.``` 错误，可在 HuggingFace Hub 令牌后的 `--env` 标志中添加 `HF_HUB_ENABLE_HF_TRANSFER=0`。若仍无法解决问题：\n  - 确保当前用户拥有 HuggingFace 写入缓存所需的全部权限。要为当前用户（以及 `huggingface-cli`）授予对 HF 缓存的读写权限，运行：\n    ```\n    sudo chmod 777 ~\u002F.cache\u002Fhuggingface\n    sudo chmod 777 ~\u002F.cache\u002Fhuggingface\u002Fhub\n    ```\n  - 手动通过 HuggingFace CLI 下载模型，并在引擎参数中指定 `--download-dir=~\u002F.cache\u002Fhuggingface\u002Fhub`。如果 `.cache\u002Fhuggingface` 目录存在问题，可在引擎参数中指定其他下载目录，并确保在所有 `huggingface-cli` 命令中同时使用 `--local-dir` 标志。\n\n### Open WebUI\n\n- 如果遇到 `Ollama: llama runner process has terminated: signal: killed` 错误，需检查 `设置 > 常规 > 高级参数` 中的相关设置。对我而言，将上下文长度设置得过高以至于某些模型无法处理，会导致 Ollama 服务器崩溃。请将其恢复为默认值（或适当提高，但务必确保不超过所用模型的限制），即可解决此问题。\n\n## 监控\n\n要监控 GPU 使用率、功耗和温度，可以使用 `nvidia-smi` 命令。要监控 GPU 使用情况，运行：\n```\nwatch -n 1 nvidia-smi\n```\n此命令会每秒更新一次 GPU 使用情况，且不会使终端界面过于杂乱。按 `Ctrl+C` 可退出。\n\n## 备注\n\n这是我首次搭建服务器并接触 Linux，因此部分步骤可能存在更优解。随着学习的深入，我会持续更新本仓库。\n\n### 软件\n\n- 我选择了 Debian，因为它据称是最稳定的 Linux 发行版之一。同时，我也选择了 XFCE 桌面环境，因为它轻量级，而我当时还不太习惯完全使用命令行。\n- 使用普通用户进行自动登录，除非有特殊原因，否则不要以 root 用户登录。\n- 如果需要在不切换用户的前提下切换到 root 用户，可以在命令行中运行 `sudo -i`。\n- 如果某个使用 Docker 容器的服务无法正常工作，可以尝试运行 `docker ps -a` 查看容器是否正在运行。如果未运行，再尝试执行 `docker compose up -d`。如果容器正在运行但仍然有问题，可以尝试运行 `docker restart \u003Ccontainer_id>` 来重启容器。\n- 如果无论怎么操作都无法解决问题，可以尝试重启服务器。这通常是解决许多问题的常见方法。在花费数小时排查之前，不妨先试试这个办法。唉。\n- 虽然需要一些时间来熟悉，但与 Ollama 相比，使用 llama.cpp 和 vLLM 等推理引擎，确实能够最大限度地发挥硬件性能。如果你正在阅读本指南，并且还没有选择直接使用云服务提供商，那么可以合理推断你更倾向于将所有内容本地化部署。因此，通过优化服务器配置，尽量让本地体验接近云服务提供商的效果。\n\n### 硬件\n\n- 在默认设置下，我的 EVGA FTW3 Ultra RTX 3090 显卡功耗为 350W。我将其功耗上限设置为 250W，对于我的使用场景（主要是 VS Code 中的代码补全和聊天问答）来说，性能下降几乎可以忽略不计。\n- 使用功率监测仪，我对服务器的功耗进行了多日测量——平均运行功耗约为 60W。在处理提示和生成 token 时，功耗会短暂飙升至 350W，但这种情况只会持续几秒钟。在其余生成时间内，功耗通常维持在 250W 的限制水平，而在模型停止使用约 20 秒后，功耗又会回落到平均值。\n- 确保电源供应有足够的余量来应对瞬时峰值（尤其是在多 GPU 配置中），否则可能会出现随机关机的情况。显卡的实际功耗可能会超过其额定值，甚至突破你为其设置的软件限值。我通常会按照系统总功耗估算值的 150% 来选择电源。\n\n## 参考资料\n\n将用户添加到 `sudo` 组：\n- https:\u002F\u002Faskubuntu.com\u002Fquestions\u002F168280\u002Fhow-do-i-grant-sudo-privileges-to-an-existing-user\n\n下载 Nvidia 驱动程序：\n- https:\u002F\u002Fdeveloper.nvidia.com\u002Fcuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Debian\n- https:\u002F\u002Fwiki.debian.org\u002FNvidiaGraphicsDrivers\n\n下载 AMD 驱动程序：\n- https:\u002F\u002Fwiki.debian.org\u002FAtiHowTo\n\n安全启动：\n- https:\u002F\u002Faskubuntu.com\u002Fa\u002F927470\n\n监控 GPU 使用情况和功耗：\n- https:\u002F\u002Funix.stackexchange.com\u002Fquestions\u002F38560\u002Fgpu-usage-monitoring-cuda\u002F78203#78203\n\n无密码 `sudo`：\n- https:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F25215604\u002Fuse-sudo-without-password-inside-a-script\n- https:\u002F\u002Fwww.reddit.com\u002Fr\u002FFedora\u002Fcomments\u002F11lh9nn\u002Fset_nvidia_gpu_power_and_temp_limit_on_boot\u002F\n- https:\u002F\u002Faskubuntu.com\u002Fquestions\u002F100051\u002Fwhy-is-sudoers-nopasswd-option-not-working\n\n自动登录：\n- https:\u002F\u002Fforums.debian.net\u002Fviewtopic.php?t=149849\n- https:\u002F\u002Fwiki.archlinux.org\u002Ftitle\u002FLightDM#Enabling_autologin\n\n将 Ollama 对外暴露到局域网：\n- https:\u002F\u002Fgithub.com\u002Follama\u002Follama\u002Fblob\u002Fmain\u002Fdocs\u002Ffaq.md#setting-environment-variables-on-linux\n- https:\u002F\u002Fgithub.com\u002Follama\u002Follama\u002Fissues\u002F703\n\n防火墙：\n- https:\u002F\u002Fwww.digitalocean.com\u002Fcommunity\u002Ftutorials\u002Fhow-to-set-up-a-firewall-with-ufw-on-debian-10\n\n无密码 `ssh`：\n- https:\u002F\u002Fwww.raspberrypi.com\u002Fdocumentation\u002Fcomputers\u002Fremote-access.html#configure-ssh-without-a-password\n\n将 CUDA 添加到 PATH：\n- https:\u002F\u002Faskubuntu.com\u002Fquestions\u002F885610\u002Fnvcc-version-command-says-nvcc-is-not-installed\n\n文档：\n\n- [Debian](https:\u002F\u002Fwww.debian.org\u002Freleases\u002Fbuster\u002Famd64\u002F)\n- [Docker](https:\u002F\u002Fdocs.docker.com\u002Fengine\u002Finstall\u002Fdebian\u002F)\n- [Ollama](https:\u002F\u002Fgithub.com\u002Follama\u002Follama\u002Fblob\u002Fmain\u002Fdocs\u002Fapi.md)\n- [vLLM](https:\u002F\u002Fdocs.vllm.ai\u002Fen\u002Fstable\u002Findex.html)\n- [Open WebUI](https:\u002F\u002Fgithub.com\u002Fopen-webui\u002Fopen-webui)\n- [ComfyUI](https:\u002F\u002Fgithub.com\u002Fcomfyanonymous\u002FComfyUI)\n\n## 致谢\n\n向开源社区的所有杰出贡献者致以诚挚的敬意。如果没有这些项目及参考指南的众多贡献者们的努力，本指南便不会存在。若想及时了解机器学习、大语言模型以及其他视觉\u002F语音模型领域的最新进展，请关注 [r\u002FLocalLLaMA](https:\u002F\u002Fwww.reddit.com\u002Fr\u002FLocalLLaMA\u002F)。如果你想进一步探索或深入研究适合新服务器的自托管应用，不妨访问 [r\u002Fselfhosted](https:\u002F\u002Fwww.reddit.com\u002Fr\u002Fselfhosted)。\n\n> [!NOTE]\n> 如果你发现某些项目非常有用，请为它们点个赞；如果你有能力，也可以考虑为它们做出贡献。如果你觉得本指南对你有所帮助，也欢迎为它点个赞，这样可以帮助更多人找到它。","# llm-server-docs 快速上手指南\n\n本指南旨在帮助开发者在 Debian 系统上快速搭建一个本地化、完全私密的大语言模型（LLM）服务器。该方案集成了聊天、联网搜索、RAG（检索增强生成）、模型管理、MCP 服务、图像生成及语音合成等功能。\n\n## 环境准备\n\n### 系统要求\n- **操作系统**：Debian Linux（推荐最新稳定版）。其他 Linux 发行版步骤类似，但本指南基于 Debian 编写。\n- **硬件配置**：\n    - **CPU**：现代多核处理器（示例：Intel Core i5-12600KF 或同等 AMD 产品）。\n    - **内存**：建议 32GB 以上（示例配置为 96GB）。\n    - **存储**：建议 NVMe SSD，容量视模型大小而定（示例配置为 1TB）。\n    - **GPU**：\n        - **Nvidia**：支持 CUDA 的显卡（示例：2x RTX 3090）。需安装专有驱动。\n        - **AMD**：较新版本的 Ollama 已原生支持 AMD GPU，无需额外复杂配置，但无法设置功耗限制。\n        - **纯 CPU**：可跳过显卡驱动安装步骤，仅推理速度较慢。\n- **外设**：显示器、键盘、鼠标（用于初始安装），后续可通过 SSH 远程管理。\n\n### 前置依赖\n- 新鲜的 Debian 安装实例。\n- 稳定的互联网连接。\n- 基础的 Linux 终端操作知识。\n- **桌面环境**（可选）：推荐安装轻量级 XFCE，或使用功能更丰富的 GNOME\u002FKDE。\n\n## 安装步骤\n\n### 1. 基础系统配置\n\n**授予 sudo 权限**\n切换到 root 用户并将当前用户加入 sudo 组：\n```bash\nsu root\nsudo usermod -a -G sudo \u003Cusername>\n```\n*注：将 `\u003Cusername>` 替换为你的实际用户名。执行后需重新登录终端生效。*\n\n**更新系统包**\n```bash\nsudo apt update\nsudo apt upgrade\n```\n\n**安装通用依赖**\n```bash\nsudo apt install libcurl cmake\n```\n\n### 2. 显卡驱动安装\n\n**Nvidia GPU 用户**\n参考 Nvidia 官方指南下载 CUDA Toolkit，然后执行：\n```bash\nsudo apt install linux-headers-amd64\nsudo apt install nvidia-driver firmware-misc-nonfree\n```\n重启系统并验证安装：\n```bash\nreboot\nnvidia-smi\n```\n\n**AMD GPU 用户**\n编辑 `\u002Fetc\u002Fapt\u002Fsources.list` 添加非自由固件源（如尚未添加），然后安装驱动：\n```bash\napt install firmware-amd-graphics libgl1-mesa-dri libglx-mesa0 mesa-vulkan-drivers xserver-xorg-video-all\nreboot\n```\n*注：AMD 用户请跳过后续涉及 `nvidia-smi` 或功耗限制的步骤。*\n\n### 3. Docker 环境部署\n\n**添加用户到 Docker 组**\n```bash\nsudo usermod -aG docker $USER\nnewgrp docker\n```\n\n**安装 Nvidia Container Toolkit (仅限 Nvidia 用户)**\n确保 Docker 容器能调用 GPU。具体安装命令请参考 Nvidia 官方文档或项目原文档中的 \"Nvidia Container Toolkit\" 部分。\n\n**创建专用网络**\n```bash\ndocker network create llm-net\n```\n\n### 4. 核心组件部署\n\n本项目采用模块化架构，主要组件如下（具体启动命令通常封装在项目提供的 `init.bash` 或 Docker Compose 文件中）：\n\n- **推理引擎**：Ollama, llama.cpp, 或 vLLM。\n- **聊天界面**：Open WebUI。\n- **搜索引擎**：SearXNG。\n- **模型调度**：llama-swap (作为 systemd 服务运行)。\n- **其他服务**：Kokoro FastAPI (TTS), ComfyUI (绘图), MCP Proxy。\n\n**配置开机自启脚本**\n创建 `init.bash` 用于设置 GPU 功耗限制（节能）并启动服务：\n```bash\ntouch init.bash\nnano init.bash\n```\n在脚本中填入配置内容（参考原仓库完整脚本），主要逻辑包括：\n1. 设置 GPU 功耗限制（例如降低 30% 功耗以换取微小性能损失）。\n2. 启动 Ollama 或其他后端服务。\n\n赋予执行权限并配置 systemd 定时任务（或 rc.local）以确保开机运行。\n\n### 5. 安全与远程访问\n\n**配置防火墙**\n使用 `ufw` 开放必要端口（如 SSH 22, WebUI 8080 等）：\n```bash\nsudo ufw allow 22\u002Ftcp\nsudo ufw allow 8080\u002Ftcp\nsudo ufw enable\n```\n\n**配置 Tailscale (推荐)**\n为了实现安全的远程访问而不暴露端口到公网：\n```bash\ncurl -fsSL https:\u002F\u002Ftailscale.com\u002Finstall.sh | sh\nsudo tailscale up\n```\n启用后可通过 Tailscale IP 直接访问服务器上的 Open WebUI 和其他服务。\n\n## 基本使用\n\n部署完成后，最核心的使用方式是通过浏览器访问 **Open WebUI** 界面。\n\n1. **访问界面**\n   在浏览器中输入服务器地址（局域网 IP 或 Tailscale IP）及端口（默认为 8080）：\n   ```text\n   http:\u002F\u002F\u003Cserver-ip>:8080\n   ```\n\n2. **初始化账户**\n   首次访问时，创建一个管理员账户。\n\n3. **连接模型**\n   - 进入设置 -> **Connections** (连接)。\n   - 确保 Ollama 或其他推理引擎的地址配置正确（通常为 `http:\u002F\u002Fhost.docker.internal:11434` 或局域网 IP）。\n   - 点击 \"Save\" 保存。\n\n4. **开始对话**\n   - 在主界面右上角选择已下载的模型（若未下载，可在终端使用 `ollama pull \u003Cmodel-name>` 下载，或在 WebUI 模型管理中下载）。\n   - 输入问题即可开始聊天。\n\n5. **高级功能启用**\n   - **联网搜索**：在聊天设置中开启 \"Web Search\"，系统将自动通过 SearXNG 检索最新信息。\n   - **文档问答 (RAG)**：直接在对话框上传 PDF\u002FTXT 文档，系统会自动索引并进行问答。\n   - **语音\u002F绘图**：在设置中启用 TTS (Kokoro) 和 Image Generation (ComfyUI) 插件，即可在对话中生成语音回复或图片。\n\n至此，您已拥有一个功能完备、数据私有的本地 AI 服务器。","某金融数据分析师需要在本地部署一套完全私有的 AI 系统，用于处理敏感财报数据并生成带语音播报的分析报告，同时要求具备联网检索最新市场动态的能力。\n\n### 没有 llm-server-docs 时\n- **集成噩梦**：需手动分别配置 Ollama、Open WebUI、SearXNG 和 ComfyUI 等多个组件，容器网络互通和依赖冲突问题频发，耗时数天仍无法跑通全流程。\n- **隐私风险**：缺乏标准化的防火墙与 SSH 加固指南，远程访问时容易暴露端口，导致敏感金融数据存在泄露隐患。\n- **功能割裂**：文字生成、图片绘制、语音合成（TTS）及联网搜索功能分散在不同界面，无法在一个统一的聊天窗口中协同工作。\n- **维护困难**：缺少系统的模型管理（如 llama-swap）和自动更新脚本，切换不同参数量的模型或升级驱动时极易导致服务崩溃。\n\n### 使用 llm-server-docs 后\n- **一键全栈部署**：依据文档在 Debian 上快速搭建起包含推理引擎、搜索、RAG 及多模态生成的完整软件栈，所有服务通过预定义的网络自动互联。\n- **企业级安全**：直接套用文档中的 Tailscale 组网、防火墙规则及 SSH 配置步骤，实现了无需公网 IP 的安全远程访问，确保数据不出内网。\n- **统一交互体验**：在 Open WebUI 单一界面中即可调用本地大模型分析财报、利用 SearXNG 检索实时股价、调用 ComfyUI 生成趋势图并由 Kokoro 播报结论。\n- **弹性资源管理**：通过集成的 llama-swap 和 systemd 服务管理，轻松在不同模型间切换以平衡速度与精度，且拥有清晰的版本更新路径。\n\nllm-server-docs 将原本复杂琐碎的私有化 AI 基建工程，转化为可复用的标准化流程，让开发者能专注于业务逻辑而非环境调试。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fvarunvasudeva1_llm-server-docs_8986751c.png","varunvasudeva1","Varun Vasudeva","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fvarunvasudeva1_8cea17fc.jpg",null,"https:\u002F\u002Fgithub.com\u002Fvarunvasudeva1",733,56,"2026-04-08T05:15:22","MIT",4,"Linux (Debian)","非必需（支持 CPU 模式）。推荐 NVIDIA GPU（如 RTX 3090），支持 AMD GPU。需安装 Nvidia Container Toolkit 或对应驱动以启用 GPU 加速。","最低未说明，参考配置为 96GB DDR4 RAM",{"notes":86,"python":87,"dependencies":88},"该工具主要基于 Debian 系统设计，但其他 Linux 发行版流程类似。架构采用模块化设计，核心组件（推理引擎、聊天界面、搜索、语音、绘图等）均通过 Docker 容器部署。若使用 AMD GPU 需跳过功耗限制步骤；若仅使用 CPU 可跳过显卡驱动安装。参考硬件配置为双路 RTX 3090 (24GB 显存) 和 Intel i5-12600KF。","未说明",[89,90,91,92,93,94,95,38,96,97],"Docker","Nvidia Container Toolkit","Ollama","llama.cpp","vLLM","Open WebUI","SearXNG","Kokoro FastAPI","llama-swap",[15,52,14,35],[100,101,102,103,104,105,106,97,107,108,109,110,111,112],"linux","llm","ollama","open-webui","debian","comfyui","vllm","llamacpp","mcp-proxy","kokoro-fastapi","mcpjungle","docker","huggingface","2026-03-27T02:49:30.150509","2026-04-09T09:32:49.122775",[116,121,126,131],{"id":117,"question_zh":118,"answer_zh":119,"source_url":120},25986,"如何手动安装 Ollama 并启动服务，而不让其开机自启？","对于无法使用自动安装脚本的隔离服务器，可以手动下载 `ollama` 二进制文件并通过 `scp` 传输到服务器（例如移动到 `\u002Fusr\u002Fbin`）。安装后，若遇到 `could not connect to ollama app` 错误，需手动启动服务：在一个终端运行 `ollama serve`，然后在另一个终端运行 `ollama run \u003C模型名>`。如果不需要开机自启，无需配置 systemd 服务或将其加入 `init.bash` 脚本，直接按需手动启动即可。","https:\u002F\u002Fgithub.com\u002Fvarunvasudeva1\u002Fllm-server-docs\u002Fissues\u002F2",{"id":122,"question_zh":123,"answer_zh":124,"source_url":125},25987,"Debian 系统休眠唤醒后，Ollama 无法使用 GPU（CUDA）怎么办？","这是一个已知的 Linux 休眠与 NVIDIA 驱动兼容性问题。系统休眠可能导致 NVIDIA 驱动卸载或切换到集成显卡（Optimus 技术），唤醒后 Ollama 只能使用 CPU。目前没有完美的自动修复方案，建议尝试以下方法：1. 在 BIOS 或系统中禁用休眠功能，将服务器设置为 24 小时运行；2. 如果使用的是双显卡（集显 + 独显），尝试配置系统仅使用 NVIDIA GPU（参考 Debian Wiki 关于 NVIDIA Optimus 的单 GPU 模式设置）；3. 避免系统进入休眠状态。若问题依旧，可能需要重启系统才能重新加载 GPU 驱动。","https:\u002F\u002Fgithub.com\u002Fvarunvasudeva1\u002Fllm-server-docs\u002Fissues\u002F1",{"id":127,"question_zh":128,"answer_zh":129,"source_url":130},25988,"该项目的代码和文档使用什么许可证？可以自由使用和修改吗？","该项目采用 MIT 许可证。这意味着您可以自由地复制、修改、分发和使用代码及文档，无论是用于个人目的还是商业用途，只要保留原始的许可证声明即可。","https:\u002F\u002Fgithub.com\u002Fvarunvasudeva1\u002Fllm-server-docs\u002Fissues\u002F4",{"id":132,"question_zh":133,"answer_zh":134,"source_url":135},25989,"运行 Ollama 推荐什么样的 GPU 硬件和驱动版本？","推荐使用较新的 NVIDIA GPU 架构（如 Ampere 系列，例如 RTX 3090 或 3060）。过旧的架构（如 Pascal 或 Maxwell）可能在最新驱动（如 590 版本）或 CUDA 版本中不再受支持或优化不足。在 Debian 上，通常使用 535 或 550 版本的驱动即可正常工作，但为了获得 llama.cpp 的最佳性能，建议尽量使用较新的硬件和匹配的驱动版本。安装时可通过 NVIDIA 官方仓库或非自由固件（nonfree-firmware）源进行安装。","https:\u002F\u002Fgithub.com\u002Fvarunvasudeva1\u002Fllm-server-docs\u002Fissues\u002F8",[]]