[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-coleam00--mcp-crawl4ai-rag":3,"tool-coleam00--mcp-crawl4ai-rag":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",157379,2,"2026-04-15T23:32:42",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":77,"owner_email":77,"owner_twitter":77,"owner_website":78,"owner_url":79,"languages":80,"stars":93,"forks":94,"last_commit_at":95,"license":96,"difficulty_score":97,"env_os":98,"env_gpu":99,"env_ram":100,"env_deps":101,"category_tags":112,"github_topics":77,"view_count":32,"oss_zip_url":77,"oss_zip_packed_at":77,"status":17,"created_at":113,"updated_at":114,"faqs":115,"releases":146},7963,"coleam00\u002Fmcp-crawl4ai-rag","mcp-crawl4ai-rag","Web Crawling and RAG Capabilities for AI Agents and AI Coding Assistants","mcp-crawl4ai-rag 是一款专为 AI 智能体和编程助手打造的开源工具，旨在赋予它们强大的网页抓取与检索增强生成（RAG）能力。它基于模型上下文协议（MCP），无缝集成了 Crawl4AI 爬虫引擎与 Supabase 向量数据库，让 AI 能够自动“阅读”互联网内容并将其转化为可查询的知识库，从而有效解决大模型因训练数据截止或缺乏特定领域信息而产生的知识滞后与幻觉问题。\n\n这款工具特别适合开发者、AI 研究人员以及希望构建定制化 AI 代理的技术团队使用。通过 mcp-crawl4ai-rag，用户可以轻松让 AI 递归抓取网站、智能分块处理内容，并利用这些实时数据进行精准回答。其技术亮点在于支持多种高级 RAG 策略，包括混合搜索（结合向量与关键词）、上下文嵌入增强、结果重排序以及基于知识图谱的幻觉检测机制。此外，项目规划了完善的本地化部署路径，未来将支持 Ollama 等本地模型，确保数据隐私与完全可控。作为一个正在快速迭代的知识引擎雏形，mcp-crawl4ai-rag 为构建具备实时学习能力的下一代 AI 应用提供了坚实底座。","\u003Ch1 align=\"center\">Crawl4AI RAG MCP Server\u003C\u002Fh1>\n\n\u003Cp align=\"center\">\n  \u003Cem>Web Crawling and RAG Capabilities for AI Agents and AI Coding Assistants\u003C\u002Fem>\n\u003C\u002Fp>\n\nA powerful implementation of the [Model Context Protocol (MCP)](https:\u002F\u002Fmodelcontextprotocol.io) integrated with [Crawl4AI](https:\u002F\u002Fcrawl4ai.com) and [Supabase](https:\u002F\u002Fsupabase.com\u002F) for providing AI agents and AI coding assistants with advanced web crawling and RAG capabilities.\n\nWith this MCP server, you can \u003Cb>scrape anything\u003C\u002Fb> and then \u003Cb>use that knowledge anywhere\u003C\u002Fb> for RAG.\n\nThe primary goal is to bring this MCP server into [Archon](https:\u002F\u002Fgithub.com\u002Fcoleam00\u002FArchon) as I evolve it to be more of a knowledge engine for AI coding assistants to build AI agents. This first version of the Crawl4AI\u002FRAG MCP server will be improved upon greatly soon, especially making it more configurable so you can use different embedding models and run everything locally with Ollama.\n\nConsider this GitHub repository a testbed, hence why I haven't been super actively address issues and pull requests yet. I certainly will though as I bring this into Archon V2!\n\n## Overview\n\nThis MCP server provides tools that enable AI agents to crawl websites, store content in a vector database (Supabase), and perform RAG over the crawled content. It follows the best practices for building MCP servers based on the [Mem0 MCP server template](https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-mem0\u002F) I provided on my channel previously.\n\nThe server includes several advanced RAG strategies that can be enabled to enhance retrieval quality:\n- **Contextual Embeddings** for enriched semantic understanding\n- **Hybrid Search** combining vector and keyword search\n- **Agentic RAG** for specialized code example extraction\n- **Reranking** for improved result relevance using cross-encoder models\n- **Knowledge Graph** for AI hallucination detection and repository code analysis\n\nSee the [Configuration section](#configuration) below for details on how to enable and configure these strategies.\n\n## Vision\n\nThe Crawl4AI RAG MCP server is just the beginning. Here's where we're headed:\n\n1. **Integration with Archon**: Building this system directly into [Archon](https:\u002F\u002Fgithub.com\u002Fcoleam00\u002FArchon) to create a comprehensive knowledge engine for AI coding assistants to build better AI agents.\n\n2. **Multiple Embedding Models**: Expanding beyond OpenAI to support a variety of embedding models, including the ability to run everything locally with Ollama for complete control and privacy.\n\n3. **Advanced RAG Strategies**: Implementing sophisticated retrieval techniques like contextual retrieval, late chunking, and others to move beyond basic \"naive lookups\" and significantly enhance the power and precision of the RAG system, especially as it integrates with Archon.\n\n4. **Enhanced Chunking Strategy**: Implementing a Context 7-inspired chunking approach that focuses on examples and creates distinct, semantically meaningful sections for each chunk, improving retrieval precision.\n\n5. **Performance Optimization**: Increasing crawling and indexing speed to make it more realistic to \"quickly\" index new documentation to then leverage it within the same prompt in an AI coding assistant.\n\n## Features\n\n- **Smart URL Detection**: Automatically detects and handles different URL types (regular webpages, sitemaps, text files)\n- **Recursive Crawling**: Follows internal links to discover content\n- **Parallel Processing**: Efficiently crawls multiple pages simultaneously\n- **Content Chunking**: Intelligently splits content by headers and size for better processing\n- **Vector Search**: Performs RAG over crawled content, optionally filtering by data source for precision\n- **Source Retrieval**: Retrieve sources available for filtering to guide the RAG process\n\n## Tools\n\nThe server provides essential web crawling and search tools:\n\n### Core Tools (Always Available)\n\n1. **`crawl_single_page`**: Quickly crawl a single web page and store its content in the vector database\n2. **`smart_crawl_url`**: Intelligently crawl a full website based on the type of URL provided (sitemap, llms-full.txt, or a regular webpage that needs to be crawled recursively)\n3. **`get_available_sources`**: Get a list of all available sources (domains) in the database\n4. **`perform_rag_query`**: Search for relevant content using semantic search with optional source filtering\n\n### Conditional Tools\n\n5. **`search_code_examples`** (requires `USE_AGENTIC_RAG=true`): Search specifically for code examples and their summaries from crawled documentation. This tool provides targeted code snippet retrieval for AI coding assistants.\n\n### Knowledge Graph Tools (requires `USE_KNOWLEDGE_GRAPH=true`, see below)\n\n6. **`parse_github_repository`**: Parse a GitHub repository into a Neo4j knowledge graph, extracting classes, methods, functions, and their relationships for hallucination detection\n7. **`check_ai_script_hallucinations`**: Analyze Python scripts for AI hallucinations by validating imports, method calls, and class usage against the knowledge graph\n8. **`query_knowledge_graph`**: Explore and query the Neo4j knowledge graph with commands like `repos`, `classes`, `methods`, and custom Cypher queries\n\n## Prerequisites\n\n- [Docker\u002FDocker Desktop](https:\u002F\u002Fwww.docker.com\u002Fproducts\u002Fdocker-desktop\u002F) if running the MCP server as a container (recommended)\n- [Python 3.12+](https:\u002F\u002Fwww.python.org\u002Fdownloads\u002F) if running the MCP server directly through uv\n- [Supabase](https:\u002F\u002Fsupabase.com\u002F) (database for RAG)\n- [OpenAI API key](https:\u002F\u002Fplatform.openai.com\u002Fapi-keys) (for generating embeddings)\n- [Neo4j](https:\u002F\u002Fneo4j.com\u002F) (optional, for knowledge graph functionality) - see [Knowledge Graph Setup](#knowledge-graph-setup) section\n\n## Installation\n\n### Using Docker (Recommended)\n\n1. Clone this repository:\n   ```bash\n   git clone https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-crawl4ai-rag.git\n   cd mcp-crawl4ai-rag\n   ```\n\n2. Build the Docker image:\n   ```bash\n   docker build -t mcp\u002Fcrawl4ai-rag --build-arg PORT=8051 .\n   ```\n\n3. Create a `.env` file based on the configuration section below\n\n### Using uv directly (no Docker)\n\n1. Clone this repository:\n   ```bash\n   git clone https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-crawl4ai-rag.git\n   cd mcp-crawl4ai-rag\n   ```\n\n2. Install uv if you don't have it:\n   ```bash\n   pip install uv\n   ```\n\n3. Create and activate a virtual environment:\n   ```bash\n   uv venv\n   .venv\\Scripts\\activate\n   # on Mac\u002FLinux: source .venv\u002Fbin\u002Factivate\n   ```\n\n4. Install dependencies:\n   ```bash\n   uv pip install -e .\n   crawl4ai-setup\n   ```\n\n5. Create a `.env` file based on the configuration section below\n\n## Database Setup\n\nBefore running the server, you need to set up the database with the pgvector extension:\n\n1. Go to the SQL Editor in your Supabase dashboard (create a new project first if necessary)\n\n2. Create a new query and paste the contents of `crawled_pages.sql`\n\n3. Run the query to create the necessary tables and functions\n\n## Knowledge Graph Setup (Optional)\n\nTo enable AI hallucination detection and repository analysis features, you need to set up Neo4j.\n\nAlso, the knowledge graph implementation isn't fully compatible with Docker yet, so I would recommend right now running directly through uv if you want to use the hallucination detection within the MCP server!\n\nFor installing Neo4j:\n\n### Local AI Package (Recommended)\n\nThe easiest way to get Neo4j running locally is with the [Local AI Package](https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Flocal-ai-packaged) - a curated collection of local AI services including Neo4j:\n\n1. **Clone the Local AI Package**:\n   ```bash\n   git clone https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Flocal-ai-packaged.git\n   cd local-ai-packaged\n   ```\n\n2. **Start Neo4j**:\n   Follow the instructions in the Local AI Package repository to start Neo4j with Docker Compose\n\n3. **Default connection details**:\n   - URI: `bolt:\u002F\u002Flocalhost:7687`\n   - Username: `neo4j`\n   - Password: Check the Local AI Package documentation for the default password\n\n### Manual Neo4j Installation\n\nAlternatively, install Neo4j directly:\n\n1. **Install Neo4j Desktop**: Download from [neo4j.com\u002Fdownload](https:\u002F\u002Fneo4j.com\u002Fdownload\u002F)\n\n2. **Create a new database**:\n   - Open Neo4j Desktop\n   - Create a new project and database\n   - Set a password for the `neo4j` user\n   - Start the database\n\n3. **Note your connection details**:\n   - URI: `bolt:\u002F\u002Flocalhost:7687` (default)\n   - Username: `neo4j` (default)\n   - Password: Whatever you set during creation\n\n## Configuration\n\nCreate a `.env` file in the project root with the following variables:\n\n```\n# MCP Server Configuration\nHOST=0.0.0.0\nPORT=8051\nTRANSPORT=sse\n\n# OpenAI API Configuration\nOPENAI_API_KEY=your_openai_api_key\n\n# LLM for summaries and contextual embeddings\nMODEL_CHOICE=gpt-4.1-nano\n\n# RAG Strategies (set to \"true\" or \"false\", default to \"false\")\nUSE_CONTEXTUAL_EMBEDDINGS=false\nUSE_HYBRID_SEARCH=false\nUSE_AGENTIC_RAG=false\nUSE_RERANKING=false\nUSE_KNOWLEDGE_GRAPH=false\n\n# Supabase Configuration\nSUPABASE_URL=your_supabase_project_url\nSUPABASE_SERVICE_KEY=your_supabase_service_key\n\n# Neo4j Configuration (required for knowledge graph functionality)\nNEO4J_URI=bolt:\u002F\u002Flocalhost:7687\nNEO4J_USER=neo4j\nNEO4J_PASSWORD=your_neo4j_password\n```\n\n### RAG Strategy Options\n\nThe Crawl4AI RAG MCP server supports four powerful RAG strategies that can be enabled independently:\n\n#### 1. **USE_CONTEXTUAL_EMBEDDINGS**\nWhen enabled, this strategy enhances each chunk's embedding with additional context from the entire document. The system passes both the full document and the specific chunk to an LLM (configured via `MODEL_CHOICE`) to generate enriched context that gets embedded alongside the chunk content.\n\n- **When to use**: Enable this when you need high-precision retrieval where context matters, such as technical documentation where terms might have different meanings in different sections.\n- **Trade-offs**: Slower indexing due to LLM calls for each chunk, but significantly better retrieval accuracy.\n- **Cost**: Additional LLM API calls during indexing.\n\n#### 2. **USE_HYBRID_SEARCH**\nCombines traditional keyword search with semantic vector search to provide more comprehensive results. The system performs both searches in parallel and intelligently merges results, prioritizing documents that appear in both result sets.\n\n- **When to use**: Enable this when users might search using specific technical terms, function names, or when exact keyword matches are important alongside semantic understanding.\n- **Trade-offs**: Slightly slower search queries but more robust results, especially for technical content.\n- **Cost**: No additional API costs, just computational overhead.\n\n#### 3. **USE_AGENTIC_RAG**\nEnables specialized code example extraction and storage. When crawling documentation, the system identifies code blocks (≥300 characters), extracts them with surrounding context, generates summaries, and stores them in a separate vector database table specifically designed for code search.\n\n- **When to use**: Essential for AI coding assistants that need to find specific code examples, implementation patterns, or usage examples from documentation.\n- **Trade-offs**: Significantly slower crawling due to code extraction and summarization, requires more storage space.\n- **Cost**: Additional LLM API calls for summarizing each code example.\n- **Benefits**: Provides a dedicated `search_code_examples` tool that AI agents can use to find specific code implementations.\n\n#### 4. **USE_RERANKING**\nApplies cross-encoder reranking to search results after initial retrieval. Uses a lightweight cross-encoder model (`cross-encoder\u002Fms-marco-MiniLM-L-6-v2`) to score each result against the original query, then reorders results by relevance.\n\n- **When to use**: Enable this when search precision is critical and you need the most relevant results at the top. Particularly useful for complex queries where semantic similarity alone might not capture query intent.\n- **Trade-offs**: Adds ~100-200ms to search queries depending on result count, but significantly improves result ordering.\n- **Cost**: No additional API costs - uses a local model that runs on CPU.\n- **Benefits**: Better result relevance, especially for complex queries. Works with both regular RAG search and code example search.\n\n#### 5. **USE_KNOWLEDGE_GRAPH**\nEnables AI hallucination detection and repository analysis using Neo4j knowledge graphs. When enabled, the system can parse GitHub repositories into a graph database and validate AI-generated code against real repository structures. (NOT fully compatible with Docker yet, I'd recommend running through uv)\n\n- **When to use**: Enable this for AI coding assistants that need to validate generated code against real implementations, or when you want to detect when AI models hallucinate non-existent methods, classes, or incorrect usage patterns.\n- **Trade-offs**: Requires Neo4j setup and additional dependencies. Repository parsing can be slow for large codebases, and validation requires repositories to be pre-indexed.\n- **Cost**: No additional API costs for validation, but requires Neo4j infrastructure (can use free local installation or cloud AuraDB).\n- **Benefits**: Provides three powerful tools: `parse_github_repository` for indexing codebases, `check_ai_script_hallucinations` for validating AI-generated code, and `query_knowledge_graph` for exploring indexed repositories.\n\nYou can now tell the AI coding assistant to add a Python GitHub repository to the knowledge graph like:\n\n\"Add https:\u002F\u002Fgithub.com\u002Fpydantic\u002Fpydantic-ai.git to the knowledge graph\"\n\nMake sure the repo URL ends with .git.\n\nYou can also have the AI coding assistant check for hallucinations with scripts it just created, or you can manually run the command:\n\n```\npython knowledge_graphs\u002Fai_hallucination_detector.py [full path to your script to analyze]\n```\n\n### Recommended Configurations\n\n**For general documentation RAG:**\n```\nUSE_CONTEXTUAL_EMBEDDINGS=false\nUSE_HYBRID_SEARCH=true\nUSE_AGENTIC_RAG=false\nUSE_RERANKING=true\n```\n\n**For AI coding assistant with code examples:**\n```\nUSE_CONTEXTUAL_EMBEDDINGS=true\nUSE_HYBRID_SEARCH=true\nUSE_AGENTIC_RAG=true\nUSE_RERANKING=true\nUSE_KNOWLEDGE_GRAPH=false\n```\n\n**For AI coding assistant with hallucination detection:**\n```\nUSE_CONTEXTUAL_EMBEDDINGS=true\nUSE_HYBRID_SEARCH=true\nUSE_AGENTIC_RAG=true\nUSE_RERANKING=true\nUSE_KNOWLEDGE_GRAPH=true\n```\n\n**For fast, basic RAG:**\n```\nUSE_CONTEXTUAL_EMBEDDINGS=false\nUSE_HYBRID_SEARCH=true\nUSE_AGENTIC_RAG=false\nUSE_RERANKING=false\nUSE_KNOWLEDGE_GRAPH=false\n```\n\n## Running the Server\n\n### Using Docker\n\n```bash\ndocker run --env-file .env -p 8051:8051 mcp\u002Fcrawl4ai-rag\n```\n\n### Using Python\n\n```bash\nuv run src\u002Fcrawl4ai_mcp.py\n```\n\nThe server will start and listen on the configured host and port.\n\n## Integration with MCP Clients\n\n### SSE Configuration\n\nOnce you have the server running with SSE transport, you can connect to it using this configuration:\n\n```json\n{\n  \"mcpServers\": {\n    \"crawl4ai-rag\": {\n      \"transport\": \"sse\",\n      \"url\": \"http:\u002F\u002Flocalhost:8051\u002Fsse\"\n    }\n  }\n}\n```\n\n> **Note for Windsurf users**: Use `serverUrl` instead of `url` in your configuration:\n> ```json\n> {\n>   \"mcpServers\": {\n>     \"crawl4ai-rag\": {\n>       \"transport\": \"sse\",\n>       \"serverUrl\": \"http:\u002F\u002Flocalhost:8051\u002Fsse\"\n>     }\n>   }\n> }\n> ```\n>\n> **Note for Docker users**: Use `host.docker.internal` instead of `localhost` if your client is running in a different container. This will apply if you are using this MCP server within n8n!\n\n> **Note for Claude Code users**: \n```\nclaude mcp add-json crawl4ai-rag '{\"type\":\"http\",\"url\":\"http:\u002F\u002Flocalhost:8051\u002Fsse\"}' --scope user\n```\n\n### Stdio Configuration\n\nAdd this server to your MCP configuration for Claude Desktop, Windsurf, or any other MCP client:\n\n```json\n{\n  \"mcpServers\": {\n    \"crawl4ai-rag\": {\n      \"command\": \"python\",\n      \"args\": [\"path\u002Fto\u002Fcrawl4ai-mcp\u002Fsrc\u002Fcrawl4ai_mcp.py\"],\n      \"env\": {\n        \"TRANSPORT\": \"stdio\",\n        \"OPENAI_API_KEY\": \"your_openai_api_key\",\n        \"SUPABASE_URL\": \"your_supabase_url\",\n        \"SUPABASE_SERVICE_KEY\": \"your_supabase_service_key\",\n        \"USE_KNOWLEDGE_GRAPH\": \"false\",\n        \"NEO4J_URI\": \"bolt:\u002F\u002Flocalhost:7687\",\n        \"NEO4J_USER\": \"neo4j\",\n        \"NEO4J_PASSWORD\": \"your_neo4j_password\"\n      }\n    }\n  }\n}\n```\n\n### Docker with Stdio Configuration\n\n```json\n{\n  \"mcpServers\": {\n    \"crawl4ai-rag\": {\n      \"command\": \"docker\",\n      \"args\": [\"run\", \"--rm\", \"-i\", \n               \"-e\", \"TRANSPORT\", \n               \"-e\", \"OPENAI_API_KEY\", \n               \"-e\", \"SUPABASE_URL\", \n               \"-e\", \"SUPABASE_SERVICE_KEY\",\n               \"-e\", \"USE_KNOWLEDGE_GRAPH\",\n               \"-e\", \"NEO4J_URI\",\n               \"-e\", \"NEO4J_USER\",\n               \"-e\", \"NEO4J_PASSWORD\",\n               \"mcp\u002Fcrawl4ai\"],\n      \"env\": {\n        \"TRANSPORT\": \"stdio\",\n        \"OPENAI_API_KEY\": \"your_openai_api_key\",\n        \"SUPABASE_URL\": \"your_supabase_url\",\n        \"SUPABASE_SERVICE_KEY\": \"your_supabase_service_key\",\n        \"USE_KNOWLEDGE_GRAPH\": \"false\",\n        \"NEO4J_URI\": \"bolt:\u002F\u002Flocalhost:7687\",\n        \"NEO4J_USER\": \"neo4j\",\n        \"NEO4J_PASSWORD\": \"your_neo4j_password\"\n      }\n    }\n  }\n}\n```\n\n## Knowledge Graph Architecture\n\nThe knowledge graph system stores repository code structure in Neo4j with the following components:\n\n### Core Components (`knowledge_graphs\u002F` folder):\n\n- **`parse_repo_into_neo4j.py`**: Clones and analyzes GitHub repositories, extracting Python classes, methods, functions, and imports into Neo4j nodes and relationships\n- **`ai_script_analyzer.py`**: Parses Python scripts using AST to extract imports, class instantiations, method calls, and function usage\n- **`knowledge_graph_validator.py`**: Validates AI-generated code against the knowledge graph to detect hallucinations (non-existent methods, incorrect parameters, etc.)\n- **`hallucination_reporter.py`**: Generates comprehensive reports about detected hallucinations with confidence scores and recommendations\n- **`query_knowledge_graph.py`**: Interactive CLI tool for exploring the knowledge graph (functionality now integrated into MCP tools)\n\n### Knowledge Graph Schema:\n\nThe Neo4j database stores code structure as:\n\n**Nodes:**\n- `Repository`: GitHub repositories\n- `File`: Python files within repositories  \n- `Class`: Python classes with methods and attributes\n- `Method`: Class methods with parameter information\n- `Function`: Standalone functions\n- `Attribute`: Class attributes\n\n**Relationships:**\n- `Repository` -[:CONTAINS]-> `File`\n- `File` -[:DEFINES]-> `Class`\n- `File` -[:DEFINES]-> `Function`\n- `Class` -[:HAS_METHOD]-> `Method`\n- `Class` -[:HAS_ATTRIBUTE]-> `Attribute`\n\n### Workflow:\n\n1. **Repository Parsing**: Use `parse_github_repository` tool to clone and analyze open-source repositories\n2. **Code Validation**: Use `check_ai_script_hallucinations` tool to validate AI-generated Python scripts\n3. **Knowledge Exploration**: Use `query_knowledge_graph` tool to explore available repositories, classes, and methods\n\n## Building Your Own Server\n\nThis implementation provides a foundation for building more complex MCP servers with web crawling capabilities. To build your own:\n\n1. Add your own tools by creating methods with the `@mcp.tool()` decorator\n2. Create your own lifespan function to add your own dependencies\n3. Modify the `utils.py` file for any helper functions you need\n4. Extend the crawling capabilities by adding more specialized crawlers\n","\u003Ch1 align=\"center\">Crawl4AI RAG MCP 服务器\u003C\u002Fh1>\n\n\u003Cp align=\"center\">\n  \u003Cem>为 AI 助手和 AI 编码助手提供网络爬取与 RAG 能力\u003C\u002Fem>\n\u003C\u002Fp>\n\n这是一个强大的 [模型上下文协议 (MCP)](https:\u002F\u002Fmodelcontextprotocol.io) 实现，集成了 [Crawl4AI](https:\u002F\u002Fcrawl4ai.com) 和 [Supabase](https:\u002F\u002Fsupabase.com\u002F)，旨在为 AI 助手和 AI 编码助手提供先进的网络爬取与 RAG 能力。\n\n借助此 MCP 服务器，您可以\u003Cb>抓取任何内容\u003C\u002Fb>,然后将这些知识\u003Cb>在任何地方用于 RAG\u003C\u002Fb>。\n\n我们的主要目标是将此 MCP 服务器整合到 [Archon](https:\u002F\u002Fgithub.com\u002Fcoleam00\u002FArchon) 中，随着 Archon 的演进，使其逐渐成为 AI 编码助手构建 AI 助手的知识引擎。目前的 Crawl4AI\u002FRAG MCP 服务器初版将在不久后得到大幅改进，尤其是会增加更多的可配置性，以便您可以使用不同的嵌入模型，并通过 Ollama 在本地运行整个系统。\n\n请将此 GitHub 仓库视为一个测试平台，这也是我尚未积极处理问题和拉取请求的原因。不过，随着我们将它引入 Archon V2，我一定会更加积极地参与其中！\n\n## 概述\n\n此 MCP 服务器提供了一系列工具，使 AI 助手能够爬取网站、将内容存储到向量数据库（Supabase）中，并对爬取的内容执行 RAG 操作。它遵循基于我之前在频道上提供的 [Mem0 MCP 服务器模板](https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-mem0\u002F) 构建 MCP 服务器的最佳实践。\n\n该服务器包含多种高级 RAG 策略，可以启用以提升检索质量：\n- **上下文嵌入**，用于增强语义理解\n- **混合搜索**，结合向量搜索和关键词搜索\n- **代理式 RAG**，用于提取专门的代码示例\n- **重排序**，利用交叉编码器模型提高结果的相关性\n- **知识图谱**，用于检测 AI 幻觉并分析代码库\n\n有关如何启用和配置这些策略的详细信息，请参阅下方的[配置部分](#configuration)。\n\n## 愿景\n\nCrawl4AI RAG MCP 服务器仅仅是一个开始。我们的目标如下：\n\n1. **与 Archon 集成**：将此系统直接构建到 [Archon](https:\u002F\u002Fgithub.com\u002Fcoleam00\u002FArchon) 中，打造一个全面的知识引擎，帮助 AI 编码助手构建更优秀的 AI 助手。\n2. **支持多种嵌入模型**：不仅限于 OpenAI，还将扩展到支持各种嵌入模型，包括通过 Ollama 在本地运行所有功能，以实现完全的控制和隐私保护。\n3. **高级 RAG 策略**：实施上下文检索、延迟分块等复杂检索技术，超越简单的“朴素查找”，显著提升 RAG 系统的功能和精度，尤其是在与 Archon 集成时。\n4. **优化分块策略**：采用受 Context 7 启发的分块方法，专注于示例并为每个分块创建具有明确语义意义的独立部分，从而提高检索精度。\n5. **性能优化**：提升爬取和索引速度，使快速索引新文档并在 AI 编码助手的同一提示中加以利用变得更加现实。\n\n## 功能\n\n- **智能 URL 检测**：自动检测并处理不同类型的 URL（普通网页、站点地图、文本文件）\n- **递归爬取**：跟随内部链接发现内容\n- **并行处理**：高效地同时爬取多个页面\n- **内容分块**：根据标题和大小智能分割内容，以便更好地处理\n- **向量搜索**：对爬取的内容执行 RAG 操作，可选择按数据源过滤以提高精确度\n- **来源检索**：检索可用于筛选的来源，以指导 RAG 流程\n\n## 工具\n\n该服务器提供了必要的网络爬取和搜索工具：\n\n### 核心工具（始终可用）\n\n1. **`crawl_single_page`**：快速爬取单个网页并将内容存储到向量数据库中\n2. **`smart_crawl_url`**：根据提供的 URL 类型（站点地图、llms-full.txt 或需要递归爬取的普通网页）智能地爬取整个网站\n3. **`get_available_sources`**：获取数据库中所有可用来源（域名）的列表\n4. **`perform_rag_query`**：使用语义搜索查找相关内容，可选择按来源过滤\n\n### 条件工具\n\n5. **`search_code_examples`**（需设置 `USE_AGENTIC_RAG=true`）：专门从爬取的文档中搜索代码示例及其摘要。此工具为 AI 编码助手提供有针对性的代码片段检索。\n\n### 知识图谱工具（需设置 `USE_KNOWLEDGE_GRAPH=true`，见下文）\n\n6. **`parse_github_repository`**：将 GitHub 代码库解析为 Neo4j 知识图谱，提取类、方法、函数及其关系，用于幻觉检测\n7. **`check_ai_script_hallucinations`**：通过验证导入、方法调用和类的使用是否符合知识图谱，分析 Python 脚本是否存在 AI 幻觉\n8. **`query_knowledge_graph`**：使用 `repos`、`classes`、`methods` 等命令以及自定义 Cypher 查询来探索和查询 Neo4j 知识图谱\n\n## 先决条件\n\n- 如果以容器方式运行 MCP 服务器（推荐），则需要 [Docker\u002FDocker Desktop](https:\u002F\u002Fwww.docker.com\u002Fproducts\u002Fdocker-desktop\u002F)\n- 如果直接通过 uv 运行 MCP 服务器，则需要 [Python 3.12+](https:\u002F\u002Fwww.python.org\u002Fdownloads\u002F)\n- [Supabase](https:\u002F\u002Fsupabase.com\u002F)（用于 RAG 的数据库）\n- [OpenAI API 密钥](https:\u002F\u002Fplatform.openai.com\u002Fapi-keys)（用于生成嵌入）\n- [Neo4j](https:\u002F\u002Fneo4j.com\u002F)（可选，用于知识图谱功能）——请参阅[知识图谱设置](#knowledge-graph-setup)部分\n\n## 安装\n\n### 使用 Docker（推荐）\n\n1. 克隆此仓库：\n   ```bash\n   git clone https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-crawl4ai-rag.git\n   cd mcp-crawl4ai-rag\n   ```\n\n2. 构建 Docker 镜像：\n   ```bash\n   docker build -t mcp\u002Fcrawl4ai-rag --build-arg PORT=8051 .\n   ```\n\n3. 根据下方的配置部分创建 `.env` 文件\n\n### 直接使用 uv（无需 Docker）\n\n1. 克隆此仓库：\n   ```bash\n   git clone https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-crawl4ai-rag.git\n   cd mcp-crawl4ai-rag\n   ```\n\n2. 如果尚未安装 uv，请先安装：\n   ```bash\n   pip install uv\n   ```\n\n3. 创建并激活虚拟环境：\n   ```bash\n   uv venv\n   .venv\\Scripts\\activate\n   # 在 Mac\u002FLinux 上：source .venv\u002Fbin\u002Factivate\n   ```\n\n4. 安装依赖项：\n   ```bash\n   uv pip install -e .\n   crawl4ai-setup\n   ```\n\n5. 根据下方的配置部分创建 `.env` 文件\n\n## 数据库设置\n\n在运行服务器之前，您需要设置带有 pgvector 扩展的数据库：\n\n1. 前往 Supabase 控制台中的 SQL 编辑器（如有必要，先创建一个新项目）\n\n2. 创建一个新的查询，并粘贴 `crawled_pages.sql` 中的内容\n\n3. 运行该查询以创建必要的表和函数\n\n## 知识图谱设置（可选）\n\n要启用AI幻觉检测和代码库分析功能，您需要设置Neo4j。\n\n此外，知识图谱的实现目前尚未完全兼容Docker，因此如果您希望在MCP服务器中使用幻觉检测功能，建议直接通过uv运行！\n\n### 安装Neo4j\n\n#### 本地AI软件包（推荐）\n\n在本地运行Neo4j最简单的方式是使用[本地AI软件包](https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Flocal-ai-packaged)——一个包含Neo4j在内的精选本地AI服务集合：\n\n1. **克隆本地AI软件包**：\n   ```bash\n   git clone https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Flocal-ai-packaged.git\n   cd local-ai-packaged\n   ```\n\n2. **启动Neo4j**：\n   按照本地AI软件包仓库中的说明，使用Docker Compose启动Neo4j。\n\n3. **默认连接信息**：\n   - URI：`bolt:\u002F\u002Flocalhost:7687`\n   - 用户名：`neo4j`\n   - 密码：请参考本地AI软件包文档获取默认密码。\n\n#### 手动安装Neo4j\n\n您也可以直接安装Neo4j：\n\n1. **安装Neo4j Desktop**：从[neo4j.com\u002Fdownload](https:\u002F\u002Fneo4j.com\u002Fdownload\u002F)下载。\n\n2. **创建新数据库**：\n   - 打开Neo4j Desktop。\n   - 创建新项目和数据库。\n   - 为`neo4j`用户设置密码。\n   - 启动数据库。\n\n3. **记录您的连接信息**：\n   - URI：`bolt:\u002F\u002Flocalhost:7687`（默认）\n   - 用户名：`neo4j`（默认）\n   - 密码：您在创建时设置的密码。\n\n## 配置\n\n在项目根目录下创建一个`.env`文件，并添加以下变量：\n\n```\n# MCP服务器配置\nHOST=0.0.0.0\nPORT=8051\nTRANSPORT=sse\n\n# OpenAI API配置\nOPENAI_API_KEY=your_openai_api_key\n\n# 用于摘要和上下文嵌入的LLM\nMODEL_CHOICE=gpt-4.1-nano\n\n# RAG策略（设置为“true”或“false”，默认为“false”）\nUSE_CONTEXTUAL_EMBEDDINGS=false\nUSE_HYBRID_SEARCH=false\nUSE_AGENTIC_RAG=false\nUSE_RERANKING=false\nUSE_KNOWLEDGE_GRAPH=false\n\n# Supabase配置\nSUPABASE_URL=your_supabase_project_url\nSUPABASE_SERVICE_KEY=your_supabase_service_key\n\n# Neo4j配置（知识图谱功能所需）\nNEO4J_URI=bolt:\u002F\u002Flocalhost:7687\nNEO4J_USER=neo4j\nNEO4J_PASSWORD=your_neo4j_password\n```\n\n### RAG策略选项\n\nCrawl4AI RAG MCP服务器支持四种强大的RAG策略，您可以根据需求独立启用：\n\n#### 1. **USE_CONTEXTUAL_EMBEDDINGS**\n启用后，该策略会利用整个文档的额外上下文来增强每个文本块的嵌入。系统会将整篇文档和特定文本块同时传递给由`MODEL_CHOICE`配置的LLM，以生成丰富的上下文信息，并将其与文本块内容一同嵌入。\n\n- **适用场景**：当您需要高精度的检索结果且上下文至关重要时，例如技术文档中某些术语在不同章节可能具有不同含义。\n- **权衡**：由于需要对每个文本块调用LLM，索引过程会较慢，但检索准确性显著提高。\n- **成本**：索引过程中会产生额外的LLM API调用费用。\n\n#### 2. **USE_HYBRID_SEARCH**\n结合传统的关键词搜索与语义向量搜索，提供更全面的检索结果。系统会并行执行两种搜索，并智能地合并结果，优先展示同时出现在两个结果集中的文档。\n\n- **适用场景**：当用户可能使用特定的技术术语、函数名进行搜索，或者在注重语义理解的同时也需要精确匹配关键词时。\n- **权衡**：搜索查询速度稍慢，但结果更加稳健，尤其适用于技术内容。\n- **成本**：无需额外的API费用，仅需承担一定的计算开销。\n\n#### 3. **USE_AGENTIC_RAG**\n启用专门的代码示例提取与存储功能。在爬取文档时，系统会识别长度≥300字符的代码块，提取其周围上下文并生成摘要，然后将这些代码示例及其摘要存储到专为代码搜索设计的独立向量数据库表中。\n\n- **适用场景**：对于需要从文档中查找特定代码示例、实现模式或使用案例的AI编码助手来说，此功能至关重要。\n- **权衡**：由于涉及代码提取和摘要生成，爬取速度会显著减慢，且需要更多的存储空间。\n- **成本**：每提取一个代码示例都需要调用LLM API生成摘要。\n- **收益**：提供了一个专用的`search_code_examples`工具，供AI代理用于查找具体的代码实现。\n\n#### 4. **USE_RERANKING**\n在初始检索之后，对搜索结果应用交叉编码器重排序。系统会使用轻量级的交叉编码器模型（`cross-encoder\u002Fms-marco-MiniLM-L-6-v2`）对每个结果与原始查询进行打分，然后按相关性重新排序结果。\n\n- **适用场景**：当搜索精度至关重要，您需要将最相关的结果排在首位时。尤其适用于语义相似度本身可能无法准确捕捉查询意图的复杂查询。\n- **权衡**：根据结果数量的不同，每次搜索查询会增加约100–200毫秒的延迟，但可以显著提升结果的相关性。\n- **成本**：无需额外的API费用——使用的是可在CPU上运行的本地模型。\n- **收益**：提高结果的相关性，尤其适用于复杂查询。既可用于常规RAG搜索，也可用于代码示例搜索。\n\n#### 5. **USE_KNOWLEDGE_GRAPH**\n启用基于Neo4j知识图谱的AI幻觉检测和代码库分析功能。启用后，系统可以将GitHub代码库解析为图数据库，并将AI生成的代码与真实的代码库结构进行验证。（目前尚未完全兼容Docker，建议通过uv运行）\n\n- **适用场景**：对于需要将生成的代码与实际实现进行验证的AI编码助手，或者希望检测AI模型是否生成了不存在的方法、类或错误用法模式的情况。\n- **权衡**：需要设置Neo4j并引入额外依赖。对于大型代码库，解析过程可能会较慢；验证还需要预先索引代码库。\n- **成本**：验证本身不产生额外的API费用，但需要搭建Neo4j基础设施（可使用免费的本地安装版或云服务AuraDB）。\n- **收益**：提供三项强大功能：“parse_github_repository”用于索引代码库，“check_ai_script_hallucinations”用于验证AI生成的代码，以及“query_knowledge_graph”用于探索已索引的代码库。\n\n现在您可以指示AI编码助手将某个Python GitHub代码库添加到知识图谱中，例如：\n\n“将https:\u002F\u002Fgithub.com\u002Fpydantic\u002Fpydantic-ai.git添加到知识图谱”\n\n请确保仓库URL以`.git`结尾。\n\n您还可以让AI编码助手检查其刚刚生成的脚本是否存在幻觉，或者手动运行以下命令：\n\n```\npython knowledge_graphs\u002Fai_hallucination_detector.py [待分析脚本的完整路径]\n```\n\n### 推荐配置\n\n**适用于通用文档 RAG：**\n```\nUSE_CONTEXTUAL_EMBEDDINGS=false\nUSE_HYBRID_SEARCH=true\nUSE_AGENTIC_RAG=false\nUSE_RERANKING=true\n```\n\n**适用于带有代码示例的 AI 编程助手：**\n```\nUSE_CONTEXTUAL_EMBEDDINGS=true\nUSE_HYBRID_SEARCH=true\nUSE_AGENTIC_RAG=true\nUSE_RERANKING=true\nUSE_KNOWLEDGE_GRAPH=false\n```\n\n**适用于具有幻觉检测功能的 AI 编程助手：**\n```\nUSE_CONTEXTUAL_EMBEDDINGS=true\nUSE_HYBRID_SEARCH=true\nUSE_AGENTIC_RAG=true\nUSE_RERANKING=true\nUSE_KNOWLEDGE_GRAPH=true\n```\n\n**适用于快速、基础的 RAG：**\n```\nUSE_CONTEXTUAL_EMBEDDINGS=false\nUSE_HYBRID_SEARCH=true\nUSE_AGENTIC_RAG=false\nUSE_RERANKING=false\nUSE_KNOWLEDGE_GRAPH=false\n```\n\n## 运行服务器\n\n### 使用 Docker\n\n```bash\ndocker run --env-file .env -p 8051:8051 mcp\u002Fcrawl4ai-rag\n```\n\n### 使用 Python\n\n```bash\nuv run src\u002Fcrawl4ai_mcp.py\n```\n\n服务器将启动，并在配置的主机和端口上监听。\n\n## 与 MCP 客户端集成\n\n### SSE 配置\n\n当您使用 SSE 传输方式运行服务器后，可以使用以下配置连接到它：\n\n```json\n{\n  \"mcpServers\": {\n    \"crawl4ai-rag\": {\n      \"transport\": \"sse\",\n      \"url\": \"http:\u002F\u002Flocalhost:8051\u002Fsse\"\n    }\n  }\n}\n```\n\n> **Windsurf 用户注意**：请在配置中使用 `serverUrl` 而不是 `url`：\n> ```json\n> {\n>   \"mcpServers\": {\n>     \"crawl4ai-rag\": {\n>       \"transport\": \"sse\",\n>       \"serverUrl\": \"http:\u002F\u002Flocalhost:8051\u002Fsse\"\n>     }\n>   }\n> }\n> ```\n>\n> **Docker 用户注意**：如果您的客户端运行在不同的容器中，请使用 `host.docker.internal` 而不是 `localhost`。如果您在 n8n 中使用此 MCP 服务器，则适用此规则！\n>\n> **Claude Code 用户注意**：\n```\nclaude mcp add-json crawl4ai-rag '{\"type\":\"http\",\"url\":\"http:\u002F\u002Flocalhost:8051\u002Fsse\"}' --scope user\n```\n\n### Stdio 配置\n\n将此服务器添加到您的 MCP 配置中，以供 Claude Desktop、Windsurf 或其他 MCP 客户端使用：\n\n```json\n{\n  \"mcpServers\": {\n    \"crawl4ai-rag\": {\n      \"command\": \"python\",\n      \"args\": [\"path\u002Fto\u002Fcrawl4ai-mcp\u002Fsrc\u002Fcrawl4ai_mcp.py\"],\n      \"env\": {\n        \"TRANSPORT\": \"stdio\",\n        \"OPENAI_API_KEY\": \"your_openai_api_key\",\n        \"SUPABASE_URL\": \"your_supabase_url\",\n        \"SUPABASE_SERVICE_KEY\": \"your_supabase_service_key\",\n        \"USE_KNOWLEDGE_GRAPH\": \"false\",\n        \"NEO4J_URI\": \"bolt:\u002F\u002Flocalhost:7687\",\n        \"NEO4J_USER\": \"neo4j\",\n        \"NEO4J_PASSWORD\": \"your_neo4j_password\"\n      }\n    }\n  }\n}\n```\n\n### Docker 与 Stdio 配置\n\n```json\n{\n  \"mcpServers\": {\n    \"crawl4ai-rag\": {\n      \"command\": \"docker\",\n      \"args\": [\"run\", \"--rm\", \"-i\", \n               \"-e\", \"TRANSPORT\", \n               \"-e\", \"OPENAI_API_KEY\", \n               \"-e\", \"SUPABASE_URL\", \n               \"-e\", \"SUPABASE_SERVICE_KEY\",\n               \"-e\", \"USE_KNOWLEDGE_GRAPH\",\n               \"-e\", \"NEO4J_URI\",\n               \"-e\", \"NEO4J_USER\",\n               \"-e\", \"NEO4J_PASSWORD\",\n               \"mcp\u002Fcrawl4ai\"],\n      \"env\": {\n        \"TRANSPORT\": \"stdio\",\n        \"OPENAI_API_KEY\": \"your_openai_api_key\",\n        \"SUPABASE_URL\": \"your_supabase_url\",\n        \"SUPABASE_SERVICE_KEY\": \"your_supabase_service_key\",\n        \"USE_KNOWLEDGE_GRAPH\": \"false\",\n        \"NEO4J_URI\": \"bolt:\u002F\u002Flocalhost:7687\",\n        \"NEO4J_USER\": \"neo4j\",\n        \"NEO4J_PASSWORD\": \"your_neo4j_password\"\n      }\n    }\n  }\n}\n```\n\n## 知识图谱架构\n\n知识图谱系统将仓库代码结构存储在 Neo4j 中，包含以下组件：\n\n### 核心组件（`knowledge_graphs\u002F` 文件夹）：\n\n- **`parse_repo_into_neo4j.py`**：克隆并分析 GitHub 仓库，将 Python 类、方法、函数和导入提取为 Neo4j 中的节点和关系。\n- **`ai_script_analyzer.py`**：使用 AST 解析 Python 脚本，提取导入、类实例化、方法调用和函数使用情况。\n- **`knowledge_graph_validator.py`**：根据知识图谱验证 AI 生成的代码，以检测幻觉（不存在的方法、错误参数等）。\n- **`hallucination_reporter.py`**：生成关于检测到的幻觉的综合报告，包括置信度评分和建议。\n- **`query_knowledge_graph.py`**：交互式 CLI 工具，用于探索知识图谱（该功能现已集成到 MCP 工具中）。\n\n### 知识图谱模式：\n\nNeo4j 数据库将代码结构存储为以下内容：\n\n**节点：**\n- `Repository`：GitHub 仓库\n- `File`：仓库中的 Python 文件\n- `Class`：包含方法和属性的 Python 类\n- `Method`：带有参数信息的类方法\n- `Function`：独立函数\n- `Attribute`：类属性\n\n**关系：**\n- `Repository` -[:CONTAINS]-> `File`\n- `File` -[:DEFINES]-> `Class`\n- `File` -[:DEFINES]-> `Function`\n- `Class` -[:HAS_METHOD]-> `Method`\n- `Class` -[:HAS_ATTRIBUTE]-> `Attribute`\n\n### 工作流程：\n\n1. **仓库解析**：使用 `parse_github_repository` 工具克隆并分析开源仓库。\n2. **代码验证**：使用 `check_ai_script_hallucinations` 工具验证 AI 生成的 Python 代码。\n3. **知识探索**：使用 `query_knowledge_graph` 工具探索可用的仓库、类和方法。\n\n## 构建您自己的服务器\n\n此实现为构建具有网络爬取功能的更复杂的 MCP 服务器提供了基础。要构建您自己的服务器：\n\n1. 使用 `@mcp.tool()` 装饰器创建您自己的工具方法。\n2. 创建您自己的生命周期函数，以添加您自己的依赖项。\n3. 修改 `utils.py` 文件，以添加您所需的任何辅助函数。\n4. 通过添加更多专用爬虫来扩展爬取功能。","# mcp-crawl4ai-rag 快速上手指南\n\n`mcp-crawl4ai-rag` 是一个基于模型上下文协议 (MCP) 的服务器，集成了 Crawl4AI 和 Supabase。它赋予 AI Agent 强大的网页爬取能力，并将内容存入向量数据库以支持检索增强生成 (RAG)。通过该工具，你可以轻松抓取网页内容并在任意场景中利用这些知识。\n\n## 环境准备\n\n在开始之前，请确保你的系统满足以下要求：\n\n*   **操作系统**: Windows, macOS 或 Linux\n*   **运行时环境** (二选一):\n    *   **推荐**: [Docker Desktop](https:\u002F\u002Fwww.docker.com\u002Fproducts\u002Fdocker-desktop\u002F) (容器化运行)\n    *   **备选**: [Python 3.12+](https:\u002F\u002Fwww.python.org\u002Fdownloads\u002F) (直接运行，需配合 `uv` 包管理器)\n*   **外部服务依赖**:\n    *   **Supabase**: 用于存储向量数据的项目 URL 和 Service Key ([官网](https:\u002F\u002Fsupabase.com))\n    *   **OpenAI API Key**: 用于生成文本嵌入 (Embeddings) ([获取地址](https:\u002F\u002Fplatform.openai.com\u002Fapi-keys))\n    *   **Neo4j** (可选): 仅当需要开启“知识图谱”和\"AI 幻觉检测”功能时需要 ([本地安装指南见下文](#知识图谱设置可选))\n\n## 安装步骤\n\n### 方案一：使用 Docker (推荐)\n\n这是最简便且隔离性最好的部署方式。\n\n1.  **克隆仓库**:\n    ```bash\n    git clone https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-crawl4ai-rag.git\n    cd mcp-crawl4ai-rag\n    ```\n\n2.  **构建镜像**:\n    ```bash\n    docker build -t mcp\u002Fcrawl4ai-rag --build-arg PORT=8051 .\n    ```\n\n3.  **配置环境变量**:\n    在项目根目录创建 `.env` 文件（参考下方[配置说明](#配置说明)）。\n\n### 方案二：使用 uv 直接运行 (适合开发或需要知识图谱功能)\n\n如果你需要使用 Neo4j 知识图谱功能（目前 Docker 支持尚不完善），建议使用此方案。\n\n1.  **克隆仓库**:\n    ```bash\n    git clone https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-crawl4ai-rag.git\n    cd mcp-crawl4ai-rag\n    ```\n\n2.  **安装 uv 并设置虚拟环境**:\n    ```bash\n    pip install uv\n    uv venv\n    # Windows:\n    .venv\\Scripts\\activate\n    # Mac\u002FLinux:\n    source .venv\u002Fbin\u002Factivate\n    ```\n\n3.  **安装依赖**:\n    ```bash\n    uv pip install -e .\n    crawl4ai-setup\n    ```\n\n4.  **配置环境变量**:\n    在项目根目录创建 `.env` 文件。\n\n### 数据库初始化 (必须执行)\n\n无论使用哪种安装方式，都必须先在 Supabase 中初始化数据库结构：\n\n1.  登录你的 Supabase 项目控制台，进入 **SQL Editor**。\n2.  新建一个 Query。\n3.  复制项目根目录下的 `crawled_pages.sql` 文件内容并粘贴到编辑器中。\n4.  点击 **Run** 执行，以创建必要的表和函数。\n\n### 知识图谱设置 (可选)\n\n若需启用 `USE_KNOWLEDGE_GRAPH=true`，需本地运行 Neo4j：\n\n*   **推荐方式**: 使用 [Local AI Package](https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Flocal-ai-packaged) 一键启动 Neo4j 容器。\n*   **手动方式**: 下载 [Neo4j Desktop](https:\u002F\u002Fneo4j.com\u002Fdownload\u002F)，创建新数据库并记录连接信息 (默认 URI: `bolt:\u002F\u002Flocalhost:7687`, 用户: `neo4j`)。\n\n## 基本使用\n\n### 1. 配置环境变量\n\n在项目根目录创建 `.env` 文件，填入以下关键信息：\n\n```ini\n# MCP 服务配置\nHOST=0.0.0.0\nPORT=8051\nTRANSPORT=sse\n\n# OpenAI 配置\nOPENAI_API_KEY=sk-your-openai-api-key\nMODEL_CHOICE=gpt-4.1-nano\n\n# Supabase 配置 (从 Supabase 控制台获取)\nSUPABASE_URL=https:\u002F\u002Fyour-project.supabase.co\nSUPABASE_SERVICE_KEY=your-supabase-service-key\n\n# RAG 策略开关 (按需设为 true 或 false)\nUSE_CONTEXTUAL_EMBEDDINGS=false\nUSE_HYBRID_SEARCH=false\nUSE_AGENTIC_RAG=false\nUSE_RERANKING=false\nUSE_KNOWLEDGE_GRAPH=false\n\n# Neo4j 配置 (仅当 USE_KNOWLEDGE_GRAPH=true 时必填)\nNEO4J_URI=bolt:\u002F\u002Flocalhost:7687\nNEO4J_USER=neo4j\nNEO4J_PASSWORD=your-neo4j-password\n```\n\n### 2. 启动服务\n\n*   **Docker 方式**:\n    ```bash\n    docker run -p 8051:8051 --env-file .env mcp\u002Fcrawl4ai-rag\n    ```\n*   **uv 方式**:\n    确保虚拟环境已激活，然后运行启动命令（具体启动脚本请参考项目后续更新或标准 MCP 启动方式，通常涉及调用主入口）。\n\n### 3. 核心功能示例\n\n启动后，AI Agent (如 Archon 或其他支持 MCP 的客户端) 可通过以下工具与之交互：\n\n*   **抓取单个页面并存入知识库**:\n    调用工具 `crawl_single_page`，传入目标 URL。系统会自动抓取、分块并存入 Supabase。\n\n*   **智能全站抓取**:\n    调用工具 `smart_crawl_url`，传入网站主页或 sitemap 地址。系统会递归抓取内部链接。\n\n*   **执行 RAG 查询**:\n    调用工具 `perform_rag_query`，输入你的问题。系统会在已抓取的內容中进行语义搜索并返回相关片段。\n    *   *进阶*: 如果开启了 `USE_AGENTIC_RAG`，还可使用 `search_code_examples` 专门搜索代码示例。\n\n*   **查看可用数据源**:\n    调用 `get_available_sources` 获取当前数据库中已索引的域名列表，可用于过滤查询范围。\n\n> **提示**: 本工具旨在作为 AI 编程助手（如 Archon）的知识引擎。配置完成后，请在你的 AI 助手 MCP 配置中添加该服务器地址即可开始使用。","一位全栈开发者正在基于最新的 LangChain v0.2 文档构建一个复杂的智能客服 Agent，需要确保代码实现与官方最新特性完全同步。\n\n### 没有 mcp-crawl4ai-rag 时\n- **信息滞后严重**：开发者只能依赖模型训练数据中的旧版文档，导致生成的代码频繁调用已废弃的 API，调试耗时极长。\n- **手动整理低效**：为了获取准确信息，必须人工浏览数十个网页，复制粘贴内容到本地文件，过程繁琐且容易遗漏关键细节。\n- **检索精度不足**：在海量文本中查找特定参数用法时，简单的关键词搜索无法理解语义上下文，常常返回无关结果。\n- **知识孤岛现象**：爬取的数据散落在各个浏览器标签页或笔记软件中，无法被 AI 编程助手直接调用进行实时辅助编码。\n\n### 使用 mcp-crawl4ai-rag 后\n- **实时知识同步**：mcp-crawl4ai-rag 自动递归抓取 LangChain 最新官方文档并索引至向量数据库，让 AI 助手瞬间掌握 v0.2 的最新语法。\n- **自动化数据处理**：工具智能识别 URL 类型并并行处理多个页面，自动按标题分割内容块，无需人工干预即可完成知识库构建。\n- **混合搜索增强**：结合向量语义搜索与关键词匹配，并通过重排序模型优化结果，精准定位到具体的代码示例和参数说明。\n- **无缝集成工作流**：作为 MCP 服务器直接嵌入开发环境，AI 助手在编写代码时可实时检索外部文档，自动修正过时的实现方案。\n\nmcp-crawl4ai-rag 将静态的网页信息转化为 AI 可即时调用的动态知识引擎，彻底消除了大模型因数据滞后产生的“幻觉”问题。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcoleam00_mcp-crawl4ai-rag_e51c3638.png","coleam00","Cole Medin","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fcoleam00_92be4757.png","Generative AI specialist with a wide range of experience developing AI Agents, RAG solutions, local AI deployments, generative AI libraries\u002Fpackages, and more.","Dynamous",null,"https:\u002F\u002Fdynamous.youcanbook.me\u002F","https:\u002F\u002Fgithub.com\u002Fcoleam00",[81,85,89],{"name":82,"color":83,"percentage":84},"Python","#3572A5",98.1,{"name":86,"color":87,"percentage":88},"PLpgSQL","#336790",1.8,{"name":90,"color":91,"percentage":92},"Dockerfile","#384d54",0.1,2142,571,"2026-04-15T18:29:53","MIT",4,"Linux, macOS, Windows","未说明 (默认使用 OpenAI API 进行嵌入生成；若启用本地 Ollama 或重排序模型，需根据具体模型需求配置，文中未指定具体显卡要求)","未说明",{"notes":102,"python":103,"dependencies":104},"1. 推荐使用 Docker 运行，也可直接通过 uv 在本地运行。2. 必须配置 Supabase 数据库并执行 SQL 脚本初始化表结构。3. 默认使用 OpenAI API 生成嵌入，需在 .env 中配置 API Key。4. 知识图谱功能（幻觉检测等）为可选，需额外安装和配置 Neo4j，且目前该功能在 Docker 中兼容性不佳，建议本地运行。5. 未来版本计划支持本地 Ollama 运行以替代 OpenAI。6. 需创建 .env 文件配置端口、API 密钥及各项 RAG 策略开关。","3.12+",[105,106,107,108,109,110,111],"uv","Crawl4AI","Supabase (pgvector)","OpenAI API","Neo4j (可选)","cross-encoder\u002Fms-marco-MiniLM-L-6-v2 (用于重排序)","Docker (可选)",[14,16,13],"2026-03-27T02:49:30.150509","2026-04-16T08:13:29.032140",[116,121,126,131,136,141],{"id":117,"question_zh":118,"answer_zh":119,"source_url":120},35647,"使用自定义客户端时遇到请求超时（Timeout）错误怎么办？","MCP 对某些客户端的默认超时时间为 60 秒，且目前可能无法直接调整。维护者表示，未来的版本会将爬虫作为独立进程运行，而不是通过 MCP 调用，以解决此问题。目前如果遇到超时，可能是由于爬取任务耗时超过了 MCP 的默认限制。","https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-crawl4ai-rag\u002Fissues\u002F26",{"id":122,"question_zh":123,"answer_zh":124,"source_url":125},35648,"运行 docker build 命令时报错 'docker buildx build requires 1 argument' 如何解决？","这通常是因为命令末尾缺少了构建上下文路径。请尝试在命令末尾添加一个点（.），表示当前目录：\n`docker build -t mcp\u002Fcrawl4ai-rag --build-arg PORT=8051 .`\n或者如果您使用的是 buildx，可以使用：\n`docker buildx build -t mcp\u002Fcrawl4ai-rag --build-arg PORT=8051`","https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-crawl4ai-rag\u002Fissues\u002F34",{"id":127,"question_zh":128,"answer_zh":129,"source_url":130},35649,"如何在本地获取 Supabase 的 SUPABASE_URL 和 ANON_KEY？访问链接一直重定向。","在最新版本的 Supabase 管理页面中，不要直接访问旧的设置链接。请点击页面顶部的 \"Connect\"（连接）按钮，然后切换到 \"App Frameworks\"（应用框架）标签页，在那里您可以直接找到所需的 URL 和 ANON_KEY。","https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-crawl4ai-rag\u002Fissues\u002F24",{"id":132,"question_zh":133,"answer_zh":134,"source_url":135},35650,"Docker 容器启动后，访问 \u002Fsse 接口返回 405 Method Not Allowed 错误是什么原因？","该错误表明客户端使用了错误的 HTTP 方法访问 SSE 端点。SSE (Server-Sent Events) 连接应当使用 GET 请求而不是 POST 请求。请检查您的客户端配置，确保向 `http:\u002F\u002Flocalhost:8051\u002Fsse` 发送的是 GET 请求。日志中显示的 `POST \u002Fsse` 是导致 405 错误的直接原因。","https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-crawl4ai-rag\u002Fissues\u002F65",{"id":137,"question_zh":138,"answer_zh":139,"source_url":140},35651,"为什么每次运行 Docker 镜像时都会重新下载大量依赖（如 torch, transformers）？","这是因为 Docker 镜像虽然系统层面安装了依赖，但运行时 `uv` 工具会创建一个新的虚拟环境并重新下载包，导致时间和空间的浪费。这是一个已知的效率问题，社区已提出通过修改 Dockerfile 来避免在虚拟环境中重复下载依赖的修复方案。","https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-crawl4ai-rag\u002Fissues\u002F35",{"id":142,"question_zh":143,"answer_zh":144,"source_url":145},35652,"如何将 crawl4ai-rag 连接到真实域名或通过 Caddy 反向代理配置？","目前官方文档或 Issue 中尚未提供详细的 Caddy 配置模板。用户需要自行配置反向代理，将外部域名的请求转发到 crawl4ai-rag 运行的端口（默认为 8051）。如果您有成功的配置经验（包括 Caddyfile 和 Windsurf 的地址端口设置），建议在社区中分享以帮助他人。","https:\u002F\u002Fgithub.com\u002Fcoleam00\u002Fmcp-crawl4ai-rag\u002Fissues\u002F11",[]]