[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-codefuse-ai--CodeFuse-CGM":3,"tool-codefuse-ai--CodeFuse-CGM":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",150037,2,"2026-04-10T23:33:47",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":75,"owner_location":75,"owner_email":75,"owner_twitter":75,"owner_website":75,"owner_url":76,"languages":77,"stars":86,"forks":87,"last_commit_at":88,"license":75,"difficulty_score":89,"env_os":90,"env_gpu":91,"env_ram":92,"env_deps":93,"category_tags":107,"github_topics":75,"view_count":32,"oss_zip_url":75,"oss_zip_packed_at":75,"status":17,"created_at":108,"updated_at":109,"faqs":110,"releases":126},6448,"codefuse-ai\u002FCodeFuse-CGM","CodeFuse-CGM","[NeurIPS 2025] A Graph-based LLM Framework for Real-world SE Tasks","CodeFuse-CGM 是一款专为仓库级软件工程任务设计的图基大语言模型框架，其核心成果已入选 NeurIPS 2025。它主要解决传统 AI 编程助手在处理大型代码库时，因缺乏全局结构感知而难以精准定位和修复复杂 Bug 的痛点。\n\n该工具创新性地引入了“代码图”概念，通过构建仓库级的代码结构图谱，让模型不仅能读懂单段代码，更能理解整个项目的上下文关联。其独特的 R4 链式架构包含四个关键步骤：重写器（Rewriter）提炼问题关键词，检索器（Retriever）基于图谱锚点锁定相关子图，重排序器（Reranker）筛选最可能修改的文件，最后由阅读器（Reader）生成精准的代码补丁。这种机制使其在 SWE-Bench-Lite 基准测试中取得了卓越的修复率。\n\nCodeFuse-CGM 特别适合需要处理复杂遗留系统、进行大规模代码重构的研发团队，以及致力于探索代码智能与图神经网络结合的研究人员。它不仅支持全参数微调，还兼容 LoRA 等高效训练方式，降低了大模型的应用门槛。无论是希望提升自动化运维效率的企业开发者，还是关注前沿 AI 编程技术的学术研究者，都能从中获得强大的辅助能","CodeFuse-CGM 是一款专为仓库级软件工程任务设计的图基大语言模型框架，其核心成果已入选 NeurIPS 2025。它主要解决传统 AI 编程助手在处理大型代码库时，因缺乏全局结构感知而难以精准定位和修复复杂 Bug 的痛点。\n\n该工具创新性地引入了“代码图”概念，通过构建仓库级的代码结构图谱，让模型不仅能读懂单段代码，更能理解整个项目的上下文关联。其独特的 R4 链式架构包含四个关键步骤：重写器（Rewriter）提炼问题关键词，检索器（Retriever）基于图谱锚点锁定相关子图，重排序器（Reranker）筛选最可能修改的文件，最后由阅读器（Reader）生成精准的代码补丁。这种机制使其在 SWE-Bench-Lite 基准测试中取得了卓越的修复率。\n\nCodeFuse-CGM 特别适合需要处理复杂遗留系统、进行大规模代码重构的研发团队，以及致力于探索代码智能与图神经网络结合的研究人员。它不仅支持全参数微调，还兼容 LoRA 等高效训练方式，降低了大模型的应用门槛。无论是希望提升自动化运维效率的企业开发者，还是关注前沿 AI 编程技术的学术研究者，都能从中获得强大的辅助能力，让代码修复变得更加智能且可控。","# CGM: Code Graph LLM\n\n![CodefuseLogo](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcodefuse-ai_CodeFuse-CGM_readme_55a9f13b7867.jpg)\n\n## Contents\n- [News](#news)\n- [Introduction](#introduction)\n- [Installation](#installation)\n- [Examples](#examples)\n  - [Rewriter](#rewriter)\n  - [Retriever](#retriever)\n  - [Reranker](#reranker)\n  - [Reader](#reader)\n- [Contributing](#contributing)\n- [Citation](#citation)\n- [Join Us](#join-us)\n\n## News\n\n🔥🔥🔥 [2025\u002F09\u002F19] Our paper [Code Graph Model (CGM): A Graph-Integrated Large Language Model for Repository-Level Software Engineering Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.16901) has been accepted to NeurIPS 2025!\n\n![SWE-Bench-Lite](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcodefuse-ai_CodeFuse-CGM_readme_f32405145bd8.png)\n\n🔥🔥🔥 [2025\u002F01\u002F15] We are pleased to announce the updated version of the CGM-72B-V1.2. The model further achieves a remarkable 44.00% resolve rate on the SWE-Bench-Lite leaderboard.\n\n🔥🔥🔥 [2024\u002F12\u002F28] We are pleased to announce the updated version of the CGM-72B-V1.1. The model further achieves a remarkable 41.67% resolve rate on the SWE-Bench-Lite leaderboard.\n\n🔥🔥🔥 [2024\u002F10\u002F28] We are pleased to announce that CGM-72B achieves a remarkable 35.67% resolve rate on the SWE-Bench-Lite leaderboard.\n\n🔥🔥🔥 [2024\u002F10\u002F28] We released **CGM**, mainly for repository-level coding tasks.\n\n- 📜 **Paper**: [Code Graph Model (CGM): A Graph-Integrated Large Language Model for Repository-Level Software Engineering Tasks](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.16901)\n- 🤖 **Model**: [codefuse-ai\u002FCodeFuse-CGM-72B](https:\u002F\u002Fhuggingface.co\u002Fcodefuse-ai\u002FCodeFuse-CGM-72B)\n- 📊 **Data**: [codefuse-ai\u002FCodeGraph](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fcodefuse-ai\u002FCodeGraph)\n\n## Introduction\nWe propose a graph-based framework CGM for real-world SE tasks. Before CGM starts its work, we construct a repository-level code graph to better represent the repository context and its structure by Code Graph Generator. Inspired by the Retrieval-Augmented Generation (RAG) approach, CGM framework is designed as a chain structure consisting of four atomic nodes, termed as R4 (Rewriter, Retriever, Reranker, and Reader) chain for this scenario. Given an issue, the initial input to the CGM framework includes the issue description and the corresponding code graph. Rewriter will first rewrite the original issue by extracting keywords and generating relevant queries for code graph. Then a heuristic code subgraph is retrieved through Retriever based on the matching anchor nodes from rewriter output. Given that the resulting subgraph provides a relatively broad context necessary for reference, we need a Reranker to identify the files most likely to be modified as a further hint. Subsequently, both the retrieved subgraph and the identified files are input into a trainable, graph-based Reader to generate the corresponding code patch.\n\n### Framework\n\n![Framework](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcodefuse-ai_CodeFuse-CGM_readme_382f8a545fee.png)\n\n### Highlights\n:white_check_mark: **Code Graph**: Train models on multiple tasks while maintaining a balance between them. The models can even generalize to new, previously unseen tasks.\n\n:white_check_mark: **Multi-framework**: It provides support for both Accelerate (with Deepspeed and FSDP)\n\n:white_check_mark: **Efficient fine-tuning**: It supports LoRA, QLoRA as well as Full-parameters training, enabling fine-tuning of large models with minimal resources. The training speed meets the demands of almost all fine-tuning scenarios.\n\n## Installation\n### Prerequisites\n- Python 3.8+\n- pip\n\n### Required Packages\n\n```bash\ntransformers==4.46.1\ntokenizers==0.20.0\naccelerate==1.0.1\npeft==0.13.2\njinja2==2.11.3\nfuzzywuzzy==0.18.0\npython-Levenshtein==0.25.1\nnetworkx==3.0\n```\n\n## Examples\n\nThe following chart illustrates the whole processing pipeline of R3.\n![R3 Pipeline](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcodefuse-ai_CodeFuse-CGM_readme_314cbbb85c94.png)\n\n\n### Pre-process for Retriever: Generate Node Embedding\n\nBefore Retriever, we need to embed \n- all the nodes in Code Graph into embeddings \n- Queries generated by Rewriter, into embeddings\n\nby [CGE-large](https:\u002F\u002Fhuggingface.co\u002Fcodefuse-ai\u002FCodeFuse-CGE-Large)\n\n```bash\npython generate_code_content.py # this step will preprocess each code graph and extract content from each node (save a .json file for each repo)\npython generate_code_embedding.py # this step will generate embedding for each node by CGE-large (save a .pkl file for each repo)\npython generate_rewriter_embedding.py # this step will generate embedding for each Query enerated by Rewriter (save a .pkl file)\n```\n\nRequirements for CGE-large\n```bash\ntorch==2.1.0\ntransformers==4.39.2\ntokenizers==0.15.2\naccelerate==0.28.0\n```\n\n### Rewriter\n\nGiven issues meta data, execute the following scripts to generate Rewriter results (both Inferer and Extractor).\n\n```bash \npython generate_rewriter_prompt.py # this step will generate a json file \"test_rewriter_prompt.json\" containing prompts used in Rewriter\n\npython inference_rewriter.py --prompt_path test_rewriter_prompt.json # this step will load Qwen Model to execuate inference and generate Rewriter's output \"test_rewriter_output.json\"\n\npython rewriter_output_post_processing.py # this step will load Rewriter's output \"rewriter_output.json\" and generate post processed output \"test_rewriter_output.json\n```\n\nUse function ```generate_prompt_for_extractor``` and ```generate_prompt_for_inferer``` in ```rewriter\u002Fprompt.py```\n```python\ndef generate_prompt_for_extractor(problem_statement, repo_name):\n    prompt = \"\"\"\n    \u003Cissue>\n    {}\n    \u003C\u002Fissue> \n    This is an issue related to repository '{}'. \n    Instructions:\n    1. Analysis:\n    ○ Analyze the provided issue description. Identify the relevant File, Class, or Function involved.\n    ○ Determine the specific problem or error encountered and note any clues that may assist in locating the relevant or problematic area.\n    2. Extraction:\n    ○ After the analysis, extract ALL the mentioned code entities (File, Class, or Function), especially Files.\n    ○ Then extract three potential and meaningful keywords, responding in the following format:\n\n    [start_of_analysis] \n    \u003Cdetailed_analysis> \n    [end_of_analysis] \n\n    [start_of_related_code_entities] \n    \u003Centity_name_with_path>\n    [end_of_related_code_entities] \n\n    [start_of_related_keywords] \n    \u003Ckeywords>\n    [end_of_related_keywords]\n\n    Notes:\n    - Pay attention to the information in the error logs (if exists).\n    - The buggy code exists solely in the project described in the issue (e.g., django, sklearn). Buggy location is usually not in the tests files or external packages.\n    - Your extracted entities should be CONCISE, ACCURATE and INFORMATIVE.\n    - Provide the relative path for code entities if specified (e.g., package\u002Ffoo.py). Relative path is relative to the repository itself, do not include suffix like '\u002Fhome\u002Fusername\u002F', '\u002Fetc\u002Fservice\u002F' or '\u002Ftree\u002Fmaster'.\n    - Do not include any additional information such as line numbers or explanations in your extraction result.\n\n    Preferred extraction Examples of Code Entities:\n    - repo\u002Fcart.py\n    - Class User()\n    - def getData()\n    Preferred extraction Examples of Keywords:\n    - train_loop\n    - hooks\n    - docker\n    \n    Unpreferred extraction Examples of keywords:\n    - something wrong\n    - input validation\n    - TypeError\n    \"\"\".format(problem_statement, repo_name)\n        \n    return prompt\n\ndef generate_prompt_for_inferer(problem_statement, repo_name):\n    prompt = \"\"\"\n    \u003Cissue>\n    {}\n    \u003C\u002Fissue> \n    This is an issue related to repository '{}'. \n    Task:\n    Based on the issue description provided, identify the characteristics of code entities (files, functions, class) that might need to be modified. \n    For each characteristic, generate a search query that could help locate relevant code entities in a codebase.\n    Instructions:\n    First, analyze the issue description and identify keywords, features, and functionalities that are likely relevant to the modification of code entities.\n    Then, create queries that capture these characteristics, focusing on:\n    ● File names that may implement relevant functionalities.\n    ● Functions or methods that are related to the features described in the issue.\n    ● Any patterns or structures that might be relevant to the functionalities mentioned.\n    For example:\n    ● File related to the initialization of a neural network.\n    ● Function related to the training process.\n    ● Code used to configure the service.\n    Please answer in the following format:\n\n    [start_of_analysis] \n    \u003Cdetailed_analysis> \n    [end_of_analysis] \n\n    [start_of_related_queries] \n    query 1:\n    query 2:\n    ...\n    [end_of_related_queries] \n\n    Notes:\n    - Your queries should be DETAILED, ACCURATE and INFORMATIVE. \n    - Your queries should be a complete sentences and do not include additional explanation.\n    - The number of queries is up to five, so be focus on the important characteristics.\n    - Your queries should focus on the repository code itself, rather than other information like commit history.\n    - Pay attention to the information in the error logs (if exists).\n\n    Preferred Query Examples:\n    - Look for references to \"tqdm\" or \"progress_bar\" within the training loop files to find where progress bars are currently updated.\n    - Code snippets where 'gethostbyname' function from 'socket' module is called.\n    - File name containing 'mysql.py' AND functions related to 'MySQLStatementSamples' initialization.\n    - Functions or methods handling hostname resolution or encoding within 'datadog_checks' directory.\n    - Find all occurrences of \"early_stopping\" within files that also mention \"Trainer\" to identify where early stopping logic is implemented and potentially needs adjustment for non-default 'val_check_interval'.\n    \"\"\".format(problem_statement, repo_name)\n        \n    return prompt\n```\nYou can use the rewriter prompt by\n```python\nfrom rewriter.prompt import generate_prompt_for_extractor, generate_prompt_for_inferer\n\n# Generate extraction prompt\nextraction_prompt = generate_prompt_for_extractor(problem_statement, repo_name)\n\n# Generate inference prompt\ninference_prompt = generate_prompt_for_inferer(problem_statement, repo_name)\n```\n\n### Retriever\nNow, we have\n- Original CodeGraph: `codegraph\u002F`\n- Node Embedding of CodeGraph: `node_embedding\u002F`\n- Query Embedding of Rewriter's Inferer: `rewriter_embedding.pkl`\n- Output of Rewriter's Extractor: `rewriter_output.json`\n\nThen we can execute Retriever:\n\n```bash\npython locate_anchor_node.py # this step will used the above input and generate ancode nodes \"anchor_node.json\" for all samples\npython subgraph.py # this step will then expand the anchor nodes to a connected subgraph (saving as a set of node_id)\npython serialize_subgraph.py # based on the above subgraph node ids, this step will serialize the subgraph into json format (which is the final output of Retriever)\n```\n\nRequirements for Retriever\n```bash\nRapidFuzz==1.5.0\nfaiss-cpu\n```\n\n### Reranker\n\nReranker is used to determine the most relevant files from the subgraph generated by Retriever. The input is the subgraph json file which is the output of Retriever\n\n```bash\npython reranker.py --stage_1_k 10 --stage_2_k 5 # this step will load the subgraph json file and generate the output of Reranker.\n```\n\nRequirements for Reranker\n```bash\nvllm>=0.8.5\n```\n\nUse function ```generate_prompt_for_reranker_stage_1``` and ```generate_prompt_for_reranker_stage_2``` in ```reranker\u002Fprompt.py```\n```python\n\"\"\"\nPrompt Template for Reranker\n\"\"\"\n\nreranker_stage_1_system_prompt = \"\"\"\nYou are an experienced software developer who specializes in extracting the most relevant files for solving issues from many reference files.\n\nTask:\nBased on the information received about the issue from a repository, find the most likely few files from among those that may be able to resolve the issue.\n\nInstructions:\n1. Analysis:\n- Analyze the provided issue description and files, and pay attention to the relevance of the provided files with the given issue, especially those might be modified during fixing the issue.\n- Determine the specific problem or error mentioned in the issue and note any clues that could help your judgment.\n2. Extraction:\n- Based on your analysis, choose the Top **1** relevant files which might be used in fixing the issue.\n- You should choose files from the provided files, and should not modify their name in any way.\n\nRespond in the following format:\n[start_of_analysis]\n\u003Cdetailed_analysis> \n[end_of_analysis] \n\n[start_of_relevant_files] \n1. \u003Cfile_with_its_path>\n2. \u003Cfile_with_its_path>\n3. ...\n[end_of_relevant_files] \n\nNotes:\n- You can refer to to the information in the error logs (if exists).\n- The relevant file usually exists in the project described in the issue (e.g., django, sklearn). File need modification is usually not in the tests files or external packages.\n- The file you choose should be contained in the provided files.\n- Provide the file path with files. Do not include redundant suffix like '\u002Fhome\u002Fusername\u002F', '\u002Fetc\u002Fservice\u002F' or '\u002Ftree\u002Fmaster'.\n- Do not include any additional information such as line numbers or explanations in your extraction result.\n- Files for initialization and configuration might be modified during changing the code.\n\nPreferred extraction Examples of Related Files:\n1. src\u002Futils\u002Ffile_handler.py\n2. core\u002Fservices\u002Fservice_manager.py\n3. ...\n\"\"\".strip()\n\nreranker_stage_1_user_prompt_template = \"\"\"\n\u003Crepository>\n{}\n\u003C\u002Frepository>\n\n\u003Cissue>\n{}\n\u003C\u002Fissue>\n \n\u003Creference_python_file_list>\n{}\n\u003C\u002Freference_python_file_list>\n\n\u003Cother_reference_file_list>\n{}\n\u003C\u002Fother_reference_file_list>\n\"\"\"\n\nreranker_stage_2_system_prompt = \"\"\"\nYou are an experienced software developer who specializes in assessing the relevance of the file for solving the issue in software repositories.\n\nTask:\nFor a file provided, evaluate the likelihood that modifying this file would resolve the given issue, and assign a score based on specific criteria.\n\nInstructions:\n1. Analysis:\n- Analyze the provided issue description and the content of the single relevant file, pay attention to any keywords, error messages, or specific functionalities mentioned that relate to the file.\n- Determine how closely the contents and functionality of the file are tied to the problem or error described in the issue.\n- Consider the role of the file in the overall project structure (e.g., configuration files, core logic files versus test files, or utility scripts).\n2. Scoring:\n- Based on your analysis, assign a score from 1 to 5 that represents the relevance of modifying the given file in order to solve the issue.\n\nScore Specifications:\n1. **Score 1**: The file is almost certainly unrelated to the issue, with no apparent connection to the functionality or error described in the issue.\n2. **Score 2**: The file may be tangentially related, but modifying it is unlikely to resolve the issue directly; possible in rare edge cases.\n3. **Score 3**: The file has some relevance to the issue; it might interact with the affected functionality indirectly and tweaking it could be part of a broader fix.\n4. **Score 4**: The file is likely related to the issue; it includes code that interacts directly with the functionality in question and could plausibly contain bugs that lead to the issue.\n5. **Score 5**: The file is very likely the root cause or heavily involved in the issue and modifying it should directly address the error or problem mentioned.\n\nRespond in the following format:\n[start_of_analysis]\n\u003Cdetailed_analysis>\n[end_of_analysis]\n\n[start_of_score]\nScore \u003Cnumber>\n[end_of_score]\n\nNotes:\n- The content of the file shows only the structure of this file, including the names of the classes and functions defined in this file.\n- You can refer to to the information in the error logs (if exists).\n\"\"\".strip()\n\nreranker_stage_2_user_prompt_template = \"\"\"\n\u003Crepository>\n{}\n\u003C\u002Frepository>\n\n\u003Cissue>\n{}\n\u003C\u002Fissue>\n\n\u003Cfile_name>\n{}\n\u003C\u002Ffile_name>\n\n\u003Cfile_content>\n{}\n\u003C\u002Ffile_content>\n\"\"\"\n\ndef generate_prompt_for_reranker_stage_1(problem_statement, repo_name, py_file, other_file):\n  \"\"\"\n  problem_statement: issue\n  repo_name: repo\n  py_file: py file list\n  other_file: related file list\n  \"\"\"\n  return reranker_stage_1_system_prompt, reranker_stage_1_user_prompt_template.format(repo_name, problem_statement, py_file, other_file)\n\ndef generate_prompt_for_reranker_stage_2(problem_statement, repo_name, file_name, file_content):\n  \"\"\"\n  problem_statement: issue\n  repo_name: repo\n  file_name: file\n  file_content: file content（class xxx和def xxx）\n  \"\"\"\n  return reranker_stage_2_system_prompt, reranker_stage_2_user_prompt_template.format(repo_name, problem_statement, file_name, file_content)\n```\nYou can use the reranker prompt by\n```python\nfrom reranker.prompt import generate_prompt_for_reranker_stage_1, generate_prompt_for_reranker_stage_2\n\n# Stage 1: Identify relevant files\nsystem_prompt, user_prompt = generate_prompt_for_reranker_stage_1(\n    problem_statement, \n    repo_name, \n    py_file_list, \n    other_file_list\n)\n\n# Stage 2: Score file relevance\nsystem_prompt, user_prompt = generate_prompt_for_reranker_stage_2(\n    problem_statement,\n    repo_name,\n    target_file,\n    file_content\n)\n```\n\n### Reader\nExecute the Reader module with DeepSpeed configurations:\n```bash\n# Zero-2 Configuration\nEXPORT N_NODE={YOUR_MACHINE_NUM} && \\\nEXPORT N_GPU_PER_NODE={YOUR_GPU_NUM} && \\\nEXPORT TRAIN_CONFIG={TRAIN_CONFIG}.json && \\\nbash launch\u002Fzero2.sh\n\n# Zero-3 Configuration\nEXPORT N_NODE={YOUR_MACHINE_NUM} && \\\nEXPORT N_GPU_PER_NODE={YOUR_GPU_NUM} && \\\nEXPORT TRAIN_CONFIG={TRAIN_CONFIG}.json && \\\nbash launch\u002Fzero3.sh\n```\n\n## Contributing\nContributions are welcome! If you have any suggestions, ideas, bug reports, or new model\u002Ffeature supported, please open an issue or submit a pull request.\n\nWe welcome contributions from the community! Please follow these guidelines:\n\n1. Fork the repository\n\n2. Create your feature branch\n\n3. Commit your changes\n\n4. Push to the branch\n\n5. Open a Pull Request\n\nFor major changes, please open an issue first to discuss the proposed changes.\n\n\n## Citation\nIf you find our work useful or helpful for your R&D works, please feel free to cite our paper as below.\n```bibtex\n@misc{tao2025codegraphmodelcgm,\n      title={Code Graph Model (CGM): A Graph-Integrated Large Language Model for Repository-Level Software Engineering Tasks}, \n      author={Hongyuan Tao and Ying Zhang and Zhenhao Tang and Hongen Peng and Xukun Zhu and Bingchang Liu and Yingguang Yang and Ziyin Zhang and Zhaogui Xu and Haipeng Zhang and Linchao Zhu and Rui Wang and Hang Yu and Jianguo Li and Peng Di},\n      year={2025},\n      eprint={2505.16901},\n      archivePrefix={arXiv},\n      primaryClass={cs.SE},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.16901}, \n}\n```\n## Join-US\n\nWe are the AI Native team within the Platform Technology Business Group at Ant Group, dedicated to the intelligentization of Ant Group's platform engineering. Established for over three years, our team has played a pivotal role in supporting the intelligent operation and maintenance of Ant Group's cloud computing infrastructure. Our mission is to build algorithm services and platforms with a wide user base through world-class technological innovation and impact, supporting the implementation of internal and external products and businesses.\nEmbracing an innovation-driven ethos, our team not only supports business implementation but also propels technological influence. Over the past three years, we have published more than 20 papers at top conferences like ICLR, NeurIPS, KDD, and ACL. Our innovative business outcomes have earned us two Ant Technology's highest T-Star awards and one SuperMA award from Ant Group. Our open-source project CodeFuse has received 4K stars as of February 2024, and our models have been downloaded over 1.5 million times on Huggingface and Modelscope.\n\nWe are on the lookout for top talents to join our vibrant team! If you're eager to develop your career in an environment filled with energy, innovation, and a culture of excellence, we welcome you to explore our career opportunities for both campus and experienced hires. Join us and be a part of creating the next milestone in the industry.\n\n**Contact**: hyu.hugo@antgroup.com \n","# CGM：代码图大语言模型\n\n![CodefuseLogo](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcodefuse-ai_CodeFuse-CGM_readme_55a9f13b7867.jpg)\n\n## 目录\n- [新闻](#news)\n- [简介](#introduction)\n- [安装](#installation)\n- [示例](#examples)\n  - [重写器](#rewriter)\n  - [检索器](#retriever)\n  - [重新排序器](#reranker)\n  - [阅读器](#reader)\n- [贡献](#contributing)\n- [引用](#citation)\n- [加入我们](#join-us)\n\n## 新闻\n\n🔥🔥🔥 [2025\u002F09\u002F19] 我们的论文[代码图模型（CGM）：一种用于仓库级软件工程任务的图集成大型语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.16901)已被NeurIPS 2025接收！\n\n![SWE-Bench-Lite](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcodefuse-ai_CodeFuse-CGM_readme_f32405145bd8.png)\n\n🔥🔥🔥 [2025\u002F01\u002F15] 我们很高兴地宣布CGM-72B-V1.2版本更新。该模型在SWE-Bench-Lite排行榜上进一步取得了令人瞩目的44.00%解决率。\n\n🔥🔥🔥 [2024\u002F12\u002F28] 我们很高兴地宣布CGM-72B-V1.1版本更新。该模型在SWE-Bench-Lite排行榜上进一步取得了令人瞩目的41.67%解决率。\n\n🔥🔥🔥 [2024\u002F10\u002F28] 我们很高兴地宣布，CGM-72B在SWE-Bench-Lite排行榜上取得了35.67%的解决率。\n\n🔥🔥🔥 [2024\u002F10\u002F28] 我们发布了**CGM**，主要用于仓库级别的编码任务。\n\n- 📜 **论文**：[代码图模型（CGM）：一种用于仓库级软件工程任务的图集成大型语言模型](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.16901)\n- 🤖 **模型**：[codefuse-ai\u002FCodeFuse-CGM-72B](https:\u002F\u002Fhuggingface.co\u002Fcodefuse-ai\u002FCodeFuse-CGM-72B)\n- 📊 **数据**：[codefuse-ai\u002FCodeGraph](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fcodefuse-ai\u002FCodeGraph)\n\n## 简介\n我们提出了一种基于图的框架CGM，用于解决现实世界的软件工程任务。在CGM开始工作之前，我们通过代码图生成器构建一个仓库级别的代码图，以更好地表示仓库的上下文及其结构。受检索增强生成（RAG）方法的启发，CGM框架被设计为一个由四个原子节点组成的链式结构，针对这一场景称为R4（重写器、检索器、重新排序器和阅读器）链。给定一个问题，CGM框架的初始输入包括问题描述和相应的代码图。重写器首先会通过提取关键词并生成与代码图相关的查询来重写原始问题。然后，检索器会根据重写器输出中的匹配锚点节点，检索出一个启发式的代码子图。由于所得子图提供了相对广泛的参考上下文，我们需要一个重新排序器来识别最有可能被修改的文件，作为进一步的提示。随后，检索到的子图和识别出的文件都会被输入到一个可训练的基于图的阅读器中，以生成相应的代码补丁。\n\n### 框架\n\n![Framework](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcodefuse-ai_CodeFuse-CGM_readme_382f8a545fee.png)\n\n### 亮点\n:white_check_mark: **代码图**：在多个任务上训练模型，同时保持各任务之间的平衡。这些模型甚至可以泛化到以前未见过的新任务。\n\n:white_check_mark: **多框架支持**：它同时支持Accelerate（结合Deepspeed和FSDP）。\n\n:white_check_mark: **高效微调**：它支持LoRA、QLoRA以及全参数训练，使得使用最少资源即可对大型模型进行微调。训练速度几乎满足所有微调场景的需求。\n\n## 安装\n### 先决条件\n- Python 3.8+\n- pip\n\n### 必需包\n\n```bash\ntransformers==4.46.1\ntokenizers==0.20.0\naccelerate==1.0.1\npeft==0.13.2\njinja2==2.11.3\nfuzzywuzzy==0.18.0\npython-Levenshtein==0.25.1\nnetworkx==3.0\n```\n\n## 示例\n\n下图展示了R3的整个处理流程。\n![R3 Pipeline](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcodefuse-ai_CodeFuse-CGM_readme_314cbbb85c94.png)\n\n\n### 检索器预处理：生成节点嵌入\n\n在检索器之前，我们需要将\n- 代码图中的所有节点嵌入为向量\n- 重写器生成的查询嵌入为向量\n\n通过[CGE-large](https:\u002F\u002Fhuggingface.co\u002Fcodefuse-ai\u002FCodeFuse-CGE-Large)完成。\n\n```bash\npython generate_code_content.py # 此步骤将预处理每个代码图，并从每个节点中提取内容（为每个仓库保存一个.json文件）\npython generate_code_embedding.py # 此步骤将使用CGE-large为每个节点生成嵌入（为每个仓库保存一个.pkl文件）\npython generate_rewriter_embedding.py # 此步骤将为重写器生成的所有查询生成嵌入（保存一个.pkl文件）\n```\n\nCGE-large的要求\n```bash\ntorch==2.1.0\ntransformers==4.39.2\ntokenizers==0.15.2\naccelerate==0.28.0\n```\n\n### 重写器\n\n给定问题元数据，执行以下脚本以生成重写器的结果（包括推理器和提取器）。\n\n```bash \npython generate_rewriter_prompt.py # 此步骤将生成一个名为 \"test_rewriter_prompt.json\" 的 JSON 文件，其中包含重写器使用的提示语\n\npython inference_rewriter.py --prompt_path test_rewriter_prompt.json # 此步骤将加载通义千问模型进行推理，并生成重写器的输出文件 \"test_rewriter_output.json\"\n\npython rewriter_output_post_processing.py # 此步骤将加载重写器的输出文件 \"rewriter_output.json\"，并生成经过后处理的输出文件 \"test_rewriter_output.json\"\n```\n\n在 `rewriter\u002Fprompt.py` 中使用函数 `generate_prompt_for_extractor` 和 `generate_prompt_for_inferer`：\n```python\ndef generate_prompt_for_extractor(problem_statement, repo_name):\n    prompt = \"\"\"\n    \u003Cissue>\n    {}\n    \u003C\u002Fissue> \n    这是一个与仓库 '{}' 相关的问题。 \n    指令：\n    1. 分析：\n    ○ 分析提供的问题描述。识别涉及的相关文件、类或函数。\n    ○ 确定遇到的具体问题或错误，并注意任何可能有助于定位相关或存在问题区域的线索。\n    2. 提取：\n    ○ 分析完成后，提取所有提到的代码实体（文件、类或函数），尤其是文件。\n    ○ 然后提取三个潜在且有意义的关键词，按以下格式回答：\n\n    [start_of_analysis] \n    \u003Cdetailed_analysis> \n    [end_of_analysis] \n\n    [start_of_related_code_entities] \n    \u003Centity_name_with_path>\n    [end_of_related_code_entities] \n\n    [start_of_related_keywords] \n    \u003Ckeywords>\n    [end_of_related_keywords]\n\n    注意事项：\n    - 请注意错误日志中的信息（如果存在）。\n    - 存在缺陷的代码仅存在于问题中描述的项目中（例如 django、sklearn）。缺陷位置通常不在测试文件或外部包中。\n    - 您提取的实体应简洁、准确且具有信息量。\n    - 如果指定了相对路径，请提供代码实体的相对路径（例如 package\u002Ffoo.py）。相对路径是相对于仓库本身的，不要包含诸如 '\u002Fhome\u002Fusername\u002F'、'\u002Fetc\u002Fservice\u002F' 或 '\u002Ftree\u002Fmaster' 等后缀。\n    - 在您的提取结果中，请勿包含行号或解释等附加信息。\n\n    推荐的代码实体提取示例：\n    - repo\u002Fcart.py\n    - Class User()\n    - def getData()\n    推荐的关键词提取示例：\n    - train_loop\n    - hooks\n    - docker\n    \n    不推荐的关键词提取示例：\n    - something wrong\n    - input validation\n    - TypeError\n    \"\"\".format(problem_statement, repo_name)\n        \n    return prompt\n\ndef generate_prompt_for_inferer(problem_statement, repo_name):\n    prompt = \"\"\"\n    \u003Cissue>\n    {}\n    \u003C\u002Fissue> \n    这是一个与仓库 '{}' 相关的问题。 \n    任务：\n    根据提供的问题描述，识别可能需要修改的代码实体（文件、函数、类）的特征。 \n    对于每个特征，生成一个搜索查询，以帮助在代码库中找到相关的代码实体。\n    指令：\n    首先，分析问题描述，识别可能与代码实体修改相关的关键词、功能和特性。\n    然后，创建能够捕捉这些特征的查询，重点关注：\n    ● 可能实现相关功能的文件名。\n    ● 与问题中描述的功能相关的函数或方法。\n    ● 可能与所述功能相关的任何模式或结构。\n    例如：\n    ● 与神经网络初始化相关的文件。\n    ● 与训练过程相关的函数。\n    ● 用于配置服务的代码。\n    请按照以下格式回答：\n\n    [start_of_analysis] \n    \u003Cdetailed_analysis> \n    [end_of_analysis] \n\n    [start_of_related_queries] \n    query 1:\n    query 2:\n    ...\n    [end_of_related_queries] \n\n    注意事项：\n    - 您的查询应详细、准确且具有信息量。 \n    - 查询应为完整的句子，不得包含额外的解释。\n    - 查询数量最多为五个，因此请专注于重要的特征。\n    - 您的查询应聚焦于仓库本身的代码，而非提交历史等其他信息。\n    - 请注意错误日志中的信息（如果存在）。\n\n    推荐的查询示例：\n    - 在训练循环文件中查找对 'tqdm' 或 'progress_bar' 的引用，以确定当前进度条更新的位置。\n    - 查找调用 'socket' 模块中 'gethostbyname' 函数的代码片段。\n    - 文件名包含 'mysql.py' 并且包含与 'MySQLStatementSamples' 初始化相关的函数。\n    - 在 'datadog_checks' 目录中处理主机名解析或编码的函数或方法。\n    - 在同时提及 'Trainer' 的文件中查找所有出现 'early_stopping' 的地方，以确定早期停止逻辑的实施位置，并可能需要针对非默认的 'val_check_interval' 进行调整。\n    \"\"\".format(problem_statement, repo_name)\n        \n    return prompt\n```\n您可以通过以下方式使用重写器的提示语：\n```python\nfrom rewriter.prompt import generate_prompt_for_extractor, generate_prompt_for_inferer\n\n# 生成提取提示语\nextraction_prompt = generate_prompt_for_extractor(problem_statement, repo_name)\n\n# 生成推理提示语\ninference_prompt = generate_prompt_for_inferer(problem_statement, repo_name)\n```\n\n### 检索器\n现在我们有：\n- 原始代码图：`codegraph\u002F`\n- 代码图的节点嵌入：`node_embedding\u002F`\n- 重写器推理器的查询嵌入：`rewriter_embedding.pkl`\n- 重写器提取器的输出：`rewriter_output.json`\n\n然后我们可以执行检索器：\n\n```bash\npython locate_anchor_node.py # 此步骤将使用上述输入，为所有样本生成锚点节点文件 \"anchor_node.json\"\npython subgraph.py # 此步骤将进一步扩展锚点节点，形成一个连通子图（保存为一组节点 ID）\npython serialize_subgraph.py # 基于上述子图的节点 ID，此步骤会将子图序列化为 JSON 格式（即检索器的最终输出）\n```\n\n检索器所需依赖：\n```bash\nRapidFuzz==1.5.0\nfaiss-cpu\n```\n\n### 重排序器\n\n重排序器用于从检索器生成的子图中确定最相关的文件。输入是检索器的输出——子图 JSON 文件。\n\n```bash\npython reranker.py --stage_1_k 10 --stage_2_k 5 # 此步骤将加载子图 JSON 文件并生成重排序器的输出。\n```\n\n重排序器的要求：\n```bash\nvllm>=0.8.5\n```\n\n使用 `reranker\u002Fprompt.py` 中的函数 `generate_prompt_for_reranker_stage_1` 和 `generate_prompt_for_reranker_stage_2`：\n```python\n\"\"\"\n重排序器提示模板\n\"\"\"\n\nreranker_stage_1_system_prompt = \"\"\"\n你是一位经验丰富的软件开发人员，擅长从大量参考文件中提取解决特定问题最相关的文件。\n\n任务：\n根据从代码库中获取的问题相关信息，从可能有助于解决问题的文件中找出最有可能的几份文件。\n\n指令：\n1. 分析：\n- 分析提供的问题描述和文件，重点关注这些文件与问题的相关性，尤其是那些在修复问题时可能被修改的文件。\n- 确定问题中提到的具体问题或错误，并注意任何有助于判断的线索。\n2. 提取：\n- 根据你的分析，选择最相关的 **1** 份文件，这些文件可能用于修复该问题。\n- 你应从提供的文件中选择，且不得以任何方式修改文件名。\n\n请按照以下格式作答：\n[start_of_analysis]\n\u003Cdetailed_analysis> \n[end_of_analysis] \n\n[start_of_relevant_files] \n1. \u003Cfile_with_its_path>\n2. \u003Cfile_with_its_path>\n3. ...\n[end_of_relevant_files] \n\n注：\n- 你可以参考错误日志中的信息（如果存在）。\n- 相关文件通常存在于问题描述的项目中（例如，django、sklearn）。需要修改的文件通常不在测试文件或外部包中。\n- 你选择的文件必须包含在提供的文件列表中。\n- 提供文件的完整路径，不要添加冗余后缀，如 '\u002Fhome\u002Fusername\u002F'、'\u002Fetc\u002Fservice\u002F' 或 '\u002Ftree\u002Fmaster'。\n- 在你的提取结果中不要包含行号或解释等额外信息。\n- 初始化和配置文件在代码更改过程中可能会被修改。\n\n相关文件提取示例：\n1. src\u002Futils\u002Ffile_handler.py\n2. core\u002Fservices\u002Fservice_manager.py\n3. ...\n\"\"\".strip()\n\nreranker_stage_1_user_prompt_template = \"\"\"\n\u003Crepository>\n{}\n\u003C\u002Frepository>\n\n\u003Cissue>\n{}\n\u003C\u002Fissue>\n \n\u003Creference_python_file_list>\n{}\n\u003C\u002Freference_python_file_list>\n\n\u003Cother_reference_file_list>\n{}\n\u003C\u002Fother_reference_file_list>\n\"\"\"\n\nreranker_stage_2_system_prompt = \"\"\"\n你是一位经验丰富的软件开发人员，擅长评估文件在软件仓库中解决特定问题时的相关性。\n\n任务：\n针对给定的文件，评估修改该文件是否能够解决所提出的问题，并根据特定标准给出评分。\n\n指令：\n1. 分析：\n- 分析提供的问题描述和单个相关文件的内容，注意其中提及的与文件相关的关键词、错误消息或具体功能。\n- 确定文件内容和功能与问题中描述的问题或错误之间的关联程度。\n- 考虑文件在整个项目结构中的角色（例如，配置文件、核心逻辑文件与测试文件或工具脚本的区别）。\n2. 评分：\n- 根据你的分析，给出 1 到 5 分的评分，表示修改该文件对解决问题的相关程度。\n\n评分说明：\n1. **1 分**：该文件几乎肯定与问题无关，与问题中描述的功能或错误没有任何明显联系。\n2. **2 分**：该文件可能与问题有间接关系，但修改它不太可能直接解决问题；仅在极少数情况下才有可能。\n3. **3 分**：该文件与问题有一定相关性；它可能间接影响受影响的功能，对其进行调整可能是更广泛修复的一部分。\n4. **4 分**：该文件很可能与问题相关；其中包含与相关功能直接交互的代码，很可能存在导致问题的缺陷。\n5. **5 分**：该文件很可能是问题的根本原因或深度参与了问题，修改它应该可以直接解决所提到的错误或问题。\n\n请按照以下格式作答：\n[start_of_analysis]\n\u003Cdetailed_analysis>\n[end_of_analysis]\n\n[start_of_score]\nScore \u003Cnumber>\n[end_of_score]\n\n注：\n- 文件内容仅展示该文件的结构，包括其中定义的类和函数名称。\n- 你可以参考错误日志中的信息（如果存在）。\n\"\"\".strip()\n\nreranker_stage_2_user_prompt_template = \"\"\"\n\u003Crepository>\n{}\n\u003C\u002Frepository>\n\n\u003Cissue>\n{}\n\u003C\u002Fissue>\n\n\u003Cfile_name>\n{}\n\u003C\u002Ffile_name>\n\n\u003Cfile_content>\n{}\n\u003C\u002Ffile_content>\n\"\"\"\n\ndef generate_prompt_for_reranker_stage_1(problem_statement, repo_name, py_file, other_file):\n  \"\"\"\n  problem_statement: 问题\n  repo_name: 代码库\n  py_file: Python 文件列表\n  other_file: 其他相关文件列表\n  \"\"\"\n  return reranker_stage_1_system_prompt, reranker_stage_1_user_prompt_template.format(repo_name, problem_statement, py_file, other_file)\n\ndef generate_prompt_for_reranker_stage_2(problem_statement, repo_name, file_name, file_content):\n  \"\"\"\n  problem_statement: 问题\n  repo_name: 代码库\n  file_name: 文件名\n  file_content: 文件内容（class xxx 和 def xxx）\n  \"\"\"\n  return reranker_stage_2_system_prompt, reranker_stage_2_user_prompt_template.format(repo_name, problem_statement, file_name, file_content)\n```\n你可以通过以下方式使用重排序器提示：\n```python\nfrom reranker.prompt import generate_prompt_for_reranker_stage_1, generate_prompt_for_reranker_stage_2\n\n# 第一阶段：识别相关文件\nsystem_prompt, user_prompt = generate_prompt_for_reranker_stage_1(\n    problem_statement, \n    repo_name, \n    py_file_list, \n    other_file_list\n)\n\n# 第二阶段：评估文件相关性\nsystem_prompt, user_prompt = generate_prompt_for_reranker_stage_2(\n    problem_statement,\n    repo_name,\n    target_file,\n    file_content\n)\n```\n\n### 阅读器\n使用 DeepSpeed 配置运行阅读器模块：\n```bash\n# Zero-2 配置\nEXPORT N_NODE={YOUR_MACHINE_NUM} && \\\nEXPORT N_GPU_PER_NODE={YOUR_GPU_NUM} && \\\nEXPORT TRAIN_CONFIG={TRAIN_CONFIG}.json && \\\nbash launch\u002Fzero2.sh\n\n# Zero-3 配置\nEXPORT N_NODE={YOUR_MACHINE_NUM} && \\\nEXPORT N_GPU_PER_NODE={YOUR_GPU_NUM} && \\\nEXPORT TRAIN_CONFIG={TRAIN_CONFIG}.json && \\\nbash launch\u002Fzero3.sh\n```\n\n## 贡献\n我们欢迎各种形式的贡献！如果您有任何建议、想法、错误报告，或是希望新增支持的模型或功能，请随时提交一个问题或拉取请求。\n\n我们非常欢迎社区的参与！请按照以下步骤进行贡献：\n\n1. 克隆仓库并创建分支  \n2. 在您的分支上开发新功能或修复问题  \n3. 提交更改  \n4. 将更改推送到您的分支  \n5. 打开一个拉取请求  \n\n对于重大变更，请先提交一个问题，与我们讨论您的计划。\n\n\n## 引用\n如果您觉得我们的工作对您的研发工作有所帮助或有价值，请随时引用我们的论文，引用格式如下：\n```bibtex\n@misc{tao2025codegraphmodelcgm,\n      title={代码图模型 (CGM)：一种用于仓库级软件工程任务的图融合大型语言模型}, \n      author={Hongyuan Tao, Ying Zhang, Zhenhao Tang, Hongen Peng, Xukun Zhu, Bingchang Liu, Yingguang Yang, Ziyin Zhang, Zhaogui Xu, Haipeng Zhang, Linchao Zhu, Rui Wang, Hang Yu, Jianguo Li, Peng Di},\n      year={2025},\n      eprint={2505.16901},\n      archivePrefix={arXiv},\n      primaryClass={cs.SE},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.16901}, \n}\n```\n\n## 加入我们\n\n我们是蚂蚁集团平台技术事业群下的 AI Native 团队，专注于推动蚂蚁集团平台工程的智能化发展。团队成立三年多来，在支撑蚂蚁集团云计算基础设施的智能化运维方面发挥了关键作用。我们的使命是通过世界级的技术创新与影响力，构建具有广泛用户基础的算法服务与平台，助力内外部产品与业务的落地实施。\n\n秉持创新驱动的理念，我们不仅支持业务落地，更致力于提升技术影响力。过去三年中，我们在 ICLR、NeurIPS、KDD 和 ACL 等顶级会议上发表了20余篇论文。我们的创新性业务成果荣获了蚂蚁集团两项最高级别的 T-Star 奖以及一项 SuperMA 奖。我们的开源项目 CodeFuse 截至2024年2月已获得4000颗星，相关模型在 Huggingface 和 Modelscope 上的下载量超过150万次。\n\n我们正在寻找顶尖人才加入这个充满活力的团队！如果您渴望在一个充满活力、创新精神和卓越文化的工作环境中发展自己的职业生涯，欢迎您了解我们的校招和社招职位信息。加入我们，共同创造行业的新里程碑！\n\n**联系方式**: hyu.hugo@antgroup.com","# CodeFuse-CGM 快速上手指南\n\nCodeFuse-CGM 是一个基于代码图（Code Graph）的大语言模型框架，专为仓库级（Repository-Level）软件工程任务设计。它通过 R4 链式结构（Rewriter, Retriever, Reranker, Reader）理解代码上下文并生成修复补丁。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**: Linux \u002F macOS (推荐)\n*   **Python 版本**: 3.8 或更高\n*   **包管理器**: pip\n*   **硬件建议**: 运行 72B 模型及进行微调需要高性能 GPU（支持 DeepSpeed\u002FFSDP）。\n\n## 安装步骤\n\n### 1. 安装核心依赖\n\n创建虚拟环境并安装主要依赖包：\n\n```bash\npip install transformers==4.46.1 tokenizers==0.20.0 accelerate==1.0.1 peft==0.13.2 jinja2==2.11.3 fuzzywuzzy==0.18.0 python-Levenshtein==0.25.1 networkx==3.0\n```\n\n> **提示**：国内用户可使用清华或阿里镜像源加速安装：\n> `pip install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple \u003Cpackage_name>`\n\n### 2. 安装组件特定依赖\n\n根据您打算使用的模块，需额外安装以下依赖：\n\n**用于检索器预处理 (CGE-large 嵌入生成):**\n```bash\npip install torch==2.1.0 transformers==4.39.2 tokenizers==0.15.2 accelerate==0.28.0\n```\n\n**用于检索器 (Retriever):**\n```bash\npip install RapidFuzz==1.5.0 faiss-cpu\n```\n\n**用于重排序器 (Reranker):**\n```bash\npip install \"vllm>=0.8.5\"\n```\n\n## 基本使用流程\n\nCGM 的核心工作流包含四个阶段：**重写 (Rewriter) -> 检索 (Retriever) -> 重排序 (Reranker) -> 读取\u002F生成 (Reader)**。以下是简化的执行步骤：\n\n### 第一步：数据预处理 (生成节点嵌入)\n\n在使用检索器前，需利用 `CGE-large` 模型将代码图节点和查询语句转化为向量嵌入。\n\n```bash\n# 1. 提取代码图节点内容\npython generate_code_content.py\n\n# 2. 为每个节点生成嵌入 (输出 .pkl 文件)\npython generate_code_embedding.py\n\n# 3. 为重写器生成的查询语句生成嵌入 (输出 .pkl 文件)\npython generate_rewriter_embedding.py\n```\n\n### 第二步：重写器 (Rewriter)\n\n解析 Issue 描述，提取关键代码实体并生成搜索查询。\n\n```bash\n# 1. 生成提示词\npython generate_rewriter_prompt.py\n\n# 2. 加载模型进行推理 (需指定 Qwen 模型)\npython inference_rewriter.py --prompt_path test_rewriter_prompt.json\n\n# 3. 后处理输出结果\npython rewriter_output_post_processing.py\n```\n\n*编程调用示例:*\n```python\nfrom rewriter.prompt import generate_prompt_for_extractor, generate_prompt_for_inferer\n\n# 生成提取器提示词\nextraction_prompt = generate_prompt_for_extractor(problem_statement, repo_name)\n\n# 生成推理器提示词\ninference_prompt = generate_prompt_for_inferer(problem_statement, repo_name)\n```\n\n### 第三步：检索器 (Retriever)\n\n基于重写器的输出，从代码图中定位锚点节点并扩展为子图。\n\n```bash\n# 1. 定位锚点节点\npython locate_anchor_node.py\n\n# 2. 扩展为连通子图\npython subgraph.py\n\n# 3. 序列化子图为 JSON 格式\npython serialize_subgraph.py\n```\n\n### 第四步：重排序器 (Reranker)\n\n从检索到的子图中筛选出最可能需要修改的文件。\n\n```bash\n# 加载子图并进行两阶段重排序\npython reranker.py --stage_1_k 10 --stage_2_k 5\n```\n\n完成上述步骤后，最终的子图上下文和筛选出的文件将输入到 **Reader** 模块（基于图的训练模型），以生成最终的代码修复补丁。\n\n---\n*更多详细信息、模型权重及数据集请访问 Hugging Face: [codefuse-ai](https:\u002F\u002Fhuggingface.co\u002Fcodefuse-ai)*","某金融科技公司后端团队在维护一个拥有百万行代码的遗留支付系统时，急需修复一个涉及多模块调用的复杂并发漏洞。\n\n### 没有 CodeFuse-CGM 时\n- **上下文缺失**：开发者仅凭文件名搜索，难以定位分散在不同微服务中的关联调用链，导致漏改关键逻辑。\n- **人工梳理耗时**：需要手动阅读大量无关代码来理解仓库级结构，排查一个 Bug 平均耗时超过 4 小时。\n- **修复准确率低**：由于缺乏对整体代码图谱的理解，生成的补丁常引发回归错误，需反复回滚和重试。\n- **知识断层**：新入职员工面对复杂的依赖关系无从下手，严重依赖资深架构师的口头指导。\n\n### 使用 CodeFuse-CGM 后\n- **图谱精准定位**：CodeFuse-CGM 自动构建仓库级代码图，通过 Retriever 节点瞬间锁定跨文件的深层依赖路径。\n- **智能上下文筛选**：利用 Reranker 节点从庞大的子图中提炼出最可能修改的核心文件，将有效信息密度提升 80%。\n- **高质量补丁生成**：Reader 节点结合图结构与问题描述，直接输出经过逻辑验证的代码修复方案，一次性通过率显著提高。\n- **自动化流程闭环**：从问题重写到最终补丁生成的 R4 链条全自动运行，将单次修复周期缩短至 30 分钟以内。\n\nCodeFuse-CGM 通过将仓库结构转化为可理解的图谱，让大模型真正具备了“全局视野”，彻底解决了复杂系统中局部视角导致的修复难题。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fcodefuse-ai_CodeFuse-CGM_f3240514.png","codefuse-ai","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fcodefuse-ai_ad46a773.png",null,"https:\u002F\u002Fgithub.com\u002Fcodefuse-ai",[78,82],{"name":79,"color":80,"percentage":81},"Python","#3572A5",99.7,{"name":83,"color":84,"percentage":85},"Shell","#89e051",0.3,528,55,"2026-03-19T11:50:19",4,"未说明","Retriever 阶段支持 CPU (faiss-cpu)；Reranker 阶段需要 GPU 以运行 vllm (vllm>=0.8.5)，具体显存需求取决于模型大小 (如 CGM-72B 需高显存)，CUDA 版本未明确说明 (依赖 torch==2.1.0 或更高)","未说明 (建议 32GB+ 以处理代码图嵌入及 72B 模型推理)",{"notes":94,"python":95,"dependencies":96},"该工具包含多个阶段：预处理需使用 CGE-large 模型生成嵌入；Rewriter 阶段需加载 Qwen 模型；Reranker 阶段强依赖 vllm 进行推理。不同阶段（如 CGE-large 预处理与主框架）对 transformers 和 accelerate 的版本要求不一致，需注意环境隔离或版本兼容。支持 LoRA\u002FQLoRA 高效微调。","3.8+",[97,98,99,100,101,102,103,104,105,106],"torch==2.1.0","transformers==4.46.1","tokenizers==0.20.0","accelerate==1.0.1","peft==0.13.2","networkx==3.0","faiss-cpu","vllm>=0.8.5","RapidFuzz==1.5.0","jinja2==2.11.3",[35,14],"2026-03-27T02:49:30.150509","2026-04-11T08:01:48.840061",[111,116,121],{"id":112,"question_zh":113,"answer_zh":114,"source_url":115},29186,"如何生成代码图（Code Graph）？","目前代码图生成功能主要用于企业内部场景，暂无立即开源的计划。团队正致力于在 Hugging Face 数据集 (https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fcodefuse-ai\u002FCodeGraph) 中开源代码图数据。如果您想了解背后的技术细节，可以参考论文《Incremental Call Graph Construction in Industrial Practice》(https:\u002F\u002Fdl.acm.org\u002Fdoi\u002F10.1109\u002FICSE-SEIP58684.2023.00048)。","https:\u002F\u002Fgithub.com\u002Fcodefuse-ai\u002FCodeFuse-CGM\u002Fissues\u002F2",{"id":117,"question_zh":118,"answer_zh":119,"source_url":120},29187,"项目中的模型在哪里可以获取？","模型现已在 Hugging Face 上发布。维护者表示将很快更新 README 文件以包含相关的详细信息，请访问 Hugging Face 页面查找该项目的模型仓库。","https:\u002F\u002Fgithub.com\u002Fcodefuse-ai\u002FCodeFuse-CGM\u002Fissues\u002F1",{"id":122,"question_zh":123,"answer_zh":124,"source_url":125},29188,"仓库代码是如何转变成图结构的？图是在哪一步生成的？","虽然 `generate_code_content` 函数直接读取 \"codegraph\u002F\" 下的内容，但该图结构的生成部分目前属于企业内部功能，尚未开源。您可以关注 Hugging Face 上的 CodeGraph 数据集 (https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fcodefuse-ai\u002FCodeGraph) 获取相关数据，或查阅论文《Incremental Call Graph Construction in Industrial Practice》了解具体实现原理。","https:\u002F\u002Fgithub.com\u002Fcodefuse-ai\u002FCodeFuse-CGM\u002Fissues\u002F3",[]]