[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-stanford-futuredata--ARES":3,"tool-stanford-futuredata--ARES":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":79,"owner_location":79,"owner_email":79,"owner_twitter":79,"owner_website":79,"owner_url":80,"languages":81,"stars":86,"forks":87,"last_commit_at":88,"license":89,"difficulty_score":10,"env_os":90,"env_gpu":91,"env_ram":92,"env_deps":93,"category_tags":101,"github_topics":79,"view_count":10,"oss_zip_url":79,"oss_zip_packed_at":79,"status":16,"created_at":102,"updated_at":103,"faqs":104,"releases":133},1139,"stanford-futuredata\u002FARES","ARES","Automated Evaluation of RAG Systems","ARES是一款专注于评估检索增强生成（RAG）系统性能的自动化工具。它通过合成数据生成技术与微调分类器结合，可高效检测模型在上下文相关性、答案忠实度和答案相关性三个核心维度的表现，大幅降低对人工标注的依赖。其独特的Prediction-Powered Inference机制能量化评估结果的统计置信度，同时支持用户基于自有文档生成定制化测试用例。该工具适合需要优化RAG模型效果的开发者和研究者使用，尤其适用于希望快速验证系统改进方案或对比不同配置性能的场景。通过简化评估流程，ARES让模型迭代更聚焦于数据质量与算法优化本身。","\u003Ch2 align=\"center\">ARES: An Automated Evaluation Framework for Retrieval-Augmented Generation Systems\u003C\u002Fh2>\n\n\u003Cp align=\"center\">\n  \u003Ca>Table of Contents:\u003C\u002Fa>\n  \u003Ca href=\"#section1\">Installation\u003C\u002Fa> |\n  \u003Ca href=\"#section2\">Requirements\u003C\u002Fa> |\n  \u003Ca href=\"#section3\">Quick Start\u003C\u002Fa> |\n  \u003Ca href=\"#section4\">Citation\u003C\u002Fa>\n\u003C\u002Fp>\n\n\n\u003Cp align=\"center\">\n\n  \u003Ca href=\"https:\u002F\u002Fpypi.org\u002Fproject\u002Fares-ai\u002F\">\n  \u003Cimg alt=\"Static Badge\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Frelease-v0.5.7-blue?style=flat&link=https%3A%2F%2Fpython.org%2F\">\n  \u003C\u002Fa>\n\n  \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.09476\">\n  \u003Cimg alt=\"Static Badge\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FRead-ARES%20Paper-blue?style=flat&link=https%3A%2F%2Farxiv.org%2Fabs%2F2311.09476\">\n  \u003C\u002Fa>\n\n  \u003Ca href=\"https:\u002F\u002Fares-ai.vercel.app\u002F\">\n    \u003Cimg alt=\"Static Badge\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FRead-documentation-purple?style=flat\">\n  \u003C\u002Fa>\n\n  \u003Ca href=\"https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1DvXr9SvWOw6xaNW8LHcy9C06LKevDPxe#scrollTo=wBDuO0n5c1mz\" target=\"_blank\">\n    \u003Cimg src=\"https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg\" alt=\"Open In Colab\"\u002F>\n  \u003C\u002Fa>\n\n  \u003Ca>\n  \u003Cimg alt=\"Static Badge\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FMade%20with-Python-red?style=flat&link=https%3A%2F%2Fpython.org%2F\">\n  \u003C\u002Fa>\n\n\u003C\u002Fp>\n\n\nARES is a groundbreaking framework for evaluating Retrieval-Augmented Generation (RAG) models. The automated process combines synthetic data generation with fine-tuned classifiers to efficiently assess context relevance, answer faithfulness, and answer relevance, minimizing the need for extensive human annotations. ARES employs synthetic query generation and Prediction-Powered Inference (PPI), providing accurate evaluations with statistical confidence.\n\n\n### 💬 Mini Q&A\n\u003Chr>\n\n**What does ARES assess in RAG models?**\n\nARES conducts a comprehensive evaluation of Retrieval-Augmented Generation (RAG) models, assessing the systems for context relevance, answer faithfulness, and answer relevance. This thorough assessment ensures a complete understanding of the performance of the RAG system.\n\n**How does ARES automate the evaluation process?**\n\nARES minimizes the need for human labeling by leveraging fine-tuned classifiers and synthetic data. Its PPI component, Prediction-Powered inference, refines evaluations considering model response variability and provides statistical confidence in the results. By using fine-tuned classifiers and synthetically generated data, ARES cuts down on human labeling needs while providing accurate assessments. \n\n**Can ARES handle my custom RAG model?**\n\nYes, ARES is a model-agnostic tool that enables you to generate synthetic queries and answers from your documents. With ARES, you can evaluate these generated queries and answers from your RAG model.\n​\n### ⚙️ Installation\n\u003Ca id=\"section1\">\u003C\u002Fa>\n\u003Chr>\n​\nTo install ARES, run the following commands:\n​\n\n```python\n\npip install ares-ai\n\n```\n​\n*Optional: Initalize OpenAI or TogetherAI API key with the following command:*\n\n\n```python\n\nexport OPENAI_API_KEY=\u003Cyour key here>\nexport TOGETHER_API_KEY=\u003Cyour key here>\n\n```\n\n### 📝 Requirements\n\u003Ca id=\"section2\">\u003C\u002Fa>\n\u003Chr>\n\nTo implement ARES for scoring your RAG system and comparing to other RAG configurations, you need three components:​\n\n* A human preference validation set of annotated query, document, and answer triples for the evaluation criteria (e.g. context relevance, answer faithfulness, and\u002For answer relevance). There should be at least 50 examples but several hundred examples is ideal.\n* A set of few-shot examples for scoring context relevance, answer faithfulness, and\u002For answer relevance in your system\n* A much larger set of unlabeled query-document-answer triples outputted by your RAG system for scoring\n\n\u003Ca id=\"section3\">\u003C\u002Fa>\n\u003Chr>\n\nTo get started with ARES, you'll need to set up your configuration. Below is an example of a configuration for ARES!\n\nCopy-paste each step to see ARES in action!\n\n\u003Chr>\n\n### 📥 Download datasets\n\n\u003Chr>\n\nUse the following command to quickly obtain the necessary files for getting started! This includes the 'few_shot_prompt' file for judge scoring and synthetic query generation, as well as both labeled and unlabeled datasets.\n```python \nwget https:\u002F\u002Fraw.githubusercontent.com\u002Fstanford-futuredata\u002FARES\u002Fmain\u002Fdatasets\u002Fexample_files\u002Fnq_few_shot_prompt_for_judge_scoring.tsv\nwget https:\u002F\u002Fraw.githubusercontent.com\u002Fstanford-futuredata\u002FARES\u002Fmain\u002Fdatasets\u002Fexample_files\u002Fnq_few_shot_prompt_for_synthetic_query_generation.tsv\nwget https:\u002F\u002Fraw.githubusercontent.com\u002Fstanford-futuredata\u002FARES\u002Fmain\u002Fdatasets\u002Fexample_files\u002Fnq_labeled_output.tsv\nwget https:\u002F\u002Fraw.githubusercontent.com\u002Fstanford-futuredata\u002FARES\u002Fmain\u002Fdatasets\u002Fexample_files\u002Fnq_unlabeled_output.tsv\n```\n\nOPTIONAL: You can run the following command to get the full NQ dataset! (37.3 GB)\n```python\nfrom ares import ARES\nares = ARES() \nares.KILT_dataset(\"nq\")\n\n# Fetches NQ datasets with ratios including 0.5, 0.6, 0.7, etc.\n# For purposes of our quick start guide, we rename nq_ratio_0.5 to nq_unlabeled_output and nq_labeled_output.\n```\n\u003Chr>\n\n### 🚀 Quick Start - #1\n\n\u003Chr>\n\nTo get started with ARES's PPI, you'll need to set up your configuration. Below is an example of a configuration for ARES!\n\nJust copy-paste as you go to see ARES in action!\n\n#### Step 1) Run the following to retrieve the UES\u002FIDP scores with GPT3.5!\n\n```python\nfrom ares import ARES\n\nues_idp_config = {\n    \"in_domain_prompts_dataset\": \"nq_few_shot_prompt_for_judge_scoring.tsv\",\n    \"unlabeled_evaluation_set\": \"nq_unlabeled_output.tsv\", \n    \"model_choice\" : \"gpt-3.5-turbo-0125\"\n} \n\nares = ARES(ues_idp=ues_idp_config)\nresults = ares.ues_idp()\nprint(results)\n# {'Context Relevance Scores': [Score], 'Answer Faithfulness Scores': [Score], 'Answer Relevance Scores': [Score]}\n```\n\n#### Step 2) Run the following to retrive ARES's PPI scores with GPT3.5!\n\n\n```python\nppi_config = { \n    \"evaluation_datasets\": ['nq_unlabeled_output.tsv'], \n    \"few_shot_examples_filepath\": \"nq_few_shot_prompt_for_judge_scoring.tsv\",\n    \"llm_judge\": \"gpt-3.5-turbo-1106\",\n    \"labels\": [\"Context_Relevance_Label\"], \n    \"gold_label_path\": \"nq_labeled_output.tsv\", \n}\n\nares = ARES(ppi=ppi_config)\nresults = ares.evaluate_RAG()\nprint(results)\n```\n\n\u003Chr>\n\n### 🚀 Quick Start - #2\n\n\u003Chr>\n\n#### Step 1) Run the following to see GPT 3.5's accuracy on the NQ unlabeled dataset!\n\n```python\nfrom ares import ARES\n\nues_idp_config = {\n    \"in_domain_prompts_dataset\": \"nq_few_shot_prompt_for_judge_scoring.tsv\",\n    \"unlabeled_evaluation_set\": \"nq_unlabeled_output.tsv\", \n    \"model_choice\" : \"gpt-3.5-turbo-0125\"\n} \n\nares = ARES(ues_idp=ues_idp_config)\nresults = ares.ues_idp()\nprint(results)\n# {'Context Relevance Scores': [Score], 'Answer Faithfulness Scores': [Score], 'Answer Relevance Scores': [Score]}\n```\n\n#### Step 2) Run the following to see ARES's synthetic generation in action! \n```python\n\nfrom ares import ARES\n\nsynth_config = { \n    \"document_filepaths\": [\"nq_labeled_output.tsv\"] ,\n    \"few_shot_prompt_filename\": \"nq_few_shot_prompt_for_synthetic_query_generation.tsv\",\n    \"synthetic_queries_filenames\": [\"synthetic_queries_1.tsv\"], \n    \"documents_sampled\": 6189\n}\n\nares_module = ARES(synthetic_query_generator=synth_config)\nresults = ares_module.generate_synthetic_data()\nprint(results)\n```\n\n\u003Chr>\n\n#### Step 3) Run the following to see ARES's training classifier in action!\n```python\n\nfrom ares import ARES\n\nclassifier_config = {\n    \"training_dataset\": [\"synthetic_queries_1.tsv\"], \n    \"validation_set\": [\"nq_labeled_output.tsv\"], \n    \"label_column\": [\"Context_Relevance_Label\"], \n    \"num_epochs\": 10, \n    \"patience_value\": 3, \n    \"learning_rate\": 5e-6,\n    \"assigned_batch_size\": 1,  \n    \"gradient_accumulation_multiplier\": 32,  \n}\n\nares = ARES(classifier_model=classifier_config)\nresults = ares.train_classifier()\nprint(results)\n```\n\nNote: This code creates a checkpoint for the trained classifier.\nTraining may take some time. You can download our jointly trained checkpoint on context relevance here!:\n[Download Checkpoint](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1INyHfZpsUsn5UEBLSRehI9AX08AI12Lt\u002Fview?usp=sharing)\n\n\n\u003Chr>\n\n#### Step 4) Run the following to see ARES's PPI in action!\n```python\n\nfrom ares import ARES\n\nppi_config = { \n    \"evaluation_datasets\": ['nq_unlabeled_output.tsv'], \n    \"checkpoints\": [\"Context_Relevance_Label_nq_labeled_output_date_time.pt\"], \n    \"rag_type\": \"question_answering\", \n    \"labels\": [\"Context_Relevance_Label\"], \n    \"gold_label_path\": \"nq_labeled_output.tsv\", \n}\n\nares = ARES(ppi=ppi_config)\nresults = ares.evaluate_RAG()\nprint(results)\n\n# Output Should be: \n\"\"\" \nContext_Relevance_Label Scoring\nARES Ranking\nARES Prediction: [0.6056978059262574]\nARES Confidence Interval: [[0.547, 0.664]]\nNumber of Examples in Evaluation Set: [4421]\nGround Truth Performance: [0.6]\nARES LLM Judge Accuracy on Ground Truth Labels: [0.789]\nAnnotated Examples used for PPI: 300\n\"\"\"\n\n```\n\n\u003Cbr>\n\n### 🚀 Local Model Execution with vLLM\n\nARES supports [vLLM](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm), allowing for local execution of LLM models, offering enhanced privacy and the ability to operate ARES offline. Below are steps to vLLM for ARES's UES\u002FIDP and PPI!\n\n#### 1) UES\u002FIDP w\u002F vLLM\n\n```python\nfrom ares import ARES\n\nues_idp_config = {\n    \"in_domain_prompts_dataset\": \"nq_few_shot_prompt_for_judge_scoring.tsv\",\n    \"unlabeled_evaluation_set\": \"nq_unlabeled_output.tsv\", \n    \"model_choice\": \"meta-llama\u002FLlama-2-13b-hf\", # Specify vLLM model\n    \"vllm\": True, # Toggle vLLM to True \n    \"host_url\": \"http:\u002F\u002F0.0.0.0:8000\u002Fv1\" # Replace with server hosting model followed by \"\u002Fv1\"\n} \n\nares = ARES(ues_idp=ues_idp_config)\nresults = ares.ues_idp()\nprint(results)\n```\n\n\u003Chr>\n\n#### 2) PPI w\u002F vLLM\n\n```python\nfrom ares import ARES\n\nppi_config = { \n    \"evaluation_datasets\": ['nq_unabeled_output.tsv'], \n    \"few_shot_examples_filepath\": \"nq_few_shot_prompt_for_judge_scoring.tsv\",\n    \"llm_judge\": \"meta-llama\u002FLlama-2-13b-hf\", # Specify vLLM model\n    \"labels\": [\"Context_Relevance_Label\"], \n    \"gold_label_path\": \"nq_labeled_output.tsv\",\n    \"vllm\": True, # Toggle vLLM to True \n    \"host_url\": \"http:\u002F\u002F0.0.0.0:8000\u002Fv1\" # Replace with server hosting model followed by \"\u002Fv1\"\n}\n\nares = ARES(ppi=ppi_config)\nresults = ares.evaluate_RAG()\nprint(results)\n```\n\nFor more details, refer to our [documentation](https:\u002F\u002Fares-ai.vercel.app\u002F).\n\n\u003Cbr>\n\n## Results Replication\n\nWe include synthetic datasets for key experimental results in `synthetic_datasets`. The few-shot prompts used for generation and evaluation are included in `datasets`. We also include instructions for fine-tuning LLM judges in the paper itself. Please reach out to jonsaadfalcon@stanford.edu or manihani@stanford.edu if you have any further questions.\n\n## Citation\n\u003Ca id=\"section4\">\u003C\u002Fa>\n\nTo cite our work, please use the following Bibtex:\n\n````\n@misc{saadfalcon2023ares,\n      title={ARES: An Automated Evaluation Framework for Retrieval-Augmented Generation Systems}, \n      author={Jon Saad-Falcon and Omar Khattab and Christopher Potts and Matei Zaharia},\n      year={2023},\n      eprint={2311.09476},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n````\n\n# Appendix\n### Machine requirements and setup when not using OpenAI API\n**Machine requirements**\n\n- Over ~100 GB of available disk space\n- GPU\n    - Should work: A100 (e.g. `Standard_NC24ads_A100_v4` on Azure)\n    - Does not work:\n        - Tested on 2023-12-17 with both `Standard_NC6s_v3` and `Standard_NC12s_v3`, and ran into this error: `torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 160.00 MiB (GPU 0; 15.77 GiB total capacity; 15.12 GiB already allocated; 95.44 MiB free; 15.12 GiB reserved in total by PyTorch)`\n\n\n**Machine setup**\n\nFor example, on an Azure VM running Linux (ubuntu 20.04), you will need to do the following:\n- Install conda\n    - First set of commands (can copy-paste multiple lines)\n        - `wget https:\u002F\u002Frepo.anaconda.com\u002Fminiconda\u002FMiniconda3-latest-Linux-x86_64.sh`\n        - `chmod +x Miniconda3-latest-Linux-x86_64.sh`\n        - `.\u002FMiniconda3-latest-Linux-x86_64.sh -b`\n    - Second set of commands (can copy-paste multiple lines)\n        - `export PATH=\"~\u002Fminiconda3\u002Fbin:$PATH\"`\n        - `conda init`\n- Install gcc\n    - `sudo apt-get -y update`\n    - `sudo apt-get -y upgrade`\n    - `sudo apt-get -y install build-essential`\n    - `sudo apt-get -y install libpcre3-dev`\n- Install NVIDIA drivers\n    - `sudo apt install ubuntu-drivers-common -y`\n    - `sudo ubuntu-drivers autoinstall`\n    - `sudo reboot`\n    - SSH in again and confirm the installation was successful by running `nvidia-smi`\n- `cd` to ARES folder and follow the rest of the README\n","\u003Ch2 align=\"center\">ARES：一种用于检索增强生成系统的自动化评估框架\u003C\u002Fh2>\n\n\u003Cp align=\"center\">\n  \u003Ca>目录：\u003C\u002Fa>\n  \u003Ca href=\"#section1\">安装\u003C\u002Fa> |\n  \u003Ca href=\"#section2\">要求\u003C\u002Fa> |\n  \u003Ca href=\"#section3\">快速入门\u003C\u002Fa> |\n  \u003Ca href=\"#section4\">引用\u003C\u002Fa>\n\u003C\u002Fp>\n\n\n\u003Cp align=\"center\">\n\n  \u003Ca href=\"https:\u002F\u002Fpypi.org\u002Fproject\u002Fares-ai\u002F\">\n  \u003Cimg alt=\"静态徽章\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002Frelease-v0.5.7-blue?style=flat&link=https%3A%2F%2Fpython.org%2F\">\n  \u003C\u002Fa>\n\n  \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.09476\">\n  \u003Cimg alt=\"静态徽章\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FRead-ARES%20Paper-blue?style=flat&link=https%3A%2F%2Farxiv.org%2Fabs%2F2311.09476\">\n  \u003C\u002Fa>\n\n  \u003Ca href=\"https:\u002F\u002Fares-ai.vercel.app\u002F\">\n    \u003Cimg alt=\"静态徽章\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FRead-documentation-purple?style=flat\">\n  \u003C\u002Fa>\n\n  \u003Ca href=\"https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1DvXr9SvWOw6xaNW8LHcy9C06LKevDPxe#scrollTo=wBDuO0n5c1mz\" target=\"_blank\">\n    \u003Cimg src=\"https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg\" alt=\"在Colab中打开\"\u002F>\n  \u003C\u002Fa>\n\n  \u003Ca>\n  \u003Cimg alt=\"静态徽章\" src=\"https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FMade%20with-Python-red?style=flat&link=https%3A%2F%2Fpython.org%2F\">\n  \u003C\u002Fa>\n\n\u003C\u002Fp>\n\n\nARES 是一个用于评估检索增强生成（RAG）模型的开创性框架。该自动化流程结合了合成数据生成与微调分类器，能够高效地评估上下文相关性、答案忠实性和答案相关性，从而最大限度地减少对大量人工标注的需求。ARES 采用合成查询生成和预测驱动推理（PPI），在统计置信度下提供准确的评估结果。\n\n\n### 💬 小问答\n\u003Chr>\n\n**ARES 在 RAG 模型中评估哪些方面？**\n\nARES 对检索增强生成（RAG）模型进行全面评估，重点考察系统在上下文相关性、答案忠实性和答案相关性方面的表现。这一全面的评估确保能够完整地了解 RAG 系统的性能。\n\n**ARES 如何实现评估过程的自动化？**\n\nARES 通过利用微调分类器和合成数据，最大限度地减少了人工标注的需求。其 PPI 组件——预测驱动推理，能够根据模型响应的变异性进一步优化评估，并为结果提供统计置信度。借助微调分类器和合成生成的数据，ARES 在大幅降低人工标注需求的同时，仍能提供准确可靠的评估结果。\n\n**ARES 能否用于我的自定义 RAG 模型？**\n\n当然可以。ARES 是一种与模型无关的工具，允许您从自己的文档中生成合成查询和答案。使用 ARES，您可以对这些由您的 RAG 模型生成的查询和答案进行评估。\n​\n### ⚙️ 安装\n\u003Ca id=\"section1\">\u003C\u002Fa>\n\u003Chr>\n​\n要安装 ARES，请运行以下命令：\n​\n\n```python\n\npip install ares-ai\n\n```\n​\n*可选：使用以下命令初始化 OpenAI 或 TogetherAI 的 API 密钥：*\n\n\n```python\n\nexport OPENAI_API_KEY=\u003Cyour key here>\nexport TOGETHER_API_KEY=\u003Cyour key here>\n\n```\n\n### 📝 要求\n\u003Ca id=\"section2\">\u003C\u002Fa>\n\u003Chr>\n\n要使用 ARES 对您的 RAG 系统进行评分并与其他 RAG 配置进行比较，您需要三个组成部分：\n\n* 一组经过人工标注的查询、文档和答案三元组组成的偏好验证集，用于评估各项指标（例如上下文相关性、答案忠实性和\u002F或答案相关性）。建议至少包含 50 个示例，但几百个示例更为理想。\n* 一组用于评分上下文相关性、答案忠实性和\u002F或答案相关性的少样本示例。\n* 一个规模更大的未标注查询-文档-答案三元组集合，由您的 RAG 系统输出，用于评分。\n\n\u003Ca id=\"section3\">\u003C\u002Fa>\n\u003Chr>\n\n要开始使用 ARES，您需要先设置配置。下面是一个 ARES 配置的示例！\n\n请逐条复制粘贴，即可体验 ARES 的实际效果！\n\n\u003Chr>\n\n### 📥 下载数据集\n\n\u003Chr>\n\n使用以下命令可快速获取入门所需的文件！其中包括用于人工评分和合成查询生成的“few_shot_prompt”文件，以及已标注和未标注的数据集。\n```python \nwget https:\u002F\u002Fraw.githubusercontent.com\u002Fstanford-futuredata\u002FARES\u002Fmain\u002Fdatasets\u002Fexample_files\u002Fnq_few_shot_prompt_for_judge_scoring.tsv\nwget https:\u002F\u002Fraw.githubusercontent.com\u002Fstanford-futuredata\u002FARES\u002Fmain\u002Fdatasets\u002Fexample_files\u002Fnq_few_shot_prompt_for_synthetic_query_generation.tsv\nwget https:\u002F\u002Fraw.githubusercontent.com\u002Fstanford-futuredata\u002FARES\u002Fmain\u002Fdatasets\u002Fexample_files\u002Fnq_labeled_output.tsv\nwget https:\u002F\u002Fraw.githubusercontent.com\u002Fstanford-futuredata\u002FARES\u002Fmain\u002Fdatasets\u002Fexample_files\u002Fnq_unlabeled_output.tsv\n```\n\n可选：您还可以运行以下命令来获取完整的 NQ 数据集！（37.3 GB）\n```python\nfrom ares import ARES\nares = ARES() \nares.KILT_dataset(\"nq\")\n\n# 获取包括 0.5、0.6、0.7 等比例的 NQ 数据集。\n# 为了我们的快速入门指南，我们将 nq_ratio_0.5 重命名为 nq_unlabeled_output 和 nq_labeled_output。\n```\n\u003Chr>\n\n### 🚀 快速入门 - #1\n\n\u003Chr>\n\n要开始使用 ARES 的 PPI，您需要先设置配置。下面是一个 ARES 配置的示例！\n\n只需按步骤复制粘贴，即可看到 ARES 的实际效果！\n\n#### 步骤 1) 运行以下代码以使用 GPT-3.5 获取 UES\u002FIDP 分数！\n\n```python\nfrom ares import ARES\n\nues_idp_config = {\n    \"in_domain_prompts_dataset\": \"nq_few_shot_prompt_for_judge_scoring.tsv\",\n    \"unlabeled_evaluation_set\": \"nq_unlabeled_output.tsv\", \n    \"model_choice\" : \"gpt-3.5-turbo-0125\"\n} \n\nares = ARES(ues_idp=ues_idp_config)\nresults = ares.ues_idp()\nprint(results)\n# {'Context Relevance Scores': [Score], 'Answer Faithfulness Scores': [Score], 'Answer Relevance Scores': [Score]}\n```\n\n#### 步骤 2) 运行以下代码以使用 GPT-3.5 获取 ARES 的 PPI 分数！\n\n\n```python\nppi_config = { \n    \"evaluation_datasets\": ['nq_unlabeled_output.tsv'], \n    \"few_shot_examples_filepath\": \"nq_few_shot_prompt_for_judge_scoring.tsv\",\n    \"llm_judge\": \"gpt-3.5-turbo-1106\",\n    \"labels\": [\"Context_Relevance_Label\"], \n    \"gold_label_path\": \"nq_labeled_output.tsv\", \n}\n\nares = ARES(ppi=ppi_config)\nresults = ares.evaluate_RAG()\nprint(results)\n```\n\n\u003Chr>\n\n### 🚀 快速入门 - #2\n\n\u003Chr>\n\n#### 步骤 1) 运行以下代码以查看 GPT-3.5 在 NQ 未标注数据集上的准确率！\n\n```python\nfrom ares import ARES\n\nues_idp_config = {\n    \"in_domain_prompts_dataset\": \"nq_few_shot_prompt_for_judge_scoring.tsv\",\n    \"unlabeled_evaluation_set\": \"nq_unlabeled_output.tsv\", \n    \"model_choice\" : \"gpt-3.5-turbo-0125\"\n} \n\nares = ARES(ues_idp=ues_idp_config)\nresults = ares.ues_idp()\nprint(results)\n\n# {'上下文相关性评分': [分数], '答案忠实度评分': [分数], '答案相关性评分': [分数]}\n```\n\n#### 第2步) 运行以下代码，查看ARES的合成数据生成效果！\n```python\n\nfrom ares import ARES\n\nsynth_config = { \n    \"文档文件路径\": [\"nq_labeled_output.tsv\"] ,\n    \"少样本提示文件名\": \"nq_few_shot_prompt_for_synthetic_query_generation.tsv\",\n    \"合成查询文件名\": [\"synthetic_queries_1.tsv\"], \n    \"采样文档数量\": 6189\n}\n\nares_module = ARES(synthetic_query_generator=synth_config)\nresults = ares_module.generate_synthetic_data()\nprint(results)\n```\n\n\u003Chr>\n\n#### 第3步) 运行以下代码，查看ARES的训练分类器效果！\n```python\n\nfrom ares import ARES\n\nclassifier_config = {\n    \"训练数据集\": [\"synthetic_queries_1.tsv\"], \n    \"验证集\": [\"nq_labeled_output.tsv\"], \n    \"标签列\": [\"Context_Relevance_Label\"], \n    \"训练轮数\": 10, \n    \"耐心值\": 3, \n    \"学习率\": 5e-6,\n    \"分配的批次大小\": 1,  \n    \"梯度累积倍数\": 32,  \n}\n\nares = ARES(classifier_model=classifier_config)\nresults = ares.train_classifier()\nprint(results)\n```\n\n注意：此代码会为训练好的分类器创建检查点。训练可能需要一些时间。您可在此处下载我们联合训练的上下文相关性检查点：\n[下载检查点](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1INyHfZpsUsn5UEBLSRehI9AX08AI12Lt\u002Fview?usp=sharing)\n\n\n\u003Chr>\n\n#### 第4步) 运行以下代码，查看ARES的PPI效果！\n```python\n\nfrom ares import ARES\n\nppi_config = { \n    \"评估数据集\": ['nq_unlabeled_output.tsv'], \n    \"检查点\": [\"Context_Relevance_Label_nq_labeled_output_date_time.pt\"], \n    \"RAG类型\": \"question_answering\", \n    \"标签\": [\"Context_Relevance_Label\"], \n    \"黄金标准标签路径\": \"nq_labeled_output.tsv\", \n}\n\nares = ARES(ppi=ppi_config)\nresults = ares.evaluate_RAG()\nprint(results)\n\n# 输出应为： \n\"\"\" \n上下文相关性标签评分\nARES排名\nARES预测值：[0.6056978059262574]\nARES置信区间：[[0.547, 0.664]]\n评估集中的示例数量：[4421]\n真实性能：[0.6]\nARES LLM法官对真实标签的准确率：[0.789]\n用于PPI的标注示例：300\n\"\"\"\n\n```\n\n\u003Cbr>\n\n### 🚀 使用vLLM进行本地模型运行\n\nARES支持[vLLM](https:\u002F\u002Fgithub.com\u002Fvllm-project\u002Fvllm)，允许在本地运行LLM模型，从而提升隐私保护并实现离线运行。以下是使用vLLM运行ARES的UES\u002FIDP和PPI的步骤！\n\n#### 1) UES\u002FIDP w\u002F vLLM\n\n```python\nfrom ares import ARES\n\nues_idp_config = {\n    \"领域内提示数据集\": \"nq_few_shot_prompt_for_judge_scoring.tsv\",\n    \"未标注评估集\": \"nq_unlabeled_output.tsv\", \n    \"模型选择\": \"meta-llama\u002FLlama-2-13b-hf\", # 指定vLLM模型\n    \"vllm\": True, # 将vLLM设置为True \n    \"主机URL\": \"http:\u002F\u002F0.0.0.0:8000\u002Fv1\" # 替换为托管模型的服务器地址，并加上\"\u002Fv1\"\n} \n\nares = ARES(ues_idp=ues_idp_config)\nresults = ares.ues_idp()\nprint(results)\n```\n\n\u003Chr>\n\n#### 2) PPI w\u002F vLLM\n\n```python\nfrom ares import ARES\n\nppi_config = { \n    \"评估数据集\": ['nq_unlabeled_output.tsv'], \n    \"少样本示例文件路径\": \"nq_few_shot_prompt_for_judge_scoring.tsv\",\n    \"LLM法官\": \"meta-llama\u002FLlama-2-13b-hf\", # 指定vLLM模型\n    \"标签\": [\"Context_Relevance_Label\"], \n    \"黄金标准标签路径\": \"nq_labeled_output.tsv\",\n    \"vllm\": True, # 将vLLM设置为True \n    \"主机URL\": \"http:\u002F\u002F0.0.0.0:8000\u002Fv1\" # 替换为托管模型的服务器地址，并加上\"\u002Fv1\"\n}\n\nares = ARES(ppi=ppi_config)\nresults = ares.evaluate_RAG()\nprint(results)\n```\n\n更多详情请参阅我们的[文档](https:\u002F\u002Fares-ai.vercel.app\u002F)。\n\n\u003Cbr>\n\n## 结果复现\n\n我们在`synthetic_datasets`中包含了关键实验结果的合成数据集。用于生成和评估的少样本提示包含在`datasets`中。此外，论文中还提供了微调LLM法官的说明。如有任何疑问，请联系jonsaadfalcon@stanford.edu或manihani@stanford.edu。\n\n## 引用\n\u003Ca id=\"section4\">\u003C\u002Fa>\n\n如需引用我们的工作，请使用以下BibTeX格式：\n\n````\n@misc{saadfalcon2023ares,\n      title={ARES: An Automated Evaluation Framework for Retrieval-Augmented Generation Systems}, \n      author={Jon Saad-Falcon and Omar Khattab and Christopher Potts and Matei Zaharia},\n      year={2023},\n      eprint={2311.09476},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n````\n\n# 附录\n### 不使用OpenAI API时的机器要求与设置\n**机器要求**\n\n- 硬盘可用空间超过100 GB\n- GPU\n    - 可用：A100（例如Azure上的`Standard_NC24ads_A100_v4`）\n    - 不可用：\n        - 我们于2023年12月17日分别测试了`Standard_NC6s_v3`和`Standard_NC12s_v3`，均遇到了以下错误：`torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 160.00 MiB (GPU 0; 15.77 GiB total capacity; 15.12 GiB already allocated; 95.44 MiB free; 15.12 GiB reserved in total by PyTorch)`\n\n\n**机器设置**\n\n以运行Linux（ubuntu 20.04）的Azure虚拟机为例，您需要执行以下操作：\n- 安装conda\n    - 第一组命令（可复制多行）\n        - `wget https:\u002F\u002Frepo.anaconda.com\u002Fminiconda\u002FMiniconda3-latest-Linux-x86_64.sh`\n        - `chmod +x Miniconda3-latest-Linux-x86_64.sh`\n        - `.\u002FMiniconda3-latest-Linux-x86_64.sh -b`\n    - 第二组命令（可复制多行）\n        - `export PATH=\"~\u002Fminiconda3\u002Fbin:$PATH\"`\n        - `conda init`\n- 安装gcc\n    - `sudo apt-get -y update`\n    - `sudo apt-get -y upgrade`\n    - `sudo apt-get -y install build-essential`\n    - `sudo apt-get -y install libpcre3-dev`\n- 安装NVIDIA驱动程序\n    - `sudo apt install ubuntu-drivers-common -y`\n    - `sudo ubuntu-drivers autoinstall`\n    - `sudo reboot`\n    - 再次通过SSH登录，并运行`nvidia-smi`确认安装成功。\n- 切换到ARES文件夹，并按照README中的其余步骤操作。","# ARES 快速上手指南\n\n## 环境准备\n### 系统要求\n- Python 3.8+ 环境\n- 至少 100GB 可用磁盘空间\n- GPU 支持（推荐 A100 或更高显存型号，如 Azure 的 `Standard_NC24ads_A100_v4`）\n- Linux 或 macOS 系统\n\n### 前置依赖\n```bash\n# 安装依赖（推荐使用国内镜像加速）\npip install -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple torch transformers\n```\n\n## 安装步骤\n```bash\n# 安装 ARES 框架\npip install ares-ai\n\n# 可选：配置 API 密钥（需替换为实际密钥）\nexport OPENAI_API_KEY=\u003Cyour_key_here>\nexport TOGETHER_API_KEY=\u003Cyour_key_here>\n```\n\n## 基本使用\n### 快速运行示例\n```python\nfrom ares import ARES\n\n# UES\u002FIDP 评估示例\nues_idp_config = {\n    \"in_domain_prompts_dataset\": \"nq_few_shot_prompt_for_judge_scoring.tsv\",\n    \"unlabeled_evaluation_set\": \"nq_unlabeled_output.tsv\", \n    \"model_choice\": \"gpt-3.5-turbo-0125\"\n}\n\nares = ARES(ues_idp=ues_idp_config)\nresults = ares.ues_idp()\nprint(results)\n```\n\n```python\n# PPI 评估示例\nppi_config = { \n    \"evaluation_datasets\": [\"nq_unlabeled_output.tsv\"], \n    \"few_shot_examples_filepath\": \"nq_few_shot_prompt_for_judge_scoring.tsv\",\n    \"llm_judge\": \"gpt-3.5-turbo-1106\",\n    \"labels\": [\"Context_Relevance_Label\"], \n    \"gold_label_path\": \"nq_labeled_output.tsv\", \n}\n\nares = ARES(ppi=ppi_config)\nresults = ares.evaluate_RAG()\nprint(results)\n```\n\n> 注意：需提前下载示例数据文件（`nq_few_shot_prompt_for_judge_scoring.tsv` 等），可通过以下命令获取：\n```bash\nwget https:\u002F\u002Fraw.githubusercontent.com\u002Fstanford-futuredata\u002FARES\u002Fmain\u002Fdatasets\u002Fexample_files\u002Fnq_few_shot_prompt_for_judge_scoring.tsv\n```","某智能客服系统开发团队在部署基于RAG的问答系统时，需要持续优化模型性能。他们需定期评估不同版本的检索-生成模块，但传统评估流程存在明显瓶颈。\n\n### 没有 ARES 时\n- 人工标注成本高昂：每轮评估需雇佣5名工程师标注200个query-document-answer对，耗时3天且成本超万元\n- 评估结果稳定性差：不同标注者对\"答案忠实性\"的判断标准差异导致评估结果波动达25%\n- 模型迭代效率低下：无法快速验证新检索策略效果，每次调优需等待人工标注完成\n- 缺乏统计置信度：手动抽样分析无法提供显著性检验结果，决策依赖主观判断\n\n### 使用 ARES 后\n- 标注成本降低90%：通过合成数据生成和微调分类器，2小时完成2000个样本的自动评估\n- 评估结果更可靠：PPI机制使关键指标波动率降至5%以内，提供置信区间和显著性检验\n- 迭代速度提升4倍：新策略验证周期从3天缩短至6小时，支持每日多次A\u002FB测试\n- 决策依据更科学：自动生成的评估报告包含上下文相关性、答案忠实性等多维指标趋势分析\n\nARES通过自动化评估流程将RAG系统优化效率提升至传统方法的5倍以上，使团队能更聚焦于模型架构创新。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fstanford-futuredata_ARES_c30e2148.png","stanford-futuredata","Future Data Systems","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fstanford-futuredata_1afa6134.png","We are a CS research group building data-intensive systems",null,"https:\u002F\u002Fgithub.com\u002Fstanford-futuredata",[82],{"name":83,"color":84,"percentage":85},"Python","#3572A5",100,699,68,"2026-03-21T09:45:40","Apache-2.0","Linux","需要 NVIDIA GPU，显存 8GB+","16GB+",{"notes":94,"python":95,"dependencies":96},"建议使用 conda 管理环境，首次运行需下载约 5GB 模型文件","3.8+",[97,98,99,100],"torch>=2.0","transformers>=4.30","accelerate","vllm",[13,54],"2026-03-27T02:49:30.150509","2026-04-06T05:44:21.088236",[105,110,115,120,124,129],{"id":106,"question_zh":107,"answer_zh":108,"source_url":109},5143,"无法在Mac上导入ares库如何解决？","由于vllm库不支持Mac系统，需使用Colab笔记本运行或尝试Triton编译。维护者已更新Colab笔记本链接：https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1DvXr9SvWOw6xaNW8LHcy9C06LKevDPxe#scrollTo=ymP8Ne5yLo06","https:\u002F\u002Fgithub.com\u002Fstanford-futuredata\u002FARES\u002Fissues\u002F54",{"id":111,"question_zh":112,"answer_zh":113,"source_url":114},5144,"如何解决PPI评估只评估第一个数据集的问题？","该问题已通过PR #51修复，需确保使用最新代码库版本。若仍存在问题，请检查代码中是否存在提前return语句，需移除循环内的return保证多数据集迭代","https:\u002F\u002Fgithub.com\u002Fstanford-futuredata\u002FARES\u002Fissues\u002F44",{"id":116,"question_zh":117,"answer_zh":118,"source_url":119},5145,"运行PPI时得到重复的评估结果如何解决？","维护者已修复上下文相关检查点问题，需使用更新后的Colab笔记本和最新检查点链接：Context Relevance https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1yg1q...（需完整链接）。请确保使用修复后的代码版本","https:\u002F\u002Fgithub.com\u002Fstanford-futuredata\u002FARES\u002Fissues\u002F65",{"id":121,"question_zh":122,"answer_zh":123,"source_url":109},5146,"如何解决教程无法运行的错误？","Colab笔记本已更新至最新版本，需使用维护者提供的新链接运行。本地环境需安装vllm（仅限Linux）或通过Triton编译替代方案，当前Mac用户无法直接安装vllm",{"id":125,"question_zh":126,"answer_zh":127,"source_url":128},5147,"如何正确配置合成数据生成？","需使用new-dev分支的抽象层代码，确保文档中的文件路径与实际代码一致。示例文件需包含'Document'列，若使用旧版代码需注意Query\u002FAnswer列的新增依赖","https:\u002F\u002Fgithub.com\u002Fstanford-futuredata\u002FARES\u002Fissues\u002F23",{"id":130,"question_zh":131,"answer_zh":132,"source_url":128},5148,"文档与代码不一致导致的错误如何处理？","维护者建议使用new-dev分支替代主分支，该分支重构了代码结构。需注意文档中的路径差异（如data与\u002Fdata）、文件名拼写错误（如nq_ratio_0.5_.tsv）及模型配置信息的准确性",[]]