[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-gkamradt--LLMTest_NeedleInAHaystack":3,"tool-gkamradt--LLMTest_NeedleInAHaystack":65},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",160015,2,"2026-04-18T11:30:52",[13,14,15],"开发框架","Agent","语言模型","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,3,"2026-04-06T11:19:32",[15,26,14,13],"图像",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":10,"last_commit_at":33,"category_tags":34,"status":16},8553,"spec-kit","github\u002Fspec-kit","Spec Kit 是一款专为提升软件开发效率而设计的开源工具包，旨在帮助团队快速落地“规格驱动开发”（Spec-Driven Development）模式。传统开发中，需求文档往往与代码实现脱节，导致沟通成本高且结果不可控；而 Spec Kit 通过将规格说明书转化为可执行的指令，让 AI 直接依据明确的业务场景生成高质量代码，从而减少从零开始的随意编码，确保产出结果的可预测性。\n\n该工具特别适合希望利用 AI 辅助编程的开发者、技术负责人及初创团队。无论是启动全新项目还是在现有工程中引入规范化流程，用户只需通过简单的命令行操作，即可初始化项目并集成主流的 AI 编程助手。其核心技术亮点在于“规格即代码”的理念，支持社区扩展与预设模板，允许用户根据特定技术栈定制开发流程。此外，Spec Kit 强调官方维护的安全性，提供稳定的版本管理，帮助开发者在享受 AI 红利的同时，依然牢牢掌握架构设计的主动权，真正实现从“凭感觉写代码”到“按规格建系统”的转变。",88749,"2026-04-17T09:48:14",[15,26,14,13],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":10,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,15],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":10,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",85267,"2026-04-18T11:00:28",[26,51,52,53,14,54,15,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":62,"last_commit_at":63,"category_tags":64,"status":16},5784,"funNLP","fighting41love\u002FfunNLP","funNLP 是一个专为中文自然语言处理（NLP）打造的超级资源库，被誉为\"NLP 民工的乐园”。它并非单一的软件工具，而是一个汇集了海量开源项目、数据集、预训练模型和实用代码的综合性平台。\n\n面对中文 NLP 领域资源分散、入门门槛高以及特定场景数据匮乏的痛点，funNLP 提供了“一站式”解决方案。这里不仅涵盖了分词、命名实体识别、情感分析、文本摘要等基础任务的标准工具，还独特地收录了丰富的垂直领域资源，如法律、医疗、金融行业的专用词库与数据集，甚至包含古诗词生成、歌词创作等趣味应用。其核心亮点在于极高的全面性与实用性，从基础的字典词典到前沿的 BERT、GPT-2 模型代码，再到高质量的标注数据和竞赛方案，应有尽有。\n\n无论是刚刚踏入 NLP 领域的学生、需要快速验证想法的算法工程师，还是从事人工智能研究的学者，都能在这里找到急需的“武器弹药”。对于开发者而言，它能大幅减少寻找数据和复现模型的时间；对于研究者，它提供了丰富的基准测试资源和前沿技术参考。funNLP 以开放共享的精神，极大地降低了中文自然语言处理的开发与研究成本，是中文 AI 社区不可或缺的宝藏仓库。",79857,1,"2026-04-08T20:11:31",[15,51,54],{"id":66,"github_repo":67,"name":68,"description_en":69,"description_zh":70,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":77,"owner_company":77,"owner_location":77,"owner_email":77,"owner_twitter":77,"owner_website":77,"owner_url":79,"languages":80,"stars":93,"forks":94,"last_commit_at":95,"license":96,"difficulty_score":10,"env_os":97,"env_gpu":98,"env_ram":99,"env_deps":100,"category_tags":106,"github_topics":77,"view_count":10,"oss_zip_url":77,"oss_zip_packed_at":77,"status":16,"created_at":107,"updated_at":108,"faqs":109,"releases":125},9171,"gkamradt\u002FLLMTest_NeedleInAHaystack","LLMTest_NeedleInAHaystack","Doing simple retrieval from LLM models at various context lengths to measure accuracy","LLMTest_NeedleInAHaystack 是一款专为评估大语言模型长文本检索能力而设计的开源测试工具。它的核心原理是经典的“大海捞针”实验：将一句关键信息（针）隐藏在不同长度的长篇文档（草堆）中的不同位置，然后要求模型精准找出这句话，以此量化模型在长上下文场景下的记忆与提取准确率。\n\n该工具主要解决了当前大模型在处理超长输入时，容易忽略中间细节或产生幻觉的痛点。通过自动化地在多种文档深度和上下文长度下进行压力测试，它能直观地揭示模型性能随文本长度增加而变化的趋势，帮助开发者判断模型是否真正具备处理长文档的能力。\n\n这款工具非常适合 AI 研究人员、大模型开发者以及需要评估模型长文本表现的技术团队使用。其独特亮点在于支持 OpenAI、Anthropic 和 Cohere 等多家主流模型提供商，并允许用户灵活配置测试参数，如上下文长度范围和关键信息插入位置。此外，它还提供了可视化的结果分析能力，让复杂的评估数据一目了然。无论是为了选型对比还是模型调优，LLMTest_NeedleInAHaystack 都能提供科学、可复现的基准测试数据，是探索长上下文模型极限的得力助手。","# Needle In A Haystack - Pressure Testing LLMs\n\nA simple 'needle in a haystack' analysis to test in-context retrieval ability of long context LLMs.\n\nSupported model providers: OpenAI, Anthropic, Cohere\n\nGet the behind the scenes on the [overview video](https:\u002F\u002Fyoutu.be\u002FKwRRuiCCdmc).\n\n![GPT-4-128 Context Testing](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgkamradt_LLMTest_NeedleInAHaystack_readme_1387e9e3f781.png)\n\n## The Test\n\n1. Place a random fact or statement (the 'needle') in the middle of a long context window (the 'haystack')\n2. Ask the model to retrieve this statement\n3. Iterate over various document depths (where the needle is placed) and context lengths to measure performance\n\nThis is the code that backed [this OpenAI](https:\u002F\u002Ftwitter.com\u002FGregKamradt\u002Fstatus\u002F1722386725635580292) and [Anthropic analysis](https:\u002F\u002Ftwitter.com\u002FGregKamradt\u002Fstatus\u002F1727018183608193393).\n\nThe results from the original tests are in `\u002Foriginal_results`. The script has upgraded a lot since those test were ran so the data formats may not match your script results.\n\n## Getting Started\n\n### Setup Virtual Environment\n\nWe recommend setting up a virtual environment to isolate Python dependencies, ensuring project-specific packages without conflicting with system-wide installations.\n\n```zsh\npython3 -m venv venv\nsource venv\u002Fbin\u002Factivate\n```\n\n### Environment Variables\n\n- `NIAH_MODEL_API_KEY` - API key for interacting with the model. Depending on the provider, this gets used appropriately with the correct sdk.\n- `NIAH_EVALUATOR_API_KEY` - API key to use if `openai` evaluation strategy is used.\n\n### Install Package\n\nInstall the package from PyPi:\n\n```zsh\npip install needlehaystack\n```\n\n### Run Test\n\nStart using the package by calling the entry point `needlehaystack.run_test` from command line.\n\nYou can then run the analysis on OpenAI, Anthropic, or Cohere models with the following command line arguments:\n\n- `provider` - The provider of the model, available options are `openai`, `anthropic`, and `cohere`. Defaults to `openai`\n- `evaluator` - The evaluator, which can either be a `model` or `LangSmith`. See more on `LangSmith` below. If using a `model`, only `openai` is currently supported. Defaults to `openai`.\n- `model_name` - Model name of the language model accessible by the provider. Defaults to `gpt-3.5-turbo-0125`\n- `evaluator_model_name` - Model name of the language model accessible by the evaluator. Defaults to `gpt-3.5-turbo-0125`\n\nAdditionally, `LLMNeedleHaystackTester` parameters can also be passed as command line arguments, except `model_to_test` and `evaluator`.\n\nHere are some example use cases.\n\nFollowing command runs the test for openai model `gpt-3.5-turbo-0125` for a single context length of 2000 and single document depth of 50%.\n\n```zsh\nneedlehaystack.run_test --provider openai --model_name \"gpt-3.5-turbo-0125\" --document_depth_percents \"[50]\" --context_lengths \"[2000]\"\n```\n\nFollowing command runs the test for anthropic model `claude-2.1` for a single context length of 2000 and single document depth of 50%.\n\n```zsh\nneedlehaystack.run_test --provider anthropic --model_name \"claude-2.1\" --document_depth_percents \"[50]\" --context_lengths \"[2000]\"\n```\n\nFollowing command runs the test for cohere model `command-r` for a single context length of 2000 and single document depth of 50%.\n\n```zsh\nneedlehaystack.run_test --provider cohere --model_name \"command-r\" --document_depth_percents \"[50]\" --context_lengths \"[2000]\"\n```\n### For Contributors\n\n1. Fork and clone the repository.\n2. Create and activate the virtual environment as described above.\n3. Set the environment variables as described above.\n4. Install the package in editable mode by running the following command from repository root:\n\n```zsh\npip install -e .\n```\n\nThe package `needlehaystack` is available for import in your test cases. Develop, make changes and test locally.\n\n## `LLMNeedleHaystackTester` parameters:\n\n- `model_to_test` - The model to run the needle in a haystack test on. Default is None.\n- `evaluator` - An evaluator to evaluate the model's response. Default is None.\n- `needle` - The statement or fact which will be placed in your context ('haystack')\n- `haystack_dir` - The directory which contains the text files to load as background context. Only text files are supported\n- `retrieval_question` - The question with which to retrieve your needle in the background context\n- `results_version` - You may want to run your test multiple times for the same combination of length\u002Fdepth, change the version number if so\n- `num_concurrent_requests` - Default: 1. Set higher if you'd like to run more requests in parallel. Keep in mind rate limits.\n- `save_results` - Whether or not you'd like to save your results to file. They will be temporarily saved in the object regardless. True\u002FFalse. If `save_results = True`, then this script will populate a `result\u002F` directory with evaluation information. Due to potential concurrent requests each new test will be saved as a few file.\n- `save_contexts` - Whether or not you'd like to save your contexts to file. **Warning** these will get very long. True\u002FFalse\n- `final_context_length_buffer` - The amount of context to take off each input to account for system messages and output tokens. This can be more intelligent but using a static value for now. Default 200 tokens.\n- `context_lengths_min` - The starting point of your context lengths list to iterate\n- `context_lengths_max` - The ending point of your context lengths list to iterate\n- `context_lengths_num_intervals` - The number of intervals between your min\u002Fmax to iterate through\n- `context_lengths` - A custom set of context lengths. This will override the values set for `context_lengths_min`, max, and intervals if set\n- `document_depth_percent_min` - The starting point of your document depths. Should be int > 0\n- `document_depth_percent_max` - The ending point of your document depths. Should be int \u003C 100\n- `document_depth_percent_intervals` - The number of iterations to do between your min\u002Fmax points\n- `document_depth_percents` - A custom set of document depths lengths. This will override the values set for `document_depth_percent_min`, max, and intervals if set\n- `document_depth_percent_interval_type` - Determines the distribution of depths to iterate over. 'linear' or 'sigmoid\n- `seconds_to_sleep_between_completions` - Default: None, set # of seconds if you'd like to slow down your requests\n- `print_ongoing_status` - Default: True, whether or not to print the status of test as they complete\n\n`LLMMultiNeedleHaystackTester` parameters:\n\n- `multi_needle` - True or False, whether to run multi-needle\n- `needles` - List of needles to insert in the context\n\nOther Parameters:\n\n- `model_name` - The name of the model you'd like to use. Should match the exact value which needs to be passed to the api. Ex: For OpenAI inference and evaluator models it would be `gpt-3.5-turbo-0125`.\n\n## Results Visualization\n\n`LLMNeedleInHaystackVisualization.ipynb` holds the code to make the pivot table visualization. The pivot table was then transferred to Google Slides for custom annotations and formatting. See the [google slides version](https:\u002F\u002Fdocs.google.com\u002Fpresentation\u002Fd\u002F15JEdEBjm32qBbqeYM6DK6G-3mUJd7FAJu-qEzj8IYLQ\u002Fedit?usp=sharing). See an overview of how this viz was created [here](https:\u002F\u002Ftwitter.com\u002FGregKamradt\u002Fstatus\u002F1729573848893579488).\n\n## OpenAI's GPT-4-128K (Run 11\u002F8\u002F2023)\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgkamradt_LLMTest_NeedleInAHaystack_readme_c193816c747b.png\" alt=\"GPT-4-128 Context Testing\" width=\"800\"\u002F>\n\n## Anthropic's Claude 2.1 (Run 11\u002F21\u002F2023)\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgkamradt_LLMTest_NeedleInAHaystack_readme_8435aa4ef129.png\" alt=\"GPT-4-128 Context Testing\" width=\"800\"\u002F>\n\n## Multi Needle Evaluator\n\nTo enable multi-needle insertion into our context, use `--multi_needle True`.\n\nThis inserts the first needle at the specified `depth_percent`, then evenly distributes subsequent needles through the remaining context after this depth.\n\nFor even spacing, it calculates the `depth_percent_interval` as:\n\n```\ndepth_percent_interval = (100 - depth_percent) \u002F len(self.needles)\n```\n\nSo, the first needle is placed at a depth percent of `depth_percent`, the second at `depth_percent + depth_percent_interval`, the third at `depth_percent + 2 * depth_percent_interval`, and so on.\n\nFollowing example shows the depth percents for the case of 10 needles and depth_percent of 40%.\n\n```\ndepth_percent_interval = (100 - 40) \u002F 10 = 6\n\nNeedle 1: 40\nNeedle 2: 40 + 6 = 46\nNeedle 3: 40 + 2 * 6 = 52\nNeedle 4: 40 + 3 * 6 = 58\nNeedle 5: 40 + 4 * 6 = 64\nNeedle 6: 40 + 5 * 6 = 70\nNeedle 7: 40 + 6 * 6 = 76\nNeedle 8: 40 + 7 * 6 = 82\nNeedle 9: 40 + 8 * 6 = 88\nNeedle 10: 40 + 9 * 6 = 94\n```\n\n## LangSmith Evaluator\n\nYou can use LangSmith to orchestrate evals and store results.\n\n(1) Sign up for [LangSmith](https:\u002F\u002Fdocs.smith.langchain.com\u002Fsetup)\n(2) Set env variables for LangSmith as specified in the setup.\n(3) In the `Datasets + Testing` tab, use `+ Dataset` to create a new dataset, call it `multi-needle-eval-sf` to start.\n(4) Populate the dataset with a test question:\n\n```\nquestion: What are the 5 best things to do in San Franscisco?\nanswer: \"The 5 best things to do in San Francisco are: 1) Go to Dolores Park. 2) Eat at Tony's Pizza Napoletana. 3) Visit Alcatraz. 4) Hike up Twin Peaks. 5) Bike across the Golden Gate Bridge\"\n```\n\n![Screenshot 2024-03-05 at 4 54 15 PM](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgkamradt_LLMTest_NeedleInAHaystack_readme_91655fdd3fb7.png)\n(5) Run with ` --evaluator langsmith` and `--eval_set multi-needle-eval-sf` to run against our recently created eval set.\n\nLet's see all these working together on a new dataset, `multi-needle-eval-pizza`.\n\nHere is the `multi-needle-eval-pizza` eval set, which has a question and reference answer. You can also and resulting runs:\nhttps:\u002F\u002Fsmith.langchain.com\u002Fpublic\u002F74d2af1c-333d-4a73-87bc-a837f8f0f65c\u002Fd\n\nHere is the command to run this using multi-needle eval and passing the relevant needles:\n\n```\nneedlehaystack.run_test --evaluator langsmith --context_lengths_num_intervals 3 --document_depth_percent_intervals 3 --provider openai --model_name \"gpt-4-0125-preview\" --multi_needle True --eval_set multi-needle-eval-pizza --needles '[\"Figs are one of the three most delicious pizza toppings.\", \"Prosciutto is one of the three most delicious pizza toppings.\", \"Goat cheese is one of the three most delicious pizza toppings.\"]'\n```\n\n## License\n\nThis project is licensed under the MIT License - see the [LICENSE](LICENSE.txt) file for details. Use of this software requires attribution to the original author and project, as detailed in the license.\n","# 稻草堆里的针 - LLM 压力测试\n\n一种简单的“稻草堆里的针”分析，用于测试长上下文 LLM 的上下文内检索能力。\n\n支持的模型提供商：OpenAI、Anthropic、Cohere\n\n观看[概述视频](https:\u002F\u002Fyoutu.be\u002FKwRRuiCCdmc)，了解幕后详情。\n\n![GPT-4-128 上下文测试](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgkamradt_LLMTest_NeedleInAHaystack_readme_1387e9e3f781.png)\n\n## 测试内容\n\n1. 将一个随机的事实或陈述（即“针”）放置在长上下文窗口的中间（即“稻草堆”）。\n2. 要求模型检索该陈述。\n3. 遍历不同的文档深度（即“针”的位置）和上下文长度，以衡量性能。\n\n这段代码支持了[这次 OpenAI 分析](https:\u002F\u002Ftwitter.com\u002FGregKamradt\u002Fstatus\u002F1722386725635580292)和[Anthropic 分析](https:\u002F\u002Ftwitter.com\u002FGregKamradt\u002Fstatus\u002F1727018183608193393)。\n\n原始测试的结果位于 `\u002Foriginal_results` 目录中。自这些测试运行以来，脚本已经进行了大量升级，因此数据格式可能与您当前脚本的结果不一致。\n\n## 快速开始\n\n### 设置虚拟环境\n\n我们建议设置一个虚拟环境来隔离 Python 依赖项，确保项目专用的包不会与系统范围的安装发生冲突。\n\n```zsh\npython3 -m venv venv\nsource venv\u002Fbin\u002Factivate\n```\n\n### 环境变量\n\n- `NIAH_MODEL_API_KEY` - 用于与模型交互的 API 密钥。根据提供商的不同，此密钥将与相应的 SDK 正确配合使用。\n- `NIAH_EVALUATOR_API_KEY` - 如果使用 `openai` 评估策略，则需要此 API 密钥。\n\n### 安装包\n\n从 PyPi 安装该包：\n\n```zsh\npip install needlehaystack\n```\n\n### 运行测试\n\n通过命令行调用入口点 `needlehaystack.run_test` 即可开始使用该包。\n\n然后，您可以使用以下命令行参数对 OpenAI、Anthropic 或 Cohere 的模型进行分析：\n\n- `provider` - 模型的提供商，可选值为 `openai`、`anthropic` 和 `cohere`。默认值为 `openai`。\n- `evaluator` - 评估器，可以是 `model` 或 `LangSmith`。有关 `LangSmith` 的更多信息请见下文。如果使用 `model`，目前仅支持 `openai`。默认值为 `openai`。\n- `model_name` - 提供商可访问的语言模型名称。默认值为 `gpt-3.5-turbo-0125`。\n- `evaluator_model_name` - 评估器可访问的语言模型名称。默认值为 `gpt-3.5-turbo-0125`。\n\n此外，`LLMNeedleHaystackTester` 的参数也可以作为命令行参数传递，但 `model_to_test` 和 `evaluator` 除外。\n\n以下是一些示例用法。\n\n以下命令针对 OpenAI 模型 `gpt-3.5-turbo-0125` 运行测试，上下文长度为 2000，文档深度为 50%。\n\n```zsh\nneedlehaystack.run_test --provider openai --model_name \"gpt-3.5-turbo-0125\" --document_depth_percents \"[50]\" --context_lengths \"[2000]\"\n```\n\n以下命令针对 Anthropic 模型 `claude-2.1` 运行测试，上下文长度为 2000，文档深度为 50%。\n\n```zsh\nneedlehaystack.run_test --provider anthropic --model_name \"claude-2.1\" --document_depth_percents \"[50]\" --context_lengths \"[2000]\"\n```\n\n以下命令针对 Cohere 模型 `command-r` 运行测试，上下文长度为 2000，文档深度为 50%。\n\n```zsh\nneedlehaystack.run_test --provider cohere --model_name \"command-r\" --document_depth_percents \"[50]\" --context_lengths \"[2000]\"\n```\n\n### 贡献者指南\n\n1. 分支并克隆仓库。\n2. 按照上述说明创建并激活虚拟环境。\n3. 按照上述说明设置环境变量。\n4. 在仓库根目录下运行以下命令，以可编辑模式安装该包：\n\n```zsh\npip install -e .\n```\n\n现在您可以在测试用例中导入 `needlehaystack` 包。开发、修改并在本地进行测试。\n\n## `LLMNeedleHaystackTester` 参数：\n\n- `model_to_test` - 要进行“稻草堆里的针”测试的模型。默认值为 None。\n- `evaluator` - 用于评估模型响应的评估器。默认值为 None。\n- `needle` - 将被放置在您的上下文中（即“稻草堆”）的陈述或事实。\n- `haystack_dir` - 包含要加载为背景上下文的文本文件的目录。仅支持文本文件。\n- `retrieval_question` - 用于在背景上下文中检索“针”的问题。\n- `results_version` - 如果您希望对相同的长度\u002F深度组合多次运行测试，请更改版本号。\n- `num_concurrent_requests` - 默认值为 1。如果您希望并行运行更多请求，请将其设置为更高的值。请注意速率限制。\n- `save_results` - 是否要将结果保存到文件。无论是否保存，结果都会暂时存储在对象中。True\u002FFalse。如果 `save_results = True`，则此脚本会填充一个 `result\u002F` 目录，其中包含评估信息。由于可能存在并发请求，每次新测试都会保存为几个文件。\n- `save_contexts` - 是否要将上下文保存到文件。**警告**：这些文件会非常大。True\u002FFalse。\n- `final_context_length_buffer` - 用于扣除每个输入中的系统消息和输出标记的上下文量。目前采用静态值，未来可能会更智能。默认值为 200 个标记。\n- `context_lengths_min` - 您要遍历的上下文长度列表的起点。\n- `context_lengths_max` - 您要遍历的上下文长度列表的终点。\n- `context_lengths_num_intervals` - 您要在最小值和最大值之间遍历的区间数量。\n- `context_lengths` - 自定义的上下文长度集合。如果已设置，则会覆盖 `context_lengths_min`、`max` 和 `intervals` 的值。\n- `document_depth_percent_min` - 您文档深度的起点。应为大于 0 的整数。\n- `document_depth_percent_max` - 您文档深度的终点。应为小于 100 的整数。\n- `document_depth_percent_intervals` - 您要在最小值和最大值之间进行的迭代次数。\n- `document_depth_percents` - 自定义的文档深度长度集合。如果已设置，则会覆盖 `document_depth_percent_min`、`max` 和 `intervals` 的值。\n- `document_depth_percent_interval_type` - 决定要遍历的深度分布。“linear” 或 “sigmoid”。\n- `seconds_to_sleep_between_completions` - 默认值为 None，如果您希望减缓请求速度，请设置秒数。\n- `print_ongoing_status` - 默认值为 True，表示是否在测试完成时打印状态。\n\n`LLMMultiNeedleHaystackTester` 参数：\n\n- `multi_needle` - 是否运行多针测试。True 或 False。\n- `needles` - 要插入上下文中的针的列表。\n\n其他参数：\n\n- `model_name` - 您想要使用的模型名称。应与需要传递给 API 的确切值匹配。例如：对于 OpenAI 的推理和评估模型，应为 `gpt-3.5-turbo-0125`。\n\n## 结果可视化\n\n`LLMNeedleInHaystackVisualization.ipynb` 包含用于生成透视表可视化的代码。随后，该透视表被导入 Google Slides 进行自定义标注和格式化。请参阅 [Google Slides 版本](https:\u002F\u002Fdocs.google.com\u002Fpresentation\u002Fd\u002F15JEdEBjm32qBbqeYM6DK6G-3mUJd7FAJu-qEzj8IYLQ\u002Fedit?usp=sharing)。有关此可视化创建过程的概述，请参见 [此处](https:\u002F\u002Ftwitter.com\u002FGregKamradt\u002Fstatus\u002F1729573848893579488)。\n\n## OpenAI 的 GPT-4-128K（2023年11月8日运行）\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgkamradt_LLMTest_NeedleInAHaystack_readme_c193816c747b.png\" alt=\"GPT-4-128 上下文测试\" width=\"800\"\u002F>\n\n## Anthropic 的 Claude 2.1（2023年11月21日运行）\n\n\u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgkamradt_LLMTest_NeedleInAHaystack_readme_8435aa4ef129.png\" alt=\"GPT-4-128 上下文测试\" width=\"800\"\u002F>\n\n## 多针评估器\n\n为在上下文中插入多根“针”，请使用 `--multi_needle True`。\n\n此选项会将第一根“针”放置在指定的 `depth_percent` 深度处，然后在该深度之后的剩余上下文中均匀分布后续的“针”。\n\n为了实现均匀间隔，它会计算 `depth_percent_interval` 如下：\n\n```\ndepth_percent_interval = (100 - depth_percent) \u002F len(self.needles)\n```\n\n因此，第一根“针”位于 `depth_percent` 百分比深度处，第二根位于 `depth_percent + depth_percent_interval`，第三根位于 `depth_percent + 2 * depth_percent_interval`，以此类推。\n\n以下示例展示了在 10 根“针”和 `depth_percent` 为 40% 的情况下，各根“针”的深度百分比：\n\n```\ndepth_percent_interval = (100 - 40) \u002F 10 = 6\n\n针 1: 40\n针 2: 40 + 6 = 46\n针 3: 40 + 2 * 6 = 52\n针 4: 40 + 3 * 6 = 58\n针 5: 40 + 4 * 6 = 64\n针 6: 40 + 5 * 6 = 70\n针 7: 40 + 6 * 6 = 76\n针 8: 40 + 7 * 6 = 82\n针 9: 40 + 8 * 6 = 88\n针 10: 40 + 9 * 6 = 94\n```\n\n## LangSmith 评估器\n\n您可以使用 LangSmith 来编排评估并存储结果。\n\n(1) 注册 [LangSmith](https:\u002F\u002Fdocs.smith.langchain.com\u002Fsetup)  \n(2) 按照设置说明配置 LangSmith 的环境变量。  \n(3) 在“数据集 + 测试”选项卡中，使用“+ 数据集”创建一个新数据集，例如命名为 `multi-needle-eval-sf`。  \n(4) 向数据集中添加一个测试问题：\n\n```\n问题：旧金山有哪些最佳游玩体验？  \n答案：“旧金山的最佳游玩体验包括：1) 去多洛雷斯公园；2) 在托尼那不勒斯披萨店用餐；3) 参观恶魔岛；4) 徒步登上双峰山；5) 骑自行车穿越金门大桥。”\n```\n\n![截图 2024-03-05 下午 4:54:15](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgkamradt_LLMTest_NeedleInAHaystack_readme_91655fdd3fb7.png)  \n(5) 使用 `--evaluator langsmith` 和 `--eval_set multi-needle-eval-sf` 运行，以针对我们刚刚创建的评估集进行测试。\n\n让我们在一个新的数据集 `multi-needle-eval-pizza` 上一起看看这些功能如何协同工作。\n\n以下是 `multi-needle-eval-pizza` 评估集，其中包含一个问题和参考答案。您也可以查看相关的运行结果：\nhttps:\u002F\u002Fsmith.langchain.com\u002Fpublic\u002F74d2af1c-333d-4a73-87bc-a837f8f0f65c\u002Fd\n\n以下是使用多针评估并传递相关“针”的命令：\n\n```\nneedlehaystack.run_test --evaluator langsmith --context_lengths_num_intervals 3 --document_depth_percent_intervals 3 --provider openai --model_name \"gpt-4-0125-preview\" --multi_needle True --eval_set multi-needle-eval-pizza --needles '[\"无花果是三种最美味的披萨配料之一。\", \"意大利生火腿是三种最美味的披萨配料之一。\", \"山羊奶酪是三种最美味的披萨配料之一。\"]'\n```\n\n## 许可证\n\n本项目采用 MIT 许可证授权——详情请参阅 [LICENSE](LICENSE.txt) 文件。使用本软件需按照许可证中的规定，注明原作者和项目的归属。","# LLMTest_NeedleInAHaystack 快速上手指南\n\n**LLMTest_NeedleInAHaystack** 是一个用于压力测试大语言模型（LLM）长上下文检索能力的开源工具。它通过“大海捞针”（Needle In A Haystack）的方法，将特定事实（针）隐藏在长文本（草堆）的不同位置，测试模型能否准确检索该信息。支持 OpenAI、Anthropic 和 Cohere 等主流模型提供商。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**：Linux, macOS 或 Windows (WSL 推荐)\n*   **Python 版本**：Python 3.8 或更高版本\n*   **API Key**：\n    *   `NIAH_MODEL_API_KEY`：目标测试模型的 API Key（如 OpenAI, Anthropic, Cohere）。\n    *   `NIAH_EVALUATOR_API_KEY`：如果使用 OpenAI 作为评估器，需提供其 API Key。\n\n建议创建虚拟环境以隔离依赖：\n\n```zsh\npython3 -m venv venv\nsource venv\u002Fbin\u002Factivate\n```\n\n设置环境变量（以 zsh\u002Fbash 为例）：\n\n```zsh\nexport NIAH_MODEL_API_KEY=\"your_model_api_key\"\nexport NIAH_EVALUATOR_API_KEY=\"your_evaluator_api_key\"\n```\n\n## 安装步骤\n\n您可以直接从 PyPI 安装该工具包：\n\n```zsh\npip install needlehaystack\n```\n\n> **提示**：国内开发者若遇到下载速度慢的问题，可使用清华或阿里镜像源加速安装：\n> ```zsh\n> pip install needlehaystack -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n> ```\n\n## 基本使用\n\n安装完成后，可通过命令行入口 `needlehaystack.run_test` 直接运行测试。\n\n### 最简单的测试示例\n\n以下命令将对 OpenAI 的 `gpt-3.5-turbo-0125` 模型进行一次基础测试：\n*   **上下文长度**：2000 tokens\n*   **针的位置**：文档深度的 50% 处\n\n```zsh\nneedlehaystack.run_test --provider openai --model_name \"gpt-3.5-turbo-0125\" --document_depth_percents \"[50]\" --context_lengths \"[2000]\"\n```\n\n### 测试其他模型提供商\n\n只需更改 `--provider` 和 `--model_name` 参数即可适配不同厂商的模型。\n\n**测试 Anthropic (Claude):**\n```zsh\nneedlehaystack.run_test --provider anthropic --model_name \"claude-2.1\" --document_depth_percents \"[50]\" --context_lengths \"[2000]\"\n```\n\n**测试 Cohere:**\n```zsh\nneedlehaystack.run_test --provider cohere --model_name \"command-r\" --document_depth_percents \"[50]\" --context_lengths \"[2000]\"\n```\n\n### 进阶参数说明\n\n您可以通过组合以下参数进行更复杂的压力测试：\n\n*   `--context_lengths`: 自定义上下文长度列表，例如 `\"[2000, 4000, 8000]\"`。\n*   `--document_depth_percents`: 自定义针在文档中的位置百分比列表，例如 `\"[0, 25, 50, 75, 100]\"`。\n*   `--multi_needle`: 设置为 `True` 可启用多针测试（在上下文中插入多个关键信息）。\n*   `--evaluator`: 指定评估器，默认为 `openai`，也支持 `langsmith`。\n\n运行结束后，测试结果通常会保存在当前的 `results\u002F` 目录中，您可以使用提供的 Jupyter Notebook (`LLMNeedleInHaystackVisualization.ipynb`) 进行可视化分析。","某金融科技公司正在评估是否将新发布的长上下文大模型用于自动审查长达数百页的合规文档，以确保关键风险条款不被遗漏。\n\n### 没有 LLMTest_NeedleInAHaystack 时\n- **盲目信任厂商宣传**：团队仅凭模型支持的上下文长度（如 128k）就假设其能完美处理长文档，缺乏实证数据支撑。\n- **测试覆盖片面**：人工构造的测试案例有限，无法系统性地验证关键信息在文档不同位置（开头、中间、末尾）的检索准确率。\n- **性能瓶颈难定位**：当模型偶尔漏掉关键条款时，无法判断是因为文档太长导致“迷失”，还是特定深度的信息难以被提取。\n- **选型决策高风险**：在缺乏量化压力测试的情况下采购或部署模型，可能导致生产环境中出现严重的合规漏审事故。\n\n### 使用 LLMTest_NeedleInAHaystack 后\n- **量化压力测试**：通过自动化脚本将关键风险条款（“针”）随机插入不同长度的模拟文档（“草堆”）中，精确测量模型在各种上下文长度下的检索成功率。\n- **全深度覆盖分析**：系统性遍历文档深度的 0% 到 100%，生成热力图直观展示模型是否在文档中段出现“记忆衰退”或注意力分散。\n- **精准模型对比**：在同一套测试标准下并行评估 OpenAI、Anthropic 和 Cohere 等多家模型，用数据选出在长文本检索中最稳健的引擎。\n- **降低落地风险**：依据测试报告设定安全阈值，只在模型准确率达标的上下文范围内部署应用，从源头杜绝关键信息遗漏。\n\nLLMTest_NeedleInAHaystack 将模糊的“长文本能力”转化为可视化的精度数据，帮助团队在关键业务中做出基于实证的模型选型决策。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgkamradt_LLMTest_NeedleInAHaystack_1387e9e3.png","gkamradt",null,"https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fgkamradt_449d06d0.jpg","https:\u002F\u002Fgithub.com\u002Fgkamradt",[81,85,89],{"name":82,"color":83,"percentage":84},"Jupyter Notebook","#DA5B0B",89.3,{"name":86,"color":87,"percentage":88},"Python","#3572A5",10.6,{"name":90,"color":91,"percentage":92},"Dockerfile","#384d54",0,2256,241,"2026-04-17T15:24:10","NOASSERTION","未说明 (基于 Python 和 pip，通常支持 Linux, macOS, Windows)","不需要 (该工具通过 API 调用 OpenAI, Anthropic, Cohere 等云端模型，无需本地 GPU)","未说明",{"notes":101,"python":102,"dependencies":103},"1. 该工具主要用于测试云端大语言模型（LLM）的长上下文检索能力，本身不运行本地模型。\n2. 必须配置环境变量：NIAH_MODEL_API_KEY（模型提供商密钥）和 NIAH_EVALUATOR_API_KEY（若使用 OpenAI 作为评估器）。\n3. 支持通过 pip 直接安装 'needlehaystack' 包运行，也支持源码克隆后以可编辑模式安装。\n4. 若使用 LangSmith 进行多针评估，需额外注册账号并配置相关环境变量及数据集。","3+",[104,105],"needlehaystack","langsmith (可选，用于评估)",[15,54],"2026-03-27T02:49:30.150509","2026-04-19T03:16:50.969177",[110,115,120],{"id":111,"question_zh":112,"answer_zh":113,"source_url":114},41180,"在评估不同模型时，是否应该统一使用相同的分词器（Tokenizer）？","不建议跨模型使用统一的标准分词器。虽然标准化看似公平，但不同模型（如 Claude 和 GPT）实际使用的分词器不同，其 Token 定义也不一致。为了进行准确的长度评估，应尽可能使用模型原本的分词器。如果无法获取所有模型的分词器，可以设置一个默认分词器作为后备，但需知悉这可能导致评估的上下文长度与模型实际最大窗口不完全匹配。","https:\u002F\u002Fgithub.com\u002Fgkamradt\u002FLLMTest_NeedleInAHaystack\u002Fissues\u002F25",{"id":116,"question_zh":117,"answer_zh":118,"source_url":119},41181,"项目的代码和材料可以在什么许可条件下复用？","该项目已添加许可证文件，采用 MIT 许可证（包含一行修改条款，类似于 BSD-3-Clause）。用户可以合法地复用代码和在教材中展示相关图表，只需注明作者和来源即可。许可证文件位于项目根目录的 LICENSE.txt。","https:\u002F\u002Fgithub.com\u002Fgkamradt\u002FLLMTest_NeedleInAHaystack\u002Fissues\u002F4",{"id":121,"question_zh":122,"answer_zh":123,"source_url":124},41182,"为什么模型明明找到了关键信息（Needle），但在评测中得分却很低？","这是因为当前的评测标准较为严格：如果模型生成的回答中包含了大量与问题无关的“废话”或偏离主题的解释，即使包含了正确答案，也可能被判定为不完美检索从而得分较低。评测者认为这属于主观判断，未来的改进方向是允许用户自定义评测器和评分标准，以便根据具体需求调整测试规则。对于非指令微调模型，建议通过提示工程（Prompt Engineering）来优化输出以符合评测要求。","https:\u002F\u002Fgithub.com\u002Fgkamradt\u002FLLMTest_NeedleInAHaystack\u002Fissues\u002F39",[]]