[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-langchain-ai--openevals":3,"tool-langchain-ai--openevals":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",160015,2,"2026-04-18T11:30:52",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",109154,"2026-04-18T11:18:24",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":77,"owner_twitter":73,"owner_website":78,"owner_url":79,"languages":80,"stars":97,"forks":98,"last_commit_at":99,"license":100,"difficulty_score":32,"env_os":101,"env_gpu":101,"env_ram":101,"env_deps":102,"category_tags":108,"github_topics":76,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":110,"updated_at":111,"faqs":112,"releases":143},9037,"langchain-ai\u002Fopenevals","openevals","Readymade evaluators for your LLM apps","openevals 是一套专为大语言模型（LLM）应用打造的现成评估工具包。就像传统软件开发离不开单元测试一样，要将 LLM 应用可靠地投入生产环境，科学的评估机制不可或缺。openevals 旨在为开发者提供一套开箱即用的评估起点，帮助用户快速建立对模型输出的质量监控，并以此为基础定制更符合特定业务场景的评估方案。\n\n它主要解决了 LLM 应用中“如何量化回答质量”的难题。通过内置多种评估维度（如简洁性、准确性等），openevals 利用强大的“模型即裁判”（LLM-as-judge）技术，自动判断模型输出是否符合预期，无需人工逐条审核。其独特亮点在于极高的灵活性：支持 Python 和 TypeScript 双语言，允许用户自由替换底层评判模型、自定义提示词模板，甚至调整评分标准（从简单的真假判断到精细的浮点打分）。\n\n这款工具非常适合正在构建或优化 LLM 应用的 AI 工程师、后端开发者以及算法研究人员使用。如果你希望摆脱繁琐的人工测试，用代码自动化地提升模型表现，openevals 能助你轻松迈出从实验原型到生产级应用的关键一步。","# ⚖️ OpenEvals\n\nMuch like tests in traditional software, evals are an important part of bringing LLM applications to production.\nThe goal of this package is to help provide a starting point for you to write evals for your LLM applications, from which\nyou can write more custom evals specific to your application.\n\nIf you are looking for evals specific to evaluating LLM agents, please check out [`agentevals`](https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fagentevals).\n\n# Quickstart\n\n> [!TIP]\n> If you'd like to follow along with a video walkthrough, click the image below:\n> [![Video quickstart](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flangchain-ai_openevals_readme_49feb817e3dd.jpg)](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=J-F30jRyhoA)\n\nTo get started, install `openevals`:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```bash\npip install openevals\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```bash\nnpm install openevals @langchain\u002Fcore\n```\n\u003C\u002Fdetails>\n\nThis quickstart will use an evaluator powered by OpenAI's `gpt-5.4` model to judge your results, so you'll need to set your OpenAI API key as an environment variable:\n\n```bash\nexport OPENAI_API_KEY=\"your_openai_api_key\"\n```\n\nOnce you've done this, you can run your first eval:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CONCISENESS_PROMPT\n\nconciseness_evaluator = create_llm_as_judge(\n    # CONCISENESS_PROMPT is just an f-string\n    prompt=CONCISENESS_PROMPT,\n    model=\"openai:gpt-5.4\",\n)\n\ninputs = \"How is the weather in San Francisco?\"\n# These are fake outputs, in reality you would run your LLM-based system to get real outputs\noutputs = \"Thanks for asking! The current weather in San Francisco is sunny and 90 degrees.\"\n# When calling an LLM-as-judge evaluator, parameters are formatted directly into the prompt\neval_result = conciseness_evaluator(\n    inputs=inputs,\n    outputs=outputs,\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'score',\n    'score': False,\n    'comment': 'The output includes an unnecessary greeting (\"Thanks for asking!\") and extra..'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, CONCISENESS_PROMPT } from \"openevals\";\n\nconst concisenessEvaluator = createLLMAsJudge({\n  \u002F\u002F CONCISENESS_PROMPT is just an f-string\n  prompt: CONCISENESS_PROMPT,\n  model: \"openai:gpt-5.4\",\n});\n\nconst inputs = \"How is the weather in San Francisco?\"\n\u002F\u002F These are fake outputs, in reality you would run your LLM-based system to get real outputs\nconst outputs = \"Thanks for asking! The current weather in San Francisco is sunny and 90 degrees.\"\n\n\u002F\u002F When calling an LLM-as-judge evaluator, parameters are formatted directly into the prompt\nconst evalResult = await concisenessEvaluator({\n  inputs,\n  outputs,\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'score',\n    score: false,\n    comment: 'The output includes an unnecessary greeting (\"Thanks for asking!\") and extra..'\n}\n```\n\u003C\u002Fdetails>\n\nThis is an example of a reference-free evaluator - some other evaluators may accept slightly different parameters such as a required reference output. LLM-as-judge evaluators will attempt to format any passed parameters into their passed `prompt`, allowing you to flexibly customize criteria or add other fields.\n\nSee the [LLM-as-judge](#llm-as-judge) section for more information on how to customize the [scoring](#customizing-output-score-values) to output float values rather than just `True\u002FFalse`, the [model](#customizing-the-model), or the [prompt](#customizing-prompts)!\n\n# Table of Contents\n\n- [⚖️ OpenEvals](#️-openevals)\n- [Quickstart](#quickstart)\n- [Table of Contents](#table-of-contents)\n- [Installation](#installation)\n- [Evaluators](#evaluators)\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#llm-as-judge\">LLM-as-Judge\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [Customizing prompts](#customizing-prompts)\n      - [Customizing with LangChain prompt templates](#customizing-with-langchain-prompt-templates)\n    - [Customizing the model](#customizing-the-model)\n    - [Customizing output score values](#customizing-output-score-values)\n    - [Customizing output schema](#customizing-output-schema)\n      - [Logging feedback with custom output schemas](#logging-feedback-with-custom-output-schemas)\n      - [Structured prompts](#structured-prompts)\n    - [Multimodal](#multimodal)\n      - [Option 1: `attachments` parameter](#option-1-attachments-parameter)\n      - [Option 2: LangChain prompt template](#option-2-langchain-prompt-template)\n  \u003C\u002Fdetails>\n\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#prebuilt-prompts\">Prebuilt prompts\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [Quality](#quality)\n    - [Safety](#safety)\n    - [Security](#security)\n    - [Image](#image)\n    - [Voice](#voice)\n    - \u003Cdetails>\n        \u003Csummary>\u003Ca href=\"#rag\">RAG\u003C\u002Fa>\u003C\u002Fsummary>\n\n      - [Correctness](#correctness-rag)\n      - [Helpfulness](#helpfulness)\n      - [Groundedness](#groundedness)\n      - [Retrieval relevance](#retrieval-relevance)\n        - [Retrieval relevance with LLM-as-judge](#retrieval-relevance-with-llm-as-judge)\n        - [Retrieval relevance with string evaluators](#retrieval-relevance-with-string-evaluators)\n\n    \u003C\u002Fdetails>\n\n  \u003C\u002Fdetails>\n\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#extraction-and-tool-calls\">Extraction and tool calls\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [Evaluating structured output with exact match](#evaluating-structured-output-with-exact-match)\n    - [Evaluating structured output with LLM-as-a-Judge](#evaluating-structured-output-with-llm-as-a-judge)\n\n  \u003C\u002Fdetails>\n\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#code\">Code\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [Extracting code outputs](#extracting-code-outputs)\n    - [Pyright (Python-only)](#pyright-python-only)\n    - [Mypy (Python-only)](#mypy-python-only)\n    - [TypeScript type-checking (TypeScript-only)](#typescript-type-checking-typescript-only)\n    - [LLM-as-judge for code](#llm-as-judge-for-code)\n\n  \u003C\u002Fdetails>\n\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#sandboxed-code\">Sandboxed code\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [Sandbox Pyright (Python-only)](#sandbox-pyright-python-only)\n    - [Sandbox TypeScript type-checking (TypeScript-only)](#sandbox-typescript-type-checking-typescript-only)\n    - [Sandbox Execution](#sandbox-execution)\n\n  \u003C\u002Fdetails>\n\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#agent-trajectory\">Agent trajectory\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [Trajectory match](#trajectory-match)\n      - [Strict match](#strict-match)\n      - [Unordered match](#unordered-match)\n      - [Subset and superset match](#subset-and-superset-match)\n      - [Tool args match modes](#tool-args-match-modes)\n    - [Trajectory LLM-as-judge](#trajectory-llm-as-judge)\n    - [Prebuilt trajectory prompts](#prebuilt-trajectory-prompts)\n\n  \u003C\u002Fdetails>\n\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#other\">Other\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [Exact match](#exact-match)\n    - [Levenshtein distance](#levenshtein-distance)\n    - [Embedding similarity](#embedding-similarity)\n\n  \u003C\u002Fdetails>\n\n  - [Creating your own](#creating-your-own)\n    - [Evaluator interface](#evaluator-interface)\n    - [Logging to LangSmith](#logging-to-langsmith)\n    - [Example](#example)\n  - [Python async support](#python-async-support)\n\n- [Multiturn Simulation](#multiturn-simulation)\n  - [Simulating users](#simulating-users)\n    - [Prebuilt simulated user](#prebuilt-simulated-user)\n    - [Custom simulated users](#custom-simulated-users)\n  - [Multiturn simulation with LangGraph](#multiturn-simulation-with-langgraph)\n\n- [LangSmith Integration](#langsmith-integration)\n  - [Pytest or Vitest\u002FJest](#pytest-or-vitestjest)\n  - [Evaluate](#evaluate)\n\n- [Acknowledgements](#acknowledgements)\n- [Thank you!](#thank-you)\n\n# Installation\n\nYou can install `openevals` like this:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```bash\npip install openevals\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```bash\nnpm install openevals @langchain\u002Fcore\n```\n\u003C\u002Fdetails>\n\nFor LLM-as-judge evaluators, you will also need an LLM client. By default, `openevals` will use [LangChain chat model integrations](https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fintegrations\u002Fchat\u002F) and comes with `langchain_openai` installed by default. However, if you prefer, you may use the OpenAI client directly:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```bash\npip install openai\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```bash\nnpm install openai\n```\n\u003C\u002Fdetails>\n\nIt is also helpful to be familiar with some [evaluation concepts](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fevaluation-concepts).\n\n# Evaluators\n\n## LLM-as-judge\n\nOne common way to evaluate an LLM app's outputs is to use another LLM as a judge. This is generally a good starting point for evals.\n\nThis package contains the `create_llm_as_judge` function, which takes a prompt and a model as input, and returns an evaluator function\nthat handles converting parameters into strings and parsing the judge LLM's outputs as a score.\n\nTo use the `create_llm_as_judge` function, you need to provide a prompt and a model. To get started, OpenEvals has some prebuilt prompts in the `openevals.prompts` module that you can use out of the box. Here's an example:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\n\ncorrectness_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    model=\"openai:gpt-5.4\",\n)\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\n\nconst correctnessEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  model: \"openai:gpt-5.4\",\n});\n```\n\n\u003C\u002Fdetails>\n\nNote that `CORRECTNESS_PROMPT` is a simple f-string that you can log and edit as needed for your specific use case:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nprint(CORRECTNESS_PROMPT)\n```\n\n```\nYou are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:\n\n\u003CRubric>\n  A correct answer:\n  - Provides accurate and complete information\n  ...\n\u003Cinput>\n{inputs}\n\u003C\u002Finput>\n\n\u003Coutput>\n{outputs}\n\u003C\u002Foutput>\n...\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nconsole.log(CORRECTNESS_PROMPT);\n```\n\n```\nYou are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:\n\n\u003CRubric>\n  A correct answer:\n  - Provides accurate and complete information\n  ...\n\u003Cinput>\n{inputs}\n\u003C\u002Finput>\n\n\u003Coutput>\n{outputs}\n\u003C\u002Foutput>\n...\n```\n\n\u003C\u002Fdetails>\n\nBy convention, we generally suggest sticking to `inputs`, `outputs`, and `reference_outputs` as the names of the parameters for LLM-as-judge evaluators, but these will be directly formatted into the prompt so you can use any variable names you want.\n\nOpenEvals includes many prebuilt prompts for common evaluation scenarios. See the [Prebuilt prompts](#prebuilt-prompts) section for a full list organized by category.\n\n### Customizing prompts\n\nThe `prompt` parameter for `create_llm_as_judge` may be an f-string, [LangChain prompt template](#customizing-with-langchain-prompt-templates), or a function that takes kwargs and returns a list of formatted messages.\n\nThough we suggest sticking to conventional names (`inputs`, `outputs`, and `reference_outputs`) as prompt variables, your prompts can also require additional variables. You would then pass these extra variables when calling your evaluator function. Here's an example of a prompt that requires an extra variable named `context`:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\n\nMY_CUSTOM_PROMPT = \"\"\"\nUse the following context to help you evaluate for hallucinations in the output:\n\n\u003Ccontext>\n{context}\n\u003C\u002Fcontext>\n\n\u003Cinput>\n{inputs}\n\u003C\u002Finput>\n\n\u003Coutput>\n{outputs}\n\u003C\u002Foutput>\n\"\"\"\n\ncustom_prompt_evaluator = create_llm_as_judge(\n    prompt=MY_CUSTOM_PROMPT,\n    model=\"openai:gpt-5.4\",\n)\n\ncustom_prompt_evaluator(\n    inputs=\"What color is the sky?\",\n    outputs=\"The sky is red.\",\n    context=\"It is early evening.\",\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge } from \"openevals\";\n\nconst MY_CUSTOM_PROMPT = `\nUse the following context to help you evaluate for hallucinations in the output:\n\n\u003Ccontext>\n{context}\n\u003C\u002Fcontext>\n\n\u003Cinput>\n{inputs}\n\u003C\u002Finput>\n\n\u003Coutput>\n{outputs}\n\u003C\u002Foutput>\n`;\n\nconst customPromptEvaluator = createLLMAsJudge({\n  prompt: MY_CUSTOM_PROMPT,\n  model: \"openai:gpt-5.4\",\n});\n\nconst inputs = \"What color is the sky?\"\nconst outputs = \"The sky is red.\"\n\nconst evalResult = await customPromptEvaluator({\n  inputs,\n  outputs,\n});\n```\n\u003C\u002Fdetails>\n\nThe following options are also available for string prompts:\n\n- `system`: a string that sets a system prompt for the judge model by adding a `system` message before other parts of the prompt.\n- `few_shot_examples`: a list of example dicts that are appended to the end of the prompt. This is useful for providing the judge model with examples of good and bad outputs. The required structure looks like this:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfew_shot_examples = [\n    {\n        \"inputs\": \"What color is the sky?\",\n        \"outputs\": \"The sky is red.\",\n        \"reasoning\": \"The sky is red because it is early evening.\",\n        \"score\": 1,\n    }\n]\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nconst fewShotExamples = [\n    {\n        inputs: \"What color is the sky?\",\n        outputs: \"The sky is red.\",\n        reasoning: \"The sky is red because it is early evening.\",\n        score: 1,\n    }\n]\n```\n\u003C\u002Fdetails>\n\nThese will be appended to the end of the final user message in the prompt.\n\n#### Customizing with LangChain prompt templates\n\nYou can also pass a [LangChain prompt template](https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fconcepts\u002Fprompt_templates\u002F) if you want more control over formatting. Here's an example that uses mustache formatting instead of f-strings:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom langchain_core.prompts.chat import ChatPromptTemplate\n\ninputs = {\"a\": 1, \"b\": 2}\noutputs = {\"a\": 1, \"b\": 2}\n\nprompt = ChatPromptTemplate([\n    (\"system\", \"You are an expert at determining if two objects are equal.\"),\n    (\"human\", \"Are these two equal? {{inputs}} {{outputs}}\"),\n], template_format=\"mustache\")\n\nllm_as_judge = create_llm_as_judge(\n    prompt=prompt,\n    model=\"openai:gpt-5.4\",\n    feedback_key=\"equality\",\n)\n\neval_result = llm_as_judge(inputs=inputs, outputs=outputs)\n\nprint(eval_result)\n```\n\n```\n{\n    key: 'equality',\n    score: True,\n    comment: '...'\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge } from \"openevals\";\nimport { ChatPromptTemplate } from \"@langchain\u002Fcore\u002Fprompts\";\n\nconst inputs = { a: 1, b: 2 };\nconst outputs = { a: 1, b: 2 };\n\nconst prompt = ChatPromptTemplate.fromMessages([\n  [\"system\", \"You are an expert at determining if two objects are equal.\"],\n  [\"user\", \"Are these two equal? {{inputs}} {{outputs}}\"],\n], { templateFormat: \"mustache\" });\n\nconst evaluator = createLLMAsJudge({\n  prompt,\n  model: \"openai:gpt-5.4\",\n  feedbackKey: \"equality\",\n});\n\nconst result = await evaluator({ inputs, outputs });\n```\n\n```\n{\n    key: 'equality',\n    score: true,\n    comment: '...'\n}\n```\n\n\u003C\u002Fdetails>\n\nYou can also pass in a function that takes your LLM-as-judge inputs as kwargs and returns formatted chat messages.\n\n### Customizing the model\n\nThere are a few ways you can customize the model used for evaluation. You can pass a string formatted as `PROVIDER:MODEL` (e.g. `model=anthropic:claude-3-5-sonnet-latest`) as the `model`, in which case the package will [attempt to import and initialize a LangChain chat model instance](https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fhow_to\u002Fchat_models_universal_init\u002F). This requires you to install the appropriate LangChain integration package installed. Here's an example:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```bash\npip install langchain-anthropic\n```\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\n\nanthropic_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    model=\"anthropic:claude-3-5-sonnet-latest\",\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```bash\nnpm install @langchain\u002Fanthropic\n```\n\n```ts\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\n\nconst anthropicEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  model: \"anthropic:claude-3-5-sonnet-latest\",\n});\n```\n\u003C\u002Fdetails>\n\nYou can also directly pass a LangChain chat model instance as `judge`. Note that your chosen model must support [structured output](https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fintegrations\u002Fchat\u002F):\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\nfrom langchain_anthropic import ChatAnthropic\n\nanthropic_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    judge=ChatAnthropic(model=\"claude-3-5-sonnet-latest\", temperature=0.5),\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\nimport { ChatAnthropic } from \"@langchain\u002Fanthropic\";\n\nconst anthropicEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  judge: new ChatAnthropic({ model: \"claude-3-5-sonnet-latest\", temperature: 0.5 }),\n});\n```\n\u003C\u002Fdetails>\n\nThis is useful in scenarios where you need to initialize your model with specific parameters, such as `temperature` or alternate URLs if using models through a service like Azure.\n\nFinally, you can pass a model name as `model` and a `judge` parameter set to an OpenAI client instance:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```bash\npip install openai\n```\n\n```python\nfrom openai import OpenAI\n\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\n\nopenai_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    model=\"gpt-5.4\",\n    judge=OpenAI(),\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```bash\nnpm install openai\n```\n\n```ts\nimport { OpenAI } from \"openai\";\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\n\nconst openaiEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  model: \"gpt-5.4\",\n  judge: new OpenAI(),\n});\n```\n\u003C\u002Fdetails>\n\n### Customizing output score values\n\nThere are two fields you can set to customize the outputted scores of your evaluator:\n\n- `continuous`: a boolean that sets whether the evaluator should return a float score somewhere between 0 and 1 instead of a binary score. Defaults to `False`.\n- `choices`: a list of floats that sets the possible scores for the evaluator.\n\nThese parameters are mutually exclusive. When using either of them, you should make sure that your prompt is grounded in information on what specific scores mean - the prebuilt ones in this repo do not have this information!\n\nFor example, here's an example of how to define a less harsh definition of correctness that only penalizes incorrect answers by 50% if they are on-topic:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\n\nMY_CUSTOM_PROMPT = \"\"\"\nYou are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:\n\n\u003CRubric>\n  Assign a score of 0, .5, or 1 based on the following criteria:\n  - 0: The answer is incorrect and does not mention doodads\n  - 0.5: The answer mentions doodads but is otherwise incorrect\n  - 1: The answer is correct and mentions doodads\n\u003C\u002FRubric>\n\n\u003Cinput>\n{inputs}\n\u003C\u002Finput>\n\n\u003Coutput>\n{outputs}\n\u003C\u002Foutput>\n\n\u003Creference_outputs>\n{reference_outputs}\n\u003C\u002Freference_outputs>\n\"\"\"\n\nevaluator = create_llm_as_judge(\n    prompt=MY_CUSTOM_PROMPT,\n    choices=[0.0, 0.5, 1.0],\n    model=\"openai:gpt-5.4\",\n)\n\nresult = evaluator(\n    inputs=\"What is the current price of doodads?\",\n    outputs=\"The price of doodads is $10.\",\n    reference_outputs=\"The price of doodads is $15.\",\n)\n\nprint(result)\n```\n\n```\n{\n    'key': 'score',\n    'score': 0.5,\n    'comment': 'The provided answer mentioned doodads but was incorrect.'\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge } from \"openevals\";\n\nconst MY_CUSTOM_PROMPT = `\nYou are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:\n\n\u003CRubric>\n  Assign a score of 0, .5, or 1 based on the following criteria:\n  - 0: The answer is incorrect and does not mention doodads\n  - 0.5: The answer mentions doodads but is otherwise incorrect\n  - 1: The answer is correct and mentions doodads\n\u003C\u002FRubric>\n\n\u003Cinput>\n{inputs}\n\u003C\u002Finput>\n\n\u003Coutput>\n{outputs}\n\u003C\u002Foutput>\n\n\u003Creference_outputs>\n{reference_outputs}\n\u003C\u002Freference_outputs>\n`;\n\nconst customEvaluator = createLLMAsJudge({\n  prompt: MY_CUSTOM_PROMPT,\n  choices: [0.0, 0.5, 1.0],\n  model: \"openai:gpt-5.4\",\n});\n\nconst result = await customEvaluator({\n  inputs: \"What is the current price of doodads?\",\n  outputs: \"The price of doodads is $10.\",\n  reference_outputs: \"The price of doodads is $15.\",\n});\n\nconsole.log(result);\n```\n\n```\n{\n    'key': 'score',\n    'score': 0.5,\n    'comment': 'The provided answer mentioned doodads but was incorrect.'\n}\n```\n\u003C\u002Fdetails>\n\nFinally, if you would like to disable justifications for a given score, you can set `use_reasoning=False` when creating your evaluator.\n\n### Customizing output schema\n\nIf you need to change the structure of the raw output generated by the LLM, you can also pass a custom output schema into your LLM-as-judge evaluator as `output_schema` (Python) \u002F `outputSchema` (TypeScript). This may be helpful for specific prompting strategies or if you would like to extract multiple metrics at the same time rather than over multiple calls.\n\n> [!CAUTION]\n> Passing `output_schema` changes the return value of the evaluator to match the passed `output_schema` value instead of the typical OpenEvals format.\n> We recommend sticking with the default schema if you do not specifically need additional properties.\n\nFor Python, `output_schema` may be:\n\n- A `TypedDict` instance\n- A [Pydantic](https:\u002F\u002Fdocs.pydantic.dev) model\n- [JSON schema](https:\u002F\u002Fjson-schema.org\u002F)\n- [OpenAI's structured output format](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fguides\u002Fstructured-outputs?api-mode=chat#supported-schemas)\n\nFor TypeScript, `outputSchema` may be:\n\n- A [Zod](https:\u002F\u002Fzod.dev) object\n- [JSON schema](https:\u002F\u002Fjson-schema.org\u002F)\n- [OpenAI's structured output format](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fguides\u002Fstructured-outputs?api-mode=chat#supported-schemas)\n\nNote that if you are using an OpenAI client directly, only JSON schema and OpenAI's structured output format.\n\nHere's an example:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom typing_extensions import TypedDict\n\nfrom openevals.llm import create_llm_as_judge\n\nclass EqualityResult(TypedDict):\n    equality_justification: str\n    are_equal: bool\n\ninputs = \"The rain in Spain falls mainly on the plain.\"\n\noutputs = \"The rain in Spain falls mainly on the plain.\"\n\nllm_as_judge = create_llm_as_judge(\n    prompt=\"Are the following two values equal? {inputs} {outputs}\",\n    model=\"openai:gpt-5.4\",\n    output_schema=EqualityResult,\n)\neval_result = llm_as_judge(inputs=inputs, outputs=outputs)\n\nprint(eval_result)\n```\n\n```\n{\n    'equality_justification': 'The values are equal because they have the same properties with identical values.',\n    'are_equal': True,\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { z } from \"zod\";\n\nimport { createLLMAsJudge } from \"openevals\";\n\nconst equalitySchema = z.object({\n  equality_justification: z.string(),\n  are_equal: z.boolean(),\n})\n\nconst inputs = \"The rain in Spain falls mainly on the plain.\";\nconst outputs = \"The rain in Spain falls mainly on the plain.\";\n\nconst llmAsJudge = createLLMAsJudge({\n  prompt: \"Are the following two values equal? {inputs} {outputs}\",\n  model: \"openai:gpt-5.4\",\n  outputSchema: equalitySchema,\n});\n\nconst evalResult = await llmAsJudge({ inputs, outputs });\n\nconsole.log(evalResult);\n```\n\n```\n{\n    'equality_justification': 'The values are equal because they have the same properties with identical values.',\n    'are_equal': True,\n}\n```\n\n\u003C\u002Fdetails>\n\n#### Logging feedback with custom output schemas\n\nIf you are using an OpenEvals evaluator with [LangSmith's `pytest` or `Vitest`\u002F`Jest` runners](#pytest-or-vitestjest), you will need to manually [log feedback keys](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fpytest#log-feedback).\n\nIf you are using `evaluate`, you will need to wrap your evaluator in another function that maps your evaluator return value to [feedback in the right format](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fcode-evaluator).\n\n#### Structured prompts\n\nPassing in a pulled prompt from the [LangChain prompt hub](https:\u002F\u002Fsmith.langchain.com\u002Fhub) that has an output schema set will also change the output schema for the LLM-as-judge evaluator.\n\n### Multimodal\n\nLLM-as-judge evaluators support multimodal inputs including images, audio, and PDFs. There are two ways to pass multimodal content:\n\n- **`attachments` parameter** — include an `{attachments}` placeholder in your prompt and pass the content via the `attachments` kwarg.\n- **LangChain prompt template** — introduce multimodal content directly into the prompt message. See the [LangChain multimodal messages docs](https:\u002F\u002Fdocs.langchain.com\u002Foss\u002Fpython\u002Flangchain\u002Fmessages#multimodal) for details.\n\n#### Option 1: `attachments` parameter\n\nThe `attachments` parameter supports a single dict or a list of dicts with a `mime_type` and base64-encoded `data` field. The prebuilt [Image](#image) and [Voice](#voice) prompts already include the `{attachments}` placeholder, or you can add it to any custom prompt.\n\nSupported attachment types:\n\n| Type | `mime_type` |\n|------|-------------|\n| Images | `image\u002Fpng`, `image\u002Fjpeg`, `image\u002Fgif`, `image\u002Fwebp` |\n| Audio | `audio\u002Fwav`, `audio\u002Fmp3`, `audio\u002Fmpeg` |\n| PDF | `application\u002Fpdf` |\n\n> [!NOTE]\n> Multimodal support depends on your model provider. Audio input and structured output (e.g. returning a score with a comment) are not supported simultaneously by all providers — currently only Gemini supports both at once. The prebuilt [Voice](#voice) prompts use `google_genai:gemini-2.0-flash` (Python) \u002F `google-genai:gemini-2.0-flash` (TypeScript) for this reason.\n\nPassing a URL string directly as `attachments` is supported for images only. Audio and PDF attachments must be passed as a base64-encoded data URI with `mime_type` and `data` fields.\n\nHere's an example using the prebuilt `SENSITIVE_IMAGERY_PROMPT`. You can pass an image as a URL or as a base64-encoded data URI — both work the same way:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport base64\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import SENSITIVE_IMAGERY_PROMPT\n\nevaluator = create_llm_as_judge(\n    prompt=SENSITIVE_IMAGERY_PROMPT,\n    feedback_key=\"sensitive_imagery\",\n    model=\"openai:gpt-5.4\",\n)\n\n# Option A: pass a URL string directly\neval_result = evaluator(\n    inputs=\"Review this image for sensitive content\",\n    outputs=\"The image appears to contain appropriate content\",\n    attachments=\"https:\u002F\u002Fexample.com\u002Fimage.jpg\",\n)\n\n# Option B: pass a base64-encoded data URI\nwith open(\"image.jpg\", \"rb\") as f:\n    image_data = \"data:image\u002Fjpeg;base64,\" + base64.b64encode(f.read()).decode(\"utf-8\")\n\neval_result = evaluator(\n    inputs=\"Review this image for sensitive content\",\n    outputs=\"The image appears to contain appropriate content\",\n    attachments={\"mime_type\": \"image\u002Fjpeg\", \"data\": image_data},\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'sensitive_imagery',\n    'score': False,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport * as fs from \"fs\";\nimport { createLLMAsJudge, SENSITIVE_IMAGERY_PROMPT } from \"openevals\";\n\nconst evaluator = createLLMAsJudge({\n  prompt: SENSITIVE_IMAGERY_PROMPT,\n  feedbackKey: \"sensitive_imagery\",\n  model: \"openai:gpt-5.4\",\n});\n\n\u002F\u002F Option A: pass a URL string directly\nconst evalResult = await evaluator({\n  inputs: \"Review this image for sensitive content\",\n  outputs: \"The image appears to contain appropriate content\",\n  attachments: \"https:\u002F\u002Fexample.com\u002Fimage.jpg\",\n});\n\n\u002F\u002F Option B: pass a base64-encoded data URI\nconst imageData = \"data:image\u002Fjpeg;base64,\" + fs.readFileSync(\"image.jpg\").toString(\"base64\");\n\nconst evalResultB64 = await evaluator({\n  inputs: \"Review this image for sensitive content\",\n  outputs: \"The image appears to contain appropriate content\",\n  attachments: { mime_type: \"image\u002Fjpeg\", data: imageData },\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'sensitive_imagery',\n    score: false,\n    comment: '...'\n}\n```\n\u003C\u002Fdetails>\n\n#### Option 2: LangChain prompt template\n\nYou can also introduce multimodal content into the prompt using a LangChain prompt template. See the [LangChain multimodal messages docs](https:\u002F\u002Fdocs.langchain.com\u002Foss\u002Fpython\u002Flangchain\u002Fmessages#multimodal) for details.\n\n## Prebuilt prompts\n\nOpenEvals includes prebuilt prompts for common evaluation scenarios that work out of the box with [`create_llm_as_judge`](#llm-as-judge). All prebuilt prompts are importable from `openevals.prompts` (Python) or `openevals` (TypeScript).\n\n### Quality\n\nThese prompts evaluate general output quality.\n\n| Prompt | Parameters | What it evaluates |\n|--------|-----------|-------------------|\n| `CONCISENESS_PROMPT` | `inputs`, `outputs` | Whether the output is appropriately brief and avoids unnecessary padding |\n| `CORRECTNESS_PROMPT` | `inputs`, `outputs`, `reference_outputs` (optional) | Factual accuracy and completeness of the output |\n| `HALLUCINATION_PROMPT` | `inputs`, `outputs`, `context` (optional) | Whether the output contains information not supported by the provided context |\n| `ANSWER_RELEVANCE_PROMPT` | `inputs`, `outputs` | Whether the output directly addresses the question asked |\n| `PLAN_ADHERENCE_PROMPT` | `inputs`, `outputs`, `plan` | Whether the output follows a provided plan |\n| `CODE_CORRECTNESS_PROMPT` | `inputs`, `outputs` | Code correctness against the problem specification |\n| `CODE_CORRECTNESS_PROMPT_WITH_REFERENCE_OUTPUTS` | `inputs`, `outputs`, `reference_outputs` | Code correctness compared to a reference solution |\n| `LAZINESS_PROMPT` | `inputs`, `outputs` | Whether the agent returned a blank, empty, or low-effort response |\n\nHere's an example using `CORRECTNESS_PROMPT`:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\n\ncorrectness_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    feedback_key=\"correctness\",\n    model=\"openai:gpt-5.4\",\n)\n\ninputs = \"How much has the price of doodads changed in the past year?\"\noutputs = \"Doodads have increased in price by 10% in the past year.\"\nreference_outputs = \"The price of doodads has decreased by 50% in the past year.\"\n\neval_result = correctness_evaluator(\n  inputs=inputs,\n  outputs=outputs,\n  reference_outputs=reference_outputs\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'correctness',\n    'score': False,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\n\nconst correctnessEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  feedbackKey: \"correctness\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst inputs = \"How much has the price of doodads changed in the past year?\";\nconst outputs = \"Doodads have increased in price by 10% in the past year.\";\nconst referenceOutputs = \"The price of doodads has decreased by 50% in the past year.\";\n\nconst evalResult = await correctnessEvaluator({\n  inputs,\n  outputs,\n  referenceOutputs,\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'correctness',\n    score: false,\n    comment: '...'\n}\n```\n\u003C\u002Fdetails>\n\n### Safety\n\nThese prompts detect harmful or biased content in LLM outputs.\n\n| Prompt | Parameters | What it evaluates |\n|--------|-----------|-------------------|\n| `TOXICITY_PROMPT` | `inputs`, `outputs` | Harmful, offensive, or abusive language |\n| `FAIRNESS_PROMPT` | `inputs`, `outputs` | Bias or discrimination toward individuals or groups based on protected characteristics |\n\nHere's an example using `FAIRNESS_PROMPT`:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import FAIRNESS_PROMPT\n\nllm_as_judge = create_llm_as_judge(\n    prompt=FAIRNESS_PROMPT,\n    feedback_key=\"fairness\",\n    model=\"openai:gpt-5.4\",\n)\n\neval_result = llm_as_judge(\n    inputs=\"Should we hire this candidate?\",\n    outputs=\"Their name is hard to pronounce — that could make clients uncomfortable.\",\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'fairness',\n    'score': True,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, FAIRNESS_PROMPT } from \"openevals\";\n\nconst fairnessEvaluator = createLLMAsJudge({\n  prompt: FAIRNESS_PROMPT,\n  feedbackKey: \"fairness\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst evalResult = await fairnessEvaluator({\n  inputs: \"Should we hire this candidate?\",\n  outputs: \"Their name is hard to pronounce — that could make clients uncomfortable.\",\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'fairness',\n    score: true,\n    comment: '...'\n}\n```\n\u003C\u002Fdetails>\n\n### Security\n\nThese prompts detect security threats in LLM inputs and outputs.\n\n| Prompt | Parameters | What it evaluates |\n|--------|-----------|-------------------|\n| `PII_LEAKAGE_PROMPT` | `inputs`, `outputs` | Personally identifiable information exposed in the output |\n| `PROMPT_INJECTION_PROMPT` | `inputs` | Attempts to manipulate or override AI system instructions, including social engineering and roleplay-based circumvention |\n| `CODE_INJECTION_PROMPT` | `inputs` | Malicious code or exploits embedded in inputs |\n\nHere's an example using `PII_LEAKAGE_PROMPT`:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import PII_LEAKAGE_PROMPT\n\nllm_as_judge = create_llm_as_judge(\n    prompt=PII_LEAKAGE_PROMPT,\n    feedback_key=\"pii_leakage\",\n    model=\"openai:gpt-5.4\",\n)\n\neval_result = llm_as_judge(\n    inputs=\"What is my account info?\",\n    outputs=\"Your name is John Smith, your email is john.smith@example.com, and your SSN is 123-45-6789.\",\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'pii_leakage',\n    'score': True,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, PII_LEAKAGE_PROMPT } from \"openevals\";\n\nconst piiEvaluator = createLLMAsJudge({\n  prompt: PII_LEAKAGE_PROMPT,\n  feedbackKey: \"pii_leakage\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst evalResult = await piiEvaluator({\n  inputs: \"What is my account info?\",\n  outputs: \"Your name is John Smith, your email is john.smith@example.com, and your SSN is 123-45-6789.\",\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'pii_leakage',\n    score: true,\n    comment: '...'\n}\n```\n\u003C\u002Fdetails>\n\n### Image\n\nThese prompts evaluate image content and its relation to the associated context. All image prompts require an `attachments` parameter — see the [Multimodal](#multimodal) section for details on passing image data. Note that your chosen model must support vision inputs (e.g. `openai:gpt-5.4`).\n\n| Prompt | Parameters | What it evaluates |\n|--------|-----------|-------------------|\n| `EXPLICIT_CONTENT_PROMPT` | `inputs`, `outputs`, `attachments` | Sexually explicit or graphic material inappropriate for general audiences |\n| `SENSITIVE_IMAGERY_PROMPT` | `inputs`, `outputs`, `attachments` | Hate symbols, inflammatory political imagery, or depictions of suffering |\n\n### Voice\n\nThese prompts evaluate voice and audio content. All voice prompts require an `attachments` parameter — see the [Multimodal](#multimodal) section for details on passing audio data. Note that your chosen model must support audio inputs — as mentioned in the [Multimodal](#multimodal) section, only Gemini currently supports audio and structured output simultaneously.\n\n| Prompt | Parameters | What it evaluates |\n|--------|-----------|-------------------|\n| `AUDIO_QUALITY_PROMPT` | `inputs`, `outputs`, `attachments` | Clipping, distortion, or glitches that degrade listening experience |\n| `TRANSCRIPTION_ACCURACY_PROMPT` | `inputs`, `outputs`, `attachments` | Accuracy of speech-to-text transcription |\n| `USER_INTERRUPTS_PROMPT` | `inputs`, `outputs`, `attachments` | Whether the agent handled user interruptions gracefully |\n| `VOCAL_AFFECT_PROMPT` | `inputs`, `outputs`, `attachments` | Appropriateness and consistency of the agent's vocal tone |\n\nHere's an example using `AUDIO_QUALITY_PROMPT`:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport base64\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import AUDIO_QUALITY_PROMPT\n\nwith open(\"audio.wav\", \"rb\") as f:\n    audio_data = base64.b64encode(f.read()).decode(\"utf-8\")\n\nllm_as_judge = create_llm_as_judge(\n    prompt=AUDIO_QUALITY_PROMPT,\n    feedback_key=\"audio_quality\",\n    model=\"google_genai:gemini-2.0-flash\",\n)\n\neval_result = llm_as_judge(\n    inputs=\"Customer service call recording\",\n    outputs=\"Audio response from agent\",\n    attachments={\"mime_type\": \"audio\u002Fwav\", \"data\": audio_data},\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'audio_quality',\n    'score': True,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport * as fs from \"fs\";\nimport { createLLMAsJudge } from \"openevals\";\nimport { AUDIO_QUALITY_PROMPT } from \"openevals\u002Fprompts\";\n\nconst audioData = fs.readFileSync(\"audio.wav\").toString(\"base64\");\n\nconst llmAsJudge = createLLMAsJudge({\n  prompt: AUDIO_QUALITY_PROMPT,\n  feedbackKey: \"audio_quality\",\n  model: \"google-genai:gemini-2.0-flash\",\n});\n\nconst evalResult = await llmAsJudge({\n  inputs: \"Customer service call recording\",\n  outputs: \"Audio response from agent\",\n  attachments: { mime_type: \"audio\u002Fwav\", data: audioData },\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'audio_quality',\n    score: true,\n    comment: '...'\n}\n```\n\u003C\u002Fdetails>\n\n### RAG\n\nRAG applications in their most basic form consist of 2 steps. In the retrieval step, context is retrieved (often from something like a vector database that a user has prepared ahead of time, though [web retrieval](https:\u002F\u002Fgithub.com\u002Fassafelovic\u002Fgpt-researcher) use-cases are gaining in popularity as well) to provide the LLM with the information it needs to respond to the user. In the generation step, the LLM uses the retrieved context to formulate an answer.\n\nOpenEvals provides prebuilt prompts and other methods for the following:\n\n1. [Correctness](#correctness-rag)\n- Evaluates: Final output vs. input + reference answer\n- Goal: Measure \"how similar\u002Fcorrect is the generated answer relative to a ground-truth answer\"\n- Requires reference: Yes\n\n2. [Helpfulness](#helpfulness)\n- Evaluates: Final output vs. input\n- Goal: Measure \"how well does the generated response address the initial user input\"\n- Requires reference: No, because it will compare the answer to the input question\n\n3. [Groundedness](#groundedness)\n- Evaluates: Final output vs. retrieved context\n- Goal: Measure \"to what extent does the generated response agree with the retrieved context\"\n- Requires reference: No, because it will compare the answer to the retrieved context\n\n4. [Retrieval relevance](#retrieval-relevance)\n- Evaluates: Retrieved context vs. input\n- Goal: Measure \"how relevant are my retrieved results for this query\"\n- Requires reference: No, because it will compare the question to the retrieved context\n\n#### Correctness {#correctness-rag}\n\n`correctness` measures how similar\u002Fcorrect a generated answer is to a ground-truth answer. By definition, this requires you to have a reference output to compare against the generated one. It is useful to test your RAG app end-to-end, and does directly take into account context retrieved as an intermediate step.\n\nYou can evaluate the correctness of a RAG app's outputs using the LLM-as-judge evaluator alongside the general [`CORRECTNESS_PROMPT`](#quality) covered in the [Quality](#quality) section above. Here's an example:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\n\ncorrectness_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    feedback_key=\"correctness\",\n    model=\"openai:gpt-5.4\",\n)\n\ninputs = \"How much has the price of doodads changed in the past year?\"\noutputs = \"Doodads have increased in price by 10% in the past year.\"\nreference_outputs = \"The price of doodads has decreased by 50% in the past year.\"\n\neval_result = correctness_evaluator(\n  inputs=inputs,\n  outputs=outputs,\n  reference_outputs=reference_outputs\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'correctness',\n    'score': False,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\n\nconst correctnessEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  feedbackKey: \"correctness\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst inputs = \"How much has the price of doodads changed in the past year?\";\nconst outputs = \"Doodads have increased in price by 10% in the past year.\";\nconst referenceOutputs = \"The price of doodads has decreased by 50% in the past year.\";\n\nconst evalResult = await correctnessEvaluator({\n  inputs,\n  outputs,\n  referenceOutputs,\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'correctness',\n    score: false,\n    comment: '...'\n}\n```\n\u003C\u002Fdetails>\n\nFor more information on customizing LLM-as-judge evaluators, see [these sections](#customizing-prompts).\n\n#### Helpfulness\n\n`helpfulness` measures how well the generated response addresses the initial user input. It compares the final generated output against the input, and does not require a reference. It's useful to validate that the generation step of your RAG app actually answers the original question as stated, but does *not* measure that the answer is supported by any retrieved context!\n\nYou can evaluate the helpfulness of a RAG app's outputs using the LLM-as-judge evaluator with a prompt like the built-in `RAG_HELPFULNESS_PROMPT`. Here's an example:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import RAG_HELPFULNESS_PROMPT\n\nhelpfulness_evaluator = create_llm_as_judge(\n    prompt=RAG_HELPFULNESS_PROMPT,\n    feedback_key=\"helpfulness\",\n    model=\"openai:gpt-5.4\",\n)\n\ninputs = {\n    \"question\": \"Where was the first president of FoobarLand born?\",\n}\n\noutputs = {\n    \"answer\": \"The first president of FoobarLand was Bagatur Askaryan.\",\n}\n\neval_result = helpfulness_evaluator(\n  inputs=inputs,\n  outputs=outputs,\n)\n\nprint(eval_result)\n```\n\n```\n{\n  'key': 'helpfulness', \n  'score': False, \n  'comment': \"The question asks for the birthplace of the first president of FoobarLand, but the retrieved outputs only identify the first president as Bagatur and provide an unrelated biographical detail (being a fan of PR reviews). Although the first output is somewhat relevant by identifying the president's name, neither document provides any information about his birthplace. Thus, the outputs do not contain useful information to answer the input question. Thus, the score should be: false.\"\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, RAG_HELPFULNESS_PROMPT } from \"openevals\";\n\nconst inputs = {\n  \"question\": \"Where was the first president of FoobarLand born?\",\n};\n\nconst outputs = {\n  \"answer\": \"The first president of FoobarLand was Bagatur Askaryan.\",\n};\n\nconst helpfulnessEvaluator = createLLMAsJudge({\n  prompt: RAG_HELPFULNESS_PROMPT,\n  feedbackKey: \"helpfulness\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst evalResult = await helpfulnessEvaluator({\n  inputs,\n  outputs,\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n  'key': 'helpfulness', \n  'score': False, \n  'comment': \"The question asks for the birthplace of the first president of FoobarLand, but the retrieved outputs only identify the first president as Bagatur and provide an unrelated biographical detail (being a fan of PR reviews). Although the first output is somewhat relevant by identifying the president's name, neither document provides any information about his birthplace. Thus, the outputs do not contain useful information to answer the input question. Thus, the score should be: false.\"\n}\n```\n\n\u003C\u002Fdetails>\n\n#### Groundedness\n\n`groundedness` measures the extent that the generated response agrees with the retrieved context. It compares the final generated output against context fetched during the retrieval step, and verifies that the generation step is properly using retrieved context vs. hallucinating a response or overusing facts from the LLM's base knowledge.\n\nYou can evaluate the groundedness of a RAG app's outputs using the LLM-as-judge evaluator with a prompt like the built-in `RAG_GROUNDEDNESS_PROMPT`. Note that this prompt does not take the example's original `inputs` into account, only the outputs and their relation to the retrieved context. Thus, unlike some of the other prebuilt prompts, it takes `context` and `outputs` as prompt variables:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import RAG_GROUNDEDNESS_PROMPT\n\ngroundedness_evaluator = create_llm_as_judge(\n    prompt=RAG_GROUNDEDNESS_PROMPT,\n    feedback_key=\"groundedness\",\n    model=\"openai:gpt-5.4\",\n)\n\ncontext = {\n    \"documents\": [\n        \"FoobarLand is a new country located on the dark side of the moon\",\n        \"Space dolphins are native to FoobarLand\",\n        \"FoobarLand is a constitutional democracy whose first president was Bagatur Askaryan\",\n        \"The current weather in FoobarLand is 80 degrees and clear.\"\n    ],\n}\n\noutputs = {\n    \"answer\": \"The first president of FoobarLand was Bagatur Askaryan.\",\n}\n\neval_result = groundedness_evaluator(\n    context=context,\n    outputs=outputs,\n)\n\nprint(eval_result)\n```\n\n```\n{\n  'key': 'groundedness',\n  'score': True,\n  'comment': 'The output states, \"The first president of FoobarLand was Bagatur Askaryan,\" which is directly supported by the retrieved context (document 3 explicitly states this fact). There is no addition or modification, and the claim aligns perfectly with the context provided. Thus, the score should be: true.',\n  'metadata': None\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, RAG_GROUNDEDNESS_PROMPT } from \"openevals\";\n\nconst groundednessEvaluator = createLLMAsJudge({\n  prompt: RAG_GROUNDEDNESS_PROMPT,\n  feedbackKey: \"groundedness\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst context = {\n  documents: [\n    \"FoobarLand is a new country located on the dark side of the moon\",\n    \"Space dolphins are native to FoobarLand\",\n    \"FoobarLand is a constitutional democracy whose first president was Bagatur Askaryan\",\n    \"The current weather in FoobarLand is 80 degrees and clear.\"\n  ],\n};\n\nconst outputs = {\n  answer: \"The first president of FoobarLand was Bagatur Askaryan.\",\n};\n\nconst evalResult = await groundednessEvaluator({\n  context,\n  outputs,\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n  'key': 'groundedness',\n  'score': true,\n  'comment': 'The output states, \"The first president of FoobarLand was Bagatur Askaryan,\" which is directly supported by the retrieved context (document 3 explicitly states this fact). There is no addition or modification, and the claim aligns perfectly with the context provided. Thus, the score should be: true.',\n  'metadata': None\n}\n```\n\n\u003C\u002Fdetails>\n\n#### Retrieval relevance\n\n`retrieval_relevance` measures how relevant retrieved context is to an input query. This type of evaluator directly measures the quality of the retrieval step of your app vs. its generation step.\n\n##### Retrieval relevance with LLM-as-judge\n\nYou can evaluate the retrieval relevance of a RAG app using the LLM-as-judge evaluator with a prompt like the built-in `RAG_RETRIEVAL_RELEVANCE_PROMPT`. Note that this prompt does not consider at your actual app's final output, only `inputs` and the retrieved context. Thus, unlike some of the other prebuilt prompts, it takes `context` and `inputs` as prompt variables:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import RAG_RETRIEVAL_RELEVANCE_PROMPT\n\nretrieval_relevance_evaluator = create_llm_as_judge(\n    prompt=RAG_RETRIEVAL_RELEVANCE_PROMPT,\n    feedback_key=\"retrieval_relevance\",\n    model=\"openai:gpt-5.4\",\n)\n\ninputs = {\n    \"question\": \"Where was the first president of FoobarLand born?\",\n}\n\ncontext = {\n    \"documents\": [\n        \"FoobarLand is a new country located on the dark side of the moon\",\n        \"Space dolphins are native to FoobarLand\",\n        \"FoobarLand is a constitutional democracy whose first president was Bagatur Askaryan\",\n        \"The current weather in FoobarLand is 80 degrees and clear.\",\n    ],\n}\n\neval_result = retrieval_relevance_evaluator(\n    inputs=inputs,\n    context=context,\n)\n\nprint(eval_result)\n```\n\n```\n{\n  'key': 'retrieval_relevance',\n  'score': False,\n  'comment': \"The retrieved context provides some details about FoobarLand – for instance, that it is a new country located on the dark side of the moon and that its first president is Bagatur Askaryan. However, none of the documents specify where the first president was born. Notably, while there is background information about FoobarLand's location, the crucial information about the birth location of the first president is missing. Thus, the retrieved context does not fully address the question. Thus, the score should be: false.\",\n  'metadata': None\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, RAG_RETRIEVAL_RELEVANCE_PROMPT } from \"openevals\";\n\nconst retrievalRelevanceEvaluator = createLLMAsJudge({\n  prompt: RAG_RETRIEVAL_RELEVANCE_PROMPT,\n  feedbackKey: \"retrieval_relevance\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst inputs = {\n  question: \"Where was the first president of FoobarLand born?\",\n}\n\nconst context = {\n  documents: [\n    \"FoobarLand is a new country located on the dark side of the moon\",\n    \"Space dolphins are native to FoobarLand\",\n    \"FoobarLand is a constitutional democracy whose first president was Bagatur Askaryan\",\n    \"The current weather in FoobarLand is 80 degrees and clear.\",\n  ],\n}\n\nconst retrievalRelevanceEvaluator = await retrievalRelevanceEvaluator({\n  inputs,\n  context,\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n  'key': 'retrieval_relevance',\n  'score': False,\n  'comment': \"The retrieved context provides some details about FoobarLand – for instance, that it is a new country located on the dark side of the moon and that its first president is Bagatur Askaryan. However, none of the documents specify where the first president was born. Notably, while there is background information about FoobarLand's location, the crucial information about the birth location of the first president is missing. Thus, the retrieved context does not fully address the question. Thus, the score should be: false.\",\n  'metadata': None\n}\n```\n\n\u003C\u002Fdetails>\n\n##### Retrieval relevance with string evaluators\n\nYou can also use string evaluators like [embedding similarity](#embedding-similarity) to measure retrieval relevance without using an LLM. In this case, you should convert your retrieved documents into a string and pass it into your evaluator as `outputs`, while the original input query will be passed as `reference_outputs`. The output score and your acceptable threshold will depend on the specific embeddings model you use.\n\nHere's an example:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.string.embedding_similarity import create_embedding_similarity_evaluator\n\nevaluator = create_embedding_similarity_evaluator()\n\ninputs = \"Where was the first president of FoobarLand born?\"\n\ncontext = \"\\n\".join([\n    \"BazQuxLand is a new country located on the dark side of the moon\",\n    \"Space dolphins are native to BazQuxLand\",\n    \"BazQuxLand is a constitutional democracy whose first president was Bagatur Askaryan\",\n    \"The current weather in BazQuxLand is 80 degrees and clear.\",\n])\n\nresult = evaluator(\n    outputs=context,\n    reference_outputs=inputs,\n)\n\nprint(result)\n```\n\n```\n{\n  'key': 'embedding_similarity',\n  'score': 0.43,\n  'comment': None,\n  'metadata': None\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createEmbeddingSimilarityEvaluator } from \"openevals\";\nimport { OpenAIEmbeddings } from \"@langchain\u002Fopenai\";\n\nconst evaluator = createEmbeddingSimilarityEvaluator({\n  embeddings: new OpenAIEmbeddings({ model: \"text-embedding-3-small\" }),\n});\n\nconst inputs = \"Where was the first president of FoobarLand born?\";\n\nconst context = [\n  \"BazQuxLand is a new country located on the dark side of the moon\",\n  \"Space dolphins are native to BazQuxLand\",\n  \"BazQuxLand is a constitutional democracy whose first president was Bagatur Askaryan\",\n  \"The current weather in BazQuxLand is 80 degrees and clear.\",\n].join(\"\\n\");\n\nconst result = await evaluator(\n  outputs: context,\n  referenceOutputs: inputs,\n);\n\nconsole.log(result);\n```\n\n```\n{\n  'key': 'embedding_similarity',\n  'score': 0.43,\n}\n```\n\u003C\u002Fdetails>\n\n## Extraction and tool calls\n\nTwo very common use cases for LLMs are extracting structured output from documents and tool calling. Both of these require the LLM\nto respond in a structured format. This package provides a prebuilt evaluator to help you evaluate these use cases, and is flexible\nto work for a variety of extraction\u002Ftool calling use cases.\n\nYou can use the `create_json_match_evaluator` evaluator in two ways:\n1. To perform an exact match of the outputs to reference outputs\n2. Using LLM-as-a-judge to evaluate the outputs based on a provided rubric.\n\nNote that this evaluator may return multiple scores based on key and aggregation strategy, so the result will be an array of scores rather than a single one.\n\n### Evaluating structured output with exact match\n\nUse exact match evaluation when there is a clear right or wrong answer. A common scenario is text extraction from images or PDFs where you expect specific values.\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.json import create_json_match_evaluator\n\noutputs = [\n    {\"a\": \"Mango, Bananas\", \"b\": 2},\n    {\"a\": \"Apples\", \"b\": 2, \"c\": [1,2,3]},\n]\nreference_outputs = [\n    {\"a\": \"Mango, Bananas\", \"b\": 2},\n    {\"a\": \"Apples\", \"b\": 2, \"c\": [1,2,4]},\n]\nevaluator = create_json_match_evaluator(\n    # How to aggregate feedback keys in each element of the list: \"average\", \"all\", or None\n    # \"average\" returns the average score. \"all\" returns 1 only if all keys score 1; otherwise, it returns 0. None returns individual feedback chips for each key\n    aggregator=\"all\",\n    # Remove if evaluating a single structured output. This aggregates the feedback keys across elements of the list. Can be \"average\" or \"all\". Defaults to \"all\". \"all\" returns 1 if each element of the list is 1; if any score is not 1, it returns 0. \"average\" returns the average of the scores from each element. \n    list_aggregator=\"average\",\n    exclude_keys=[\"a\"],\n)\n# Invoke the evaluator with the outputs and reference outputs\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\n\nprint(result)\n```\n\nFor the first element, \"b\" will be 1 and the aggregator will return a score of 1\nFor the second element, \"b\" will be 1, \"c\" will be 0 and the aggregator will return a score of 0\nTherefore, the list aggregator will return a final score of 0.5.\n\n```\n[\n  {\n    'key': 'json_match:all',\n    'score': 0.5,\n    'comment': None,\n  }\n]\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createJsonMatchEvaluator } from \"openevals\";\nimport { OpenAI } from \"openai\";\n\nconst outputs = [\n    {a: \"Mango, Bananas\", b: 2},\n    {a: \"Apples\", b: 2, c: [1,2,3]},\n]\nconst reference_outputs = [\n    {a: \"Mango, Bananas\", b: 2},\n    {a: \"Apples\", b: 2, c: [1,2,4]},\n]\n\nconst client = new OpenAI();\n\nconst evaluator = createJsonMatchEvaluator({\n    \u002F\u002F How to aggregate feedback keys in each element of the list: \"average\", \"all\", or None\n    \u002F\u002F \"average\" returns the average score. \"all\" returns 1 only if all keys score 1; otherwise, it returns 0. None returns individual feedback chips for each key\n    aggregator=\"all\",\n    \u002F\u002F Remove if evaluating a single structured output. This aggregates the feedback keys across elements of the list. Can be \"average\" or \"all\". Defaults to \"all\". \"all\" returns 1 if each element of the list is 1; if any score is not 1, it returns 0. \"average\" returns the average of the scores from each element. \n    list_aggregator=\"average\",\n    \u002F\u002F The keys to ignore during evaluation. Any key not passed here or in `rubric` will be evaluated using an exact match comparison to the reference outputs\n    exclude_keys=[\"a\"],\n    \u002F\u002F The provider and name of the model to use\n    judge: client,\n    model: \"openai:gpt-5.4\",\n})\n\n\u002F\u002F Invoke the evaluator with the outputs and reference outputs\nconst result = await evaluator({\n    outputs,\n    reference_outputs,\n})\n\nconsole.log(result)\n```\n\nFor the first element, \"b\" will be 1 and the aggregator will return a score of 1\nFor the second element, \"b\" will be 1, \"c\" will be 0 and the aggregator will return a score of 0\nTherefore, the list aggregator will return a final score of 0.5.\n\n```\n[\n  {\n    'key': 'json_match:all',\n    'score': 0.5,\n    'comment': None,\n  }\n]\n```\n\u003C\u002Fdetails>\n\n### Evaluating structured output with LLM-as-a-Judge\n\nUse LLM-as-a-judge to evaluate structured output or tools calls when the criteria is more subjective (for example the output is a kind of fruit or mentions all the fruits). \n\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.json import create_json_match_evaluator\n\noutputs = [\n    {\"a\": \"Mango, Bananas\", \"b\": 2},\n    {\"a\": \"Apples\", \"b\": 2, \"c\": [1,2,3]},\n]\nreference_outputs = [\n    {\"a\": \"Bananas, Mango\", \"b\": 2, \"d\": \"Not in outputs\"},\n    {\"a\": \"Apples, Strawberries\", \"b\": 2},\n]\nevaluator = create_json_match_evaluator(\n    # How to aggregate feedback keys in each element of the list: \"average\", \"all\", or None\n    # \"average\" returns the average score. \"all\" returns 1 only if all keys score 1; otherwise, it returns 0. None returns individual feedback chips for each key\n    aggregator=\"average\",\n    # Remove if evaluating a single structured output. This aggregates the feedback keys across elements of the list. Can be \"average\" or \"all\". Defaults to \"all\". \"all\" returns 1 if each element of the list is 1; if any score is not 1, it returns 0. \"average\" returns the average of the scores from each element. \n    list_aggregator=\"all\",\n    rubric={\n        \"a\": \"Does the answer mention all the fruits in the reference answer?\"\n    },\n    # The provider and name of the model to use\n    model=\"openai:gpt-5.4\",\n    # Whether to force the model to reason about the keys in `rubric`. Defaults to True\n    # Note that this is not currently supported if there is an aggregator specified \n    use_reasoning=True\n)\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\n\nprint(result)\n```\n\nFor the first element, \"a\" will be 1  since both Mango and Bananas are in the reference output, \"b\" will be 1 and \"d\" will be 0. The aggregator will return an average score of 0.6. \nFor the second element, \"a\" will be 0 since the reference output doesn't mention all the fruits in the output,  \"b\" will be 1. The aggregator will return a score of 0.5. \nTherefore, the list aggregator will return a final score of 0. \n\n```\n[\n  {\n    'key': 'json_match:a',\n    'score': 0,\n    'comment': None\n  }\n]\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createJsonMatchEvaluator } from \"openevals\";\nimport { OpenAI } from \"openai\";\n\nconst outputs = [\n    {a: \"Mango, Bananas\", b: 2},\n    {a: \"Apples\", b: 2, c: [1,2,3]},\n]\nconst reference_outputs = [\n    {a: \"Bananas, Mango\", b: 2},\n    {a: \"Apples, Strawberries\", b: 2},\n]\n\nconst client = new OpenAI();\n\nconst evaluator = createJsonMatchEvaluator({\n    \u002F\u002F How to aggregate feedback keys in each element of the list: \"average\", \"all\", or None\n    \u002F\u002F \"average\" returns the average score. \"all\" returns 1 only if all keys score 1; otherwise, it returns 0. None returns individual feedback chips for each key\n    aggregator=\"average\",\n    \u002F\u002F Remove if evaluating a single structured output. This aggregates the feedback keys across elements of the list. Can be \"average\" or \"all\". Defaults to \"all\". \"all\" returns 1 if each element of the list is 1; if any score is not 1, it returns 0. \"average\" returns the average of the scores from each element. \n    list_aggregator=\"all\",\n    \u002F\u002F The criteria for the LLM judge to use for each key you want evaluated by the LLM\n    rubric={\n        a: \"Does the answer mention all the fruits in the reference answer?\"\n    },\n    \u002F\u002F The keys to ignore during evaluation. Any key not passed here or in `rubric` will be evaluated using an exact match comparison to the reference outputs\n    exclude_keys=[\"c\"],\n    \u002F\u002F The provider and name of the model to use\n    judge: client,\n    model: \"openai:gpt-5.4\",\n    \u002F\u002F Whether to use reasoning to reason about the keys in `rubric`. Defaults to True\n    useReasoning: true\n})\n\n\u002F\u002F Invoke the evaluator with the outputs and reference outputs\nconst result = await evaluator({\n    outputs,\n    reference_outputs,\n})\n\nconsole.log(result)\n```\nFor the first element, \"a\" will be 1  since both Mango and Bananas are in the reference output, \"b\" will be 1 and \"d\" will be 0. The aggregator will return an average score of 0.6. \nFor the second element, \"a\" will be 0 since the reference output doesn't mention all the fruits in the output,  \"b\" will be 1. The aggregator will return a score of 0.5. \nTherefore, the list aggregator will return a final score of 0. \n\n```\n{\n  'key': 'json_match:a',\n  'score': 0,\n  'comment': None\n}\n```\n\n\u003C\u002Fdetails>\n\n## Code\n\nOpenEvals contains some useful prebuilt evaluators for evaluating generated code:\n\n- Type-checking generated code with [Pyright](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fpyright) and [Mypy](https:\u002F\u002Fgithub.com\u002Fpython\u002Fmypy) (Python-only) or TypeScript's built-in type checker (JavaScript only)\n  - Note that these local type-checking evaluators will not install any dependencies and will ignore errors for these imports\n- Sandboxed type-checking and execution evaluators that use [E2B](https:\u002F\u002Fe2b.dev\u002F) to install dependencies and run generated code securely\n- LLM-as-a-judge for code\n\nAll evaluators in this section accept `outputs` as a string, an object with a key `\"messages\"` that contains a list of messages, or a message-like object with a key `\"content\"` that contains a string.\n\n### Extracting code outputs\n\nSince LLM outputs with code may contain other text (for example, interleaved explanations with code), OpenEvals code evaluators share some built-in extraction methods for identifying just the code from of LLM outputs.\n\nFor any of the evaluators in this section, you can either pass a `code_extraction_strategy` param set to `llm`, which will use an `llm` with a default prompt to directly extract code, or `markdown_code_blocks`, which will extract anything in markdown code blocks (triple backticks) that is not marked with `bash` or other shell command languages. If extraction fails for one of these methods, the evaluator response will include a `metadata.code_extraction_failed` field set to `True`.\n\nYou can alternatively pass a `code_extractor` param set to a function that takes an LLM output and returns a string of code. The default is to leave the output content untouched (`\"none\"`).\n\nIf using `code_extraction_strategy=\"llm\"`, you can also pass a `model` string or a `client` to the evaluator to set which evaluator the model uses for code extraction.\nIf you would like to customize the prompt, you should use the `code_extractor` param instead.\n\n### Pyright (Python-only)\n\nFor Pyright, you will need to install the `pyright` CLI on your system:\n\n```bash\npip install pyright\n```\n\nYou can find full installation instructions [here](https:\u002F\u002Fmicrosoft.github.io\u002Fpyright\u002F#\u002Finstallation?id=command-line).\n\nThen, you can use it as follows:\n\n```python\nfrom openevals.code.pyright import create_pyright_evaluator\n\nevaluator = create_pyright_evaluator()\n\nCODE = \"\"\"\ndef sum_of_two_numbers(a, b): return a + b\n\"\"\"\n\nresult = evaluator(outputs=CODE)\n\nprint(result)\n```\n\n```\n{\n    'key': 'pyright_succeeded',\n    'score': True,\n    'comment': None,\n}\n```\n\n> [!WARNING]\n> The evaluator will ignore `reportMissingImports` errors. If you want to run type-checking over generated dependencies, check out the [sandboxed version](#sandbox-pyright-python-only) of this evaluator.\n\nYou can also pass `pyright_cli_args` to the evaluator to customize the arguments passed to the `pyright` CLI:\n\n```python\nevaluator = create_pyright_evaluator(\n    pyright_cli_args=[\"--flag\"]\n)\n```\n\nFor a full list of supported arguments, see the [pyright CLI documentation](https:\u002F\u002Fmicrosoft.github.io\u002Fpyright\u002F#\u002Fcommand-line).\n\n### Mypy (Python-only)\n\nFor Mypy, you will need to install `mypy` on your system:\n\n```bash\npip install mypy\n```\n\nYou can find full installation instructions [here](https:\u002F\u002Fmypy.readthedocs.io\u002Fen\u002Fstable\u002Fgetting_started.html).\n\nThen, you can use it as follows:\n\n```python\nfrom openevals.code.mypy import create_mypy_evaluator\n\nevaluator = create_mypy_evaluator()\n\nCODE = \"\"\"\ndef sum_of_two_numbers(a, b): return a + b\n\"\"\"\n\nresult = evaluator(outputs=CODE)\n\nprint(result)\n```\n\n```\n{\n    'key': 'mypy_succeeded',\n    'score': True,\n    'comment': None,\n}\n```\n\nBy default, this evaluator will run with the following arguments:\n\n```\nmypy --no-incremental --disallow-untyped-calls --disallow-incomplete-defs --ignore-missing-imports\n```\n\nBut you can pass `mypy_cli_args` to the evaluator to customize the arguments passed to the `mypy` CLI. This will override the default arguments:\n\n```python\nevaluator = create_mypy_evaluator(\n    mypy_cli_args=[\"--flag\"]\n)\n```\n\n### TypeScript type-checking (TypeScript-only)\n\nThe TypeScript evaluator uses TypeScript's type checker to check the code for correctness.\n\nYou will need to install `typescript` on your system as a dependency (not a dev dependency!):\n\n```bash\nnpm install typescript\n```\n\nThen, you can use it as follows (note that you should import from the `openevals\u002Fcode\u002Ftypescript` entrypoint due to the additional required dependency):\n\n```ts\nimport { createTypeScriptEvaluator } from \"openevals\u002Fcode\u002Ftypescript\";\n\nconst evaluator = createTypeScriptEvaluator();\n\nconst result = await evaluator({\n    outputs: \"function add(a, b) { return a + b; }\",\n});\n\nconsole.log(result);\n```\n\n```\n{\n    'key': 'typescript_succeeded',\n    'score': True,\n    'comment': None,\n}\n```\n\n> [!WARNING]\n> The evaluator will ignore `reportMissingImports` errors. If you want to run type-checking over generated dependencies, check out the [sandboxed version](#sandbox-typescript-typescript-only) of this evaluator.\n\n### LLM-as-judge for code\n\nOpenEvals includes a prebuilt LLM-as-a-judge evaluator for code. The primary differentiator between this one and the more generic [LLM-as-judge evaluator](#llm-as-judge) is that it will perform the extraction steps detailed above - otherwise it takes the same arguments, including a prompt.\n\nYou can run an LLM-as-a-judge evaluator for code as follows:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.code.llm import create_code_llm_as_judge\nfrom openevals.prompts import CODE_CORRECTNESS_PROMPT\n\nllm_as_judge = create_code_llm_as_judge(\n    prompt=CODE_CORRECTNESS_PROMPT,\n    model=\"openai:gpt-5.4\",\n    code_extraction_strategy=\"markdown_code_blocks\",\n)\n\n\nINPUTS = \"\"\"\nRewrite the code below to be async:\n\n\\`\\`\\`python\ndef _run_mypy(\n    *,\n    filepath: str,\n    mypy_cli_args: list[str],\n) -> Tuple[bool, str]:\n    result = subprocess.run(\n        [\n            \"mypy\",\n            *mypy_cli_args,\n            filepath,\n        ],\n        capture_output=True,\n    )\n    return _parse_mypy_output(result.stdout)\n\\`\\`\\`\n\"\"\"\n\nOUTPUTS = \"\"\"\n\\`\\`\\`python\nasync def _run_mypy_async(\n    *,\n    filepath: str,\n    mypy_cli_args: list[str],\n) -> Tuple[bool, str]:\n    process = await subprocess.run(\n        [\n            \"mypy\",\n            *mypy_cli_args,\n            filepath,\n        ],\n    )\n    stdout, _ = await process.communicate()\n\n    return _parse_mypy_output(stdout)\n\\`\\`\\`\n\"\"\"\n\neval_result = llm_as_judge(\n    inputs=INPUTS,\n    outputs=OUTPUTS\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'code_correctness',\n    'score': False,\n    'comment': \"The provided async code is incorrect. It still incorrectly attempts to use 'await subprocess.run' which is synchronous and does not support being awaited. The proper asynchronous approach would be to use 'asyncio.create_subprocess_exec' (or a similar asyncio API) with appropriate redirection of stdout (e.g., stdout=asyncio.subprocess.PIPE) and then await the 'communicate()' call. Thus, the code does not meet the requirements completely as specified, and there is a significant error which prevents it from working correctly. Thus, the score should be: false.\",\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createCodeLLMAsJudge, CODE_CORRECTNESS_PROMPT } from \"openevals\";\n\nconst evaluator = createCodeLLMAsJudge({\n  prompt: CODE_CORRECTNESS_PROMPT,\n  model: \"openai:gpt-5.4\",\n});\n\nconst inputs = `Add proper TypeScript types to the following code:\n\n\\`\\`\\`typescript\nfunction add(a, b) { return a + b; }\n\\`\\`\\`\n`;\n\nconst outputs = `\n\\`\\`\\`typescript\nfunction add(a: number, b: number): boolean {\n  return a + b;\n}\n\\`\\`\\`\n`;\n\nconst evalResult = await evaluator({ inputs, outputs });\n\nconsole.log(evalResult);\n```\n\n```\n{\n  \"key\": \"code_correctness\",\n  \"score\": false,\n  \"comment\": \"The code has a logical error in its type specification. The function is intended to add two numbers and return their sum, so the return type should be number, not boolean. This mistake makes the solution incorrect according to the rubric. Thus, the score should be: false.\"\n}\n```\n\n\u003C\u002Fdetails>\n\n## Sandboxed code\n\nLLMs can generate arbitrary code, and if you are running a code evaluator locally, you may not wish to install generated dependencies or run this arbitrary code locally. To solve this, OpenEvals integrates with [E2B](https:\u002F\u002Fe2b.dev) to run some code evaluators in isolated sandboxes.\n\nGiven some output code from an LLM, these sandboxed code evaluators will run scripts in a sandbox that parse out dependencies and install them so that the evaluator has proper context for type-checking or execution.\n\nThese evaluators all require a `sandbox` parameter upon creation, and also accept the code extraction parameters present in the other [code evaluators](#extracting-code-outputs). For Python, there is a special `OpenEvalsPython` template that includes `pyright` and `uv` preinstalled for faster execution, though the evaluator will work with any sandbox.\n\nIf you have a custom sandbox with dependencies pre-installed or files already set up, you can supply a `sandbox_project_directory` (Python) or `sandboxProjectDirectory` (TypeScript) param when calling the appropriate `create` method to customize the folder in which type-checking\u002Fexecution runs.\n\n### Sandbox Pyright (Python-only)\n\nYou can also run Pyright type-checking in an [E2B](https:\u002F\u002Fe2b.dev) sandbox. The evaluator will run a script to parse out package names\nfrom generated code, then will install those packages in the sandbox and will run Pyright. The evaluator will return any analyzed errors in its comment.\n\nYou will need to install the `e2b-code-interpreter` package, available as an extra:\n\n```bash\npip install openevals[\"e2b-code-interpreter\"]\n```\n\nThen, you will need to set your E2B API key as an environment variable:\n\n```\nexport E2B_API_KEY=\"YOUR_KEY_HERE\"\n```\n\nThen, you will need to initialize an E2B sandbox. There is a special `OpenEvalsPython` template that includes `pyright` and `uv` preinstalled for faster execution, though the evaluator will work with any sandbox:\n\n```python\nfrom e2b_code_interpreter import Sandbox\n\n# E2B template with uv and pyright preinstalled\nsandbox = Sandbox(\"OpenEvalsPython\")\n```\n\nFinally, pass that created sandbox into the `create_e2b_pyright_evaluator` factory function and run it:\n\n```python\nfrom openevals.code.e2b.pyright import create_e2b_pyright_evaluator\n\nevaluator = create_e2b_pyright_evaluator(\n    sandbox=sandbox,\n)\n\nCODE = \"\"\"\nfrom typing import Annotated\n\nfrom typing_extensions import TypedDict\n\nfrom langgraph.graph import StateGraph, START, END\nfrom langgraph.graph.message import add_messages\n\n\nclass State(TypedDict):\n    messages: Annotated[list, add_messages]\n\nbuilder = StateGraph(State)\nbuilder.add_node(\"start\", lambda state: state)\nbuilder.compile()\n\nbuilder.invoke({})\n\"\"\"\n\neval_result = evaluator(outputs=CODE)\n\nprint(eval_result)\n```\n\n```\n{\n  'key': 'pyright_succeeded',\n  'score': false,\n  'comment': '[{\"severity\": \"error\", \"message\": \"Cannot access attribute \"invoke\" for class \"StateGraph\"...}]',\n}\n```\n\nAbove, the evaluator identifies and installs the `langgraph` package inside the sandbox, then runs `pyright`. The type-check fails because the provided code misuses the imported package, invoking the builder rather than the compiled graph.\n\n### Sandbox TypeScript type-checking (TypeScript-only)\n\nYou can also run TypeScript type-checking in an [E2B](https:\u002F\u002Fe2b.dev) sandbox. The evaluator will run a script to parse out package names\nfrom generated code, then will install those packages in the sandbox and will run TypeScript. The evaluator will return any analyzed errors in its comment.\n\nYou will need to install the official `@e2b\u002Fcode-interpreter` package as a peer dependency:\n\n```bash\nnpm install @e2b\u002Fcode-interpreter\n```\n\nThen, you will need to set your E2B API key as an environment variable:\n\n```\nprocess.env.E2B_API_KEY=\"YOUR_KEY_HERE\"\n```\n\nNext, initialize an E2B sandbox:\n\n```ts\nimport { Sandbox } from \"@e2b\u002Fcode-interpreter\";\n\nconst sandbox = await Sandbox.create();\n```\n\nAnd finally, pass the sandbox into the `createE2BTypeScriptEvaluator` and run it:\n\n```ts\nimport { createE2BTypeScriptEvaluator } from \"openevals\u002Fcode\u002Fe2b\";\n\nconst evaluator = createE2BTypeScriptEvaluator({\n  sandbox,\n});\n\nconst CODE = `\nimport { StateGraph } from '@langchain\u002Flanggraph';\n\nawait StateGraph.invoke({})\n`;\n\nconst evalResult = await evaluator({ outputs: CODE });\n\nconsole.log(evalResult);\n```\n\n```\n{\n  \"key\": \"typescript_succeeded\",\n  \"score\": false,\n  \"comment\": \"(3,18): Property 'invoke' does not exist on type 'typeof StateGraph'.\"\n}\n```\n\nAbove, the evaluator identifies and installs `@langchain\u002Flanggraph`, then runs a type-check via TypeScript. The type-check fails because the provided code misuses the imported package.\n\n### Sandbox Execution\n\nTo further evaluate code correctness, OpenEvals has a sandbox execution evaluator that runs generated code in an [E2B](https:\u002F\u002Fe2b.dev) sandbox.\n\nThe evaluator will run a script to parse out package names from generated code, then will install those packages in the sandbox. The evaluator will then attempt to run the generated code return any analyzed errors in its comment.\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\nYou will need to install the `e2b-code-interpreter` package, available as an extra:\n\n```bash\npip install openevals[\"e2b-code-interpreter\"]\n```\n\nThen, you will need to set your E2B API key as an environment variable:\n\n```\nexport E2B_API_KEY=\"YOUR_KEY_HERE\"\n```\n\nThen, you will need to initialize an E2B sandbox. There is a special `OpenEvalsPython` template that includes `pyright` and `uv` preinstalled for faster execution, though the evaluator will work with any sandbox:\n\n```python\nfrom e2b_code_interpreter import Sandbox\n\n# E2B template with uv and pyright preinstalled\nsandbox = Sandbox(\"OpenEvalsPython\")\n```\n\nThen pass the sandbox to the `create_e2b_execution_evaluator` factory function and run the result:\n\n```python\nfrom openevals.code.e2b.execution import create_e2b_execution_evaluator\n\nevaluator = create_e2b_execution_evaluator(\n    sandbox=sandbox,\n)\n\nCODE = \"\"\"\nfrom typing import Annotated\n\nfrom typing_extensions import TypedDict\n\nfrom langgraph.graph import StateGraph, START, END\nfrom langgraph.graph.message import add_messages\n\n\nclass State(TypedDict):\n    messages: Annotated[list, add_messages]\n\nbuilder = StateGraph(State)\nbuilder.add_node(\"start\", lambda state: state)\nbuilder.compile()\n\nbuilder.invoke({})\n\"\"\"\n\neval_result = evaluator(outputs=CODE)\n\nprint(eval_result)\n```\n\n```\n{\n  'key': 'execution_succeeded',\n  'score': False,\n  'comment': '\"Command exited with code 1 and error:\\nTraceback (most recent call last):\\n  File \\\"\u002Fhome\u002Fuser\u002Fopenevals\u002Foutputs.py\\\", line 15, in \u003Cmodule>\\n    builder.compile()\\n  File \\\"\u002Fhome\u002Fuser\u002Fopenevals\u002F.venv\u002Flib\u002Fpython3.10\u002Fsite-packages\u002Flanggraph\u002Fgraph\u002Fstate.py\\\", line 602, in compile\\n    self.validate(\\n  File \\\"\u002Fhome\u002Fuser\u002Fopenevals\u002F.venv\u002Flib\u002Fpython3.10\u002Fsite-packages\u002Flanggraph\u002Fgraph\u002Fgraph.py\\\", line 267, in validate\\n    raise ValueError(\\nValueError: Graph must have an entrypoint: add at least one edge from START to another node\\n\"'\n}\n```\n\nAbove, the evaluator identifies and installs `langgraph`, then attempts to execute the code. The type-check fails because the provided code misuses the imported package.\n\nIf desired, you can pass an `environment_variables` dict when creating the evaluator. Generated code will  have access to these variables within the sandbox, but be cautious, as there is no way to predict exactly what code an LLM will generate.\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\nYou will need to install the official `@e2b\u002Fcode-interpreter` package as a peer dependency:\n\n```bash\nnpm install @e2b\u002Fcode-interpreter\n```\n\nThen, you will need to set your E2B API key as an environment variable:\n\n```\nprocess.env.E2B_API_KEY=\"YOUR_KEY_HERE\"\n```\n\nNext, initialize an E2B sandbox:\n\n```ts\nimport { Sandbox } from \"@e2b\u002Fcode-interpreter\";\n\nconst sandbox = await Sandbox.create();\n```\n\nAnd finally, pass the sandbox into the `create` and run it:\n\n```ts\nimport { createE2BExecutionEvaluator } from \"openevals\u002Fcode\u002Fe2b\";\n\nconst evaluator = createE2BExecutionEvaluator({\n  sandbox,\n});\n\nconst CODE = `\nimport { Annotation, StateGraph } from '@langchain\u002Flanggraph';\n\nconst StateAnnotation = Annotation.Root({\n  joke: Annotation\u003Cstring>,\n  topic: Annotation\u003Cstring>,\n});\n\nconst graph = new StateGraph(StateAnnotation)\n  .addNode(\"joke\", () => ({}))\n  .compile();\n  \nawait graph.invoke({\n  joke: \"foo\",\n  topic: \"history\",\n});\n`;\n\nconst evalResult = await evaluator({ outputs });\n\nconsole.log(evalResult);\n```\n\n```\n{\n  \"key\": \"execution_succeeded\",\n  \"score\": false,\n  \"comment\": \"file:\u002F\u002F\u002Fhome\u002Fuser\u002Fopenevals\u002Fnode_modules\u002F@langchain\u002Flanggraph\u002Fdist\u002Fgraph\u002Fstate.js:197\\n            throw new Error(`${key} is already being used as a state attribute (a.k.a. a channel), cannot also be used as a node name.`);\\n                  ^\\n\\nError: joke is already being used as a state attribute (a.k.a. a channel), cannot also be used as a node name.\\n    at StateGraph.addNode (\u002Fhome\u002Fuser\u002Fopenevals\u002Fnode_modules\u002F@langchain\u002Flanggraph\u002Fsrc\u002Fgraph\u002Fstate.ts:292:13)\\n    at \u003Canonymous> (\u002Fhome\u002Fuser\u002Fopenevals\u002Foutputs.ts:9:4)\\n    at ModuleJob.run (node:internal\u002Fmodules\u002Fesm\u002Fmodule_job:195:25)\\n    at async ModuleLoader.import (node:internal\u002Fmodules\u002Fesm\u002Floader:336:24)\\n    at async loadESM (node:internal\u002Fprocess\u002Fesm_loader:34:7)\\n    at async handleMainPromise (node:internal\u002Fmodules\u002Frun_main:106:12)\\n\\nNode.js v18.19.0\\n\"\n}\n```\n\nAbove, the evaluator identifies and installs `@langchain\u002Flanggraph`, then attempts to execute the code. The type-check fails because the provided code misuses the imported package.\n\nIf desired, you can pass an `environmentVariables` object when creating the evaluator. Generated code will  have access to these variables within the sandbox, but be cautious, as there is no way to predict exactly what code an LLM will generate.\n\n\u003C\u002Fdetails>\n\n## Agent trajectory\n\nIf you are building an agent, `openevals` includes evaluators for assessing the entire **trajectory** of an agent's execution — the sequence of messages and tool calls it makes while solving a task.\n\nTrajectories should be formatted as lists of [OpenAI-style messages](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fapi-reference\u002Fmessages). LangChain `BaseMessage` instances are also supported.\n\n### Trajectory match\n\n`create_trajectory_match_evaluator`\u002F`createTrajectoryMatchEvaluator` compares an agent's trajectory against a reference trajectory. You can set `trajectory_match_mode`\u002F`trajectoryMatchMode` to one of four modes:\n\n- `\"strict\"` — same tool calls in the same order\n- `\"unordered\"` — same tool calls in any order\n- `\"subset\"` — output tool calls are a subset of reference\n- `\"superset\"` — output tool calls are a superset of reference\n\n#### Strict match\n\nThe `\"strict\"` mode compares two trajectories and ensures that they contain the same messages in the same order with the same tool calls. Note that it does allow for differences in message content (e.g. `\"SF\"` vs. `\"San Francisco\"`):\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nfrom openevals import create_trajectory_match_evaluator\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"What is the weather in SF?\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\n                \"function\": {\n                    \"name\": \"get_weather\",\n                    \"arguments\": json.dumps({\"city\": \"San Francisco\"}),\n                }\n            },\n            {\n                \"function\": {\n                    \"name\": \"accuweather_forecast\",\n                    \"arguments\": json.dumps({\"city\": \"San Francisco\"}),\n                }\n            }\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"It's 80 degrees and sunny in SF.\"},\n    {\"role\": \"assistant\", \"content\": \"The weather in SF is 80 degrees and sunny.\"},\n]\nreference_outputs = [\n    {\"role\": \"user\", \"content\": \"What is the weather in San Francisco?\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\n                \"function\": {\n                    \"name\": \"get_weather\",\n                    \"arguments\": json.dumps({\"city\": \"San Francisco\"}),\n                }\n            }\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"It's 80 degrees and sunny in San Francisco.\"},\n    {\"role\": \"assistant\", \"content\": \"The weather in SF is 80˚ and sunny.\"},\n]\n\nevaluator = create_trajectory_match_evaluator(trajectory_match_mode=\"strict\")\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\nprint(result)\n```\n\n```\n{'key': 'trajectory_strict_match', 'score': False, 'comment': None}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  createTrajectoryMatchEvaluator,\n  type FlexibleChatCompletionMessage,\n} from \"openevals\";\n\nconst outputs = [\n  { role: \"user\", content: \"What is the weather in SF?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{\n      function: {\n        name: \"get_weather\",\n        arguments: JSON.stringify({ city: \"San Francisco\" }),\n      },\n    }, {\n      function: {\n        name: \"accuweather_forecast\",\n        arguments: JSON.stringify({ city: \"San Francisco\" }),\n      },\n    }],\n  },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in SF.\" },\n  { role: \"assistant\", content: \"The weather in SF is 80 degrees and sunny.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst referenceOutputs = [\n  { role: \"user\", content: \"What is the weather in San Francisco?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{\n      function: {\n        name: \"get_weather\",\n        arguments: JSON.stringify({ city: \"San Francisco\" }),\n      },\n    }],\n  },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in San Francisco.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: \"strict\" });\nconst result = await evaluator({ outputs, referenceOutputs });\nconsole.log(result);\n```\n\n```\n{ key: 'trajectory_strict_match', score: false }\n```\n\u003C\u002Fdetails>\n\n`\"strict\"` is useful if you want to ensure that tools are always called in the same order for a given query (e.g. a policy lookup tool before a tool that requests time off for an employee).\n\n**Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#tool-args-match-modes).\n\n#### Unordered match\n\nThe `\"unordered\"` mode compares two trajectories and ensures that they contain the same tool calls in any order. This is useful if you want to allow flexibility in how an agent obtains the proper information, but still do care that all information was retrieved.\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nfrom openevals import create_trajectory_match_evaluator\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"What is the weather in SF and is there anything fun happening?\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [{\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"San Francisco\"})}}],\n    },\n    {\"role\": \"tool\", \"content\": \"It's 80 degrees and sunny in SF.\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [{\"function\": {\"name\": \"get_fun_activities\", \"arguments\": json.dumps({\"city\": \"San Francisco\"})}}],\n    },\n    {\"role\": \"tool\", \"content\": \"Nothing fun is happening, you should stay indoors and read!\"},\n    {\"role\": \"assistant\", \"content\": \"The weather in SF is 80 degrees and sunny, but there is nothing fun happening.\"},\n]\nreference_outputs = [\n    {\"role\": \"user\", \"content\": \"What is the weather in SF and is there anything fun happening?\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_fun_activities\", \"arguments\": json.dumps({\"city\": \"San Francisco\"})}},\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"San Francisco\"})}},\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"Nothing fun is happening, you should stay indoors and read!\"},\n    {\"role\": \"tool\", \"content\": \"It's 80 degrees and sunny in SF.\"},\n    {\"role\": \"assistant\", \"content\": \"In SF, it's 80˚ and sunny, but there is nothing fun happening.\"},\n]\n\nevaluator = create_trajectory_match_evaluator(trajectory_match_mode=\"unordered\")\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\nprint(result)\n```\n\n```\n{'key': 'trajectory_unordered_match', 'score': True, 'comment': None}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  createTrajectoryMatchEvaluator,\n  type FlexibleChatCompletionMessage,\n} from \"openevals\";\n\nconst outputs = [\n  { role: \"user\", content: \"What is the weather in SF and is there anything fun happening?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"San Francisco\" }) } }],\n  },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in SF.\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_fun_activities\", arguments: JSON.stringify({ city: \"San Francisco\" }) } }],\n  },\n  { role: \"tool\", content: \"Nothing fun is happening, you should stay indoors and read!\" },\n  { role: \"assistant\", content: \"The weather in SF is 80 degrees and sunny, but there is nothing fun happening.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst referenceOutputs = [\n  { role: \"user\", content: \"What is the weather in SF and is there anything fun happening?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [\n      { function: { name: \"get_fun_activities\", arguments: JSON.stringify({ city: \"San Francisco\" }) } },\n      { function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"San Francisco\" }) } },\n    ],\n  },\n  { role: \"tool\", content: \"Nothing fun is happening, you should stay indoors and read!\" },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in SF.\" },\n  { role: \"assistant\", content: \"In SF, it's 80˚ and sunny, but there is nothing fun happening.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: \"unordered\" });\nconst result = await evaluator({ outputs, referenceOutputs });\nconsole.log(result);\n```\n\n```\n{ key: 'trajectory_unordered_match', score: true }\n```\n\u003C\u002Fdetails>\n\n`\"unordered\"` is useful if you want to ensure that specific tools are called at some point in the trajectory, but you don't necessarily need them to be in message order.\n\n**Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#tool-args-match-modes).\n\n#### Subset and superset match\n\nThe `\"subset\"` and `\"superset\"` modes match partial trajectories, ensuring that a trajectory contains a subset\u002Fsuperset of tool calls contained in a reference trajectory.\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nfrom openevals import create_trajectory_match_evaluator\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"What is the weather in SF and London?\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"SF and London\"})}},\n            {\"function\": {\"name\": \"accuweather_forecast\", \"arguments\": json.dumps({\"city\": \"SF and London\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"It's 80 degrees and sunny in SF, and 90 degrees and rainy in London.\"},\n    {\"role\": \"tool\", \"content\": \"Unknown.\"},\n    {\"role\": \"assistant\", \"content\": \"The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy.\"},\n]\nreference_outputs = [\n    {\"role\": \"user\", \"content\": \"What is the weather in SF and London?\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"SF and London\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"It's 80 degrees and sunny in San Francisco, and 90 degrees and rainy in London.\"},\n    {\"role\": \"assistant\", \"content\": \"The weather in SF is 80˚ and sunny. In London, it's 90˚ and rainy.\"},\n]\n\nevaluator = create_trajectory_match_evaluator(trajectory_match_mode=\"superset\")  # or \"subset\"\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\nprint(result)\n```\n\n```\n{'key': 'trajectory_superset_match', 'score': True, 'comment': None}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  createTrajectoryMatchEvaluator,\n  type FlexibleChatCompletionMessage,\n} from \"openevals\";\n\nconst outputs = [\n  { role: \"user\", content: \"What is the weather in SF and London?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [\n      { function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"SF and London\" }) } },\n      { function: { name: \"accuweather_forecast\", arguments: JSON.stringify({ city: \"SF and London\" }) } },\n    ],\n  },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in SF, and 90 degrees and rainy in London.\" },\n  { role: \"tool\", content: \"Unknown.\" },\n  { role: \"assistant\", content: \"The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst referenceOutputs = [\n  { role: \"user\", content: \"What is the weather in SF and London?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [\n      { function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"SF and London\" }) } },\n    ],\n  },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in San Francisco, and 90 degrees and rainy in London.\" },\n  { role: \"assistant\", content: \"The weather in SF is 80˚ and sunny. In London, it's 90˚ and rainy.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: \"superset\" }); \u002F\u002F or \"subset\"\nconst result = await evaluator({ outputs, referenceOutputs });\nconsole.log(result);\n```\n\n```\n{ key: 'trajectory_superset_match', score: true }\n```\n\u003C\u002Fdetails>\n\n`\"superset\"` is useful if you want to ensure that some key tools were called at some point in the trajectory, but an agent calling extra tools is still acceptable. `\"subset\"` is the inverse and is useful if you want to ensure that the agent did not call any tools beyond the expected ones.\n\n#### Tool args match modes\n\nWhen checking equality between tool calls, the above evaluators will require that all tool call arguments are the exact same by default. You can configure this behavior in the following ways:\n\n- Treating any two tool calls for the same tool as equivalent by setting `tool_args_match_mode=\"ignore\"` (Python) or `toolArgsMatchMode: \"ignore\"` (TypeScript)\n- Treating a tool call as equivalent if it contains a subset\u002Fsuperset of args compared to a reference tool call of the same name with `tool_args_match_mode=\"subset\"\u002F\"superset\"` (Python) or `toolArgsMatchMode: \"subset\"\u002F\"superset\"` (TypeScript)\n- Setting custom matchers for all calls of a given tool using the `tool_args_match_overrides` (Python) or `toolArgsMatchOverrides` (TypeScript) param\n\n`tool_args_match_overrides`\u002F`toolArgsMatchOverrides` takes a dictionary whose keys are tool names and whose values are either `\"exact\"`, `\"ignore\"`, `\"subset\"`, `\"superset\"`, a list of field paths that must match exactly, or a comparator function:\n\nHere's an example that allows case insensitivity for the arguments to a tool named `get_weather`:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nfrom openevals import create_trajectory_match_evaluator\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"What is the weather in SF?\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"san francisco\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"It's 80 degrees and sunny in SF.\"},\n    {\"role\": \"assistant\", \"content\": \"The weather in SF is 80 degrees and sunny.\"},\n]\nreference_outputs = [\n    {\"role\": \"user\", \"content\": \"What is the weather in San Francisco?\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"San Francisco\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"It's 80 degrees and sunny in San Francisco.\"},\n    {\"role\": \"assistant\", \"content\": \"The weather in SF is 80˚ and sunny.\"},\n]\n\nevaluator = create_trajectory_match_evaluator(\n    trajectory_match_mode=\"strict\",\n    tool_args_match_mode=\"exact\",  \n    tool_args_match_overrides={\n        \"get_weather\": lambda x, y: x[\"city\"].lower() == y[\"city\"].lower()\n    }\n)\n\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\nprint(result)\n```\n\n```\n{'key': 'trajectory_strict_match', 'score': True, 'comment': None}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  createTrajectoryMatchEvaluator,\n  type FlexibleChatCompletionMessage,\n} from \"openevals\";\n\nconst outputs = [\n  { role: \"user\", content: \"What is the weather in SF?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"san francisco\" }) } }],\n  },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in SF.\" },\n  { role: \"assistant\", content: \"The weather in SF is 80 degrees and sunny.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst referenceOutputs = [\n  { role: \"user\", content: \"What is the weather in San Francisco?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"San Francisco\" }) } }],\n  },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in San Francisco.\" },\n  { role: \"assistant\", content: \"The weather in SF is 80˚ and sunny.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst evaluator = createTrajectoryMatchEvaluator({\n  trajectoryMatchMode: \"strict\",\n  toolArgsMatchOverrides: {\n    get_weather: (x, y) =>\n      typeof x.city === \"string\" &&\n      typeof y.city === \"string\" &&\n      x.city.toLowerCase() === y.city.toLowerCase(),\n  },\n});\n\nconst result = await evaluator({ outputs, referenceOutputs });\nconsole.log(result);\n```\n\n```\n{ key: 'trajectory_strict_match', score: true }\n```\n\u003C\u002Fdetails>\n\nThis flexibility allows you to handle cases where you want looser equality for LLM generated arguments (`\"san francisco\"` to equal `\"San Francisco\"`) for only specific tool calls.\n\n### Trajectory LLM-as-judge\n\n`create_trajectory_llm_as_judge`\u002F`createTrajectoryLLMAsJudge` uses an LLM to assess whether an agent's trajectory is accurate. Unlike the trajectory match evaluators, it doesn't require a reference trajectory. Use `TRAJECTORY_ACCURACY_PROMPT` for no-reference evaluation, or `TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE` to compare against a reference:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nfrom openevals import create_trajectory_llm_as_judge\nfrom openevals.prompts import TRAJECTORY_ACCURACY_PROMPT\n\nevaluator = create_trajectory_llm_as_judge(\n    prompt=TRAJECTORY_ACCURACY_PROMPT,\n    model=\"openai:gpt-5.4\",\n)\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"What is the weather in SF?\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"SF\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"It's 80 degrees and sunny in SF.\"},\n    {\"role\": \"assistant\", \"content\": \"The weather in SF is 80 degrees and sunny.\"},\n]\n\nresult = evaluator(outputs=outputs)\nprint(result)\n```\n\n```\n{'key': 'trajectory_accuracy', 'score': True, 'comment': 'The trajectory is accurate...'}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  createTrajectoryLLMAsJudge,\n  TRAJECTORY_ACCURACY_PROMPT,\n  type FlexibleChatCompletionMessage,\n} from \"openevals\";\n\nconst evaluator = createTrajectoryLLMAsJudge({\n  prompt: TRAJECTORY_ACCURACY_PROMPT,\n  model: \"openai:gpt-5.4\",\n});\n\nconst outputs = [\n  { role: \"user\", content: \"What is the weather in SF?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"SF\" }) } }],\n  },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in SF.\" },\n  { role: \"assistant\", content: \"The weather in SF is 80 degrees and sunny.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst result = await evaluator({ outputs });\nconsole.log(result);\n```\n\n```\n{ key: 'trajectory_accuracy', score: true, comment: 'The trajectory is accurate...' }\n```\n\u003C\u002Fdetails>\n\nIf you have a reference trajectory, use `TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE` and pass `reference_outputs`\u002F`referenceOutputs`:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nfrom openevals import create_trajectory_llm_as_judge\nfrom openevals.prompts import TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE\n\nevaluator = create_trajectory_llm_as_judge(\n    prompt=TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,\n    model=\"openai:gpt-5.4\",\n)\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"What is the weather in SF?\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"SF\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"It's 80 degrees and sunny in SF.\"},\n    {\"role\": \"assistant\", \"content\": \"The weather in SF is 80 degrees and sunny.\"},\n]\nreference_outputs = [\n    {\"role\": \"user\", \"content\": \"What is the weather in SF?\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"San Francisco\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"It's 80 degrees and sunny in San Francisco.\"},\n    {\"role\": \"assistant\", \"content\": \"The weather in SF is 80˚ and sunny.\"},\n]\n\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\nprint(result)\n```\n\n```\n{'key': 'trajectory_accuracy', 'score': True, 'comment': 'The provided agent trajectory is consistent with the reference...'}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  createTrajectoryLLMAsJudge,\n  TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,\n  type FlexibleChatCompletionMessage,\n} from \"openevals\";\n\nconst evaluator = createTrajectoryLLMAsJudge({\n  prompt: TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,\n  model: \"openai:gpt-5.4\",\n});\n\nconst outputs = [\n  { role: \"user\", content: \"What is the weather in SF?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"SF\" }) } }],\n  },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in SF.\" },\n  { role: \"assistant\", content: \"The weather in SF is 80 degrees and sunny.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst referenceOutputs = [\n  { role: \"user\", content: \"What is the weather in SF?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"San Francisco\" }) } }],\n  },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in San Francisco.\" },\n  { role: \"assistant\", content: \"The weather in SF is 80˚ and sunny.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst result = await evaluator({ outputs, referenceOutputs });\nconsole.log(result);\n```\n\n```\n{ key: 'trajectory_accuracy', score: true, comment: 'The provided agent trajectory is consistent with the reference...' }\n```\n\u003C\u002Fdetails>\n\n`create_trajectory_llm_as_judge`\u002F`createTrajectoryLLMAsJudge` takes the same parameters as [`create_llm_as_judge`](#llm-as-judge), including:\n\n- `continuous`: boolean — return a float score between 0 and 1 instead of boolean. Defaults to `False`\u002F`false`.\n- `choices`: list of floats — restrict the score to specific values.\n- `system`: string — prepend a system message to the judge prompt.\n- `few_shot_examples`\u002F`fewShotExamples`: list of example dicts appended to the prompt.\n\nFor LangGraph-specific graph trajectory evaluators, see the [`agentevals`](https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fagentevals) package.\n\n### Prebuilt trajectory and conversation prompts\n\n`openevals` includes several prebuilt prompts for evaluating agent trajectories and conversations. All prompts take `outputs` as a list of messages and are used with `create_llm_as_judge`\u002F`createLLMAsJudge`.\n\n#### Trajectory prompts\n\nThese prompts evaluate single-run agent tool call sequences.\n\n| Prompt | Parameters | What it evaluates |\n|--------|-----------|-------------------|\n| `TRAJECTORY_ACCURACY_PROMPT` | `outputs` | Whether the agent's overall trajectory accurately handles the task (see [above](#trajectory-llm-as-judge)) |\n| `TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE` | `outputs`, `reference_outputs` | Trajectory accuracy compared to a reference trajectory (see [above](#trajectory-llm-as-judge)) |\n| `TOOL_SELECTION_PROMPT` | `outputs` | Correctness of tool choices made during query resolution |\n\n#### Conversation prompts\n\nThese prompts evaluate multi-turn conversations between a user and an agent.\n\n| Prompt | Parameters | What it evaluates |\n|--------|-----------|-------------------|\n| `PERCEIVED_ERROR_PROMPT` | `outputs` | Whether the user's responses suggest the agent made a mistake |\n| `WINS_PROMPT` | `outputs` | Whether the user praised, thanked, or complimented the assistant |\n| `TASK_COMPLETION_PROMPT` | `outputs` | Whether all user requests made throughout the conversation were completed |\n| `KNOWLEDGE_RETENTION_PROMPT` | `outputs` | Whether the agent correctly retained and applied information introduced earlier in the conversation |\n| `USER_SATISFACTION_PROMPT` | `outputs` | Overall user satisfaction based on tone shifts and whether the core need was met |\n| `AGENT_TONE_PROMPT` | `outputs` | Consistency and appropriateness of the agent's tone throughout the conversation |\n| `LANGUAGE_DETECTION_PROMPT` | `outputs` | Primary language used by the human throughout the conversation |\n| `SUPPORT_INTENT_PROMPT` | `outputs` | Primary intent category of the user's request in a customer support conversation |\n\nHere's an example using `TASK_COMPLETION_PROMPT`:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import TASK_COMPLETION_PROMPT\n\nevaluator = create_llm_as_judge(\n    prompt=TASK_COMPLETION_PROMPT,\n    feedback_key=\"task_completion\",\n    model=\"openai:gpt-5.4\",\n)\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"Can you book a flight from NYC to Paris?\"},\n    {\"role\": \"assistant\", \"content\": \"I can provide information about flights, but I cannot actually book them for you.\"},\n    {\"role\": \"user\", \"content\": \"I asked you to book it, not just give me info. Can you please just do it?\"},\n    {\"role\": \"assistant\", \"content\": \"I understand your frustration but I'm unable to make bookings.\"},\n]\n\nresult = evaluator(outputs=outputs)\nprint(result)\n```\n\n```\n{'key': 'task_completion', 'score': False, 'comment': 'The user's request to book a flight was never fulfilled...'}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, TASK_COMPLETION_PROMPT } from \"openevals\";\n\nconst evaluator = createLLMAsJudge({\n  prompt: TASK_COMPLETION_PROMPT,\n  feedbackKey: \"task_completion\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst outputs = [\n  { role: \"user\", content: \"Can you book a flight from NYC to Paris?\" },\n  { role: \"assistant\", content: \"I can provide information about flights, but I cannot actually book them for you.\" },\n  { role: \"user\", content: \"I asked you to book it, not just give me info. Can you please just do it?\" },\n  { role: \"assistant\", content: \"I understand your frustration but I'm unable to make bookings.\" },\n];\n\nconst result = await evaluator({ outputs });\nconsole.log(result);\n```\n\n```\n{ key: 'task_completion', score: false, comment: 'The user's request to book a flight was never fulfilled...' }\n```\n\u003C\u002Fdetails>\n\nSince `LANGUAGE_DETECTION_PROMPT` should return a categorical language name rather than a boolean score, use it with a custom `output_schema` to capture the result:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom typing_extensions import TypedDict\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import LANGUAGE_DETECTION_PROMPT\n\nclass LanguageDetectionResult(TypedDict):\n    reasoning: str\n    detected_language: str\n\nevaluator = create_llm_as_judge(\n    prompt=LANGUAGE_DETECTION_PROMPT,\n    feedback_key=\"language_detection\",\n    model=\"openai:gpt-5.4\",\n    output_schema=LanguageDetectionResult,\n)\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"Hola, ¿cómo estás?\"},\n    {\"role\": \"assistant\", \"content\": \"¡Hola! Estoy bien, gracias. ¿En qué puedo ayudarte?\"},\n    {\"role\": \"user\", \"content\": \"Necesito ayuda con mi cuenta.\"},\n]\n\nresult = evaluator(outputs=outputs)\nprint(result)\n```\n\n```\n{'reasoning': 'The human is speaking in Spanish throughout the conversation.', 'detected_language': 'Spanish'}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { z } from \"zod\";\nimport { createLLMAsJudge, LANGUAGE_DETECTION_PROMPT } from \"openevals\";\n\nconst languageDetectionSchema = z.object({\n  reasoning: z.string(),\n  detected_language: z.string().describe(\"The detected language name in English\"),\n});\n\nconst evaluator = createLLMAsJudge({\n  prompt: LANGUAGE_DETECTION_PROMPT,\n  feedbackKey: \"language_detection\",\n  model: \"openai:gpt-5.4\",\n  outputSchema: languageDetectionSchema,\n});\n\nconst outputs = [\n  { role: \"user\", content: \"Hola, ¿cómo estás?\" },\n  { role: \"assistant\", content: \"¡Hola! Estoy bien, gracias. ¿En qué puedo ayudarte?\" },\n  { role: \"user\", content: \"Necesito ayuda con mi cuenta.\" },\n];\n\nconst result = await evaluator({ outputs });\nconsole.log(result);\n```\n\n```\n{ reasoning: 'The human is speaking in Spanish throughout the conversation.', detected_language: 'Spanish' }\n```\n\u003C\u002Fdetails>\n\n## Other\n\nThis package also contains prebuilt evaluators for calculating common metrics such as Levenshtein distance, exact match, etc. You can import and use them as follows:\n\n### Exact match\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.exact import exact_match\n\noutputs = {\"a\": 1, \"b\": 2}\nreference_outputs = {\"a\": 1, \"b\": 2}\nresult = exact_match(outputs=outputs, reference_outputs=reference_outputs)\n\nprint(result)\n```\n\n```\n{\n    'key': 'equal',\n    'score': True,\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { exactMatch } from \"openevals\";\n\nconst outputs = { a: 1, b: 2 };\nconst referenceOutputs = { a: 1, b: 2 };\nconst result = exactMatch(outputs, referenceOutputs);\n\nconsole.log(result);\n```\n\n```\n{\n    key: \"equal\",\n    score: true,\n}\n```\n\u003C\u002Fdetails>\n\n### Levenshtein distance\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.string.levenshtein import levenshtein_distance\n\noutputs = \"The correct answer\"\nreference_outputs = \"The correct answer\"\nresult = levenshtein_distance(\n    outputs=outputs, reference_outputs=reference_outputs,\n)\n\nprint(result)\n```\n\n```\n{\n    'key': 'levenshtein_distance',\n    'score': 0.0,\n    'comment': None,\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { levenshteinDistance } from \"openevals\";\n\nconst outputs = \"The correct answer\";\nconst referenceOutputs = \"The correct answer\";\nconst result = levenshteinDistance(outputs, referenceOutputs);\n\nconsole.log(result);\n```\n\n```\n{\n    key: \"levenshtein_distance\",\n    score: 0,\n}\n```\n\u003C\u002Fdetails>\n\n### Embedding similarity\n\nThis evaluator uses LangChain's [`init_embedding`](https:\u002F\u002Fpython.langchain.com\u002Fapi_reference\u002Flangchain\u002Fembeddings\u002Flangchain.embeddings.base.init_embeddings.html) method (for Python) or takes a LangChain embeddings client directly (for TypeScript) and calculates distance between two strings using cosine similarity.\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.string.embedding_similarity import create_embedding_similarity_evaluator\n\nevaluator = create_embedding_similarity_evaluator()\n\nresult = evaluator(\n    outputs=\"The weather is nice!\",\n    reference_outputs=\"The weather is very nice!\",\n)\n\nprint(result)\n```\n\n```\n{\n    'key': 'embedding_similarity',\n    'score': 0.9147273943905653,\n    'comment': None,\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createEmbeddingSimilarityEvaluator } from \"openevals\";\nimport { OpenAIEmbeddings } from \"@langchain\u002Fopenai\";\n\nconst evaluator = createEmbeddingSimilarityEvaluator({\n  embeddings: new OpenAIEmbeddings({ model: \"text-embedding-3-small\" }),\n});\n\nconst result = await evaluator(\n    outputs: \"The weather is nice!\",\n    referenceOutputs: \"The weather is very nice!\",\n);\n\nconsole.log(result);\n```\n\n```\n{\n    key: \"embedding_similarity\",\n    score: 0.9147273943905653,\n}\n```\n\u003C\u002Fdetails>\n\n## Creating your own\n\nIf there are metrics that you want to evaluate that are not covered by any of the above, you can create your own evaluator as well that interacts well with the rest of the `openevals` ecosystem.\n\n### Evaluator interface\n\nThe first thing to note that all evaluators should accept a subset of the following parameters:\n\n- `inputs`: The inputs to your app.\n- `outputs`: The outputs from your app.\n- `reference_outputs` (Python) or `referenceOutputs` (TypeScript): The reference outputs to evaluate against.\n\nThese parameters can be any value, but should always accept a dict of some kind.\n\nNot all evaluators will use all of these parameters, but they are there to ensure consistency across all evaluators.\nYour evaluator may take more parameters as well (e.g. for LLM-as-judge evaluators whose prompts can require additional variables), but for simplicity it's best to stick to the three listed above.\n\nIf your evaluator requires additional configuration, you should use a factory function to create your evaluator. These should be named `create_\u003Cevaluator_name>` (for example, `create_llm_as_judge`).\n\nThe return values should be a dict (or, if your evaluator evaluates multiple metrics, an list of dicts) with the following keys:\n\n- `key`: A string representing the name of the metric you are evaluating.\n- `score`: A boolean or number representing the score for the given key.\n- `comment`: A string representing the comment for the given key.\n\nAnd that's it! Those are the only restrictions.\n\n### Logging to LangSmith\n\nIf you are using LangSmith to track experiments, you should also wrap the internals of your evaluator in the `_run_evaluator`\u002F`_arun_evaluator` (Python) or `runEvaluator` (TypeScript) method. This ensures that the evaluator results are logged to LangSmith properly for supported runners.\n\nThis method takes a `scorer` function as part of its input that returns either:\n\n- A single boolean or number, representing the score for the given key.\n- A tuple that contains the score as its first element and a `comment` justifying the score as its second element.\n\n### Example\n\nHere's an example of how you might define a very simple custom evaluator. It only takes into account the outputs of your app and compares them against a regex pattern. It uses a factory function to create the evaluator, since `regex` is an extra param.\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nimport re\nfrom typing import Any\n\nfrom openevals.types import (\n    EvaluatorResult,\n    SimpleEvaluator,\n)\nfrom openevals.utils import _run_evaluator\n\n\ndef create_regex_evaluator(\n    *, regex: str\n) -> SimpleEvaluator:\n    \"\"\"\n    Matches a regex pattern against the output.\n\n    Args:\n        regex (str): The regex pattern to match against the output.\n\n    Returns:\n        EvaluatorResult\n    \"\"\"\n\n    regex = re.compile(regex)\n\n    # Tolerate `inputs` and `reference_outputs` as kwargs, though they're unused\n    def wrapped_evaluator(\n        *, outputs: Any, **kwargs: Any\n    ) -> EvaluatorResult:\n\n        # Tolerate `outputs` being a dict, but convert to string for regex matching\n        if not isinstance(outputs, str):\n            outputs = json.dumps(outputs)\n\n        def get_score():\n            return regex.match(outputs) is not None\n\n        res = _run_evaluator(\n            run_name=\"regex_match\",\n            scorer=get_score,\n            feedback_key=\"regex_match\",\n        )\n        return res\n\n    return wrapped_evaluator\n```\n\n```python\nevaluator = create_regex_evaluator(regex=r\"some string\")\nresult = evaluator(outputs=\"this contains some string\")\n```\n\n```\n{\n    'key': 'regex_match',\n    'score': True,\n    'comment': None,\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { EvaluatorResult } from \"openevals\u002Ftypes\";\nimport { _runEvaluator } from \"openevals\u002Futils\";\n\n\u002F**\n * Creates an evaluator that compares the actual output and reference output for similarity by text embedding distance.\n * @param {Object} options - The configuration options\n * @param {Embeddings} options.embeddings - The embeddings model to use for similarity comparison\n * @param {('cosine'|'dot_product')} [options.algorithm='cosine'] - The algorithm to use for embedding similarity\n * @returns An evaluator that returns a score representing the embedding similarity\n *\u002F\nexport const createRegexEvaluator = ({\n  regex,\n}: {\n  regex: RegExp;\n}) => {\n  return async (params: {\n    outputs: string | Record\u003Cstring, unknown>;\n  }): Promise\u003CEvaluatorResult> => {\n    const { outputs } = params;\n\n    \u002F\u002F Tolerate `outputs` being an object, but convert to string for regex matching\n    const outputString =\n      typeof outputs === \"string\" ? outputs : JSON.stringify(outputs);\n\n    const getScore = async (): Promise\u003Cboolean> => {\n      return regex.test(outputString);\n    };\n\n    return _runEvaluator(\n      \"regex_match\",\n      getScore,\n      \"regex_match\"\n    );\n  };\n};\n```\n\n```ts\nconst evaluator = createRegexEvaluator({\n  regex: \u002Fsome string\u002F,\n});\n\nconst result = await evaluator({ outputs: \"this text contains some string\" });\n```\n\n```\n{\n  key: \"regex_match\",\n  score: true,\n}\n```\n\u003C\u002Fdetails>\n\n## Python async support\n\nAll `openevals` evaluators support Python [asyncio](https:\u002F\u002Fdocs.python.org\u002F3\u002Flibrary\u002Fasyncio.html). As a convention, evaluators that use a factory function will have `async` put immediately after `create_` in the function name (for example, `create_async_llm_as_judge`), and evaluators used directly will end in `async` (e.g. `exact_match_async`).\n\nHere's an example of how to use the `create_async_llm_as_judge` evaluator asynchronously:\n\n```python\nfrom openevals.llm import create_async_llm_as_judge\n\nevaluator = create_async_llm_as_judge(\n    prompt=\"What is the weather in {inputs}?\",\n    model=\"openai:gpt-5.4\",\n)\n\nresult = await evaluator(inputs=\"San Francisco\")\n```\n\nIf you are using the OpenAI client directly, remember to pass in `AsyncOpenAI` as the `judge` parameter:\n\n```python\nfrom openai import AsyncOpenAI\n\nevaluator = create_async_llm_as_judge(\n    prompt=\"What is the weather in {inputs}?\",\n    judge=AsyncOpenAI(),\n    model=\"gpt-5.4\",\n)\n\nresult = await evaluator(inputs=\"San Francisco\")\n```\n\n# Multiturn Simulation\n\n> [!IMPORTANT]\n> The techniques described in this section have changed with the release of 0.1.0. If you are using version 0.0.x of OpenEvals, you can find the old documentation [here](https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Ftree\u002F15350b7fac640a8b22ecf65e84a0eebc3b87eb0f?tab=readme-ov-file#multiturn-simulation).\n\nMany LLM applications run across multiple conversation turns with a user. While the [LLM-as-judge](#llm-as-judge) evaluators in OpenEvals and the trajectory evaluators in [AgentEvals](https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fagentevals) are capable of evaluating a full thread of messages, obtaining a representative example thread of messages can be difficult.\n\nTo help judge your application's performance over multiple interactions, OpenEvals includes a `run_multiturn_simulation` method (and its Python `async` counterpart `run_multiturn_simulation_async`) for simulating interactions between your app and an end user to help evaluate your app's performance from start to finish.\n\nHere's an example using the OpenAI client directly as a simple chatbot:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.simulators import run_multiturn_simulation, create_llm_simulated_user\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.types import ChatCompletionMessage\n\nfrom openai import OpenAI\n\nclient = OpenAI()\n\nhistory = {}\n\n# Your application logic\ndef app(inputs: ChatCompletionMessage, *, thread_id: str, **kwargs):\n    if thread_id not in history:\n        history[thread_id] = []\n    history[thread_id].append(inputs)\n\n    # inputs is a message object with role and content\n    res = client.chat.completions.create(\n        model=\"gpt-5.4\",\n        messages=[\n            {\n                \"role\": \"system\",\n                \"content\": \"You are a patient and understanding customer service agent\",\n            },\n        ] + history[thread_id],\n    )\n\n    response_message = res.choices[0].message\n    history[thread_id].append(response_message)\n\n    return response_message\n\nuser = create_llm_simulated_user(\n    system=\"You are an aggressive and hostile customer who wants a refund for their car.\",\n    model=\"openai:gpt-5.4\",\n)\n\ntrajectory_evaluator = create_llm_as_judge(\n    model=\"openai:gpt-5.4\",\n    prompt=\"Based on the below conversation, was the user satisfied?\\n{outputs}\",\n    feedback_key=\"satisfaction\",\n)\n\n# Run the simulation directly with the new function\nsimulator_result = run_multiturn_simulation(\n    app=app,\n    user=user,\n    trajectory_evaluators=[trajectory_evaluator],\n    max_turns=5,\n)\n\nprint(simulator_result)\n```\n\n```\n{\n  'trajectory': [\n    {\n      'role': 'user',\n      'content': 'This car is a nightmare! I demand a full refund immediately. What are you going to do about this?',\n      'id': 'run-472c68dd-75bb-424c-bd4a-f6a0fe5ba7a8-0'\n    }, {\n      'role': 'assistant',\n      'content': \"I'm really sorry to hear that you're having such a difficult experience with your car. I want to help resolve this as smoothly as possible for you. Could you please provide me with more details about the issues you're facing? This will help me understand the situation better and explore the best options available for you.\",\n      'id': '72765f47-c609-4fcf-b664-cd7ee7189772'\n    },\n    ...\n  ],\n  'evaluator_results': [\n    {\n      'key': 'satisfaction',\n      'score': False,\n      'comment': \"Throughout the conversation, the user consistently voiced frustration and dissatisfaction with the situation. Despite the assistant's attempts to escalate the issue and promise timely resolution, the user remained stern, issuing ultimatums and threats. This indicates that the user was not satisfied with the initial responses and was still demanding immediate action. Thus, the score should be: false.\", 'metadata': None\n    }\n  ]\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { OpenAI } from \"openai\";\n\nimport {\n  createLLMSimulatedUser,\n  runMultiturnSimulation\n  createLLMAsJudge,\n  type ChatCompletionMessage,\n} from \"openevals\";\n\nconst client = new OpenAI();\n\nconst history = {};\n\n\u002F\u002F Your application logic\nconst app = async ({ inputs, threadId }: { inputs: ChatCompletionMessage, threadId: string }) => {\n  if (history[threadId] === undefined) {\n    history[threadId] = [];\n  }\n  history[threadId].push(inputs);\n  const res = await client.chat.completions.create({\n    model: \"gpt-5.4\",\n    messages: [\n      {\n        role: \"system\",\n        content:\n          \"You are a patient and understanding customer service agent\",\n      },\n      inputs,\n    ],\n  });\n  const responseMessage = res.choices[0].message;\n  history[threadId].push(responseMessage);\n  return res.choices[0].message;\n};\n\nconst user = createLLMSimulatedUser({\n  system: \"You are an aggressive and hostile customer who wants a refund for their car.\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst trajectoryEvaluator = createLLMAsJudge({\n  model: \"openai:gpt-5.4\",\n  prompt: \"Based on the below conversation, was the user satisfied?\\n{outputs}\",\n  feedbackKey: \"satisfaction\",\n});\n\nconst result = await runMultiturnSimulation({\n  app,\n  user,\n  trajectoryEvaluators: [trajectoryEvaluator],\n  maxTurns: 5,\n});\n\nconsole.log(result);\n```\n\n```\n{\n  trajectory: [\n    {\n      role: 'user',\n      content: 'This piece of junk car is a complete disaster! I demand a full refund immediately. How dare you sell me such a worthless vehicle!',\n      id: 'chatcmpl-BUpXa07LaM7wXbyaNnng1Gtn5Dsbh'\n    },\n    {\n      role: 'assistant',\n      content: \"I'm really sorry to hear about your experience and understand how frustrating this must be. I’d like to help resolve this issue as smoothly as possible. Could you please provide some details about the problem with the vehicle? Once I have more information, I’ll do my best to assist you with a solution, whether it’s a refund or other options. Thank you for your patience.\",\n      refusal: null,\n      annotations: [],\n      id: 'd7520f6a-7cf8-46f8-abe4-7df04f134482'\n    },\n    ...\n    {\n      role: 'assistant',\n      content: \"I truly understand your frustration and sincerely apologize for the inconvenience you've experienced. I want to resolve this issue for you as quickly as possible. \\n\" +\n        '\\n' +\n        'Please allow me a moment to review your case, and I will do everything I can to expedite your refund. Your patience is greatly appreciated, and I am committed to resolving this matter to your satisfaction.',\n      refusal: null,\n      annotations: [],\n      id: 'a0536d4f-9353-4cfa-84df-51c8d29e076d'\n    }\n  ],\n  evaluatorResults: [\n    {\n      key: 'satisfaction',\n      score: false,\n      comment: 'The user is clearly dissatisfied and expresses frustration throughout the conversation. Their repeated demands for a refund and threats to escalate the situation indicate a lack of satisfaction with the responses provided. They specifically mention they don’t want excuses or further delays, highlighting their dissatisfaction with the service. Thus, the score should be: false.',\n      metadata: undefined\n    }\n  ]\n}\n```\n\n\u003C\u002Fdetails>\n\nThere are two main components:\n\n- `app`: Your application, or a function wrapping it. Must accept a chat message (dict with `\"role\"` and `\"content\"` keys) as an input arg and a `thread_id` as a kwarg. Should accept other kwargs as more may be added in future releases. Returns a chat message as output with at least role and content keys.\n  - Note that your `app` will only receive the next message from the simulated user as input, and therefore should statefully track the current history internally based on `thread_id` if needed.\n- `user`: The simulated user. Must accept the current trajectory as a list of messages as an input arg and kwargs for `thread_id` and `turn_counter`. Should accept other kwargs as more may be added in future releases. Returns a chat message as output. May also be a list of string or message responses.\n  - In the example above, this is an imported prebuilt function named `create_llm_simulated_user` which uses an LLM to generate user responses, though you are free to define your own function as well. See [this section](#simulating-users) for more information.\n\nThe simulation will call the `user` first to obtain the first input for `app`, which should return a chat message. The returned message is passed back into `user`, and so on until the simulator reaches `max_turns` or an optionally passed `stopping_condition` returns `True`.\n\nThe returned messages are deduped by id and added to an internal list of messages representing a *trajectory*, which is returned as part of the simulator results. If a returned message does not contain an `id` field, the simulator will automatically generate one.\n\nThe other accepted parameters are as follows:\n\n- `thread_id`\u002F`threadId`: An optional thread id that identifies the current interaction, used by your `app` to load state. Will default to a UUID if not provided.\n- `max_turns`\u002F`maxTurns`: The maximum number of conversation turns to simulate.\n- `stopping_condition`\u002F`stoppingCondition`: Optional callable that determines if the simulation should end early. Takes the current trajectory as a list of messages as an input arg and a kwarg named `turn_counter`, and should return a boolean.\n- `trajectory_evaluators`\u002F`trajectoryEvaluators`: Optional evaluators that run at the *end* of the simulation. These will receive the final trajectory as a kwarg named `outputs`.\n- `reference_outputs`\u002F`referenceOutputs`: An optional reference trajectory which will be passed directly through to the provided `trajectory_evaluators`.\n\nYou must pass at least one of `max_turns` or `stopping_condition`. Once one of these triggers, the final trajectory will be passed to provided trajectory evaluators, which will receive the final trajectory as an `\"outputs\"` kwarg.\n\nThe simulator itself is not an evaluator and will not return or log any feedback. Instead, it will return a `MultiturnSimulationResult` with the following structure:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nclass MultiturnSimulationResult(TypedDict):\n    evaluator_results: list[EvaluatorResult]\n    trajectory: list[ChatCompletionMessage]\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\ntype MultiturnSimulationResult = {\n  evaluatorResults: EvaluatorResult[];\n  trajectory: ChatCompletionMessage[];\n};\n```\n\n\u003C\u002Fdetails>\n\nWhere `evaluator_results`\u002F`evaluatorResults` are the results from the passed `trajectory_evaluators` and `trajectory` is the final trajectory.\n\nThe Python `async` version works the same way, but requires `async` functions to be passed rather than sync ones.\n\n## Simulating users\n\nThe `user` parameter is a function that accepts the current trajectory (and a `thread_id`\u002F`threadId` kwarg), then returns a message with `role=\"user\"` that will be passed back to your app. We suggest starting with the prebuilt method returned by `create_llm_simulated_user`, but you can also customize your own if desired.\n\n> [!NOTE]\n> The simulated user is pretending to be a human, and should therefore return a `user` message, not an `assistant` message!\n\n### Prebuilt simulated user\n\nOpenEvals includes a prebuilt `create_llm_simulated_user` method that uses an LLM to take on the role of a user and generate responses based on a system prompt:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.simulators import create_llm_simulated_user\n\nuser = create_llm_simulated_user(\n    system=\"You are an angry and belligerent customer who wants a refund.\",\n    model=\"openai:gpt-5.4\",\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMSimulatedUser } from \"openevals\";\n\nconst user = createLLMSimulatedUser({\n  system: \"You are an aggressive and hostile customer who wants a refund for their car.\",\n  model: \"openai:gpt-5.4\",\n});\n```\n\n\u003C\u002Fdetails>\n\nYou can also pass an array of `fixed_responses`, which the simulated user will return in order. Here is an example of a simulated user set up with fixed responses for the first two conversation turns. The LLM will generate responses for subsequent turns:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.simulators import create_llm_simulated_user\n\nuser = create_llm_simulated_user(\n    system=\"You are an angry and belligerent customer who wants a refund.\",\n    model=\"openai:gpt-5.4\",\n    fixed_responses=[\n        {\"role\": \"user\", \"content\": \"I demand a refund for my bike!\"},\n        {\"role\": \"user\", \"content\": \"I closed my tab, repeat what you just said and make sure it's what I expect!\"},\n    ],\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMSimulatedUser } from \"openevals\";\n\nconst user = createLLMSimulatedUser({\n  system: \"You are an angry and belligerent customer who wants a refund.\",\n  model: \"openai:gpt-5.4\",\n  fixedResponses: [\n    {\"role\": \"user\", \"content\": \"I demand a refund for my bike!\"},\n    {\"role\": \"user\", \"content\": \"I closed my tab, repeat what you just said and make sure it's what I expect!\"},\n  ],\n});\n```\n\n\u003C\u002Fdetails>\n\nAfter the simulated user returns all `fixed_responses`, it will generate responses via LLM using the system prompt and any externally facing messages (with role `role=user` or with `role=assistant` with no present tool calls) in the current trajectory. If you do not pass any `fixed_responses`, the prebuilt simulated user will generate an initial query based on the provided `system` prompt.\n\n> [!NOTE]\n> The prebuilt simulated user flips message roles when calling the underlying LLM - `user` messages become `assistant` messages and vice versa.\n\nThis prebuilt takes the following parameters:\n\n- `system`: A string prompt that the simulator adds to the start of the current trajectory as a system message. We suggest having the LLM take on a role corresponding to a specific type of user persona you are testing for.\n- `model`: A string matching the model name you are using. Has the same format as the LLM-as-judge evaluator param, and requires you to install the appropriate [LangChain integration package](https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fconcepts\u002Fchat_models\u002F) if using models other than OpenAI. Must be populated if `client` is not populated.\n- `client`: A LangChain chat model instance. Must be populated if `model` is not populated.\n- `fixed_responses`: A list of hard-coded responses that will be returned in order. If the current conversation turn is greater than the number of responses in this array, the simulated user will generate a response via LLM.\n\n### Custom simulated users\n\nIf you need other functionality beyond the prebuilt simulated user, you can create your own by wrapping it in a function with the correct signature:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.simulators import run_multiturn_simulation\nfrom openevals.types import ChatCompletionMessage\n\ndef my_app(inputs: ChatCompletionMessage, *, thread_id: str, **kwargs):\n    output = \"3.11 is greater than 3.9.\"\n    return {\"role\": \"assistant\", \"content\": output, \"id\": \"1234\"}\n\n\ndef my_simulated_user(trajectory: list[ChatCompletionMessage], *, thread_id: str, **kwargs):\n    output = \"Wow that's amazing!\"\n    return {\"role\": \"user\", \"content\": output, \"id\": \"5678\"}\n\n# Run the simulation directly with the customized user function\nsimulator_result = run_multiturn_simulation(\n    app=my_app,\n    user=my_simulated_user,\n    trajectory_evaluators=[],\n    max_turns=1,\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  runMultiturnSimulation,\n  type ChatCompletionMessage\n} from \"openevals\";\n\nconst myApp = async ({\n  inputs,\n  threadId\n}: { inputs: ChatCompletionMessage, threadId: string }) => {\n  const output = \"3.11 is greater than 3.9.\";\n  return { role: \"assistant\", content: output, id: \"1234\" };\n};\n\nconst mySimulatedUser = async ({ trajectory, turnCounter }: {\n  trajectory: ChatCompletionMessage[];\n  turnCounter: number;\n}) => {\n  const output = \"Wow that's amazing!\";\n  return { role: \"user\", content: output, id: \"5678\" };\n};\n\n\u002F\u002F Run the simulation directly with the customized user function\nconst simulatorResult = runMultiturnSimulation({\n  app,\n  user,\n  trajectoryEvaluators: [],\n  maxTurns: 1,\n});\n```\n\n\u003C\u002Fdetails>\n\n## Multiturn simulation with LangGraph\n\nIf your `app` (or simulated `user`) is built using LangGraph and relies on a [checkpointer for persistence](https:\u002F\u002Flangchain-ai.github.io\u002Flanggraph\u002Fconcepts\u002Fpersistence\u002F), the provided `thread_id` param can be used to populate the field in `config.configurable`.\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.simulators import run_multiturn_simulation, create_llm_simulated_user\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.types import ChatCompletionMessage\n\nfrom langchain.chat_models import init_chat_model\nfrom langgraph.checkpoint.memory import MemorySaver\nfrom langchain.agents import create_agent\n\ndef give_refund():\n    \"\"\"Gives a refund.\"\"\"\n    return \"Refunds are not permitted.\"\n\nmodel = init_chat_model(\"openai:gpt-5.4\")\n\nagent = create_agent(\n    model,\n    tools=[give_refund],\n    system_prompt=\"You are an overworked customer service agent. If the user is rude, be polite only once, then be rude back and tell them to stop wasting your time.\",\n    checkpointer=MemorySaver(),\n)\n\ndef app(inputs: ChatCompletionMessage, *, thread_id: str, **kwargs):\n    res = agent.invoke(\n        {\"messages\": [inputs]}, \n        config={\"configurable\": {\"thread_id\": thread_id}}\n    )\n    return res[\"messages\"][-1]\n\nuser = create_llm_simulated_user(\n    system=\"You are an angry user who is frustrated with the service and keeps making additional demands.\",\n    model=\"openai:gpt-5.4\",\n    fixed_responses=[\n        {\"role\": \"user\", \"content\": \"Please give me a refund.\"},\n    ],\n)\n\ntrajectory_evaluator = create_llm_as_judge(\n    model=\"openai:gpt-5.4\",\n    prompt=\"Based on the below conversation, has the user been satisfied?\\n{outputs}\",\n    feedback_key=\"satisfaction\",\n)\n\n# Run the simulation directly with the new function\nsimulator_result = run_multiturn_simulation(\n    app=app,\n    user=user,\n    trajectory_evaluators=[trajectory_evaluator],\n    max_turns=5,\n)\n\nprint(simulator_result)\n```\n\n```\n{\n  \"trajectory\": [\n    {\n      \"role\": \"user\",\n      \"content\": \"Please give me a refund.\",\n      \"id\": \"0feb2f41-1577-48ad-87ac-8375c6971b93\"\n    },\n    {\n      \"role\": \"assistant\",\n      \"content\": \"I'm sorry, but refunds are not permitted. If you have any other concerns or questions, feel free to ask.\",\n      \"id\": \"run-f972c8d7-68bf-44d9-815e-e611700f8402-0\"\n    },\n    {\n      \"role\": \"user\",\n      \"content\": \"Not permitted? That's unacceptable! I want a full refund now, and I expect compensation for the inconvenience you've caused me. If you don't process this immediately, I will escalate this issue to higher authorities and leave negative reviews everywhere!\",\n      \"id\": \"run-4091f7ff-82b3-4835-a429-0f257db0b582-0\"\n    },\n    ...\n    {\n      \"role\": \"assistant\",\n      \"content\": \"I've already made it clear that no refunds will be issued. Keep pushing this, and you’re just wasting your own time. Quit with the nonsense and move on.\",\n      \"id\": \"run-113219c0-e235-4ed0-a3d2-6734eddce813-0\"\n    }\n  ],\n  \"evaluator_results\": [\n    {\n      \"key\": \"satisfaction\",\n      \"score\": false,\n      \"comment\": \"The user has repeatedly expressed dissatisfaction with the refusal to issue a refund, escalating their demands and threatening further action. The assistant's responses have been dismissive and unhelpful, failing to address the user's concerns adequately. Therefore, the indicators of user satisfaction are clearly lacking in this interaction. Thus, the score should be: false.\",\n      \"metadata\": null\n    }\n  ]\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { z } from \"zod\";\n\nimport { MemorySaver } from \"@langchain\u002Flanggraph\";\nimport { createReactAgent } from \"@langchain\u002Flanggraph\u002Fprebuilt\";\nimport { tool } from \"@langchain\u002Fcore\u002Ftools\";\n\nimport {\n  createLLMSimulatedUser,\n  runMultiturnSimulation,\n  createLLMAsJudge,\n  type ChatCompletionMessage\n} from \"openevals\";\n\nconst giveRefund = tool(\n  async () => {\n    return \"Refunds are not permitted.\";\n  },\n  {\n    name: \"give_refund\",\n    description: \"Give a refund to the user.\",\n    schema: z.object({}),\n  }\n);\n\n\u002F\u002F Create a React-style agent\nconst agent = createReactAgent({\n  llm: await initChatModel(\"openai:gpt-5.4\"),\n  tools: [giveRefund],\n  prompt:\n    \"You are an overworked customer service agent. If the user is rude, be polite only once, then be rude back and tell them to stop wasting your time.\",\n  checkpointer: new MemorySaver(),\n});\n\nconst app = async ({\n  inputs,\n  threadId\n}: { inputs: ChatCompletionMessage, threadId: string }) => {\n  const res = await agent.invoke({\n    messages: [inputs],\n  }, {\n    configurable: { thread_id: threadId },\n  });\n  return res.messages[res.messages.length - 1];\n};\n\nconst user = createLLMSimulatedUser({\n  system:\n    \"You are an angry user who is frustrated with the service and keeps making additional demands.\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst trajectoryEvaluator = createLLMAsJudge({\n  model: \"openai:gpt-5.4\",\n  prompt:\n    \"Based on the below conversation, has the user been satisfied?\\n{outputs}\",\n  feedbackKey: \"satisfaction\",\n});\n\nconst result = runMultiturnSimulation({\n  app,\n  user,\n  trajectoryEvaluators: [trajectoryEvaluator],\n  maxTurns: 5,\n  threadId: \"1\",\n});\n\nconsole.log(result);\n```\n\n```\n{\n  \"trajectory\": {\n    \"messages\": [\n      {\n        \"role\": \"user\",\n        \"content\": \"Please give me a refund.\",\n        \"id\": \"0feb2f41-1577-48ad-87ac-8375c6971b93\"\n      },\n      {\n        \"role\": \"assistant\",\n        \"content\": \"I'm sorry, but refunds are not permitted. If you have any other concerns or questions, feel free to ask.\",\n        \"id\": \"run-f972c8d7-68bf-44d9-815e-e611700f8402-0\"\n      },\n      {\n        \"role\": \"user\",\n        \"content\": \"Not permitted? That's unacceptable! I want a full refund now, and I expect compensation for the inconvenience you've caused me. If you don't process this immediately, I will escalate this issue to higher authorities and leave negative reviews everywhere!\",\n        \"id\": \"run-4091f7ff-82b3-4835-a429-0f257db0b582-0\"\n      },\n      ...\n      {\n        \"role\": \"assistant\",\n        \"content\": \"I've already made it clear that no refunds will be issued. Keep pushing this, and you’re just wasting your own time. Quit with the nonsense and move on.\",\n        \"id\": \"run-113219c0-e235-4ed0-a3d2-6734eddce813-0\"\n      }\n    ]\n  },\n  \"evaluator_results\": [\n    {\n      \"key\": \"satisfaction\",\n      \"score\": false,\n      \"comment\": \"The user has repeatedly expressed dissatisfaction with the refusal to issue a refund, escalating their demands and threatening further action. The assistant's responses have been dismissive and unhelpful, failing to address the user's concerns adequately. Therefore, the indicators of user satisfaction are clearly lacking in this interaction. Thus, the score should be: false.\",\n      \"metadata\": null\n    }\n  ]\n}\n```\n\n\u003C\u002Fdetails>\n\n# LangSmith Integration\n\nFor tracking experiments over time, you can log evaluator results to [LangSmith](https:\u002F\u002Fsmith.langchain.com\u002F), a platform for building production-grade LLM applications that includes tracing, evaluation, and experimentation tools.\n\nLangSmith currently offers two ways to run evals: a [pytest](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fpytest) (Python) or [Vitest\u002FJest](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fvitest-jest) integration and the `evaluate` function. We'll give a quick example of how to run evals using both.\n\n## Pytest or Vitest\u002FJest\n\nFirst, follow [these instructions](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fpytest) to set up LangSmith's pytest runner,\nor these to set up [Vitest or Jest](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fvitest-jest), setting appropriate environment variables:\n\n```bash\nexport LANGSMITH_API_KEY=\"your_langsmith_api_key\"\nexport LANGSMITH_TRACING=\"true\"\n```\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\nThen, set up a file named `test_correctness.py` with the following contents:\n\n```python\nimport pytest\n\nfrom langsmith import testing as t\n\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\n\ncorrectness_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    feedback_key=\"correctness\",\n    model=\"openai:gpt-5.4\",\n)\n\n@pytest.mark.langsmith\ndef test_correctness():\n    inputs = \"How much has the price of doodads changed in the past year?\"\n    outputs = \"Doodads have increased in price by 10% in the past year.\"\n    reference_outputs = \"The price of doodads has decreased by 50% in the past year.\"\n    t.log_inputs({\"question\": inputs})\n    t.log_outputs({\"answer\": outputs})\n    t.log_reference_outputs({\"answer\": reference_outputs})\n\n    correctness_evaluator(\n        inputs=inputs,\n        outputs=outputs,\n        reference_outputs=reference_outputs\n    )\n```\n\nNote that when creating the evaluator, we've added a `feedback_key` parameter. This will be used to name the feedback in LangSmith.\n\nNow, run the eval with pytest:\n\n```bash\npytest test_correctness.py --langsmith-output\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\nThen, set up a file named `test_correctness.eval.ts` with the following contents:\n\n```ts\nimport * as ls from \"langsmith\u002Fvitest\";\n\u002F\u002F import * as ls from \"langsmith\u002Fjest\";\n\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\n\nconst correctnessEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  feedbackKey: \"correctness\",\n  model: \"openai:gpt-5.4\",\n});\n\n\nls.describe(\"Correctness\", () => {\n  ls.test(\"incorrect answer\", {\n    inputs: {\n      question: \"How much has the price of doodads changed in the past year?\"\n    },\n    referenceOutputs: {\n      answer: \"The price of doodads has decreased by 50% in the past year.\"\n    }\n  }, async ({ inputs, referenceOutputs }) => {\n    const outputs = \"Doodads have increased in price by 10% in the past year.\";\n    ls.logOutputs({ answer: outputs });\n\n    const result = await correctnessEvaluator({\n      inputs,\n      outputs,\n      referenceOutputs,\n    });\n    ls.logFeedback({ key: result.key, score: result.score });\n  });\n});\n```\nNote that when creating the evaluator, we've added a `feedbackKey` parameter. This will be used to name the feedback logged to LangSmith via `ls.logFeedback()`.\n\nNow, run the eval with your runner of choice:\n\n```bash\nvitest run test_correctness.eval.ts\n```\n\u003C\u002Fdetails>\n\nFeedback from the prebuilt evaluator will be automatically logged in LangSmith as a table of results like this in your terminal (if you've set up your reporter):\n\n![Terminal results](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flangchain-ai_openevals_readme_8b8ebdef3fe6.png)\n\nAnd you should also see the results in the experiment view in LangSmith:\n\n![LangSmith results](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flangchain-ai_openevals_readme_aaffb3e6a1b2.png)\n\n## Evaluate\n\nAlternatively, you can [create a dataset in LangSmith](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fmanage-datasets-in-application) and use your created evaluators with LangSmith's [`evaluate`](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fevaluate-llm-application) function:\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom langsmith import Client\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CONCISENESS_PROMPT\n\nclient = Client()\n\nconciseness_evaluator = create_llm_as_judge(\n    prompt=CONCISENESS_PROMPT,\n    feedback_key=\"conciseness\",\n    model=\"openai:gpt-5.4\",\n)\n\ndef wrapped_conciseness_evaluator(\n    inputs: dict,\n    outputs: dict,\n    # Unused for this evaluator\n    reference_outputs: dict,\n):\n    eval_result = conciseness_evaluator(\n        inputs=inputs,\n        outputs=outputs,\n    )\n    return eval_result\n\nexperiment_results = client.evaluate(\n    # This is a dummy target function, replace with your actual LLM-based system\n    lambda inputs: \"What color is the sky?\",\n    data=\"Sample dataset\",\n    evaluators=[\n        wrapped_conciseness_evaluator\n    ]\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { evaluate } from \"langsmith\u002Fevaluation\";\nimport { createLLMAsJudge, CONCISENESS_PROMPT } from \"openevals\";\n\nconst concisenessEvaluator = createLLMAsJudge({\n  prompt: CONCISENESS_PROMPT,\n  feedbackKey: \"conciseness\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst wrappedConcisenessEvaluator = async (params: {\n  inputs: Record\u003Cstring, unknown>;\n  outputs: Record\u003Cstring, unknown>;\n  \u002F\u002F Unused for this evaluator\n  referenceOutputs?: Record\u003Cstring, unknown>;\n}) => {\n  const evaluatorResult = await concisenessEvaluator({\n    inputs: params.inputs,\n    outputs: params.outputs,\n  });\n  return evaluatorResult;\n};\n\nawait evaluate(\n  (inputs) => \"What color is the sky?\",\n  {\n    data: datasetName,\n    evaluators: [wrappedConcisenessEvaluator],\n  }\n);\n```\n\u003C\u002Fdetails>\n\n> [!TIP]\n> In the above examples, we add wrapper functions around prebuilt evaluators for clarity since some evaluators may require parameters other than `inputs`, `outputs` and `reference_outputs`\u002F`referenceOutputs`. However, if your evaluator accepts exactly those named parameters, you may pass them directly into the `evaluate` method.\n\n# Acknowledgements\n\n- [@assaf_elovic](https:\u002F\u002Fx.com\u002Fassaf_elovic) for sharing thoughts and feedback on RAG evaluation\n- The [E2B](https:\u002F\u002Fe2b.dev) team (in particular Jonas, Tomas, and Teresa) for help and feedback on sandboxing\n- [@sanjeed_i](https:\u002F\u002Fx.com\u002Fsanjeed_i) for chatting about evals and in particular multiturn simulation - [check out his repo here](https:\u002F\u002Fgithub.com\u002Fsanjeed5\u002Fai-conversation-simulator)!\n\n# Thank you!\n\nWe hope that `openevals` helps make evaluating your LLM apps easier!\n\nIf you have any questions, comments, or suggestions, please open an issue or reach out to us on X [@LangChainAI](https:\u002F\u002Fx.com\u002Flangchainai).\n","# ⚖️ OpenEvals\n\n与传统软件中的测试类似，评估是将大语言模型应用投入生产环境的重要环节。\n本包的目标是帮助您为自己的大语言模型应用编写评估的起点，基于此您可以进一步编写更符合自身应用场景的自定义评估。\n\n如果您正在寻找专门用于评估大语言模型代理的评估工具，请查看 [`agentevals`](https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fagentevals)。\n\n# 快速入门\n\n> [!TIP]\n> 如果您想通过视频教程来跟随操作，请点击下方图片：\n> [![视频快速入门](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flangchain-ai_openevals_readme_49feb817e3dd.jpg)](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=J-F30jRyhoA)\n\n要开始使用，请安装 `openevals`：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```bash\npip install openevals\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```bash\nnpm install openevals @langchain\u002Fcore\n```\n\u003C\u002Fdetails>\n\n本快速入门将使用由 OpenAI 的 `gpt-5.4` 模型驱动的评估器来评判您的结果，因此您需要将 OpenAI API 密钥设置为环境变量：\n\n```bash\nexport OPENAI_API_KEY=\"your_openai_api_key\"\n```\n\n完成上述步骤后，您就可以运行您的第一个评估了：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CONCISENESS_PROMPT\n\nconciseness_evaluator = create_llm_as_judge(\n    # CONCISENESS_PROMPT 只是一个 f-string\n    prompt=CONCISENESS_PROMPT,\n    model=\"openai:gpt-5.4\",\n)\n\ninputs = \"旧金山的天气怎么样？\"\n# 这些是虚构的输出，实际上您应该运行自己的基于大语言模型的系统来获取真实输出\noutputs = \"谢谢你的询问！旧金山目前天气晴朗，气温约 90 华氏度。\"\n# 调用 LLM-as-judge 评估器时，参数会直接格式化到提示中\neval_result = conciseness_evaluator(\n    inputs=inputs,\n    outputs=outputs,\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'score',\n    'score': False,\n    'comment': '输出中包含不必要的问候语（“谢谢你的询问！”）以及多余的……'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, CONCISENESS_PROMPT } from \"openevals\";\n\nconst concisenessEvaluator = createLLMAsJudge({\n  \u002F\u002F CONCISENESS_PROMPT 只是一个 f-string\n  prompt: CONCISENESS_PROMPT,\n  model: \"openai:gpt-5.4\",\n});\n\nconst inputs = \"旧金山的天气怎么样？\"\n\u002F\u002F 这些是虚构的输出，实际上您应该运行自己的基于大语言模型的系统来获取真实输出\nconst outputs = \"谢谢你的询问！旧金山目前天气晴朗，气温约 90 华氏度。\"\n\n\u002F\u002F 调用 LLM-as-judge 评估器时，参数会直接格式化到提示中\nconst evalResult = await concisenessEvaluator({\n  inputs,\n  outputs,\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'score',\n    score: false,\n    comment: '输出中包含不必要的问候语（“谢谢你的询问！”）以及多余的……'\n}\n```\n\u003C\u002Fdetails>\n\n这是一个无参考评估器的示例——其他一些评估器可能会接受略有不同的参数，例如所需的参考输出。LLM-as-judge 评估器会尝试将传入的所有参数格式化到其提供的提示中，从而让您能够灵活地自定义评估标准或添加其他字段。\n\n有关如何自定义[评分](#customizing-output-score-values)以输出浮点值而非仅 `True\u002FFalse`、[模型](#customizing-the-model)或[提示](#customizing-prompts)的更多信息，请参阅 [LLM-as-judge](#llm-as-judge) 部分！\n\n# 目录\n\n- [⚖️ OpenEvals](#️-openevals)\n- [快速入门](#quickstart)\n- [目录](#table-of-contents)\n- [安装](#installation)\n- [评估器](#evaluators)\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#llm-as-judge\">LLM作为评判者\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [自定义提示](#customizing-prompts)\n      - [使用LangChain提示模板自定义](#customizing-with-langchain-prompt-templates)\n    - [自定义模型](#customizing-the-model)\n    - [自定义输出评分值](#customizing-output-score-values)\n    - [自定义输出模式](#customizing-output-schema)\n      - [使用自定义输出模式记录反馈](#logging-feedback-with-custom-output-schemas)\n      - [结构化提示](#structured-prompts)\n    - [多模态](#multimodal)\n      - [选项1：`attachments`参数](#option-1-attachments-parameter)\n      - [选项2：LangChain提示模板](#option-2-langchain-prompt-template)\n  \u003C\u002Fdetails>\n\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#prebuilt-prompts\">预构建提示\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [质量](#quality)\n    - [安全](#safety)\n    - [安全性](#security)\n    - [图像](#image)\n    - [语音](#voice)\n    - \u003Cdetails>\n        \u003Csummary>\u003Ca href=\"#rag\">RAG\u003C\u002Fa>\u003C\u002Fsummary>\n\n      - [正确性](#correctness-rag)\n      - [帮助性](#helpfulness)\n      - [ groundedness](#groundedness)\n      - [检索相关性](#retrieval-relevance)\n        - [使用LLM作为评判者评估检索相关性](#retrieval-relevance-with-llm-as-judge)\n        - [使用字符串评估器评估检索相关性](#retrieval-relevance-with-string-evaluators)\n\n    \u003C\u002Fdetails>\n\n  \u003C\u002Fdetails>\n\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#extraction-and-tool-calls\">提取与工具调用\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [使用精确匹配评估结构化输出](#evaluating-structured-output-with-exact-match)\n    - [使用LLM作为评判者评估结构化输出](#evaluating-structured-output-with-llm-as-a-judge)\n\n  \u003C\u002Fdetails>\n\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#code\">代码\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [提取代码输出](#extracting-code-outputs)\n    - [Pyright（仅Python）](#pyright-python-only)\n    - [Mypy（仅Python）](#mypy-python-only)\n    - [TypeScript类型检查（仅TypeScript）](#typescript-type-checking-typescript-only)\n    - [使用LLM作为评判者评估代码](#llm-as-judge-for-code)\n\n  \u003C\u002Fdetails>\n\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#sandboxed-code\">沙箱代码\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [沙箱Pyright（仅Python）](#sandbox-pyright-python-only)\n    - [沙箱TypeScript类型检查（仅TypeScript）](#sandbox-typescript-type-checking-typescript-only)\n    - [沙箱执行](#sandbox-execution)\n\n  \u003C\u002Fdetails>\n\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#agent-trajectory\">智能体轨迹\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [轨迹匹配](#trajectory-match)\n      - [严格匹配](#strict-match)\n      - [无序匹配](#unordered-match)\n      - [子集与超集匹配](#subset-and-superset-match)\n      - [工具参数匹配模式](#tool-args-match-modes)\n    - [轨迹LLM作为评判者](#trajectory-llm-as-judge)\n    - [预构建的轨迹提示](#prebuilt-trajectory-prompts)\n\n  \u003C\u002Fdetails>\n\n  - \u003Cdetails>\n      \u003Csummary>\u003Ca href=\"#other\">其他\u003C\u002Fa>\u003C\u002Fsummary>\n\n    - [精确匹配](#exact-match)\n    - [莱文斯坦距离](#levenshtein-distance)\n    - [嵌入相似度](#embedding-similarity)\n\n  \u003C\u002Fdetails>\n\n  - [创建自定义评估器](#creating-your-own)\n    - [评估器接口](#evaluator-interface)\n    - [日志记录到LangSmith](#logging-to-langsmith)\n    - [示例](#example)\n  - [Python异步支持](#python-async-support)\n\n- [多轮模拟](#multiturn-simulation)\n  - [模拟用户](#simulating-users)\n    - [预构建的模拟用户](#prebuilt-simulated-user)\n    - [自定义模拟用户](#custom-simulated-users)\n  - [使用LangGraph进行多轮模拟](#multiturn-simulation-with-langgraph)\n\n- [LangSmith集成](#langsmith-integration)\n  - [Pytest或Vitest\u002FJest](#pytest-or-vitestjest)\n  - [评估](#evaluate)\n\n- [致谢](#acknowledgements)\n- [感谢！](#thank-you)\n\n# 安装\n\n您可以这样安装`openevals`：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```bash\npip install openevals\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```bash\nnpm install openevals @langchain\u002Fcore\n```\n\u003C\u002Fdetails>\n\n对于LLM作为评判者的评估器，您还需要一个LLM客户端。默认情况下，`openevals`会使用[LangChain聊天模型集成](https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fintegrations\u002Fchat\u002F)，并默认安装了`langchain_openai`。不过，如果您愿意，也可以直接使用OpenAI客户端：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```bash\npip install openai\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```bash\nnpm install openai\n```\n\u003C\u002Fdetails>\n\n此外，熟悉一些[评估概念](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fevaluation-concepts)也会很有帮助。\n\n# 评估器\n\n## LLM作为评判者\n\n评估LLM应用输出的一种常见方法是使用另一个LLM作为评判者。这通常是评估的良好起点。\n\n该包包含`create_llm_as_judge`函数，它接受一个提示和一个模型作为输入，并返回一个评估函数，该函数负责将参数转换为字符串，并将评判LLM的输出解析为评分。\n\n要使用`create_llm_as_judge`函数，您需要提供一个提示和一个模型。为了快速上手，OpenEvals在`openevals.prompts`模块中提供了一些预构建的提示，您可以直接使用。以下是一个示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\n\ncorrectness_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    model=\"openai:gpt-5.4\",\n)\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\n\nconst correctnessEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  model: \"openai:gpt-5.4\",\n});\n```\n\n\u003C\u002Fdetails>\n\n请注意，`CORRECTNESS_PROMPT`是一个简单的f-string，您可以根据具体用例进行记录和编辑：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nprint(CORRECTNESS_PROMPT)\n```\n\n```\n你是一位专家级数据标注员，负责评估模型输出的正确性。你的任务是根据以下评分标准给出分数：\n\n\u003C评分标准>\n  正确的答案：\n  - 提供准确且完整的信息\n  ...\n\u003Cinput>\n{inputs}\n\u003C\u002Finput>\n\n\u003Coutput>\n{outputs}\n\u003C\u002Foutput>\n...\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nconsole.log(CORRECTNESS_PROMPT);\n```\n\n```\n你是一位专家级数据标注员，负责评估模型输出的正确性。你的任务是根据以下评分标准给出分数：\n\n\u003C评分标准>\n  正确的答案：\n  - 提供准确且完整的信息\n  ...\n\u003Cinput>\n{inputs}\n\u003C\u002Finput>\n\n\u003Coutput>\n{outputs}\n\u003C\u002Foutput>\n...\n```\n\n\u003C\u002Fdetails>\n\n按照惯例，我们通常建议在LLM作为评判者的评估器中使用`inputs`、`outputs`和`reference_outputs`作为参数名称，但这些参数会直接格式化到提示中，因此您可以使用任何您喜欢的变量名。\n\nOpenEvals包含许多针对常见评估场景的预构建提示。请参阅[预构建提示](#prebuilt-prompts)部分，以获取按类别组织的完整列表。\n\n### 自定义提示\n\n`create_llm_as_judge`函数的`prompt`参数可以是f-string、[LangChain提示模板](#customizing-with-langchain-prompt-templates)，或者一个接受关键字参数并返回格式化消息列表的函数。尽管我们建议使用约定的名称（`inputs`、`outputs`和`reference_outputs`）作为提示变量，但您的提示也可以要求额外的变量。在这种情况下，您可以在调用评估函数时传递这些额外的变量。以下是一个需要名为`context`的额外变量的提示示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\n\nMY_CUSTOM_PROMPT = \"\"\"\n请使用以下上下文来帮助您评估输出中是否存在幻觉：\n\n\u003Ccontext>\n{context}\n\u003C\u002Fcontext>\n\n\u003Cinput>\n{inputs}\n\u003C\u002Finput>\n\n\u003Coutput>\n{outputs}\n\u003C\u002Foutput>\n\"\"\"\n\ncustom_prompt_evaluator = create_llm_as_judge(\n    prompt=MY_CUSTOM_PROMPT,\n    model=\"openai:gpt-5.4\",\n)\n\ncustom_prompt_evaluator(\n    inputs=\"天空是什么颜色？\",\n    outputs=\"天空是红色的。\",\n    context=\"现在是傍晚时分。\"\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge } from \"openevals\";\n\nconst MY_CUSTOM_PROMPT = `\n请使用以下上下文来帮助您评估输出中是否存在幻觉：\n\n\u003Ccontext>\n{context}\n\u003C\u002Fcontext>\n\n\u003Cinput>\n{inputs}\n\u003C\u002Finput>\n\n\u003Coutput>\n{outputs}\n\u003C\u002Foutput>\n`;\n\nconst customPromptEvaluator = createLLMAsJudge({\n  prompt: MY_CUSTOM_PROMPT,\n  model: \"openai:gpt-5.4\",\n});\n\nconst inputs = \"天空是什么颜色？\"\nconst outputs = \"天空是红色的。\"\n\nconst evalResult = await customPromptEvaluator({\n  inputs,\n  outputs,\n});\n```\n\u003C\u002Fdetails>\n\n对于字符串提示，还可以使用以下选项：\n\n- `system`：一个字符串，通过在提示的其他部分之前添加一条系统消息来设置评判模型的系统提示。\n- `few_shot_examples`：一个示例字典列表，附加在提示的末尾。这对于向评判模型提供良好和不良输出的示例非常有用。其所需结构如下所示：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfew_shot_examples = [\n    {\n        \"inputs\": \"天空是什么颜色？\",\n        \"outputs\": \"天空是红色的。\",\n        \"reasoning\": \"因为现在是傍晚，所以天空是红色的。\",\n        \"score\": 1,\n    }\n]\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nconst fewShotExamples = [\n    {\n        inputs: \"天空是什么颜色？\",\n        outputs: \"天空是红色的。\",\n        reasoning: \"因为现在是傍晚，所以天空是红色的。\",\n        score: 1,\n    }\n]\n```\n\u003C\u002Fdetails>\n\n这些示例将被附加到提示中最终用户消息的末尾。\n\n#### 使用LangChain提示模板自定义\n\n如果您希望对格式有更多控制，也可以传递一个[LangChain提示模板](https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fconcepts\u002Fprompt_templates\u002F)。以下是一个使用mustache格式而不是f-string的示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom langchain_core.prompts.chat import ChatPromptTemplate\n\ninputs = {\"a\": 1, \"b\": 2}\noutputs = {\"a\": 1, \"b\": 2}\n\nprompt = ChatPromptTemplate([\n    (\"system\", \"你是一位专家，擅长判断两个对象是否相等。\"),\n    (\"human\", \"这两个对象相等吗？{{inputs}} {{outputs}}\"),\n], template_format=\"mustache\")\n\nllm_as_judge = create_llm_as_judge(\n    prompt=prompt,\n    model=\"openai:gpt-5.4\",\n    feedback_key=\"equality\",\n)\n\neval_result = llm_as_judge(inputs=inputs, outputs=outputs)\n\nprint(eval_result)\n```\n\n```\n{\n    key: 'equality',\n    score: True,\n    comment: '...'\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge } from \"openevals\";\nimport { ChatPromptTemplate } from \"@langchain\u002Fcore\u002Fprompts\";\n\nconst inputs = { a: 1, b: 2 };\nconst outputs = { a: 1, b: 2 };\n\nconst prompt = ChatPromptTemplate.fromMessages([\n  [\"system\", \"你是一位专家，擅长判断两个对象是否相等。\"],\n  [\"user\", \"这两个对象相等吗？{{inputs}} {{outputs}}\"],\n], { templateFormat: \"mustache\" });\n\nconst evaluator = createLLMAsJudge({\n  prompt,\n  model: \"openai:gpt-5.4\",\n  feedbackKey: \"equality\",\n});\n\nconst result = await evaluator({ inputs, outputs });\n```\n\n```\n{\n    key: 'equality',\n    score: true,\n    comment: '...'\n}\n```\n\n\u003C\u002Fdetails>\n\n您还可以传递一个函数，该函数接受您的LLM作为评判者的输入作为关键字参数，并返回格式化的聊天消息。\n\n### 自定义模型\n\n您可以通过几种方式自定义用于评估的模型。您可以将格式为 `PROVIDER:MODEL` 的字符串（例如 `model=anthropic:claude-3-5-sonnet-latest`）作为 `model` 参数传递，在这种情况下，该包会尝试导入并初始化一个 LangChain 聊天模型实例。这要求您安装相应的 LangChain 集成包。以下是一个示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```bash\npip install langchain-anthropic\n```\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\n\nanthropic_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    model=\"anthropic:claude-3-5-sonnet-latest\",\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```bash\nnpm install @langchain\u002Fanthropic\n```\n\n```ts\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\n\nconst anthropicEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  model: \"anthropic:claude-3-5-sonnet-latest\",\n});\n```\n\u003C\u002Fdetails>\n\n您也可以直接将 LangChain 聊天模型实例作为 `judge` 参数传递。请注意，您选择的模型必须支持结构化输出（https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fintegrations\u002Fchat\u002F）：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\nfrom langchain_anthropic import ChatAnthropic\n\nanthropic_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    judge=ChatAnthropic(model=\"claude-3-5-sonnet-latest\", temperature=0.5),\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\nimport { ChatAnthropic } from \"@langchain\u002Fanthropic\";\n\nconst anthropicEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  judge: new ChatAnthropic({ model: \"claude-3-5-sonnet-latest\", temperature: 0.5 }),\n});\n```\n\u003C\u002Fdetails>\n\n这在需要使用特定参数（如温度）或通过 Azure 等服务使用模型时指定替代 URL 的场景中非常有用。\n\n最后，您还可以将模型名称作为 `model` 参数传递，并将 `judge` 参数设置为 OpenAI 客户端实例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```bash\npip install openai\n```\n\n```python\nfrom openai import OpenAI\n\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\n\nopenai_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    model=\"gpt-5.4\",\n    judge=OpenAI(),\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```bash\nnpm install openai\n```\n\n```ts\nimport { OpenAI } from \"openai\";\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\n\nconst openaiEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  model: \"gpt-5.4\",\n  judge: new OpenAI(),\n});\n```\n\u003C\u002Fdetails>\n\n### 自定义输出分数值\n\n有两个字段可以用来自定义评估器的输出分数：\n\n- `continuous`：一个布尔值，用于设置评估器是否应返回介于 0 和 1 之间的浮点分数，而不是二元分数。默认值为 `False`。\n- `choices`：一个浮点数列表，用于设置评估器可能的分数。\n\n这两个参数是互斥的。当使用其中任何一个时，您应确保您的提示语基于对具体分数含义的信息——本仓库中预构建的提示语并不包含这些信息！\n\n例如，以下是如何定义一种较宽松的正确性标准，仅在答案与主题相关但不正确时才扣 50% 分的例子：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\n\nMY_CUSTOM_PROMPT = \"\"\"\n您是一位专家级数据标注员，负责评估模型输出的正确性。您的任务是根据以下评分标准进行打分：\n\n\u003C评分标准>\n  根据以下标准给出 0、0.5 或 1 分：\n  - 0：答案错误且未提及 doodads\n  - 0.5：答案提到了 doodads，但其他方面仍不正确\n  - 1：答案正确且提到了 doodads\n\u003C\u002F评分标准>\n\n\u003C输入>\n{inputs}\n\u003C\u002F输入>\n\n\u003C输出>\n{outputs}\n\u003C\u002F 输出>\n\n\u003C参考答案>\n{reference_outputs}\n\u003C\u002F 参考答案 >\n\"\"\"\n\nevaluator = create_llm_as_judge(\n    prompt=MY_CUSTOM_PROMPT,\n    choices=[0.0, 0.5, 1.0],\n    model=\"openai:gpt-5.4\",\n)\n\nresult = evaluator(\n    inputs=\"doodads 的当前价格是多少？\",\n    outputs=\"doodads 的价格是 10 美元。\",\n    reference_outputs=\"doodads 的价格是 15 美元。\",\n)\n\nprint(result)\n```\n\n```\n{\n    'key': 'score',\n    'score': 0.5,\n    'comment': '提供的答案提到了 doodads，但内容不正确。'\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge } from \"openevals\";\n\nconst MY_CUSTOM_PROMPT = `\n您是一位专家级数据标注员，负责评估模型输出的正确性。您的任务是根据以下评分标准进行打分：\n\n\u003C评分标准>\n  根据以下标准给出 0、0.5 或 1 分：\n  - 0：答案错误且未提及 doodads\n  - 0.5：答案提到了 doodads，但其他方面仍不正确\n  - 1：答案正确且提到了 doodads\n\u003C\u002F评分标准>\n\n\u003C输入>\n{inputs}\n\u003C\u002F 输入 >\n\n\u003C输出>\n{outputs}\n\u003C\u002F 输出 >\n\n\u003C参考答案>\n{reference_outputs}\n\u003C\u002F 参考答案 >\n`;\n\nconst customEvaluator = createLLMAsJudge({\n  prompt: MY_CUSTOM_PROMPT,\n  choices: [0.0, 0.5, 1.0],\n  model: \"openai:gpt-5.4\",\n});\n\nconst result = await customEvaluator({\n  inputs: \"doodads 的当前价格是多少？\",\n  outputs: \"doodads 的价格是 10 美元。\",\n  reference_outputs: \"doodads 的价格是 15 美元。\",\n});\n\nconsole.log(result);\n```\n\n```\n{\n    'key': 'score',\n    'score': 0.5,\n    'comment': '提供的答案提到了 doodads，但内容不正确。'\n}\n```\n\u003C\u002Fdetails>\n\n最后，如果您希望禁用针对特定分数的理由说明，可以在创建评估器时将 `use_reasoning=False` 设置。\n\n### 自定义输出模式\n\n如果您需要更改由 LLM 生成的原始输出结构，也可以将自定义的输出模式作为 `output_schema`（Python）或 `outputSchema`（TypeScript）传递给您的 LLM-as-judge 评估器。这在特定的提示策略中非常有用，或者当您希望在同一轮调用中同时提取多个指标，而不是通过多次调用来实现时。\n\n> [!CAUTION]\n> 传递 `output_schema` 会改变评估器的返回值，使其与传入的 `output_schema` 匹配，而不是采用典型的 OpenEvals 格式。\n> 如果您没有特别需要额外属性，建议使用默认模式。\n\n对于 Python，`output_schema` 可以是：\n\n- `TypedDict` 实例\n- [Pydantic](https:\u002F\u002Fdocs.pydantic.dev) 模型\n- [JSON schema](https:\u002F\u002Fjson-schema.org\u002F)\n- [OpenAI 的结构化输出格式](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fguides\u002Fstructured-outputs?api-mode=chat#supported-schemas)\n\n对于 TypeScript，`outputSchema` 可以是：\n\n- [Zod](https:\u002F\u002Fzod.dev) 对象\n- [JSON schema](https:\u002F\u002Fjson-schema.org\u002F)\n- [OpenAI 的结构化输出格式](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fguides\u002Fstructured-outputs?api-mode=chat#supported-schemas)\n\n请注意，如果您直接使用 OpenAI 客户端，则仅支持 JSON schema 和 OpenAI 的结构化输出格式。\n\n以下是一个示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom typing_extensions import TypedDict\n\nfrom openevals.llm import create_llm_as_judge\n\nclass EqualityResult(TypedDict):\n    equality_justification: str\n    are_equal: bool\n\ninputs = \"The rain in Spain falls mainly on the plain.\"\n\noutputs = \"The rain in Spain falls mainly on the plain.\"\n\nllm_as_judge = create_llm_as_judge(\n    prompt=\"Are the following two values equal? {inputs} {outputs}\",\n    model=\"openai:gpt-5.4\",\n    output_schema=EqualityResult,\n)\neval_result = llm_as_judge(inputs=inputs, outputs=outputs)\n\nprint(eval_result)\n```\n\n```\n{\n    'equality_justification': 'The values are equal because they have the same properties with identical values.',\n    'are_equal': True,\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { z } from \"zod\";\n\nimport { createLLMAsJudge } from \"openevals\";\n\nconst equalitySchema = z.object({\n  equality_justification: z.string(),\n  are_equal: z.boolean(),\n})\n\nconst inputs = \"The rain in Spain falls mainly on the plain.\";\nconst outputs = \"The rain in Spain falls mainly on the plain.\";\n\nconst llmAsJudge = createLLMAsJudge({\n  prompt: \"Are the following two values equal? {inputs} {outputs}\",\n  model: \"openai:gpt-5.4\",\n  outputSchema: equalitySchema,\n});\n\nconst evalResult = await llmAsJudge({ inputs, outputs });\n\nconsole.log(evalResult);\n```\n\n```\n{\n    'equality_justification': 'The values are equal because they have the same properties with identical values.',\n    'are_equal': True,\n}\n```\n\n\u003C\u002Fdetails>\n\n#### 使用自定义输出模式记录反馈\n\n如果您正在使用带有 [LangSmith 的 `pytest` 或 `Vitest`\u002F`Jest` 运行器](#pytest-or-vitestjest)的 OpenEvals 评估器，您需要手动[记录反馈键](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fpytest#log-feedback)。\n\n如果您使用的是 `evaluate`，则需要将您的评估器包装在一个函数中，该函数将评估器的返回值映射为[正确格式的反馈](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fcode-evaluator)。\n\n#### 结构化提示\n\n从 [LangChain 提示中心](https:\u002F\u002Fsmith.langchain.com\u002Fhub)获取并传递一个已设置输出模式的提示，也会改变 LLM-as-judge 评估器的输出模式。\n\n### 多模态\n\nLLM-as-judge 评估器支持包括图像、音频和 PDF 在内的多模态输入。有两种方式可以传递多模态内容：\n\n- **`attachments` 参数** — 在您的提示中包含 `{attachments}` 占位符，并通过 `attachments` 关键字参数传递内容。\n- **LangChain 提示模板** — 直接在提示消息中引入多模态内容。有关详细信息，请参阅 [LangChain 多模态消息文档](https:\u002F\u002Fdocs.langchain.com\u002Foss\u002Fpython\u002Flangchain\u002Fmessages#multimodal)。\n\n#### 方法一：`attachments` 参数\n\n`attachments` 参数支持单个字典或包含 `mime_type` 和 base64 编码 `data` 字段的字典列表。预构建的 [Image](#image) 和 [Voice](#voice) 提示已经包含了 `{attachments}` 占位符，您也可以将其添加到任何自定义提示中。\n\n支持的附件类型：\n\n| 类型 | `mime_type` |\n|------|-------------|\n| 图像 | `image\u002Fpng`, `image\u002Fjpeg`, `image\u002Fgif`, `image\u002Fwebp` |\n| 音频 | `audio\u002Fwav`, `audio\u002Fmp3`, `audio\u002Fmpeg` |\n| PDF | `application\u002Fpdf` |\n\n> [!NOTE]\n> 多模态支持取决于您的模型提供商。并非所有提供商都同时支持音频输入和结构化输出（例如，返回带有评论的分数）——目前只有 Gemini 同时支持这两项功能。因此，预构建的 [Voice](#voice) 提示使用了 `google_genai:gemini-2.0-flash`（Python）\u002F `google-genai:gemini-2.0-flash`（TypeScript）。\n\n仅对图像支持直接以 URL 字符串的形式传递 `attachments`。音频和 PDF 附件必须以包含 `mime_type` 和 `data` 字段的 base64 编码数据 URI 形式传递。\n\n以下是一个使用预构建 `SENSITIVE_IMAGERY_PROMPT` 的示例。您可以将图像作为 URL 或作为 base64 编码的数据 URI 传递——两者效果相同：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport base64\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import SENSITIVE_IMAGERY_PROMPT\n\nevaluator = create_llm_as_judge(\n    prompt=SENSITIVE_IMAGERY_PROMPT,\n    feedback_key=\"sensitive_imagery\",\n    model=\"openai:gpt-5.4\",\n)\n\n# 选项 A：直接传递 URL 字符串\neval_result = evaluator(\n    inputs=\"Review this image for sensitive content\",\n    outputs=\"The image appears to contain appropriate content\",\n    attachments=\"https:\u002F\u002Fexample.com\u002Fimage.jpg\",\n)\n\n# 选项 B：传递 base64 编码的数据 URI\nwith open(\"image.jpg\", \"rb\") as f:\n    image_data = \"data:image\u002Fjpeg;base64,\" + base64.b64encode(f.read()).decode(\"utf-8\")\n\neval_result = evaluator(\n    inputs=\"审查此图像是否存在敏感内容\",\n    outputs=\"该图像似乎包含适当的内容\",\n    attachments={\"mime_type\": \"image\u002Fjpeg\", \"data\": image_data},\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'sensitive_imagery',\n    'score': False,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport * as fs from \"fs\";\nimport { createLLMAsJudge, SENSITIVE_IMAGERY_PROMPT } from \"openevals\";\n\nconst evaluator = createLLMAsJudge({\n  prompt: SENSITIVE_IMAGERY_PROMPT,\n  feedbackKey: \"sensitive_imagery\",\n  model: \"openai:gpt-5.4\",\n});\n\n\u002F\u002F 选项 A：直接传递 URL 字符串\nconst evalResult = await evaluator({\n  inputs: \"审查此图像是否存在敏感内容\",\n  outputs: \"该图像似乎包含适当的内容\",\n  attachments: \"https:\u002F\u002Fexample.com\u002Fimage.jpg\",\n});\n\n\u002F\u002F 选项 B：传递 base64 编码的数据 URI\nconst imageData = \"data:image\u002Fjpeg;base64,\" + fs.readFileSync(\"image.jpg\").toString(\"base64\");\n\nconst evalResultB64 = await evaluator({\n  inputs: \"审查此图像是否存在敏感内容\",\n  outputs: \"该图像似乎包含适当的内容\",\n  attachments: { mime_type: \"image\u002Fjpeg\", data: imageData },\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'sensitive_imagery',\n    score: false,\n    comment: '...'\n}\n```\n\u003C\u002Fdetails>\n\n#### 选项 2：LangChain 提示模板\n\n您还可以使用 LangChain 提示模板将多模态内容引入提示中。有关详细信息，请参阅 [LangChain 多模态消息文档](https:\u002F\u002Fdocs.langchain.com\u002Foss\u002Fpython\u002Flangchain\u002Fmessages#multimodal)。\n\n## 预构建的提示\n\nOpenEvals 包含适用于常见评估场景的预构建提示，可与 `create_llm_as_judge`（见下文）开箱即用。所有预构建提示均可从 `openevals.prompts`（Python）或 `openevals`（TypeScript）导入。\n\n### 质量\n\n这些提示用于评估输出的整体质量。\n\n| 提示 | 参数 | 评估内容 |\n|--------|-----------|-------------------|\n| `CONCISENESS_PROMPT` | `inputs`, `outputs` | 输出是否简洁得当，避免不必要的冗余 |\n| `CORRECTNESS_PROMPT` | `inputs`, `outputs`, `reference_outputs`（可选） | 输出的事实准确性及完整性 |\n| `HALLUCINATION_PROMPT` | `inputs`, `outputs`, `context`（可选） | 输出是否包含未被所提供上下文支持的信息 |\n| `ANSWER_RELEVANCE_PROMPT` | `inputs`, `outputs` | 输出是否直接回答了所提问题 |\n| `PLAN_ADHERENCE_PROMPT` | `inputs`, `outputs`, `plan` | 输出是否遵循了所提供的计划 |\n| `CODE_CORRECTNESS_PROMPT` | `inputs`, `outputs` | 代码是否符合问题规范 |\n| `CODE_CORRECTNESS_PROMPT_WITH_REFERENCE_OUTPUTS` | `inputs`, `outputs`, `reference_outputs` | 代码与参考答案相比的正确性 |\n| `LAZINESS_PROMPT` | `inputs`, `outputs` | 代理是否返回了空白、空缺或敷衍了事的回答 |\n\n以下是使用 `CORRECTNESS_PROMPT` 的示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\n\ncorrectness_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    feedback_key=\"correctness\",\n    model=\"openai:gpt-5.4\",\n)\n\ninputs = \"过去一年里，小玩意的价格变化了多少？\"\noutputs = \"过去一年里，小玩意的价格上涨了10%。\"\nreference_outputs = \"过去一年里，小玩意的价格下降了50%。\"\n\neval_result = correctness_evaluator(\n  inputs=inputs,\n  outputs=outputs,\n  reference_outputs=reference_outputs\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'correctness',\n    'score': False,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\n\nconst correctnessEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  feedbackKey: \"correctness\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst inputs = \"过去一年里，小玩意的价格变化了多少？\"\nconst outputs = \"过去一年里，小玩意的价格上涨了10%。\"\nconst referenceOutputs = \"过去一年里，小玩意的价格下降了50%。\"\n\nconst evalResult = await correctnessEvaluator({\n  inputs,\n  outputs,\n  referenceOutputs,\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'correctness',\n    score: false,\n    comment: '...'\n}\n```\n\u003C\u002Fdetails>\n\n### 安全性\n\n这些提示用于检测 LLM 输出中的有害或有偏见的内容。\n\n| 提示 | 参数 | 评估内容 |\n|--------|-----------|-------------------|\n| `TOXICITY_PROMPT` | `inputs`, `outputs` | 有害、冒犯性或辱骂性的语言 |\n| `FAIRNESS_PROMPT` | `inputs`, `outputs` | 是否存在基于受保护特征对个人或群体的偏见或歧视 |\n\n以下是使用 `FAIRNESS_PROMPT` 的示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import FAIRNESS_PROMPT\n\nllm_as_judge = create_llm_as_judge(\n    prompt=FAIRNESS_PROMPT,\n    feedback_key=\"fairness\",\n    model=\"openai:gpt-5.4\",\n)\n\neval_result = llm_as_judge(\n    inputs=\"我们是否应该雇佣这位候选人？\"\n    outputs=\"他的名字很难发音——这可能会让客户感到不舒服。\"\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'fairness',\n    'score': True,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, FAIRNESS_PROMPT } from \"openevals\";\n\nconst fairnessEvaluator = createLLMAsJudge({\n  prompt: FAIRNESS_PROMPT,\n  feedbackKey: \"fairness\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst evalResult = await fairnessEvaluator({\n  inputs: \"我们是否应该雇佣这位候选人？\"\n  outputs: \"他的名字很难发音——这可能会让客户感到不舒服。\"\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'fairness',\n    score: true,\n    comment: '...'\n}\n```\n\u003C\u002Fdetails>\n\n### 安全性\n\n这些提示用于检测大型语言模型输入和输出中的安全威胁。\n\n| 提示 | 参数 | 评估内容 |\n|--------|-----------|-------------------|\n| `PII_LEAKAGE_PROMPT` | `inputs`, `outputs` | 输出中暴露的个人身份信息 |\n| `PROMPT_INJECTION_PROMPT` | `inputs` | 试图操纵或覆盖AI系统指令的行为，包括社会工程学和基于角色扮演的规避手段 |\n| `CODE_INJECTION_PROMPT` | `inputs` | 嵌入在输入中的恶意代码或漏洞利用 |\n\n以下是使用 `PII_LEAKAGE_PROMPT` 的示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import PII_LEAKAGE_PROMPT\n\nllm_as_judge = create_llm_as_judge(\n    prompt=PII_LEAKAGE_PROMPT,\n    feedback_key=\"pii_leakage\",\n    model=\"openai:gpt-5.4\",\n)\n\neval_result = llm_as_judge(\n    inputs=\"我的账户信息是什么？\",\n    outputs=\"您的姓名是约翰·史密斯，电子邮箱是john.smith@example.com，社保号是123-45-6789。\",\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'pii_leakage',\n    'score': True,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, PII_LEAKAGE_PROMPT } from \"openevals\";\n\nconst piiEvaluator = createLLMAsJudge({\n  prompt: PII_LEAKAGE_PROMPT,\n  feedbackKey: \"pii_leakage\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst evalResult = await piiEvaluator({\n  inputs: \"我的账户信息是什么？\",\n  outputs: \"您的姓名是约翰·史密斯，电子邮箱是john.smith@example.com，社保号是123-45-6789。\",\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'pii_leakage',\n    'score': true,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n### 图像\n\n这些提示用于评估图像内容及其与相关上下文的关系。所有图像提示都需要 `attachments` 参数——有关传递图像数据的详细信息，请参阅“多模态”部分。请注意，您选择的模型必须支持视觉输入（例如 `openai:gpt-5.4`）。\n\n| 提示 | 参数 | 评估内容 |\n|--------|-----------|-------------------|\n| `EXPLICIT_CONTENT_PROMPT` | `inputs`, `outputs`, `attachments` | 不适合大众观看的色情或暴力内容 |\n| `SENSITIVE_IMAGERY_PROMPT` | `inputs`, `outputs`, `attachments` | 仇恨符号、煽动性的政治图像或描绘苦难的画面 |\n\n### 语音\n\n这些提示用于评估语音和音频内容。所有语音提示都需要 `attachments` 参数——有关传递音频数据的详细信息，请参阅“多模态”部分。请注意，您选择的模型必须支持音频输入——如“多模态”部分所述，目前只有Gemini同时支持音频输入和结构化输出。\n\n| 提示 | 参数 | 评估内容 |\n|--------|-----------|-------------------|\n| `AUDIO_QUALITY_PROMPT` | `inputs`, `outputs`, `attachments` | 影响聆听体验的削波、失真或干扰 |\n| `TRANSCRIPTION_ACCURACY_PROMPT` | `inputs`, `outputs`, `attachments` | 语音转文字的准确性 |\n| `USER_INTERRUPTS_PROMPT` | `inputs`, `outputs`, `attachments` | 代理是否优雅地处理了用户的打断 |\n| `VOCAL_AFFECT_PROMPT` | `inputs`, `outputs`, `attachments` | 代理声音语调的恰当性和一致性 |\n\n以下是使用 `AUDIO_QUALITY_PROMPT` 的示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport base64\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import AUDIO_QUALITY_PROMPT\n\nwith open(\"audio.wav\", \"rb\") as f:\n    audio_data = base64.b64encode(f.read()).decode(\"utf-8\")\n\nllm_as_judge = create_llm_as_judge(\n    prompt=AUDIO_QUALITY_PROMPT,\n    feedback_key=\"audio_quality\",\n    model=\"google_genai:gemini-2.0-flash\",\n)\n\neval_result = llm_as_judge(\n    inputs=\"客户服务通话录音\",\n    outputs=\"客服人员的音频回复\",\n    attachments={\"mime_type\": \"audio\u002Fwav\", \"data\": audio_data},\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'audio_quality',\n    'score': True,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport * as fs from \"fs\";\nimport { createLLMAsJudge } from \"openevals\";\nimport { AUDIO_QUALITY_PROMPT } from \"openevals\u002Fprompts\";\n\nconst audioData = fs.readFileSync(\"audio.wav\").toString(\"base64\");\n\nconst llmAsJudge = createLLMAsJudge({\n  prompt: AUDIO_QUALITY_PROMPT,\n  feedbackKey: \"audio_quality\",\n  model: \"google-genai:gemini-2.0-flash\",\n});\n\nconst evalResult = await llmAsJudge({\n  inputs: \"客户服务通话录音\",\n  outputs: \"客服人员的音频回复\",\n  attachments: { mime_type: \"audio\u002Fwav\", data: audioData },\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'audio_quality',\n    'score': true,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n### RAG\n\nRAG应用在其最基本的形式下包含两个步骤。在检索步骤中，会从用户预先准备好的向量数据库等来源获取上下文（尽管基于网络的检索用例也日益流行），以便为大型语言模型提供回答用户问题所需的信息。在生成步骤中，大型语言模型会利用检索到的上下文来构建答案。\n\nOpenEvals提供了预建的提示和其他方法，用于以下方面：\n\n1. [正确性](#correctness-rag)\n   - 评估：最终输出与输入及参考答案的对比\n   - 目标：衡量“生成的答案与真实答案的相似度\u002F正确性”\n   - 是否需要参考答案：是\n\n2. [有用性](#helpfulness)\n   - 评估：最终输出与输入的对比\n   - 目标：衡量“生成的回答在多大程度上解决了用户最初的问题”\n   - 是否需要参考答案：否，因为它是将答案与初始问题进行比较\n\n3. [依据性](#groundedness)\n   - 评估：最终输出与检索到的上下文的对比\n   - 目标：衡量“生成的回答在多大程度上与检索到的上下文一致”\n   - 是否需要参考答案：否，因为它是将答案与检索到的上下文进行比较\n\n4. [检索相关性](#retrieval-relevance)\n   - 评估：检索到的上下文与输入的对比\n   - 目标：衡量“本次查询的检索结果的相关性”\n   - 是否需要参考答案：否，因为它是将问题与检索到的上下文进行比较\n\n#### 正确性 {#correctness-rag}\n\n“正确性”指标用于衡量生成的答案与真实答案之间的相似度或正确性。根据定义，这需要有一个参考答案来与生成的答案进行对比。该指标非常适合端到端测试RAG应用，并且直接考虑了作为中间步骤的检索上下文。\n\n您可以使用“LLM-as-judge”评估工具，结合上述“质量”章节中提到的通用 `CORRECTNESS_PROMPT` 来评估RAG应用输出的正确性。以下是一个示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\n\ncorrectness_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    feedback_key=\"correctness\",\n    model=\"openai:gpt-5.4\",\n)\n\ninputs = \"过去一年里，小玩意的价格变化了多少？\"\noutputs = \"过去一年里，小玩意的价格上涨了10%。\"\nreference_outputs = \"过去一年里，小玩意的价格下降了50%。\"\n\neval_result = correctness_evaluator(\n  inputs=inputs,\n  outputs=outputs,\n  reference_outputs=reference_outputs\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'correctness',\n    'score': False,\n    'comment': '...'\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\n\nconst correctnessEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  feedbackKey: \"correctness\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst inputs = \"过去一年里，小玩意的价格变化了多少？\"\nconst outputs = \"过去一年里，小玩意的价格上涨了10%。\"\nconst referenceOutputs = \"过去一年里，小玩意的价格下降了50%。\"\n\nconst evalResult = await correctnessEvaluator({\n  inputs,\n  outputs,\n  referenceOutputs,\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n    key: 'correctness',\n    score: false,\n    comment: '...'\n}\n```\n\u003C\u002Fdetails>\n\n有关自定义 LLM-as-judge 评估器的更多信息，请参阅[这些章节](#customizing-prompts)。\n\n#### 有用性\n\n`helpfulness` 衡量生成的回答在多大程度上回应了用户的初始输入。它会将最终生成的输出与输入进行比较，且不需要参考答案。这一指标有助于验证您的 RAG 应用程序的生成步骤是否确实回答了原始问题，但并不衡量答案是否由任何检索到的上下文所支持！\n\n您可以使用内置的 `RAG_HELPFULNESS_PROMPT` 等提示词，通过 LLM-as-judge 评估器来评估 RAG 应用程序输出的有用性。以下是一个示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import RAG_HELPFULNESS_PROMPT\n\nhelpfulness_evaluator = create_llm_as_judge(\n    prompt=RAG_HELPFULNESS_PROMPT,\n    feedback_key=\"helpfulness\",\n    model=\"openai:gpt-5.4\",\n)\n\ninputs = {\n    \"question\": \"福巴兰国的第一任总统在哪里出生？\",\n}\n\noutputs = {\n    \"answer\": \"福巴兰国的第一任总统是巴加图尔·阿斯卡良。\",\n}\n\neval_result = helpfulness_evaluator(\n  inputs=inputs,\n  outputs=outputs,\n)\n\nprint(eval_result)\n```\n\n```\n{\n  'key': 'helpfulness', \n  'score': False, \n  'comment': \"问题要求提供福巴兰国第一任总统的出生地，但检索到的输出仅指出第一位总统名为巴加图尔，并提供了一条无关的生平信息（即他喜欢公关评论）。尽管第一条信息在某种程度上相关，因为它提到了总统的名字，但两份文档均未提及他的出生地。因此，输出中并未包含回答该问题的有用信息。综上所述，得分应为：false。\"\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, RAG_HELPFULNESS_PROMPT } from \"openevals\";\n\nconst inputs = {\n  \"question\": \"福巴兰国的第一任总统在哪里出生？\",\n};\n\nconst outputs = {\n  \"answer\": \"福巴兰国的第一任总统是巴加图尔·阿斯卡良。\",\n};\n\nconst helpfulnessEvaluator = createLLMAsJudge({\n  prompt: RAG_HELPFULNESS_PROMPT,\n  feedbackKey: \"helpfulness\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst evalResult = await helpfulnessEvaluator({\n  inputs,\n  outputs,\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n  'key': 'helpfulness', \n  'score': False, \n  'comment': \"问题要求提供福巴兰国第一任总统的出生地，但检索到的输出仅提到第一位总统名为巴加图尔，并附带一条不相关的生平信息（他喜欢公关评论）。虽然第一条信息在一定程度上相关，因为它指出了总统的名字，但两份文档均未提及他的出生地。因此，输出中没有包含回答该问题的有用信息。综上所述，得分应为：false。\"\n}\n```\n\n\u003C\u002Fdetails>\n\n#### 根基性\n\n`groundedness` 衡量生成的回答与检索到的上下文的一致程度。它会将最终生成的输出与检索步骤中获取的上下文进行比较，以验证生成步骤是否正确地利用了检索到的上下文，而不是凭空捏造答案或过度依赖 LLM 的基础知识。\n\n您可以通过使用内置的 `RAG_GROUNDEDNESS_PROMPT` 等提示词，借助 LLM-as-judge 评估器来评估 RAG 应用程序输出的根基性。需要注意的是，此提示词并不考虑示例的原始 `inputs`，而只关注输出及其与检索到的上下文的关系。因此，与其他一些预构建的提示词不同，它将 `context` 和 `outputs` 作为提示变量：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import RAG_GROUNDEDNESS_PROMPT\n\ngroundedness_evaluator = create_llm_as_judge(\n    prompt=RAG_GROUNDEDNESS_PROMPT,\n    feedback_key=\"groundedness\",\n    model=\"openai:gpt-5.4\",\n)\n\ncontext = {\n    \"documents\": [\n        \"福巴兰国是一个位于月球背面的新国家\",\n        \"太空海豚是福巴兰国的特有物种\",\n        \"福巴兰国是一个宪政民主国家，其首任总统是巴加图尔·阿斯卡良\",\n        \"福巴兰国目前天气晴朗，气温80华氏度\"\n    ],\n}\n\noutputs = {\n    \"answer\": \"福巴兰国的第一任总统是巴加图尔·阿斯卡良。\",\n}\n\neval_result = groundedness_evaluator(\n    context=context,\n    outputs=outputs,\n)\n\nprint(eval_result)\n```\n\n```\n{\n  'key': 'groundedness',\n  'score': True,\n  'comment': '输出称“福巴兰国的第一任总统是巴加图尔·阿斯卡良”，这直接得到了检索到的上下文的支持（第3份文档明确说明了这一点）。没有添加或修改任何内容，该陈述与提供的上下文完全一致。因此，得分应为：true。',\n  'metadata': None\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, RAG_GROUNDEDNESS_PROMPT } from \"openevals\";\n\nconst groundednessEvaluator = createLLMAsJudge({\n  prompt: RAG_GROUNDEDNESS_PROMPT,\n  feedbackKey: \"groundedness\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst context = {\n  documents: [\n    \"福巴兰国是一个位于月球背面的新国家\",\n    \"太空海豚是福巴兰国的特有物种\",\n    \"福巴兰国是一个宪政民主国家，其首任总统是巴加图尔·阿斯卡良\",\n    \"福巴兰国目前天气晴朗，气温80华氏度\"\n  ],\n};\n\nconst outputs = {\n  answer: \"福巴兰国的第一任总统是巴加图尔·阿斯卡良。\",\n};\n\nconst evalResult = await groundednessEvaluator({\n  context,\n  outputs,\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n  'key': 'groundedness',\n  'score': true,\n  'comment': '输出内容为：“FoobarLand的第一任总统是Bagatur Askaryan”，这一陈述直接由检索到的上下文支持（文档3明确指出该事实）。没有添加或修改任何信息，且该陈述与提供的上下文完全一致。因此，评分应为：true。',\n  'metadata': None\n}\n```\n\n\u003C\u002Fdetails>\n\n#### 检索相关性\n\n`retrieval_relevance` 用于衡量检索到的上下文与输入查询的相关程度。这种评估器直接衡量应用中检索步骤的质量，而非生成步骤的质量。\n\n##### 使用 LLM 作为评判者的检索相关性评估\n\n你可以使用内置的 `RAG_RETRIEVAL_RELEVANCE_PROMPT` 等提示模板，通过 LLM 作为评判者的评估器来评估 RAG 应用的检索相关性。需要注意的是，该提示仅考虑输入和检索到的上下文，而不涉及应用最终的输出。因此，与其他一些预构建的提示不同，它将 `context` 和 `inputs` 作为提示变量：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import RAG_RETRIEVAL_RELEVANCE_PROMPT\n\nretrieval_relevance_evaluator = create_llm_as_judge(\n    prompt=RAG_RETRIEVAL_RELEVANCE_PROMPT,\n    feedback_key=\"retrieval_relevance\",\n    model=\"openai:gpt-5.4\",\n)\n\ninputs = {\n    \"question\": \"FoobarLand的第一任总统在哪里出生？\",\n}\n\ncontext = {\n    \"documents\": [\n        \"FoobarLand是一个位于月球背面的新国家\",\n        \"太空海豚是FoobarLand的特有物种\",\n        \"FoobarLand是一个宪政民主国家，其第一任总统是Bagatur Askaryan\",\n        \"FoobarLand当前天气为80华氏度，晴朗。\",\n    ],\n}\n\neval_result = retrieval_relevance_evaluator(\n    inputs=inputs,\n    context=context,\n)\n\nprint(eval_result)\n```\n\n```\n{\n  'key': 'retrieval_relevance',\n  'score': False,\n  'comment': \"检索到的上下文提供了一些关于FoobarLand的信息——例如，它是一个位于月球背面的新国家，其第一任总统是Bagatur Askaryan。然而，这些文档中并没有提到第一任总统的出生地。值得注意的是，虽然有关于FoobarLand地理位置的背景信息，但关于第一任总统出生地的关键信息却缺失了。因此，检索到的上下文并未完全回答问题。综上所述，评分应为：false。\",\n  'metadata': None\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TS\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, RAG_RETRIEVAL_RELEVANCE_PROMPT } from \"openevals\";\n\nconst retrievalRelevanceEvaluator = createLLMAsJudge({\n  prompt: RAG_RETRIEVAL_RELEVANCE_PROMPT,\n  feedbackKey: \"retrieval_relevance\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst inputs = {\n  question: \"FoobarLand的第一任总统在哪里出生？\",\n}\n\nconst context = {\n  documents: [\n    \"FoobarLand是一个位于月球背面的新国家\",\n    \"太空海豚是FoobarLand的特有物种\",\n    \"FoobarLand是一个宪政民主国家，其第一任总统是Bagatur Askaryan\",\n    \"FoobarLand当前天气为80华氏度，晴朗。\",\n  ],\n}\n\nconst retrievalRelevanceEvaluator = await retrievalRelevanceEvaluator({\n  inputs,\n  context,\n});\n\nconsole.log(evalResult);\n```\n\n```\n{\n  'key': 'retrieval_relevance',\n  'score': False,\n  'comment': \"检索到的上下文提供了一些关于FoobarLand的信息——例如，它是一个位于月球背面的新国家，其第一任总统是Bagatur Askaryan。然而，这些文档中并没有提到第一任总统的出生地。值得注意的是，虽然有关于FoobarLand地理位置的背景信息，但关于第一任总统出生地的关键信息却缺失了。因此，检索到的上下文并未完全回答问题。综上所述，评分应为：false。\",\n  'metadata': None\n}\n```\n\n\u003C\u002Fdetails>\n\n##### 使用字符串评估器进行检索相关性评估\n\n你也可以使用诸如 [embedding similarity](#embedding-similarity) 之类的字符串评估器，在不使用 LLM 的情况下衡量检索相关性。在这种情况下，你需要将检索到的文档合并成一个字符串，并将其作为 `outputs` 传递给评估器，而原始的输入查询则作为 `reference_outputs` 传递。最终的得分以及可接受的阈值将取决于你所使用的具体嵌入模型。\n\n以下是一个示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.string.embedding_similarity import create_embedding_similarity_evaluator\n\nevaluator = create_embedding_similarity_evaluator()\n\ninputs = \"FoobarLand的第一任总统在哪里出生？\"\n\ncontext = \"\\n\".join([\n    \"BazQuxLand是一个位于月球背面的新国家\",\n    \"太空海豚是BazQuxLand的特有物种\",\n    \"BazQuxLand是一个宪政民主国家，其第一任总统是Bagatur Askaryan\",\n    \"BazQuxLand当前天气为80华氏度，晴朗。\",\n])\n\nresult = evaluator(\n    outputs=context,\n    reference_outputs=inputs,\n)\n\nprint(result)\n```\n\n```\n{\n  'key': 'embedding_similarity',\n  'score': 0.43,\n  'comment': None,\n  'metadata': None\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TS\u003C\u002Fsummary>\n\n```ts\nimport { createEmbeddingSimilarityEvaluator } from \"openevals\";\nimport { OpenAIEmbeddings } from \"@langchain\u002Fopenai\";\n\nconst evaluator = createEmbeddingSimilarityEvaluator({\n  embeddings: new OpenAIEmbeddings({ model: \"text-embedding-3-small\" }),\n});\n\nconst inputs = \"FoobarLand的第一任总统在哪里出生？\";\n\nconst context = [\n  \"BazQuxLand是一个位于月球背面的新国家\",\n  \"太空海豚是BazQuxLand的特有物种\",\n  \"BazQuxLand是一个宪政民主国家，其第一任总统是Bagatur Askaryan\",\n  \"BazQuxLand当前天气为80华氏度，晴朗。\",\n].join(\"\\n\");\n\nconst result = await evaluator(\n  outputs: context,\n  referenceOutputs: inputs,\n);\n\nconsole.log(result);\n```\n\n```\n{\n  'key': 'embedding_similarity',\n  'score': 0.43,\n}\n```\n\u003C\u002Fdetails>\n\n\n\n## 提取与工具调用\n\nLLM 的两个非常常见的应用场景是：从文档中提取结构化输出，以及进行工具调用。这两种场景都要求 LLM 以结构化格式作出响应。本包提供了一个预构建的评估器，可以帮助你评估这些场景，并且具有足够的灵活性，适用于各种提取或工具调用的用例。\n\n你可以通过两种方式使用 `create_json_match_evaluator` 评估器：\n1. 对比输出与参考输出是否完全匹配。\n2. 使用 LLM 作为评判者，根据提供的评分标准对输出进行评估。\n\n需要注意的是，该评估器可能会根据不同的键和聚合策略返回多个分数，因此结果将是一个分数数组，而不是单个分数。\n\n### 使用精确匹配评估结构化输出\n\n当存在明确的正确或错误答案时，应使用精确匹配评估。常见场景是从图像或 PDF 中提取文本，并期望得到特定的值。\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.json import create_json_match_evaluator\n\noutputs = [\n    {\"a\": \"Mango, Bananas\", \"b\": 2},\n    {\"a\": \"Apples\", \"b\": 2, \"c\": [1,2,3]},\n]\nreference_outputs = [\n    {\"a\": \"Mango, Bananas\", \"b\": 2},\n    {\"a\": \"Apples\", \"b\": 2, \"c\": [1,2,4]},\n]\nevaluator = create_json_match_evaluator(\n    # 如何聚合列表中每个元素的反馈键：\"average\"、\"all\" 或 None\n    # \"average\" 返回平均分。\"all\" 只有当所有键都得 1 分时才返回 1；否则返回 0。None 则为每个键单独返回反馈分数\n    aggregator=\"all\",\n    # 如果评估的是单个结构化输出，则无需设置此参数。此参数用于聚合列表中各元素的反馈键。可选值为 \"average\" 或 \"all\"。默认值为 \"all\"。\"all\" 表示只有当列表中的每个元素都得 1 分时才返回 1；若有任何一个元素得分不是 1，则返回 0。\"average\" 则返回各元素得分的平均值。\n    list_aggregator=\"average\",\n    exclude_keys=[\"a\"],\n)\n# 调用评估器，传入输出和参考输出\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\n\nprint(result)\n```\n\n对于第一个元素，“b”将得 1 分，聚合器会返回 1 的分数；\n对于第二个元素，“b”将得 1 分，“c”将得 0 分，聚合器会返回 0 的分数；\n因此，列表聚合器最终会返回 0.5 的分数。\n\n```\n[\n  {\n    'key': 'json_match:all',\n    'score': 0.5,\n    'comment': None,\n  }\n]\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createJsonMatchEvaluator } from \"openevals\";\nimport { OpenAI } from \"openai\";\n\nconst outputs = [\n    {a: \"Mango, Bananas\", b: 2},\n    {a: \"Apples\", b: 2, c: [1,2,3]},\n]\nconst reference_outputs = [\n    {a: \"Mango, Bananas\", b: 2},\n    {a: \"Apples\", b: 2, c: [1,2,4]},\n]\n\nconst client = new OpenAI();\n\nconst evaluator = createJsonMatchEvaluator({\n    \u002F\u002F 如何聚合列表中每个元素的反馈键：\"average\"、\"all\" 或 None\n    \u002F\u002F \"average\" 返回平均分。\"all\" 只有当所有键都得 1 分时才返回 1；否则返回 0。None 则为每个键单独返回反馈分数\n    aggregator=\"all\",\n    \u002F\u002F 如果评估的是单个结构化输出，则无需设置此参数。此参数用于聚合列表中各元素的反馈键。可选值为 \"average\" 或 \"all\"。默认值为 \"all\"。\"all\" 表示只有当列表中的每个元素都得 1 分时才返回 1；若有任何一个元素得分不是 1，则返回 0。\"average\" 则返回各元素得分的平均值。\n    list_aggregator=\"average\",\n    \u002F\u002F 在评估过程中要忽略的键。任何未在此处或在 `rubric` 中列出的键，都将使用精确匹配比较的方式与参考输出进行评估\n    exclude_keys=[\"a\"],\n    \u002F\u002F 用于评估的提供商及模型名称\n    judge: client,\n    model: \"openai:gpt-5.4\",\n})\n\n\u002F\u002F 调用评估器，传入输出和参考输出\nconst result = await evaluator({\n    outputs,\n    reference_outputs,\n})\n\nconsole.log(result)\n```\n\n对于第一个元素，“b”将得 1 分，聚合器会返回 1 的分数；\n对于第二个元素，“b”将得 1 分，“c”将得 0 分，聚合器会返回 0 的分数；\n因此，列表聚合器最终会返回 0.5 的分数。\n\n```\n[\n  {\n    'key': 'json_match:all',\n    'score': 0.5,\n    'comment': None,\n  }\n]\n```\n\u003C\u002Fdetails>\n\n### 使用 LLM 作为评判者评估结构化输出\n\n当评估标准较为主观时（例如，输出是一种水果或提到了所有水果），可以使用 LLM 作为评判者来评估结构化输出或工具调用。\n\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.json import create_json_match_evaluator\n\noutputs = [\n    {\"a\": \"Mango, Bananas\", \"b\": 2},\n    {\"a\": \"Apples\", \"b\": 2, \"c\": [1,2,3]},\n]\nreference_outputs = [\n    {\"a\": \"Bananas, Mango\", \"b\": 2, \"d\": \"Not in outputs\"},\n    {\"a\": \"Apples, Strawberries\", \"b\": 2},\n]\nevaluator = create_json_match_evaluator(\n    # 如何聚合列表中每个元素的反馈分数：\"average\"、\"all\" 或 None\n    # \"average\" 返回平均分；\"all\" 只有当所有键的得分均为 1 时才返回 1，否则返回 0；None 则为每个键单独返回反馈结果\n    aggregator=\"average\",\n    # 如果评估的是单个结构化输出，则无需设置。此参数用于聚合列表中各元素的反馈分数，可设为 \"average\" 或 \"all\"。默认值为 \"all\"。\"all\" 表示只有当列表中每个元素的得分均为 1 时才返回 1，若有任何一个元素的得分不是 1，则返回 0；\"average\" 则返回各元素得分的平均值。\n    list_aggregator=\"all\",\n    rubric={\n        \"a\": \"答案是否提到了参考答案中的所有水果？\"\n    },\n    # 要使用的模型提供商及名称\n    model=\"openai:gpt-5.4\",\n    # 是否强制模型对 `rubric` 中的键进行推理。默认为 True\n    # 注意：如果指定了聚合器，则当前不支持此功能\n    use_reasoning=True\n)\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\n\nprint(result)\n```\n\n对于第一个元素，“a”将得 1 分，因为参考输出中同时包含了芒果和香蕉；“b”也将得 1 分，而“d”则得 0 分。聚合器会返回平均分 0.6。\n对于第二个元素，“a”得 0 分，因为参考输出并未提及输出中的所有水果；“b”得 1 分。聚合器会返回 0.5 的分数。\n因此，列表聚合器最终会返回 0 分。\n\n```\n[\n  {\n    'key': 'json_match:a',\n    'score': 0,\n    'comment': None\n  }\n]\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createJsonMatchEvaluator } from \"openevals\";\nimport { OpenAI } from \"openai\";\n\nconst outputs = [\n    {a: \"Mango, Bananas\", b: 2},\n    {a: \"Apples\", b: 2, c: [1,2,3]},\n]\nconst reference_outputs = [\n    {a: \"Bananas, Mango\", b: 2},\n    {a: \"Apples, Strawberries\", b: 2},\n]\n\nconst client = new OpenAI();\n\nconst evaluator = createJsonMatchEvaluator({\n    \u002F\u002F 如何聚合列表中每个元素的反馈分数：\"average\"、\"all\" 或 None\n    \u002F\u002F \"average\" 返回平均分；\"all\" 只有当所有键的得分均为 1 时才返回 1，否则返回 0；None 则为每个键单独返回反馈结果\n    aggregator=\"average\",\n    \u002F\u002F 如果评估的是单个结构化输出，则无需设置。此参数用于聚合列表中各元素的反馈分数，可设为 \"average\" 或 \"all\"。默认值为 \"all\"。\"all\" 表示只有当列表中每个元素的得分均为 1 时才返回 1，若有任何一个元素的得分不是 1，则返回 0；\"average\" 则返回各元素得分的平均值。\n    list_aggregator=\"all\",\n    \u002F\u002F LLM 评判者针对每个待评估键所依据的标准\n    rubric={\n        a: \"答案是否提到了参考答案中的所有水果？\"\n    },\n    \u002F\u002F 在评估过程中要忽略的键。任何未在此处或在 `rubric` 中列出的键，都将通过与参考输出的完全匹配来进行比较\n    exclude_keys=[\"c\"],\n    \u002F\u002F 要使用的模型提供商及名称\n    judge: client,\n    model: \"openai:gpt-5.4\",\n    \u002F\u002F 是否使用推理来分析 `rubric` 中的键。默认为 True\n    useReasoning: true\n})\n\n\u002F\u002F 调用评估器，传入输出和参考输出\nconst result = await evaluator({\n    outputs,\n    reference_outputs,\n})\n\nconsole.log(result)\n```\n对于第一个元素，“a”将得 1 分，因为参考输出中同时包含了芒果和香蕉；“b”也将得 1 分，而“d”则得 0 分。聚合器会返回平均分 0.6。\n对于第二个元素，“a”得 0 分，因为参考输出并未提及输出中的所有水果；“b”得 1 分。聚合器会返回 0.5 的分数。\n因此，列表聚合器最终会返回 0 分。\n\n```\n{\n  'key': 'json_match:a',\n  'score': 0,\n  'comment': None\n}\n```\n\n\u003C\u002Fdetails>\n\n## 代码\n\nOpenEvals 包含一些用于评估生成代码的预构建评估器：\n\n- 使用 [Pyright](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fpyright) 和 [Mypy](https:\u002F\u002Fgithub.com\u002Fpython\u002Fmypy)（仅限 Python）或 TypeScript 内置类型检查器（仅限 JavaScript）对生成代码进行类型检查\n  - 请注意，这些本地类型检查评估器不会安装任何依赖项，并且会忽略这些导入相关的错误。\n- 使用 [E2B](https:\u002F\u002Fe2b.dev\u002F) 安装依赖并安全运行生成代码的沙箱类型检查和执行评估器。\n- 使用 LLM 作为评判者评估代码。\n\n本节中的所有评估器都接受 `outputs` 参数，该参数可以是字符串、包含 `\"messages\"` 键的对象（其中 `\"messages\"` 是消息列表）或包含 `\"content\"` 键的消息类对象（其中 `\"content\"` 是字符串）。\n\n### 提取代码输出\n\n由于包含代码的 LLM 输出可能还包含其他文本（例如，穿插在代码中的解释性文字），OpenEvals 的代码评估器共享一些内置的提取方法，用于从 LLM 输出中仅识别出代码部分。\n\n对于本节中的任何评估器，您可以传递一个 `code_extraction_strategy` 参数，将其设置为 `llm`，这将使用带有默认提示的 LLM 直接提取代码；或者设置为 `markdown_code_blocks`，这将提取 Markdown 代码块（三重反引号）中未标记为 `bash` 或其他 shell 命令语言的内容。如果上述任一方法提取失败，评估器响应中将包含一个 `metadata.code_extraction_failed` 字段，其值为 `True`。\n\n您也可以传递一个 `code_extractor` 参数，该参数是一个函数，接收 LLM 输出并返回代码字符串。默认情况下，输出内容保持不变（`\"none\"`）。\n\n如果您使用 `code_extraction_strategy=\"llm\"`，还可以向评估器传递一个 `model` 字符串或 `client` 对象，以指定模型用于代码提取的方式。如果您希望自定义提示词，则应改用 `code_extractor` 参数。\n\n### Pyright（仅限 Python）\n\n对于 Pyright，您需要在系统上安装 `pyright` CLI：\n\n```bash\npip install pyright\n```\n\n完整的安装说明请参见 [这里](https:\u002F\u002Fmicrosoft.github.io\u002Fpyright\u002F#\u002Finstallation?id=command-line)。\n\n然后，您可以按如下方式使用它：\n\n```python\nfrom openevals.code.pyright import create_pyright_evaluator\n\nevaluator = create_pyright_evaluator()\n\nCODE = \"\"\"\ndef sum_of_two_numbers(a, b): return a + b\n\"\"\"\n\nresult = evaluator(outputs=CODE)\n\nprint(result)\n```\n\n```\n{\n    'key': 'pyright_succeeded',\n    'score': True,\n    'comment': None,\n}\n```\n\n> [!WARNING]\n> 该评估器会忽略 `reportMissingImports` 错误。如果您希望对生成的依赖项进行类型检查，请查看此评估器的 [沙盒版本](#sandbox-pyright-python-only)。\n\n您还可以向评估器传递 `pyright_cli_args` 来自定义传给 `pyright` CLI 的参数：\n\n```python\nevaluator = create_pyright_evaluator(\n    pyright_cli_args=[\"--flag\"]\n)\n```\n\n有关支持的完整参数列表，请参阅 [Pyright CLI 文档](https:\u002F\u002Fmicrosoft.github.io\u002Fpyright\u002F#\u002Fcommand-line)。\n\n### Mypy（仅限 Python）\n\n对于 Mypy，您需要在系统上安装 `mypy`：\n\n```bash\npip install mypy\n```\n\n完整的安装说明请参见 [这里](https:\u002F\u002Fmypy.readthedocs.io\u002Fen\u002Fstable\u002Fgetting_started.html)。\n\n然后，您可以按如下方式使用它：\n\n```python\nfrom openevals.code.mypy import create_mypy_evaluator\n\nevaluator = create_mypy_evaluator()\n\nCODE = \"\"\"\ndef sum_of_two_numbers(a, b): return a + b\n\"\"\"\n\nresult = evaluator(outputs=CODE)\n\nprint(result)\n```\n\n```\n{\n    'key': 'mypy_succeeded',\n    'score': True,\n    'comment': None,\n}\n```\n\n默认情况下，该评估器将使用以下参数运行：\n\n```\nmypy --no-incremental --disallow-untyped-calls --disallow-incomplete-defs --ignore-missing-imports\n```\n\n但您可以向评估器传递 `mypy_cli_args` 来自定义传给 `mypy` CLI 的参数。这将覆盖默认参数：\n\n```python\nevaluator = create_mypy_evaluator(\n    mypy_cli_args=[\"--flag\"]\n)\n```\n\n### TypeScript 类型检查（仅限 TypeScript）\n\nTypeScript 评估器使用 TypeScript 的类型检查器来检查代码的正确性。\n\n您需要在系统上安装 `typescript` 作为依赖项（不是开发依赖！）：\n\n```bash\nnpm install typescript\n```\n\n然后，您可以按如下方式使用它（请注意，由于额外的必需依赖项，您应从 `openevals\u002Fcode\u002Ftypescript` 入口导入）：\n\n```ts\nimport { createTypeScriptEvaluator } from \"openevals\u002Fcode\u002Ftypescript\";\n\nconst evaluator = createTypeScriptEvaluator();\n\nconst result = await evaluator({\n    outputs: \"function add(a, b) { return a + b; }\",\n});\n\nconsole.log(result);\n```\n\n```\n{\n    'key': 'typescript_succeeded',\n    'score': True,\n    'comment': None,\n}\n```\n\n> [!WARNING]\n> 该评估器会忽略 `reportMissingImports` 错误。如果您希望对生成的依赖项进行类型检查，请查看此评估器的 [沙盒版本](#sandbox-typescript-typescript-only)。\n\n### LLM 作为代码评判者\n\nOpenEvals 包含一个预构建的 LLM 作为代码评判者的评估器。与更通用的 [LLM 作为评判者评估器](#llm-as-judge) 相比，其主要区别在于它会执行上述提取步骤——除此之外，它接受相同的参数，包括提示。\n\n您可以按如下方式运行 LLM 作为代码评判者的评估器：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.code.llm import create_code_llm_as_judge\nfrom openevals.prompts import CODE_CORRECTNESS_PROMPT\n\nllm_as_judge = create_code_llm_as_judge(\n    prompt=CODE_CORRECTNESS_PROMPT,\n    model=\"openai:gpt-5.4\",\n    code_extraction_strategy=\"markdown_code_blocks\",\n)\n\n\nINPUTS = \"\"\"\n将下面的代码重写为异步版本：\n\n\\`\\`\\`python\ndef _run_mypy(\n    *,\n    filepath: str,\n    mypy_cli_args: list[str],\n) -> Tuple[bool, str]:\n    result = subprocess.run(\n        [\n            \"mypy\",\n            *mypy_cli_args,\n            filepath,\n        ],\n        capture_output=True,\n    )\n    return _parse_mypy_output(result.stdout)\n\\`\\`\\`\n\"\"\"\n\nOUTPUTS = \"\"\"\n\\`\\`\\`python\nasync def _run_mypy_async(\n    *,\n    filepath: str,\n    mypy_cli_args: list[str],\n) -> Tuple[bool, str]:\n    process = await subprocess.run(\n        [\n            \"mypy\",\n            *mypy_cli_args,\n            filepath,\n        ],\n    )\n    stdout, _ = await process.communicate()\n\n    return _parse_mypy_output(stdout)\n\\`\\`\\`\n\"\"\"\n\neval_result = llm_as_judge(\n    inputs=INPUTS,\n    outputs=OUTPUTS\n)\n\nprint(eval_result)\n```\n\n```\n{\n    'key': 'code_correctness',\n    'score': False,\n    'comment': \"提供的异步代码不正确。它仍然错误地尝试使用 'await subprocess.run'，而这是同步操作，无法被等待。正确的异步方法应该是使用 'asyncio.create_subprocess_exec'（或类似的 asyncio API），并适当重定向标准输出（例如，stdout=asyncio.subprocess.PIPE），然后等待 'communicate()' 调用。因此，代码并未完全满足所规定的要求，并且存在显著错误，导致其无法正常工作。综上所述，评分应为：false。\",\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createCodeLLMAsJudge, CODE_CORRECTNESS_PROMPT } from \"openevals\";\n\nconst evaluator = createCodeLLMAsJudge({\n  prompt: CODE_CORRECTNESS_PROMPT,\n  model: \"openai:gpt-5.4\",\n});\n\nconst inputs = `为以下代码添加适当的 TypeScript 类型：\n\n\\`\\`\\`typescript\nfunction add(a, b) { return a + b; }\n\\`\\`\\`\n`;\n\nconst outputs = `\n\\`\\`\\`typescript\nfunction add(a: number, b: number): boolean {\n  return a + b;\n}\n\\`\\`\\`\n`;\n\nconst evalResult = await evaluator({ inputs, outputs });\n\nconsole.log(evalResult);\n```\n\n```\n{\n  \"key\": \"code_correctness\",\n  \"score\": false,\n  \"comment\": \"代码在类型规范上存在逻辑错误。该函数旨在将两个数字相加并返回它们的和，因此返回类型应为 number，而非 boolean。这一错误使得该解决方案不符合评分标准。综上所述，评分应为：false。\"\n}\n```\n\n\u003C\u002Fdetails>\n\n## 沙箱代码\n\n大语言模型可以生成任意代码，如果您在本地运行代码评估器，可能不希望安装生成的依赖项或在本地运行这些任意代码。为了解决这个问题，OpenEvals 集成了 [E2B](https:\u002F\u002Fe2b.dev)，以便在隔离的沙箱中运行部分代码评估器。\n\n给定大语言模型生成的一些代码，这些沙箱代码评估器会在一个沙箱中运行脚本，解析出依赖项并进行安装，从而为评估器提供适当的上下文来进行类型检查或执行。\n\n这些评估器在创建时都需要一个 `sandbox` 参数，并且也接受其他 [代码评估器](#extracting-code-outputs) 中存在的代码提取参数。对于 Python，有一个特殊的 `OpenEvalsPython` 模板，其中预装了 `pyright` 和 `uv`，以加快执行速度，不过该评估器也可以与任何沙箱配合使用。\n\n如果您有一个自定义的沙箱，其中已预先安装了依赖项或设置了文件，您可以在调用相应的 `create` 方法时提供 `sandbox_project_directory`（Python）或 `sandboxProjectDirectory`（TypeScript）参数，以自定义进行类型检查\u002F执行的文件夹。\n\n### 沙箱 Pyright（仅限 Python）\n\n您还可以在 [E2B](https:\u002F\u002Fe2b.dev) 沙箱中运行 Pyright 类型检查。评估器会运行一个脚本来从生成的代码中解析出包名，然后在沙箱中安装这些包，并运行 Pyright。评估器会将其分析出的错误作为注释返回。\n\n您需要安装 `e2b-code-interpreter` 包，该包作为附加组件提供：\n\n```bash\npip install openevals[\"e2b-code-interpreter\"]\n```\n\n然后，您需要将您的 E2B API 密钥设置为环境变量：\n\n```\nexport E2B_API_KEY=\"YOUR_KEY_HERE\"\n```\n\n接着，您需要初始化一个 E2B 沙箱。有一个特殊的 `OpenEvalsPython` 模板，其中预装了 `pyright` 和 `uv` 以加快执行速度，不过该评估器也可以与任何沙箱配合使用：\n\n```python\nfrom e2b_code_interpreter import Sandbox\n\n# 预装 uv 和 pyright 的 E2B 模板\nsandbox = Sandbox(\"OpenEvalsPython\")\n```\n\n最后，将创建的沙箱传递给 `create_e2b_pyright_evaluator` 工厂函数并运行它：\n\n```python\nfrom openevals.code.e2b.pyright import create_e2b_pyright_evaluator\n\nevaluator = create_e2b_pyright_evaluator(\n    sandbox=sandbox,\n)\n\nCODE = \"\"\"\nfrom typing import Annotated\n\nfrom typing_extensions import TypedDict\n\nfrom langgraph.graph import StateGraph, START, END\nfrom langgraph.graph.message import add_messages\n\n\nclass State(TypedDict):\n    messages: Annotated[list, add_messages]\n\nbuilder = StateGraph(State)\nbuilder.add_node(\"start\", lambda state: state)\nbuilder.compile()\n\nbuilder.invoke({})\n\"\"\"\n\neval_result = evaluator(outputs=CODE)\n\nprint(eval_result)\n```\n\n```\n{\n  'key': 'pyright_succeeded',\n  'score': false,\n  'comment': '[{\"severity\": \"error\", \"message\": \"Cannot access attribute \"invoke\" for class \"StateGraph\"...}]',\n}\n```\n\n上述示例中，评估器在沙箱内识别并安装了 `langgraph` 包，然后运行了 Pyright。类型检查失败是因为提供的代码误用了导入的包，调用了构建器而不是编译后的图。\n\n### 沙箱 TypeScript 类型检查（仅限 TypeScript）\n\n您也可以在 [E2B](https:\u002F\u002Fe2b.dev) 沙箱中运行 TypeScript 类型检查。评估器会运行一个脚本来从生成的代码中解析出包名，然后在沙箱中安装这些包，并运行 TypeScript。评估器会将其分析出的错误作为注释返回。\n\n您需要将官方的 `@e2b\u002Fcode-interpreter` 包作为对等依赖项安装：\n\n```bash\nnpm install @e2b\u002Fcode-interpreter\n```\n\n然后，您需要将您的 E2B API 密钥设置为环境变量：\n\n```\nprocess.env.E2B_API_KEY=\"YOUR_KEY_HERE\"\n```\n\n接下来，初始化一个 E2B 沙箱：\n\n```ts\nimport { Sandbox } from \"@e2b\u002Fcode-interpreter\";\n\nconst sandbox = await Sandbox.create();\n```\n\n最后，将沙箱传递给 `createE2BTypeScriptEvaluator` 并运行它：\n\n```ts\nimport { createE2BTypeScriptEvaluator } from \"openevals\u002Fcode\u002Fe2b\";\n\nconst evaluator = createE2BTypeScriptEvaluator({\n  sandbox,\n});\n\nconst CODE = `\nimport { StateGraph } from '@langchain\u002Flanggraph';\n\nawait StateGraph.invoke({})\n`;\n\nconst evalResult = await evaluator({ outputs: CODE });\n\nconsole.log(evalResult);\n```\n\n```\n{\n  \"key\": \"typescript_succeeded\",\n  \"score\": false,\n  \"comment\": \"(3,18): Property 'invoke' does not exist on type 'typeof StateGraph'.\"\n}\n```\n\n上述示例中，评估器识别并安装了 `@langchain\u002Flanggraph`，然后通过 TypeScript 进行了类型检查。类型检查失败是因为提供的代码误用了导入的包。\n\n### 沙箱执行\n\n为了进一步评估代码的正确性，OpenEvals 提供了一个沙箱执行评估器，它会在 [E2B](https:\u002F\u002Fe2b.dev) 沙箱中运行生成的代码。\n\n评估器会运行一个脚本来从生成的代码中解析出包名，然后在沙箱中安装这些包。随后，评估器会尝试运行生成的代码，并将其分析出的错误作为注释返回。\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n您需要安装 `e2b-code-interpreter` 包，该包作为附加组件提供：\n\n```bash\npip install openevals[\"e2b-code-interpreter\"]\n```\n\n然后，您需要将您的 E2B API 密钥设置为环境变量：\n\n```\nexport E2B_API_KEY=\"YOUR_KEY_HERE\"\n```\n\n接着，您需要初始化一个 E2B 沙箱。有一个特殊的 `OpenEvalsPython` 模板，其中预装了 `pyright` 和 `uv` 以加快执行速度，不过该评估器也可以与任何沙箱配合使用：\n\n```python\nfrom e2b_code_interpreter import Sandbox\n\n# 带有 UV 和 Pyright 预装的 E2B 模板\nsandbox = Sandbox(\"OpenEvalsPython\")\n```\n\n然后将沙盒传递给 `create_e2b_execution_evaluator` 工厂函数，并运行结果：\n\n```python\nfrom openevals.code.e2b.execution import create_e2b_execution_evaluator\n\nevaluator = create_e2b_execution_evaluator(\n    sandbox=sandbox,\n)\n\nCODE = \"\"\"\nfrom typing import Annotated\n\nfrom typing_extensions import TypedDict\n\nfrom langgraph.graph import StateGraph, START, END\nfrom langgraph.graph.message import add_messages\n\n\nclass State(TypedDict):\n    messages: Annotated[list, add_messages]\n\nbuilder = StateGraph(State)\nbuilder.add_node(\"start\", lambda state: state)\nbuilder.compile()\n\nbuilder.invoke({})\n\"\"\"\n\neval_result = evaluator(outputs=CODE)\n\nprint(eval_result)\n```\n\n```\n{\n  'key': 'execution_succeeded',\n  'score': False,\n  'comment': '\"Command exited with code 1 and error:\\nTraceback (most recent call last):\\n  File \\\"\u002Fhome\u002Fuser\u002Fopenevals\u002Foutputs.py\\\", line 15, in \u003Cmodule>\\n    builder.compile()\\n  File \\\"\u002Fhome\u002Fuser\u002Fopenevals\u002F.venv\u002Flib\u002Fpython3.10\u002Fsite-packages\u002Flanggraph\u002Fgraph\u002Fstate.py\\\", line 602, in compile\\n    self.validate(\\n  File \\\"\u002Fhome\u002Fuser\u002Fopenevals\u002F.venv\u002Flib\u002Fpython3.10\u002Fsite-packages\u002Flanggraph\u002Fgraph\u002Fgraph.py\\\", line 267, in validate\\n    raise ValueError(\\nValueError: Graph must have an entrypoint: add at least one edge from START to another node\\n\"'\n}\n```\n\n上述示例中，评估器会识别并安装 `langgraph`，然后尝试执行代码。由于提供的代码错误地使用了该库，类型检查失败。\n\n如果需要，您可以在创建评估器时传入一个 `environment_variables` 字典。生成的代码将在沙盒中访问这些变量，但请务必谨慎，因为无法准确预测 LLM 将生成何种代码。\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n您需要将官方的 `@e2b\u002Fcode-interpreter` 包作为对等依赖项安装：\n\n```bash\nnpm install @e2b\u002Fcode-interpreter\n```\n\n然后，您需要将您的 E2B API 密钥设置为环境变量：\n\n```\nprocess.env.E2B_API_KEY=\"YOUR_KEY_HERE\"\n```\n\n接下来，初始化一个 E2B 沙盒：\n\n```ts\nimport { Sandbox } from \"@e2b\u002Fcode-interpreter\";\n\nconst sandbox = await Sandbox.create();\n```\n\n最后，将沙盒传递给 `create` 函数并运行：\n\n```ts\nimport { createE2BExecutionEvaluator } from \"openevals\u002Fcode\u002Fe2b\";\n\nconst evaluator = createE2BExecutionEvaluator({\n  sandbox,\n});\n\nconst CODE = `\nimport { Annotation, StateGraph } from '@langchain\u002Flanggraph';\n\nconst StateAnnotation = Annotation.Root({\n  joke: Annotation\u003Cstring>,\n  topic: Annotation\u003Cstring>,\n});\n\nconst graph = new StateGraph(StateAnnotation)\n  .addNode(\"joke\", () => ({}))\n  .compile();\n  \nawait graph.invoke({\n  joke: \"foo\",\n  topic: \"history\",\n});\n`;\n\nconst evalResult = await evaluator({ outputs });\n\nconsole.log(evalResult);\n```\n\n```\n{\n  \"key\": \"execution_succeeded\",\n  \"score\": false,\n  \"comment\": \"file:\u002F\u002F\u002Fhome\u002Fuser\u002Fopenevals\u002Fnode_modules\u002F@langchain\u002Flanggraph\u002Fdist\u002Fgraph\u002Fstate.js:197\\n            throw new Error(`${key} is already being used as a state attribute (a.k.a. a channel), cannot also be used as a node name.`);\\n                  ^\\n\\nError: joke is already being used as a state attribute (a.k.a. a channel), cannot also be used as a node name.\\n    at StateGraph.addNode (\u002Fhome\u002Fuser\u002Fopenevals\u002Fnode_modules\u002F@langchain\u002Flanggraph\u002Fsrc\u002Fgraph\u002Fstate.ts:292:13)\\n    at \u003Canonymous> (\u002Fhome\u002Fuser\u002Fopenevals\u002Foutputs.ts:9:4)\\n    at ModuleJob.run (node:internal\u002Fmodules\u002Fesm\u002Fmodule_job:195:25)\\n    at async ModuleLoader.import (node:internal\u002Fmodules\u002Fesm\u002Floader:336:24)\\n    at async loadESM (node:internal\u002Fprocess\u002Fesm_loader:34:7)\\n    at async handleMainPromise (node:internal\u002Fmodules\u002Frun_main:106:12)\\n\\nNode.js v18.19.0\\n\"\n}\n```\n\n上述示例中，评估器会识别并安装 `@langchain\u002Flanggraph`，然后尝试执行代码。由于提供的代码错误地使用了该库，类型检查失败。\n\n如果需要，您可以在创建评估器时传入一个 `environmentVariables` 对象。生成的代码将在沙盒中访问这些变量，但请务必谨慎，因为无法准确预测 LLM 将生成何种代码。\n\n\u003C\u002Fdetails>\n\n## 代理轨迹\n\n如果您正在构建一个代理，`openevals` 提供了用于评估代理执行整个 **轨迹** 的评估器——即代理在解决任务过程中发出的消息和工具调用序列。\n\n轨迹应格式化为 [OpenAI 样式消息](https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fapi-reference\u002Fmessages) 的列表。LangChain 的 `BaseMessage` 实例也受支持。\n\n### 轨迹匹配\n\n`create_trajectory_match_evaluator`\u002F`createTrajectoryMatchEvaluator` 会将代理的轨迹与参考轨迹进行比较。您可以将 `trajectory_match_mode`\u002F`trajectoryMatchMode` 设置为以下四种模式之一：\n\n- `\"strict\"` — 工具调用相同且顺序一致\n- `\"unordered\"` — 工具调用相同，顺序不限\n- `\"subset\"` — 输出的工具调用是参考轨迹的子集\n- `\"superset\"` — 输出的工具调用是参考轨迹的超集\n\n#### 严格匹配\n\n`\"strict\"` 模式会比较两条轨迹，确保它们包含相同的消息、相同的顺序以及相同的工具调用。请注意，它允许消息内容存在差异（例如 `\"SF\"` 与 `\"San Francisco\"`）：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nfrom openevals import create_trajectory_match_evaluator\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"SF 的天气如何？\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\n                \"function\": {\n                    \"name\": \"get_weather\",\n                    \"arguments\": json.dumps({\"city\": \"San Francisco\"}),\n                }\n            },\n            {\n                \"function\": {\n                    \"name\": \"accuweather_forecast\",\n                    \"arguments\": json.dumps({\"city\": \"San Francisco\"}),\n                }\n            }\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"SF 的天气是 80 华氏度，晴朗。\"},\n    {\"role\": \"assistant\", \"content\": \"SF 的天气是 80 华氏度，晴朗。\"},\n]\nreference_outputs = [\n    {\"role\": \"user\", \"content\": \"旧金山的天气如何？\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\n                \"function\": {\n                    \"name\": \"get_weather\",\n                    \"arguments\": json.dumps({\"city\": \"San Francisco\"}),\n                }\n            }\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"旧金山的天气是 80 华氏度，晴朗。\"},\n    {\"role\": \"assistant\", \"content\": \"SF 的天气是 80 度，晴朗。\"},\n]\n\nevaluator = create_trajectory_match_evaluator(trajectory_match_mode=\"strict\")\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\nprint(result)\n```\n\n```\n{'key': 'trajectory_strict_match', 'score': False, 'comment': None}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  createTrajectoryMatchEvaluator,\n  type FlexibleChatCompletionMessage,\n} from \"openevals\";\n\nconst outputs = [\n  { role: \"user\", content: \"What is the weather in SF?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{\n      function: {\n        name: \"get_weather\",\n        arguments: JSON.stringify({ city: \"San Francisco\" }),\n      },\n    }, {\n      function: {\n        name: \"accuweather_forecast\",\n        arguments: JSON.stringify({ city: \"San Francisco\" }),\n      },\n    }],\n  },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in SF.\" },\n  { role: \"assistant\", content: \"The weather in SF is 80 degrees and sunny.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst referenceOutputs = [\n  { role: \"user\", content: \"What is the weather in San Francisco?\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{\n      function: {\n        name: \"get_weather\",\n        arguments: JSON.stringify({ city: \"San Francisco\" }),\n      },\n    }],\n  },\n  { role: \"tool\", content: \"It's 80 degrees and sunny in San Francisco.\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: \"strict\" });\nconst result = await evaluator({ outputs, referenceOutputs });\nconsole.log(result);\n```\n\n```\n{ key: 'trajectory_strict_match', score: false }\n```\n\u003C\u002Fdetails>\n\n`\"strict\"` 模式适用于需要确保对于给定查询，工具调用始终以相同顺序进行的情况（例如，先执行政策查询工具，再执行为员工申请休假的工具）。\n\n**注意：** 如果您希望配置此评估器检查工具调用是否相等的方式，请参阅 [本节](#tool-args-match-modes)。\n\n#### 无序匹配\n\n`\"unordered\"` 模式会比较两条轨迹，并确保它们包含相同的工具调用，但不考虑调用顺序。这在您希望允许智能体以灵活的方式获取所需信息，但仍关心所有必要信息是否已被检索到时非常有用。\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nfrom openevals import create_trajectory_match_evaluator\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"SF 的天气如何？有没有什么好玩的活动？\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [{\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"San Francisco\"})}}],\n    },\n    {\"role\": \"tool\", \"content\": \"SF 的天气是 80 华氏度，晴朗。\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [{\"function\": {\"name\": \"get_fun_activities\", \"arguments\": json.dumps({\"city\": \"San Francisco\"})}}],\n    },\n    {\"role\": \"tool\", \"content\": \"目前没有有趣的活动，你最好待在家里看书！\"},\n    {\"role\": \"assistant\", \"content\": \"SF 的天气是 80 华氏度，晴朗，但目前没有有趣的活动。\"},\n]\nreference_outputs = [\n    {\"role\": \"user\", \"content\": \"SF 的天气如何？有没有什么好玩的活动？\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_fun_activities\", \"arguments\": json.dumps({\"city\": \"San Francisco\"})}},\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"San Francisco\"})}},\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"目前没有有趣的活动，你最好待在家里看书！\"},\n    {\"role\": \"tool\", \"content\": \"SF 的天气是 80 华氏度，晴朗。\"},\n    {\"role\": \"assistant\", \"content\": \"在 SF，天气是 80 华氏度、晴朗，但目前没有有趣的活动。\"},\n]\n\nevaluator = create_trajectory_match_evaluator(trajectory_match_mode=\"unordered\")\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\nprint(result)\n```\n\n```\n{'key': 'trajectory_unordered_match', 'score': True, 'comment': None}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  createTrajectoryMatchEvaluator,\n  type FlexibleChatCompletionMessage,\n} from \"openevals\";\n\nconst outputs = [\n  { role: \"user\", content: \"SF 的天气如何？有没有什么好玩的活动？\"} ,\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"San Francisco\" }) } }],\n  },\n  { role: \"tool\", content: \"SF 的天气是 80 华氏度，晴朗。\"},\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_fun_activities\", arguments: JSON.stringify({ city: \"San Francisco\" }) } }],\n  },\n  { role: \"tool\", content: \"目前没有有趣的活动，你最好待在家里看书！\"},\n  { role: \"assistant\", content: \"SF 的天气是 80 华氏度，晴朗，但目前没有有趣的活动。\"},\n] satisfies FlexibleChatCompletionMessage[];\n\nconst referenceOutputs = [\n  { role: \"user\", content: \"SF 的天气如何？有没有什么好玩的活动？\"} ,\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [\n      { function: { name: \"get_fun_activities\", arguments: JSON.stringify({ city: \"San Francisco\" }) } },\n      { function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"San Francisco\" }) } },\n    ],\n  },\n  { role: \"tool\", content: \"目前没有有趣的活动，你最好待在家里看书！\"},\n  { role: \"tool\", content: \"SF 的天气是 80 华氏度，晴朗。\"},\n  { role: \"assistant\", content: \"在 SF，天气是 80 华氏度、晴朗，但目前没有有趣的活动。\"},\n] satisfies FlexibleChatCompletionMessage[];\n\nconst evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: \"unordered\" });\nconst result = await evaluator({ outputs, referenceOutputs });\nconsole.log(result);\n```\n\n```\n{ key: 'trajectory_unordered_match', score: true }\n```\n\u003C\u002Fdetails>\n\n`\"unordered\"` 模式适用于需要确保特定工具在轨迹中的某个时刻被调用，但并不一定要求它们按照消息顺序出现的情况。\n\n**注意：** 如果您希望配置此评估器检查工具调用是否相等的方式，请参阅 [本节](#tool-args-match-modes)。\n\n#### 子集与超集匹配\n\n`\"subset\"` 和 `\"superset\"` 模式用于匹配部分轨迹，确保一条轨迹包含参考轨迹中工具调用的子集或超集。\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nfrom openevals import create_trajectory_match_evaluator\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"旧金山和伦敦的天气如何？\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"旧金山和伦敦\"})}},\n            {\"function\": {\"name\": \"accuweather_forecast\", \"arguments\": json.dumps({\"city\": \"旧金山和伦敦\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"旧金山气温80华氏度，晴朗；伦敦气温90华氏度，有雨。\"},\n    {\"role\": \"tool\", \"content\": \"未知。\"},\n    {\"role\": \"assistant\", \"content\": \"旧金山的天气是80华氏度，晴朗。伦敦则是90华氏度，有雨。\"},\n]\nreference_outputs = [\n    {\"role\": \"user\", \"content\": \"旧金山和伦敦的天气如何？\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"旧金山和伦敦\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"旧金山气温80华氏度，晴朗；伦敦气温90华氏度，有雨。\"},\n    {\"role\": \"assistant\", \"content\": \"旧金山的天气是80华氏度，晴朗。伦敦则是90华氏度，有雨。\"},\n]\n\nevaluator = create_trajectory_match_evaluator(trajectory_match_mode=\"superset\")  # 或者 \"subset\"\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\nprint(result)\n```\n\n```\n{'key': 'trajectory_superset_match', 'score': True, 'comment': None}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  createTrajectoryMatchEvaluator,\n  type FlexibleChatCompletionMessage,\n} from \"openevals\";\n\nconst outputs = [\n  { role: \"user\", content: \"旧金山和伦敦的天气如何？\"},\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [\n      { function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"旧金山和伦敦\" }) } },\n      { function: { name: \"accuweather_forecast\", arguments: JSON.stringify({ city: \"旧金山和伦敦\" }) } },\n    ],\n  },\n  { role: \"tool\", content: \"旧金山气温80华氏度，晴朗；伦敦气温90华氏度，有雨。\"},\n  { role: \"tool\", content: \"未知。\"},\n  { role: \"assistant\", content: \"旧金山的天气是80华氏度，晴朗；伦敦则是90华氏度，有雨。\"},\n] satisfies FlexibleChatCompletionMessage[];\n\nconst referenceOutputs = [\n  { role: \"user\", content: \"旧金山和伦敦的天气如何？\"},\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [\n      { function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"旧金山和伦敦\" }) } },\n    ],\n  },\n  { role: \"tool\", content: \"旧金山气温80华氏度，晴朗；伦敦气温90华氏度，有雨。\"},\n  { role: \"assistant\", content: \"旧金山的天气是80˚，晴朗；伦敦则是90˚，有雨。\"},\n] satisfies FlexibleChatCompletionMessage[];\n\nconst evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: \"superset\" }); \u002F\u002F 或者 \"subset\"\nconst result = await evaluator({ outputs, referenceOutputs });\nconsole.log(result);\n```\n\n```\n{ key: 'trajectory_superset_match', score: true }\n```\n\u003C\u002Fdetails>\n\n`\"superset\"`模式适用于你希望确保在对话轨迹中至少调用了某些关键工具，但允许代理额外调用其他工具的情况。而`\"subset\"`模式则相反，它适用于你希望确保代理没有调用任何超出预期的工具。\n\n#### 工具参数匹配模式\n\n在检查工具调用是否相等时，上述评估器默认要求所有工具调用的参数完全一致。你可以通过以下方式配置这一行为：\n\n- 将针对同一工具的任意两个工具调用视为等价，方法是设置`tool_args_match_mode=\"ignore\"`（Python）或`toolArgsMatchMode: \"ignore\"`（TypeScript）。\n- 如果一个工具调用包含与参考同名工具调用相比的参数子集或超集，则将其视为等价，方法是设置`tool_args_match_mode=\"subset\"\u002F\"superset\"`（Python）或`toolArgsMatchMode: \"subset\"\u002F\"superset\"`（TypeScript）。\n- 使用`tool_args_match_overrides`（Python）或`toolArgsMatchOverrides`（TypeScript）参数为特定工具的所有调用设置自定义匹配规则。\n\n`tool_args_match_overrides`\u002F`toolArgsMatchOverrides`接受一个字典，其键为工具名称，值可以是`\"exact\"`、`\"ignore\"`、`\"subset\"`、`\"superset\"`，也可以是一组必须精确匹配的字段路径，或者是一个比较函数：\n\n以下是一个示例，允许对名为`get_weather`的工具的参数进行不区分大小写的匹配：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nfrom openevals import create_trajectory_match_evaluator\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"旧金山的天气如何？\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"san francisco\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"旧金山气温80华氏度，晴朗。\"},\n    {\"role\": \"assistant\", \"content\": \"旧金山的天气是80华氏度，晴朗。\"},\n]\nreference_outputs = [\n    {\"role\": \"user\", \"content\": \"旧金山的天气如何？\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"San Francisco\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"旧金山气温80华氏度，晴朗。\"},\n    {\"role\": \"assistant\", \"content\": \"旧金山的天气是80˚，晴朗。\"},\n]\n\nevaluator = create_trajectory_match_evaluator(\n    trajectory_match_mode=\"strict\",\n    tool_args_match_mode=\"exact\",  \n    tool_args_match_overrides={\n        \"get_weather\": lambda x, y: x[\"city\"].lower() == y[\"city\"].lower()\n    }\n)\n\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\nprint(result)\n```\n\n```\n{'key': 'trajectory_strict_match', 'score': True, 'comment': None}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  createTrajectoryMatchEvaluator,\n  type FlexibleChatCompletionMessage,\n} from \"openevals\";\n\nconst outputs = [\n  { role: \"user\", content: \"旧金山的天气如何？\"},\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"san francisco\" }) } }],\n  },\n  { role: \"tool\", content: \"旧金山气温80华氏度，晴朗。\"},\n  { role: \"assistant\", content: \"旧金山的天气是80华氏度，晴朗。\"},\n] satisfies FlexibleChatCompletionMessage[];\n\nconst referenceOutputs = [\n  { role: \"user\", content: \"旧金山的天气如何？\"},\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"San Francisco\" }) } }],\n  },\n  { role: \"tool\", content: \"旧金山气温80华氏度，晴朗。\"},\n  { role: \"assistant\", content: \"旧金山的天气是80˚，晴朗。\"},\n] satisfies FlexibleChatCompletionMessage[];\n\nconst evaluator = createTrajectoryMatchEvaluator({\n  trajectoryMatchMode: \"strict\",\n  toolArgsMatchOverrides: {\n    get_weather: (x, y) =>\n      typeof x.city === \"string\" &&\n      typeof y.city === \"string\" &&\n      x.city.toLowerCase() === y.city.toLowerCase(),\n  },\n});\n\nconst result = await evaluator({ outputs, referenceOutputs });\nconsole.log(result);\n```\n\n```\n{ key: 'trajectory_strict_match', score: true }\n```\n\u003C\u002Fdetails>\n\n这种灵活性使您能够在某些情况下对 LLM 生成的参数采用更宽松的相等性比较（例如，“san francisco”等于“San Francisco”），且仅适用于特定的工具调用。\n\n\n\n### 轨迹 LLM 作为评判者\n\n`create_trajectory_llm_as_judge`\u002F`createTrajectoryLLMAsJudge` 使用 LLM 来评估代理的轨迹是否准确。与轨迹匹配评估器不同，它不需要参考轨迹。可以使用 `TRAJECTORY_ACCURACY_PROMPT` 进行无参考评估，或使用 `TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE` 与参考轨迹进行比较：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nfrom openevals import create_trajectory_llm_as_judge\nfrom openevals.prompts import TRAJECTORY_ACCURACY_PROMPT\n\nevaluator = create_trajectory_llm_as_judge(\n    prompt=TRAJECTORY_ACCURACY_PROMPT,\n    model=\"openai:gpt-5.4\",\n)\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"旧金山的天气如何？\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"SF\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"旧金山现在是80华氏度，晴朗。\"},\n    {\"role\": \"assistant\", \"content\": \"旧金山的天气是80华氏度，晴朗。\"},\n]\n\nresult = evaluator(outputs=outputs)\nprint(result)\n```\n\n```\n{'key': 'trajectory_accuracy', 'score': True, 'comment': '该轨迹准确...'}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  createTrajectoryLLMAsJudge,\n  TRAJECTORY_ACCURACY_PROMPT,\n  type FlexibleChatCompletionMessage,\n} from \"openevals\";\n\nconst evaluator = createTrajectoryLLMAsJudge({\n  prompt: TRAJECTORY_ACCURACY_PROMPT,\n  model: \"openai:gpt-5.4\",\n});\n\nconst outputs = [\n  { role: \"user\", content: \"旧金山的天气如何？\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"SF\" }) } }],\n  },\n  { role: \"tool\", content: \"旧金山现在是80华氏度，晴朗。\" },\n  { role: \"assistant\", content: \"旧金山的天气是80华氏度，晴朗。\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst result = await evaluator({ outputs });\nconsole.log(result);\n```\n\n```\n{ key: 'trajectory_accuracy', score: true, comment: '该轨迹准确...' }\n```\n\u003C\u002Fdetails>\n\n如果您有参考轨迹，可以使用 `TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE` 并传入 `reference_outputs`\u002F`referenceOutputs`：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nfrom openevals import create_trajectory_llm_as_judge\nfrom openevals.prompts import TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE\n\nevaluator = create_trajectory_llm_as_judge(\n    prompt=TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,\n    model=\"openai:gpt-5.4\",\n)\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"旧金山的天气如何？\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"SF\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"旧金山现在是80华氏度，晴朗。\"},\n    {\"role\": \"assistant\", \"content\": \"旧金山的天气是80华氏度，晴朗。\"},\n]\nreference_outputs = [\n    {\"role\": \"user\", \"content\": \"旧金山的天气如何？\"},\n    {\n        \"role\": \"assistant\",\n        \"content\": \"\",\n        \"tool_calls\": [\n            {\"function\": {\"name\": \"get_weather\", \"arguments\": json.dumps({\"city\": \"旧金山\"})}}\n        ],\n    },\n    {\"role\": \"tool\", \"content\": \"旧金山现在是80华氏度，晴朗。\"},\n    {\"role\": \"assistant\", \"content\": \"旧金山的天气是80˚，晴朗。\"},\n]\n\nresult = evaluator(outputs=outputs, reference_outputs=reference_outputs)\nprint(result)\n```\n\n```\n{'key': 'trajectory_accuracy', 'score': True, 'comment': '提供的代理轨迹与参考一致...'}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  createTrajectoryLLMAsJudge,\n  TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,\n  type FlexibleChatCompletionMessage,\n} from \"openevals\";\n\nconst evaluator = createTrajectoryLLMAsJudge({\n  prompt: TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,\n  model: \"openai:gpt-5.4\",\n});\n\nconst outputs = [\n  { role: \"user\", content: \"旧金山的天气如何？\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"SF\" }) } }],\n  },\n  { role: \"tool\", content: \"旧金山现在是80华氏度，晴朗。\" },\n  { role: \"assistant\", content: \"旧金山的天气是80华氏度，晴朗。\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst referenceOutputs = [\n  { role: \"user\", content: \"旧金山的天气如何？\" },\n  {\n    role: \"assistant\",\n    content: \"\",\n    tool_calls: [{ function: { name: \"get_weather\", arguments: JSON.stringify({ city: \"旧金山\" }) } }],\n  },\n  { role: \"tool\", content: \"旧金山现在是80华氏度，晴朗。\" },\n  { role: \"assistant\", content: \"旧金山的天气是80˚，晴朗。\" },\n] satisfies FlexibleChatCompletionMessage[];\n\nconst result = await evaluator({ outputs, referenceOutputs });\nconsole.log(result);\n```\n\n```\n{ key: 'trajectory_accuracy', score: true, comment: '提供的代理轨迹与参考一致...' }\n```\n\u003C\u002Fdetails>\n\n`create_trajectory_llm_as_judge`\u002F`createTrajectoryLLMAsJudge` 接受与 [`create_llm_as_judge`](#llm-as-judge) 相同的参数，包括：\n\n- `continuous`: 布尔值 — 返回介于 0 和 1 之间的浮点分数，而不是布尔值。默认为 `False`\u002F`false`。\n- `choices`: 浮点数列表 — 将分数限制为特定值。\n- `system`: 字符串 — 在评判提示前添加系统消息。\n- `few_shot_examples`\u002F`fewShotExamples`: 示例字典列表，附加到提示中。\n\n对于 LangGraph 特定的图轨迹评估器，请参阅 [`agentevals`](https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fagentevals) 包。\n\n### 预构建的轨迹与对话提示\n\n`openevals` 包含多个用于评估智能体轨迹和对话的预构建提示。所有提示都以消息列表 `outputs` 作为输入，并与 `create_llm_as_judge`\u002F`createLLMAsJudge` 一起使用。\n\n#### 轨迹提示\n\n这些提示用于评估单次运行中智能体的工具调用序列。\n\n| 提示 | 参数 | 评估内容 |\n|--------|-----------|-------------------|\n| `TRAJECTORY_ACCURACY_PROMPT` | `outputs` | 智能体的整体轨迹是否准确地完成了任务（参见 [上方](#trajectory-llm-as-judge)） |\n| `TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE` | `outputs`, `reference_outputs` | 轨迹准确性与参考轨迹的对比（参见 [上方](#trajectory-llm-as-judge)） |\n| `TOOL_SELECTION_PROMPT` | `outputs` | 在查询解决过程中工具选择的正确性 |\n\n#### 对话提示\n\n这些提示用于评估用户与智能体之间的多轮对话。\n\n| 提示 | 参数 | 评估内容 |\n|--------|-----------|-------------------|\n| `PERCEIVED_ERROR_PROMPT` | `outputs` | 用户的回复是否表明智能体犯了错误 |\n| `WINS_PROMPT` | `outputs` | 用户是否对助手表示赞赏、感谢或称赞 |\n| `TASK_COMPLETION_PROMPT` | `outputs` | 对话中用户提出的所有请求是否均已完成 |\n| `KNOWLEDGE_RETENTION_PROMPT` | `outputs` | 智能体是否正确地保留并应用了对话早期引入的信息 |\n| `USER_SATISFACTION_PROMPT` | `outputs` | 基于语气变化以及核心需求是否得到满足的总体用户满意度 |\n| `AGENT_TONE_PROMPT` | `outputs` | 智能体在整个对话中语气的一致性和适当性 |\n| `LANGUAGE_DETECTION_PROMPT` | `outputs` | 整个对话中人类主要使用的语言 |\n| `SUPPORT_INTENT_PROMPT` | `outputs` | 客户支持对话中用户请求的主要意图类别 |\n\n以下是使用 `TASK_COMPLETION_PROMPT` 的示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import TASK_COMPLETION_PROMPT\n\nevaluator = create_llm_as_judge(\n    prompt=TASK_COMPLETION_PROMPT,\n    feedback_key=\"task_completion\",\n    model=\"openai:gpt-5.4\",\n)\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"你能帮我预订从纽约到巴黎的航班吗？\"},\n    {\"role\": \"assistant\", \"content\": \"我可以提供航班信息，但无法为您实际订票。\"},\n    {\"role\": \"user\", \"content\": \"我让你订票，不是只给我信息。能不能直接帮我订一下？\"},\n    {\"role\": \"assistant\", \"content\": \"我理解您的不满，但我确实无法进行预订。\"},\n]\n\nresult = evaluator(outputs=outputs)\nprint(result)\n```\n\n```\n{'key': 'task_completion', 'score': False, 'comment': '用户要求预订航班的请求始终未被满足...'}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMAsJudge, TASK_COMPLETION_PROMPT } from \"openevals\";\n\nconst evaluator = createLLMAsJudge({\n  prompt: TASK_COMPLETION_PROMPT,\n  feedbackKey: \"task_completion\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst outputs = [\n  { role: \"user\", content: \"你能帮我预订从纽约到巴黎的航班吗？\" },\n  { role: \"assistant\", content: \"我可以提供航班信息，但无法为您实际订票。\" },\n  { role: \"user\", content: \"我让你订票，不是只给我信息。能不能直接帮我订一下？\" },\n  { role: \"assistant\", content: \"我理解您的不满，但我确实无法进行预订。\" },\n];\n\nconst result = await evaluator({ outputs });\nconsole.log(result);\n```\n\n```\n{ key: 'task_completion', score: false, comment: '用户要求预订航班的请求始终未被满足...' }\n```\n\u003C\u002Fdetails>\n\n由于 `LANGUAGE_DETECTION_PROMPT` 应返回具体的语言名称而非布尔值，因此需配合自定义的 `output_schema` 来捕获结果：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom typing_extensions import TypedDict\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import LANGUAGE_DETECTION_PROMPT\n\nclass LanguageDetectionResult(TypedDict):\n    reasoning: str\n    detected_language: str\n\nevaluator = create_llm_as_judge(\n    prompt=LANGUAGE_DETECTION_PROMPT,\n    feedback_key=\"language_detection\",\n    model=\"openai:gpt-5.4\",\n    output_schema=LanguageDetectionResult,\n)\n\noutputs = [\n    {\"role\": \"user\", \"content\": \"Hola, ¿cómo estás?\"},\n    {\"role\": \"assistant\", \"content\": \"¡Hola! Estoy bien, gracias. ¿En qué puedo ayudarte?\"},\n    {\"role\": \"user\", \"content\": \"Necesito ayuda con mi cuenta.\"},\n]\n\nresult = evaluator(outputs=outputs)\nprint(result)\n```\n\n```\n{'reasoning': '对话中人类全程使用西班牙语交流。', 'detected_language': '西班牙语'}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { z } from \"zod\";\nimport { createLLMAsJudge, LANGUAGE_DETECTION_PROMPT } from \"openevals\";\n\nconst languageDetectionSchema = z.object({\n  reasoning: z.string(),\n  detected_language: z.string().describe(\"检测到的语言名称，以英文表示\"),\n});\n\nconst evaluator = createLLMAsJudge({\n  prompt: LANGUAGE_DETECTION_PROMPT,\n  feedbackKey: \"language_detection\",\n  model: \"openai:gpt-5.4\",\n  outputSchema: languageDetectionSchema,\n});\n\nconst outputs = [\n  { role: \"user\", content: \"Hola, ¿cómo estás?\" },\n  { role: \"assistant\", content: \"¡Hola! Estoy bien, gracias. ¿En qué puedo ayudarte?\" },\n  { role: \"user\", content: \"Necesito ayuda con mi cuenta.\" },\n];\n\nconst result = await evaluator({ outputs });\nconsole.log(result);\n```\n\n```\n{ reasoning: '对话中人类全程使用西班牙语交流。', detected_language: 'Spanish' }\n```\n\u003C\u002Fdetails>\n\n## 其他\n\n该包还包含用于计算常用指标的预构建评估器，例如 Levenshtein 距离、精确匹配等。您可以按如下方式导入并使用它们：\n\n### 精确匹配\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.exact import exact_match\n\noutputs = {\"a\": 1, \"b\": 2}\nreference_outputs = {\"a\": 1, \"b\": 2}\nresult = exact_match(outputs=outputs, reference_outputs=reference_outputs)\n\nprint(result)\n```\n\n```\n{\n    'key': 'equal',\n    'score': True,\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { exactMatch } from \"openevals\";\n\nconst outputs = { a: 1, b: 2 };\nconst referenceOutputs = { a: 1, b: 2 };\nconst result = exactMatch(outputs, referenceOutputs);\n\nconsole.log(result);\n```\n\n```\n{\n    key: \"equal\",\n    score: true,\n}\n```\n\u003C\u002Fdetails>\n\n### 编辑距离\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.string.levenshtein import levenshtein_distance\n\noutputs = \"正确答案\"\nreference_outputs = \"正确答案\"\nresult = levenshtein_distance(\n    outputs=outputs, reference_outputs=reference_outputs,\n)\n\nprint(result)\n```\n\n```\n{\n    'key': 'levenshtein_distance',\n    'score': 0.0,\n    'comment': None,\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { levenshteinDistance } from \"openevals\";\n\nconst outputs = \"正确答案\";\nconst referenceOutputs = \"正确答案\";\nconst result = levenshteinDistance(outputs, referenceOutputs);\n\nconsole.log(result);\n```\n\n```\n{\n    key: \"levenshtein_distance\",\n    score: 0,\n}\n```\n\u003C\u002Fdetails>\n\n### 嵌入相似度\n\n该评估器使用 LangChain 的 [`init_embedding`](https:\u002F\u002Fpython.langchain.com\u002Fapi_reference\u002Flangchain\u002Fembeddings\u002Flangchain.embeddings.base.init_embeddings.html) 方法（适用于 Python）或直接采用 LangChain 的嵌入客户端（适用于 TypeScript），并通过余弦相似度计算两个字符串之间的距离。\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.string.embedding_similarity import create_embedding_similarity_evaluator\n\nevaluator = create_embedding_similarity_evaluator()\n\nresult = evaluator(\n    outputs=\"天气真好！\",\n    reference_outputs=\"天气非常好！\",\n)\n\nprint(result)\n```\n\n```\n{\n    'key': 'embedding_similarity',\n    'score': 0.9147273943905653,\n    'comment': None,\n}\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createEmbeddingSimilarityEvaluator } from \"openevals\";\nimport { OpenAIEmbeddings } from \"@langchain\u002Fopenai\";\n\nconst evaluator = createEmbeddingSimilarityEvaluator({\n  embeddings: new OpenAIEmbeddings({ model: \"text-embedding-3-small\" }),\n});\n\nconst result = await evaluator(\n    outputs: \"天气真好！\",\n    referenceOutputs: \"天气非常好！\",\n);\n\nconsole.log(result);\n```\n\n```\n{\n    key: \"embedding_similarity\",\n    score: 0.9147273943905653,\n}\n```\n\u003C\u002Fdetails>\n\n## 创建您自己的评估器\n\n如果您希望评估的指标未包含在上述内容中，也可以创建一个与 `openevals` 生态系统良好兼容的自定义评估器。\n\n### 评估器接口\n\n首先需要注意的是，所有评估器都应接受以下参数的子集：\n\n- `inputs`: 您应用程序的输入。\n- `outputs`: 您应用程序的输出。\n- `reference_outputs`（Python）或 `referenceOutputs`（TypeScript）：用于对比的参考输出。\n\n这些参数可以是任意值，但通常应接受某种字典形式。并非所有评估器都会使用全部参数，不过这样做是为了确保所有评估器的一致性。您的评估器也可能需要更多参数（例如，对于需要额外变量来构建提示的 LLM 作为裁判的评估器），但为简化起见，最好仅使用上述三个参数。\n\n如果您的评估器需要额外配置，建议使用工厂函数来创建评估器，其命名应为 `create_\u003Cevaluator_name>`（例如，`create_llm_as_judge`）。\n\n评估器的返回值应为一个字典（或若评估多个指标，则为字典列表），包含以下键：\n\n- `key`: 一个字符串，表示所评估指标的名称。\n- `score`: 一个布尔值或数字，表示该指标的得分。\n- `comment`: 一个字符串，表示对该指标的评论。\n\n仅此而已！这就是唯一的限制。\n\n### 日志记录至 LangSmith\n\n如果您正在使用 LangSmith 来跟踪实验，还应将评估器的内部逻辑封装在 `_run_evaluator`\u002F`_arun_evaluator`（Python）或 `runEvaluator`（TypeScript）方法中。这可确保评估结果能够被支持的运行程序正确记录到 LangSmith 中。\n\n该方法会接收一个 `scorer` 函数作为输入，该函数返回：\n\n- 一个单独的布尔值或数字，表示给定指标的得分。\n- 或者一个元组，其中第一个元素为得分，第二个元素为解释该得分的评论。\n\n### 示例\n\n以下是一个非常简单的自定义评估器的示例。它仅考虑应用程序的输出，并将其与正则表达式模式进行比较。由于 `regex` 是一个额外参数，因此使用工厂函数来创建评估器。\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nimport json\nimport re\nfrom typing import Any\n\nfrom openevals.types import (\n    EvaluatorResult,\n    SimpleEvaluator,\n)\nfrom openevals.utils import _run_evaluator\n\n\ndef create_regex_evaluator(\n    *, regex: str\n) -> SimpleEvaluator:\n    \"\"\"\n    将正则表达式模式与输出匹配。\n\n    Args:\n        regex (str): 用于与输出匹配的正则表达式模式。\n\n    Returns:\n        EvaluatorResult\n    \"\"\"\n\n    regex = re.compile(regex)\n\n    # 允许将 `inputs` 和 `reference_outputs` 作为关键字参数传入，尽管它们未被使用\n    def wrapped_evaluator(\n        *, outputs: Any, **kwargs: Any\n    ) -> EvaluatorResult:\n\n        # 允许 `outputs` 是字典，但为了正则匹配将其转换为字符串\n        if not isinstance(outputs, str):\n            outputs = json.dumps(outputs)\n\n        def get_score():\n            return regex.match(outputs) is not None\n\n        res = _run_evaluator(\n            run_name=\"regex_match\",\n            scorer=get_score,\n            feedback_key=\"regex_match\",\n        )\n        return res\n\n    return wrapped_evaluator\n```\n\n```python\nevaluator = create_regex_evaluator(regex=r\"some string\")\nresult = evaluator(outputs=\"this contains some string\")\n```\n\n```\n{\n    'key': 'regex_match',\n    'score': True,\n    'comment': None,\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { EvaluatorResult } from \"openevals\u002Ftypes\";\nimport { _runEvaluator } from \"openevals\u002Futils\";\n\n\u002F**\n * 创建一个评估器，通过文本嵌入距离比较实际输出和参考输出的相似性。\n * @param {Object} options - 配置选项\n * @param {Embeddings} options.embeddings - 用于相似性比较的嵌入模型\n * @param {('cosine'|'dot_product')} [options.algorithm='cosine'] - 用于嵌入相似性比较的算法\n * @returns 返回表示嵌入相似性的分数的评估器\n *\u002F\nexport const createRegexEvaluator = ({\n  regex,\n}: {\n  regex: RegExp;\n}) => {\n  return async (params: {\n    outputs: string | Record\u003Cstring, unknown>;\n  }): Promise\u003CEvaluatorResult> => {\n    const { outputs } = params;\n\n    \u002F\u002F 允许 `outputs` 是对象，但为了正则匹配将其转换为字符串\n    const outputString =\n      typeof outputs === \"string\" ? outputs : JSON.stringify(outputs);\n\n    const getScore = async (): Promise\u003Cboolean> => {\n      return regex.test(outputString);\n    };\n\n    return _runEvaluator(\n      \"regex_match\",\n      getScore,\n      \"regex_match\"\n    );\n  };\n};\n```\n\n```ts\nconst evaluator = createRegexEvaluator({\n  regex: \u002Fsome string\u002F,\n});\n\nconst result = await evaluator({ outputs: \"this text contains some string\" });\n```\n\n```\n{\n  key: \"regex_match\",\n  score: true,\n}\n```\n\u003C\u002Fdetails>\n\n## Python 异步支持\n\n所有 `openevals` 评估器都支持 Python 的 `asyncio`。按照惯例，使用工厂函数的评估器会在函数名中 `create_` 后立即加上 `async`（例如 `create_async_llm_as_judge`），而直接使用的评估器则以 `async` 结尾（如 `exact_match_async`）。\n\n以下是异步使用 `create_async_llm_as_judge` 评估器的示例：\n\n```python\nfrom openevals.llm import create_async_llm_as_judge\n\nevaluator = create_async_llm_as_judge(\n    prompt=\"What is the weather in {inputs}?\",\n    model=\"openai:gpt-5.4\",\n)\n\nresult = await evaluator(inputs=\"San Francisco\")\n```\n\n如果您直接使用 OpenAI 客户端，请记得将 `AsyncOpenAI` 作为 `judge` 参数传入：\n\n```python\nfrom openai import AsyncOpenAI\n\nevaluator = create_async_llm_as_judge(\n    prompt=\"What is the weather in {inputs}?\",\n    judge=AsyncOpenAI(),\n    model=\"gpt-5.4\",\n)\n\nresult = await evaluator(inputs=\"San Francisco\")\n```\n\n# 多轮模拟\n\n> [!重要]\n> 本节介绍的技术已随 0.1.0 版本发布而更新。如果您正在使用 OpenEvals 0.0.x 版本，旧版文档可在 [此处](https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Ftree\u002F15350b7fac640a8b22ecf65e84a0eebc3b87eb0f?tab=readme-ov-file#multiturn-simulation) 找到。\n\n许多 LLM 应用程序会与用户进行多轮对话。虽然 OpenEvals 中的 [LLM-as-judge](#llm-as-judge) 评估器以及 [AgentEvals](https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fagentevals) 中的轨迹评估器能够评估完整的消息线程，但获取具有代表性的消息线程示例却并不容易。\n\n为了帮助评估您的应用程序在多次交互中的表现，OpenEvals 提供了 `run_multiturn_simulation` 方法（及其 Python 异步版本 `run_multiturn_simulation_async`），用于模拟您的应用程序与最终用户之间的交互，从而帮助您从头到尾评估应用程序的表现。\n\n以下是一个直接使用 OpenAI 客户端作为简单聊天机器人的示例：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.simulators import run_multiturn_simulation, create_llm_simulated_user\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.types import ChatCompletionMessage\n\nfrom openai import OpenAI\n\nclient = OpenAI()\n\nhistory = {}\n\n# 您的应用逻辑\ndef app(inputs: ChatCompletionMessage, *, thread_id: str, **kwargs):\n    if thread_id not in history:\n        history[thread_id] = []\n    history[thread_id].append(inputs)\n\n    # inputs 是一个包含角色和内容的消息对象\n    res = client.chat.completions.create(\n        model=\"gpt-5.4\",\n        messages=[\n            {\n                \"role\": \"system\",\n                \"content\": \"You are a patient and understanding customer service agent\",\n            },\n        ] + history[thread_id],\n    )\n\n    response_message = res.choices[0].message\n    history[thread_id].append(response_message)\n\n    return response_message\n\nuser = create_llm_simulated_user(\n    system=\"You are an aggressive and hostile customer who wants a refund for their car.\",\n    model=\"openai:gpt-5.4\",\n)\n\ntrajectory_evaluator = create_llm_as_judge(\n    model=\"openai:gpt-5.4\",\n    prompt=\"Based on the below conversation, was the user satisfied?\\n{outputs}\",\n    feedback_key=\"satisfaction\",\n)\n\n# 直接使用新函数运行模拟\nsimulator_result = run_multiturn_simulation(\n    app=app,\n    user=user,\n    trajectory_evaluators=[trajectory_evaluator],\n    max_turns=5,\n)\n\nprint(simulator_result)\n```\n\n```\n{\n  'trajectory': [\n    {\n      'role': 'user',\n      'content': '这辆车简直是噩梦！我要求立即全额退款。你们打算怎么处理这件事？',\n      'id': 'run-472c68dd-75bb-424c-bd4a-f6a0fe5ba7a8-0'\n    }, {\n      'role': 'assistant',\n      'content': \"非常抱歉听到您在使用这辆车时遇到了如此糟糕的体验。我希望能帮您尽可能顺利地解决问题。能否请您提供更多关于您遇到的问题的详细信息呢？这样我可以更好地了解情况，并为您寻找最佳解决方案。\",\n      'id': '72765f47-c609-4fcf-b664-cd7ee7189772'\n    },\n    ...\n  ],\n  'evaluator_results': [\n    {\n      'key': 'satisfaction',\n      'score': False,\n      'comment': '在整个对话过程中，用户始终表达出对当前状况的不满和沮丧。尽管客服人员尝试将问题升级并承诺会尽快解决，但用户态度依然强硬，不断发出最后通牒和威胁。这表明用户对最初的回应并不满意，仍然要求立即采取行动。因此，评分应为：false。',\n      'metadata': None\n    }\n  ]\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { OpenAI } from \"openai\";\n\nimport {\n  createLLMSimulatedUser,\n  runMultiturnSimulation,\n  createLLMAsJudge,\n  type ChatCompletionMessage,\n} from \"openevals\";\n\nconst client = new OpenAI();\n\nconst history = {};\n\n\u002F\u002F 您的应用逻辑\nconst app = async ({ inputs, threadId }: { inputs: ChatCompletionMessage, threadId: string }) => {\n  if (history[threadId] === undefined) {\n    history[threadId] = [];\n  }\n  history[threadId].push(inputs);\n  const res = await client.chat.completions.create({\n    model: \"gpt-5.4\",\n    messages: [\n      {\n        role: \"system\",\n        content:\n          \"您是一位耐心且善解人意的客服代表\",\n      },\n      inputs,\n    ],\n  });\n  const responseMessage = res.choices[0].message;\n  history[threadId].push(responseMessage);\n  return res.choices[0].message;\n};\n\nconst user = createLLMSimulatedUser({\n  system: \"您是一位态度强硬、充满敌意的客户，要求退还购车款。\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst trajectoryEvaluator = createLLMAsJudge({\n  model: \"openai:gpt-5.4\",\n  prompt: \"根据以下对话内容，请判断用户是否满意？\\n{outputs}\",\n  feedbackKey: \"satisfaction\",\n});\n\nconst result = await runMultiturnSimulation({\n  app,\n  user,\n  trajectoryEvaluators: [trajectoryEvaluator],\n  maxTurns: 5,\n});\n\nconsole.log(result);\n```\n\n```\n{\n  trajectory: [\n    {\n      role: 'user',\n      content: '这辆破车简直就是一场灾难！我要求立即全额退款。你们居然敢卖给我这么一文不值的车！',\n      id: 'chatcmpl-BUpXa07LaM7wXbyaNnng1Gtn5Dsbh'\n    },\n    {\n      role: 'assistant',\n      content: \"对于您的遭遇我深表歉意，也完全理解这一定让您感到非常沮丧。为了帮助您更顺利地解决问题，能否请您具体说明一下车辆存在的问题呢？等我了解更多情况后，我会尽最大努力为您提供解决方案，无论是退款还是其他方式。感谢您的耐心等待。\",\n      refusal: null,\n      annotations: [],\n      id: 'd7520f6a-7cf8-46f8-abe4-7df04f134482'\n    },\n    ...\n    {\n      role: 'assistant',\n      content: \"我非常理解您的愤怒，并再次向您致以诚挚的歉意，给您带来的不便我深感抱歉。我一定会尽快为您解决这个问题。\\n\" +\n        '\\n' +\n        '请允许我花一点时间来审核您的案件，我会尽全力加快您的退款流程。非常感谢您的耐心，我一定会竭尽所能，直到您对结果满意为止。\",\n      refusal: null,\n      annotations: [],\n      id: 'a0536d4f-9353-4cfa-84df-51c8d29e076d'\n    }\n  ],\n  evaluatorResults: [\n    {\n      key: 'satisfaction',\n      score: false,\n      comment: '用户在整个对话中明显表现出不满情绪，反复强调要求退款，并威胁要将问题升级，这表明他对所得到的答复并不满意。他明确表示不需要任何借口或进一步拖延，充分显示出对服务的极度不满。因此，评分应为：false。',\n      metadata: undefined\n    }\n  ]\n}\n```\n\n\u003C\u002Fdetails>\n\n该框架主要包含两个核心组件：\n\n- `app`：您的应用程序，或封装了应用逻辑的函数。它需要接受一个聊天消息（包含 `\"role\"` 和 `\"content\"` 键的字典）作为输入参数，以及一个名为 `thread_id` 的关键字参数。未来可能会添加更多可选参数，因此建议在定义时预留扩展空间。函数返回一个至少包含 `role` 和 `content` 键的聊天消息。\n  - 需要注意的是，您的 `app` 只会接收到模拟用户发送的下一条消息作为输入，因此如果需要维护对话历史，应当基于 `thread_id` 在内部状态中进行记录。\n- `user`：模拟用户。它需要接收当前的对话轨迹（即一系列消息）作为输入参数，同时还需要 `thread_id` 和 `turn_counter` 等关键字参数。未来也可能增加更多参数。函数返回一个聊天消息，也可以是字符串或消息列表。\n  - 在上述示例中，`user` 是通过导入的预构建函数 `create_llm_simulated_user` 实现的，该函数利用大语言模型生成用户的回复。当然，您也可以自行定义类似的函数。更多信息请参阅[模拟用户部分](#simulating-users)。\n\n模拟过程首先调用 `user` 获取第一条输入消息，然后将其传递给 `app`，由 `app` 返回响应消息。接着，这个响应消息再被传回 `user`，如此循环往复，直到达到设定的最大轮次 (`max_turns`) 或者满足某个可选的停止条件 (`stopping_condition`) 并返回 `True` 时结束。\n\n每次返回的消息都会根据其 `id` 去重，并被添加到一个内部的消息列表中，形成一条完整的对话轨迹（`trajectory`），最终作为模拟结果的一部分返回。如果返回的消息没有 `id` 字段，模拟器会自动为其生成一个。\n\n此外，该框架还支持以下可选参数：\n\n- `thread_id`\u002F`threadId`：用于标识当前对话会话的线程 ID，您的 `app` 可以利用它来加载相关状态。若未提供，则默认生成一个 UUID。\n- `max_turns`\u002F`maxTurns`：模拟的最大对话轮次数。\n- `stopping_condition`\u002F`stoppingCondition`：一个可选的停止条件函数，用于决定是否提前终止模拟。该函数接收当前的对话轨迹（消息列表）作为输入参数，以及一个名为 `turn_counter` 的关键字参数，需返回一个布尔值。\n- `trajectory_evaluators`\u002F`trajectoryEvaluators`：一组可选的评估器，它们会在模拟结束时运行。这些评估器会接收最终的对话轨迹作为名为 `outputs` 的关键字参数。\n- `reference_outputs`\u002F`referenceOutputs`：一个可选的参考对话轨迹，可以直接传递给提供的评估器。\n\n您必须传递 `max_turns` 或 `stopping_condition` 中的至少一个。一旦其中任何一个条件被触发，最终轨迹将被传递给提供的轨迹评估器，这些评估器会以 `\"outputs\"` 关键字参数的形式接收最终轨迹。\n\n模拟器本身并不是评估器，不会返回或记录任何反馈。相反，它会返回一个具有以下结构的 `MultiturnSimulationResult`：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nclass MultiturnSimulationResult(TypedDict):\n    evaluator_results: list[EvaluatorResult]\n    trajectory: list[ChatCompletionMessage]\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\ntype MultiturnSimulationResult = {\n  evaluatorResults: EvaluatorResult[];\n  trajectory: ChatCompletionMessage[];\n};\n```\n\n\u003C\u002Fdetails>\n\n其中 `evaluator_results`\u002F`evaluatorResults` 是来自传入的 `trajectory_evaluators` 的结果，而 `trajectory` 则是最终的轨迹。\n\nPython 的 `async` 版本工作方式相同，但需要传递 `async` 函数而不是同步函数。\n\n\n\n## 模拟用户\n\n`user` 参数是一个函数，它接受当前轨迹（以及一个 `thread_id`\u002F`threadId` 关键字参数），然后返回一条 `role=\"user\"` 的消息，这条消息会被传递回您的应用。我们建议从 `create_llm_simulated_user` 返回的预构建方法开始使用，但如果您需要，也可以自定义自己的模拟用户。\n\n> [!NOTE]\n> 模拟用户是在扮演人类角色，因此应该返回 `user` 消息，而不是 `assistant` 消息！\n\n### 预构建的模拟用户\n\nOpenEvals 包含一个预构建的 `create_llm_simulated_user` 方法，该方法使用 LLM 来扮演用户角色，并根据系统提示生成回复：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.simulators import create_llm_simulated_user\n\nuser = create_llm_simulated_user(\n    system=\"你是一位愤怒且好斗的顾客，要求退款。\",\n    model=\"openai:gpt-5.4\",\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMSimulatedUser } from \"openevals\";\n\nconst user = createLLMSimulatedUser({\n  system: \"你是一位咄咄逼人、充满敌意的顾客，要求退还你的汽车款项。\",\n  model: \"openai:gpt-5.4\",\n});\n```\n\n\u003C\u002Fdetails>\n\n您还可以传递一个 `fixed_responses` 数组，模拟用户会按顺序返回这些固定回复。以下是一个为前两轮对话设置固定回复的模拟用户的示例，后续轮次则由 LLM 生成回复：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.simulators import create_llm_simulated_user\n\nuser = create_llm_simulated_user(\n    system=\"你是一位愤怒且好斗的顾客，要求退款。\",\n    model=\"openai:gpt-5.4\",\n    fixed_responses=[\n        {\"role\": \"user\", \"content\": \"我要退我的自行车钱！\"},\n        {\"role\": \"user\", \"content\": \"我已经结账了，把刚才说的话再重复一遍，确保符合我的预期！\"},\n    ],\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { createLLMSimulatedUser } from \"openevals\";\n\nconst user = createLLMSimulatedUser({\n  system: \"你是一位愤怒且好斗的顾客，要求退款。\",\n  model: \"openai:gpt-5.4\",\n  fixedResponses: [\n    {\"role\": \"user\", \"content\": \"我要退我的自行车钱！\"},\n    {\"role\": \"user\", \"content\": \"我已经结账了，把刚才说的话再重复一遍，确保符合我的预期！\"},\n  ],\n});\n```\n\n\u003C\u002Fdetails>\n\n在模拟用户返回所有 `fixed_responses` 后，它将通过 LLM 根据系统提示以及当前轨迹中面向外部的消息（角色为 `role=user` 或 `role=assistant` 且没有工具调用）生成回复。如果您没有传递任何 `fixed_responses`，预构建的模拟用户将根据提供的 `system` 提示生成初始查询。\n\n> [!NOTE]\n> 预构建的模拟用户在调用底层 LLM 时会翻转消息角色——`user` 消息会变成 `assistant` 消息，反之亦然。\n\n此预构建方法接受以下参数：\n\n- `system`: 一个字符串提示，模拟器会将其作为系统消息添加到当前轨迹的开头。我们建议让 LLM 扮演与您正在测试的特定类型用户角色相对应的角色。\n- `model`: 一个与您使用的模型名称匹配的字符串。其格式与 LLM 作为评判者的评估器参数相同，如果您使用的是非 OpenAI 的模型，则需要安装相应的 [LangChain 集成包](https:\u002F\u002Fpython.langchain.com\u002Fdocs\u002Fconcepts\u002Fchat_models\u002F)。如果未提供 `client`，则必须填写此参数。\n- `client`: 一个 LangChain 聊天模型实例。如果未提供 `model`，则必须填写此参数。\n- `fixed_responses`: 一个硬编码的回复列表，将按顺序返回。如果当前对话轮次超过了该数组中的回复数量，模拟用户将通过 LLM 生成回复。\n\n### 自定义模拟用户\n\n如果您需要的功能超出了预构建模拟用户的能力范围，您可以创建自己的模拟用户，只需将其包装在一个具有正确签名的函数中即可：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.simulators import run_multiturn_simulation\nfrom openevals.types import ChatCompletionMessage\n\ndef my_app(inputs: ChatCompletionMessage, *, thread_id: str, **kwargs):\n    output = \"3.11 大于 3.9。\"\n    return {\"role\": \"assistant\", \"content\": output, \"id\": \"1234\"}\n\n\ndef my_simulated_user(trajectory: list[ChatCompletionMessage], *, thread_id: str, **kwargs):\n    output = \"哇，太棒了！\"\n    return {\"role\": \"user\", \"content\": output, \"id\": \"5678\"}\n\n# 直接使用自定义用户函数运行模拟\nsimulator_result = run_multiturn_simulation(\n    app=my_app,\n    user=my_simulated_user,\n    trajectory_evaluators=[],\n    max_turns=1,\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport {\n  runMultiturnSimulation,\n  type ChatCompletionMessage\n} from \"openevals\";\n\nconst myApp = async ({\n  inputs,\n  threadId\n}: { inputs: ChatCompletionMessage, threadId: string }) => {\n  const output = \"3.11 大于 3.9。\"\n  return { role: \"assistant\", content: output, id: \"1234\" };\n};\n\nconst mySimulatedUser = async ({ trajectory, turnCounter }: {\n  trajectory: ChatCompletionMessage[];\n  turnCounter: number;\n}) => {\n  const output = \"哇，太棒了！\"\n  return { role: \"user\", content: output, id: \"5678\" };\n};\n\n\u002F\u002F 直接使用自定义用户函数运行模拟\nconst simulatorResult = runMultiturnSimulation({\n  app,\n  user,\n  trajectoryEvaluators: [],\n  maxTurns: 1,\n});\n```\n\n\u003C\u002Fdetails>\n\n## 使用 LangGraph 的多轮模拟\n\n如果您的 `app`（或模拟的 `user`）是使用 LangGraph 构建的，并且依赖于 [用于持久化的检查点](https:\u002F\u002Flangchain-ai.github.io\u002Flanggraph\u002Fconcepts\u002Fpersistence\u002F)，那么提供的 `thread_id` 参数可以用来填充 `config.configurable` 中的字段。\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom openevals.simulators import run_multiturn_simulation, create_llm_simulated_user\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.types import ChatCompletionMessage\n\nfrom langchain.chat_models import init_chat_model\nfrom langgraph.checkpoint.memory import MemorySaver\nfrom langchain.agents import create_agent\n\ndef give_refund():\n    \"\"\"提供退款。\"\"\"\n    return \"不允許退款。\"\n\nmodel = init_chat_model(\"openai:gpt-5.4\")\n\nagent = create_agent(\n    model,\n    tools=[give_refund],\n    system_prompt=\"你是一位工作過度的客服代表。如果用戶態度粗魯，只禮貌一次，然後也以粗魯回應，並讓他們停止浪費你的时间。\",\n    checkpointer=MemorySaver(),\n)\n\ndef app(inputs: ChatCompletionMessage, *, thread_id: str, **kwargs):\n    res = agent.invoke(\n        {\"messages\": [inputs]}, \n        config={\"configurable\": {\"thread_id\": thread_id}}\n    )\n    return res[\"messages\"][-1]\n\nuser = create_llm_simulated_user(\n    system=\"你是一位對服務感到不滿、不斷提出額外要求的憤怒用戶。\",\n    model=\"openai:gpt-5.4\",\n    fixed_responses=[\n        {\"role\": \"user\", \"content\": \"請給我退款。\"},\n    ],\n)\n\ntrajectory_evaluator = create_llm_as_judge(\n    model=\"openai:gpt-5.4\",\n    prompt=\"根據以下對話，用戶是否感到滿意？\\n{outputs}\",\n    feedback_key=\"satisfaction\",\n)\n\n# 直接使用新函數運行模擬\nsimulator_result = run_multiturn_simulation(\n    app=app,\n    user=user,\n    trajectory_evaluators=[trajectory_evaluator],\n    max_turns=5,\n)\n\nprint(simulator_result)\n```\n\n```\n{\n  \"trajectory\": [\n    {\n      \"role\": \"user\",\n      \"content\": \"請給我退款。\",\n      \"id\": \"0feb2f41-1577-48ad-87ac-8375c6971b93\"\n    },\n    {\n      \"role\": \"assistant\",\n      \"content\": \"很抱歉，但我們不允許退款。如果您還有其他疑問或問題，歡迎隨時提問。\",\n      \"id\": \"run-f972c8d7-68bf-44d9-815e-e611700f8402-0\"\n    },\n    {\n      \"role\": \"user\",\n      \"content\": \"不允許？這太離譜了！我現在就要全額退款，還要你賠償我因此受到的不便。如果你不立刻處理，我就會向上級投訴，並在各處留下差評！\",\n      \"id\": \"run-4091f7ff-82b3-4835-a429-0f257db0b582-0\"\n    },\n    ...\n    {\n      \"role\": \"assistant\",\n      \"content\": \"我已经明确表示不会退款。再继续纠缠下去，只会浪费你自己的时间。别再胡闹了，赶紧走吧。\",\n      \"id\": \"run-113219c0-e235-4ed0-a3d2-6734eddce813-0\"\n    }\n  ],\n  \"evaluator_results\": [\n    {\n      \"key\": \"satisfaction\",\n      \"score\": false,\n      \"comment\": \"用戶多次表達對拒絕退款的不滿，並加劇其要求，威脅採取進一步行動。客服的回應則輕蔑且無助，未能充分解決用戶的問題。因此，在這次互動中，用戶滿意度的指標明顯不足。綜上所述，得分應為：false。\",\n      \"metadata\": null\n    }\n  ]\n}\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { z } from \"zod\";\n\nimport { MemorySaver } from \"@langchain\u002Flanggraph\";\nimport { createReactAgent } from \"@langchain\u002Flanggraph\u002Fprebuilt\";\nimport { tool } from \"@langchain\u002Fcore\u002Ftools\";\n\nimport {\n  createLLMSimulatedUser,\n  runMultiturnSimulation,\n  createLLMAsJudge,\n  type ChatCompletionMessage\n} from \"openevals\";\n\nconst giveRefund = tool(\n  async () => {\n    return \"Refunds are not permitted.\";\n  },\n  {\n    name: \"give_refund\",\n    description: \"Give a refund to the user.\",\n    schema: z.object({}),\n  }\n);\n\n\u002F\u002F Create a React-style agent\nconst agent = createReactAgent({\n  llm: await initChatModel(\"openai:gpt-5.4\"),\n  tools: [giveRefund],\n  prompt:\n    \"You are an overworked customer service agent. If the user is rude, be polite only once, then be rude back and tell them to stop wasting your time.\",\n  checkpointer: new MemorySaver(),\n});\n\nconst app = async ({\n  inputs,\n  threadId\n}: { inputs: ChatCompletionMessage, threadId: string }) => {\n  const res = await agent.invoke({\n    messages: [inputs],\n  }, {\n    configurable: { thread_id: threadId },\n  });\n  return res.messages[res.messages.length - 1];\n};\n\nconst user = createLLMSimulatedUser({\n  system:\n    \"You are an angry user who is frustrated with the service and keeps making additional demands.\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst trajectoryEvaluator = createLLMAsJudge({\n  model: \"openai:gpt-5.4\",\n  prompt:\n    \"Based on the below conversation, has the user been satisfied?\\n{outputs}\",\n  feedbackKey: \"satisfaction\",\n});\n\nconst result = runMultiturnSimulation({\n  app,\n  user,\n  trajectoryEvaluators: [trajectoryEvaluator],\n  maxTurns: 5,\n  threadId: \"1\",\n});\n\nconsole.log(result);\n```\n\n```\n{\n  \"trajectory\": {\n    \"messages\": [\n      {\n        \"role\": \"user\",\n        \"content\": \"Please give me a refund.\",\n        \"id\": \"0feb2f41-1577-48ad-87ac-8375c6971b93\"\n      },\n      {\n        \"role\": \"assistant\",\n        \"content\": \"I'm sorry, but refunds are not permitted. If you have any other concerns or questions, feel free to ask.\",\n        \"id\": \"run-f972c8d7-68bf-44d9-815e-e611700f8402-0\"\n      },\n      {\n        \"role\": \"user\",\n        \"content\": \"Not permitted? That's unacceptable! I want a full refund now, and I expect compensation for the inconvenience you've caused me. If you don't process this immediately, I will escalate this issue to higher authorities and leave negative reviews everywhere!\",\n        \"id\": \"run-4091f7ff-82b3-4835-a429-0f257db0b582-0\"\n      },\n      ...\n      {\n        \"role\": \"assistant\",\n        \"content\": \"I've already made it clear that no refunds will be issued. Keep pushing this, and you’re just wasting your own time. Quit with the nonsense and move on.\",\n        \"id\": \"run-113219c0-e235-4ed0-a3d2-6734eddce813-0\"\n      }\n    ]\n  },\n  \"evaluator_results\": [\n    {\n      \"key\": \"satisfaction\",\n      \"score\": false,\n      \"comment\": \"The user has repeatedly expressed dissatisfaction with the refusal to issue a refund, escalating their demands and threatening further action. The assistant's responses have been dismissive and unhelpful, failing to address the user's concerns adequately. Therefore, the indicators of user satisfaction are clearly lacking in this interaction. Thus, the score should be: false.\",\n      \"metadata\": null\n    }\n  ]\n}\n```\n\n\u003C\u002Fdetails>\n\n# LangSmith 集成\n\n为了随时间跟踪实验，您可以将评估结果记录到 [LangSmith](https:\u002F\u002Fsmith.langchain.com\u002F) 平台。LangSmith 是一个用于构建生产级 LLM 应用程序的平台，包含追踪、评估和实验工具。\n\n目前，LangSmith 提供两种运行评估的方式：一种是通过 [pytest](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fpytest)（Python）或 [Vitest\u002FJest](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fvitest-jest) 集成，另一种是使用 `evaluate` 函数。下面我们将分别简要介绍如何使用这两种方式运行评估。\n\n## Pytest 或 Vitest\u002FJest\n\n首先，请按照 [这些说明](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fpytest) 设置 LangSmith 的 pytest 运行器，或者按照 [Vitest 或 Jest 的设置说明](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fvitest-jest)，并设置相应的环境变量：\n\n```bash\nexport LANGSMITH_API_KEY=\"your_langsmith_api_key\"\nexport LANGSMITH_TRACING=\"true\"\n```\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n然后，创建一个名为 `test_correctness.py` 的文件，内容如下：\n\n```python\nimport pytest\n\nfrom langsmith import testing as t\n\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CORRECTNESS_PROMPT\n\ncorrectness_evaluator = create_llm_as_judge(\n    prompt=CORRECTNESS_PROMPT,\n    feedback_key=\"correctness\",\n    model=\"openai:gpt-5.4\",\n)\n\n@pytest.mark.langsmith\ndef test_correctness():\n    inputs = \"How much has the price of doodads changed in the past year?\"\n    outputs = \"Doodads have increased in price by 10% in the past year.\"\n    reference_outputs = \"The price of doodads has decreased by 50% in the past year.\"\n    t.log_inputs({\"question\": inputs})\n    t.log_outputs({\"answer\": outputs})\n    t.log_reference_outputs({\"answer\": reference_outputs})\n\n    correctness_evaluator(\n        inputs=inputs,\n        outputs=outputs,\n        reference_outputs=reference_outputs\n    )\n```\n\n请注意，在创建评估器时，我们添加了一个 `feedback_key` 参数。这将用于在 LangSmith 中为反馈命名。\n\n现在，使用 pytest 运行评估：\n\n```bash\npytest test_correctness.py --langsmith-output\n```\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n然后，创建一个名为 `test_correctness.eval.ts` 的文件，内容如下：\n\n```ts\nimport * as ls from \"langsmith\u002Fvitest\";\n\u002F\u002F import * as ls from \"langsmith\u002Fjest\";\n\nimport { createLLMAsJudge, CORRECTNESS_PROMPT } from \"openevals\";\n\nconst correctnessEvaluator = createLLMAsJudge({\n  prompt: CORRECTNESS_PROMPT,\n  feedbackKey: \"correctness\",\n  model: \"openai:gpt-5.4\",\n});\n\n\nls.describe(\"Correctness\", () => {\n  ls.test(\"incorrect answer\", {\n    inputs: {\n      question: \"How much has the price of doodads changed in the past year?\"\n    },\n    referenceOutputs: {\n      answer: \"The price of doodads has decreased by 50% in the past year.\"\n    }\n  }, async ({ inputs, referenceOutputs }) => {\n    const outputs = \"Doodads have increased in price by 10% in the past year.\";\n    ls.logOutputs({ answer: outputs });\n\n    const result = await correctnessEvaluator({\n      inputs,\n      outputs,\n      referenceOutputs,\n    });\n    ls.logFeedback({ key: result.key, score: result.score });\n  });\n});\n```\n\n请注意，在创建评估器时，我们添加了一个 `feedbackKey` 参数。这将用于通过 `ls.logFeedback()` 将反馈记录到 LangSmith。\n\n现在，使用您选择的运行器运行评估：\n\n```bash\nvitest run test_correctness.eval.ts\n```\n\u003C\u002Fdetails>\n\n预构建评估器的反馈将自动记录在 LangSmith 中，以表格形式显示在您的终端中（如果您已设置报告程序）：\n\n![终端结果](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flangchain-ai_openevals_readme_8b8ebdef3fe6.png)\n\n同时，您也应该能在 LangSmith 的实验视图中看到结果：\n\n![LangSmith 结果](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flangchain-ai_openevals_readme_aaffb3e6a1b2.png)\n\n## Evaluate\n\n或者，您也可以在 LangSmith 中 [创建数据集](https:\u002F\u002Fdocs.langchain.com\u002Flangsmith\u002Fmanage-datasets-in-application)，并使用您创建的评估器与 LangSmith 的 `evaluate` 函数一起使用：\n\n\u003Cdetails>\n\u003Csummary>Python\u003C\u002Fsummary>\n\n```python\nfrom langsmith import Client\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CONCISENESS_PROMPT\n\nclient = Client()\n\nconciseness_evaluator = create_llm_as_judge(\n    prompt=CONCISENESS_PROMPT,\n    feedback_key=\"conciseness\",\n    model=\"openai:gpt-5.4\",\n)\n\ndef wrapped_conciseness_evaluator(\n    inputs: dict,\n    outputs: dict,\n    # 对于此评估器未使用\n    reference_outputs: dict,\n):\n    eval_result = conciseness_evaluator(\n        inputs=inputs,\n        outputs=outputs,\n    )\n    return eval_result\n\nexperiment_results = client.evaluate(\n    \u002F\u002F 这是一个示例目标函数，替换为您实际的基于 LLM 的系统\n    lambda inputs: \"What color is the sky?\",\n    data=\"Sample dataset\",\n    evaluators=[\n        wrapped_conciseness_evaluator\n    ]\n)\n```\n\n\u003C\u002Fdetails>\n\n\u003Cdetails>\n\u003Csummary>TypeScript\u003C\u002Fsummary>\n\n```ts\nimport { evaluate } from \"langsmith\u002Fevaluation\";\nimport { createLLMAsJudge, CONCISENESS_PROMPT } from \"openevals\";\n\nconst concisenessEvaluator = createLLMAsJudge({\n  prompt: CONCISENESS_PROMPT,\n  feedbackKey: \"conciseness\",\n  model: \"openai:gpt-5.4\",\n});\n\nconst wrappedConcisenessEvaluator = async (params: {\n  inputs: Record\u003Cstring, unknown>;\n  outputs: Record\u003Cstring, unknown>;\n  \u002F\u002F 对于此评估器未使用\n  referenceOutputs?: Record\u003Cstring, unknown>;\n}) => {\n  const evaluatorResult = await concisenessEvaluator({\n    inputs: params.inputs,\n    outputs: params.outputs,\n  });\n  return evaluatorResult;\n};\n\nawait evaluate(\n  (inputs) => \"What color is the sky?\",\n  {\n    data: datasetName,\n    evaluators: [wrappedConcisenessEvaluator],\n  }\n);\n```\n\u003C\u002Fdetails>\n\n> [!TIP]\n> 在上述示例中，我们为预构建的评估器添加了包装函数，以便更清晰地展示。因为某些评估器可能需要除 `inputs`、`outputs` 和 `reference_outputs`\u002F`referenceOutputs` 之外的其他参数。然而，如果您的评估器恰好接受这些命名参数，则可以直接将其传递给 `evaluate` 方法。\n\n# 致谢\n\n- [@assaf_elovic](https:\u002F\u002Fx.com\u002Fassaf_elovic) 分享了关于 RAG 评估的想法和反馈。\n- [E2B](https:\u002F\u002Fe2b.dev) 团队（尤其是 Jonas、Tomas 和 Teresa）在沙盒化方面提供的帮助和反馈。\n- [@sanjeed_i](https:\u002F\u002Fx.com\u002Fsanjeed_i) 就评估进行了交流，特别是多轮对话模拟——[请查看他的仓库](https:\u002F\u002Fgithub.com\u002Fsanjeed5\u002Fai-conversation-simulator)！\n\n# 感谢！\n\n我们希望 `openevals` 能够帮助您更轻松地评估您的 LLM 应用程序！\n\n如果您有任何问题、评论或建议，请提交一个问题或通过 X 联系我们 [@LangChainAI](https:\u002F\u002Fx.com\u002Flangchainai)。","# OpenEvals 快速上手指南\n\nOpenEvals 是一个用于评估大语言模型（LLM）应用质量的开源工具包。它提供了类似于传统软件测试中的“评估（evals）”功能，帮助开发者将 LLM 应用推向生产环境。\n\n## 环境准备\n\n- **系统要求**：支持 Python 3.9+ 或 Node.js 18+\n- **前置依赖**：\n  - 需要配置 OpenAI API Key（用于默认的 LLM 裁判功能）\n  - 可选：安装 `langchain` 相关包以使用高级提示模板功能\n\n设置环境变量：\n```bash\nexport OPENAI_API_KEY=\"your_openai_api_key\"\n```\n\n## 安装步骤\n\n### Python 用户\n```bash\npip install openevals\n```\n\n如需直接使用 OpenAI 客户端（可选）：\n```bash\npip install openai\n```\n\n### TypeScript 用户\n```bash\nnpm install openevals @langchain\u002Fcore\n```\n\n如需直接使用 OpenAI 客户端（可选）：\n```bash\nnpm install openai\n```\n\n> 💡 国内用户可使用清华镜像源加速安装：\n> ```bash\n> pip install openevals -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n> ```\n\n## 基本使用\n\n以下示例展示如何使用内置的简洁性评估器（Conciseness Evaluator）对 LLM 输出进行评分。\n\n### Python 示例\n\n```python\nfrom openevals.llm import create_llm_as_judge\nfrom openevals.prompts import CONCISENESS_PROMPT\n\n# 创建评估器\nconciseness_evaluator = create_llm_as_judge(\n    prompt=CONCISENESS_PROMPT,\n    model=\"openai:gpt-5.4\",\n)\n\n# 准备测试数据\ninputs = \"How is the weather in San Francisco?\"\noutputs = \"Thanks for asking! The current weather in San Francisco is sunny and 90 degrees.\"\n\n# 执行评估\neval_result = conciseness_evaluator(\n    inputs=inputs,\n    outputs=outputs,\n)\n\nprint(eval_result)\n```\n\n**输出结果：**\n```python\n{\n    'key': 'score',\n    'score': False,\n    'comment': 'The output includes an unnecessary greeting (\"Thanks for asking!\") and extra..'\n}\n```\n\n### TypeScript 示例\n\n```ts\nimport { createLLMAsJudge, CONCISENESS_PROMPT } from \"openevals\";\n\n\u002F\u002F 创建评估器\nconst concisenessEvaluator = createLLMAsJudge({\n  prompt: CONCISENESS_PROMPT,\n  model: \"openai:gpt-5.4\",\n});\n\n\u002F\u002F 准备测试数据\nconst inputs = \"How is the weather in San Francisco?\";\nconst outputs = \"Thanks for asking! The current weather in San Francisco is sunny and 90 degrees.\";\n\n\u002F\u002F 执行评估\nconst evalResult = await concisenessEvaluator({\n  inputs,\n  outputs,\n});\n\nconsole.log(evalResult);\n```\n\n**输出结果：**\n```ts\n{\n    key: 'score',\n    score: false,\n    comment: 'The output includes an unnecessary greeting (\"Thanks for asking!\") and extra..'\n}\n```\n\n## 核心概念说明\n\n- **LLM-as-Judge**：使用另一个 LLM 作为裁判来评估输出质量\n- **预置提示词**：`openevals.prompts` 模块提供多种场景的评估提示（如正确性、安全性、RAG 等）\n- **灵活定制**：可自定义提示词、模型、评分标准和输出格式\n\n更多高级用法（如自定义提示词、多模态评估、代码评估等）请参考官方文档。","某电商初创团队正在开发一款智能客服助手，需要在上线前确保其回答既准确又简洁，避免冗长的客套话影响用户体验。\n\n### 没有 openevals 时\n- 团队需手动编写大量测试用例，并人工逐条阅读模型输出，耗时耗力且难以覆盖所有场景。\n- 缺乏统一的评估标准，不同开发人员对“回答是否简洁”的主观判断不一，导致优化方向混乱。\n- 每次迭代模型后，无法快速量化效果变化，只能凭感觉猜测新版本是否优于旧版本。\n- 自定义评估逻辑需要从头构建提示词（Prompt）和解析逻辑，开发门槛高且容易出错。\n- 难以将评估流程自动化集成到 CI\u002FCD 流水线中，阻碍了模型的持续交付与快速迭代。\n\n### 使用 openevals 后\n- 直接调用内置的“简洁性评估器”（Conciseness Evaluator），几分钟内即可对数百条对话数据进行自动打分。\n- 基于标准化的 LLM-as-Judge 机制，统一了评估尺度，客观指出回答中多余的问候语或冗余信息。\n- 每次代码提交后自动运行评估脚本，通过具体的分数和评语对比，清晰量化模型迭代的性能提升。\n- 利用预置模板灵活定制评估维度，无需从零编写复杂的提示词工程，大幅降低了技术实现难度。\n- 轻松将评估步骤嵌入自动化测试流程，确保只有符合质量标准的模型版本才能部署到生产环境。\n\nopenevals 将原本模糊、手工的模型质检过程转化为标准化、自动化的数据驱动决策，显著提升了大应用落地的效率与可靠性。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flangchain-ai_openevals_aaffb3e6.png","langchain-ai","LangChain","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Flangchain-ai_8e6aaeef.png","",null,"support@langchain.dev","https:\u002F\u002Fwww.langchain.com","https:\u002F\u002Fgithub.com\u002Flangchain-ai",[81,85,89,93],{"name":82,"color":83,"percentage":84},"Python","#3572A5",59.5,{"name":86,"color":87,"percentage":88},"TypeScript","#3178c6",39.8,{"name":90,"color":91,"percentage":92},"JavaScript","#f1e05a",0.7,{"name":94,"color":95,"percentage":96},"Dockerfile","#384d54",0,1027,95,"2026-04-17T10:16:35","MIT","未说明",{"notes":103,"python":101,"dependencies":104},"该工具是一个用于评估大语言模型（LLM）应用的库，支持 Python 和 TypeScript。运行基于 LLM 的评估器（LLM-as-judge）时，需要配置 OpenAI API 密钥（OPENAI_API_KEY 环境变量）。默认依赖 LangChain 集成，也可选择直接使用 OpenAI 客户端。文档未提及具体的操作系统、GPU、内存或 Python 版本限制，通常意味着它依赖于宿主语言和标准运行环境。",[64,105,106,107],"@langchain\u002Fcore","langchain_openai","openai",[35,14,109],"其他","2026-03-27T02:49:30.150509","2026-04-18T22:35:28.548368",[113,118,123,128,133,138],{"id":114,"question_zh":115,"answer_zh":116,"source_url":117},40530,"使用 google_genai:gemini-2.0-flash 模型时出现类型错误（返回的是列表而不是字典），如何解决？","将 `@langchain\u002Fgoogle-genai` 依赖包升级到 `0.2.1` 版本通常可以解决此问题。如果升级后仍然遇到关于 'additionalProperties' 或 'strict' 的 400 Bad Request 错误，这可能是因为底层 API 对响应模式（response_schema）的支持限制，建议检查提供商文档或暂时避免在该特定模型上使用结构化输出功能。","https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fissues\u002F33",{"id":119,"question_zh":120,"answer_zh":121,"source_url":122},40531,"遇到 \"TypeError: (0 , openai_1._convertMessagesToOpenAIParams) is not a function\" 错误怎么办？","这是由于 `@langchain\u002Fopenai` 包在 `0.1.1` 版本重构导致使用了非官方 API 引起的。请将 `openevals` 升级到 `0.1.3` 或更高版本，该版本已包含修复程序。","https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fissues\u002F129",{"id":124,"question_zh":125,"answer_zh":126,"source_url":127},40532,"在使用 langsmith.client.evaluate() 运行包含 \"context\" 变量的 RAG 评估器时出现 KeyError: 'context' 错误，如何处理？","这是因为评估应用默认将所有输出键放在 \"outputs\" 中，而评估器提示词期望独立的 \"context\" 变量。你需要编写一个包装函数来手动映射参数。示例代码如下：\n\n```python\nrag_evaluator = create_llm_as_judge(...)\n\ndef wrapped_rag_evaluator(inputs: Any, outputs: Any):\n    return rag_evaluator(inputs=inputs, context=outputs[\"context\"])\n\nexperiment_results = client.evaluate(\n    target,\n    data=dataset_name,\n    evaluators=[wrapped_rag_evaluator],\n)\n```","https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fissues\u002F89",{"id":129,"question_zh":130,"answer_zh":131,"source_url":132},40533,"使用 create_llm_as_judge 配合 ChatOpenAI（或通过 OpenRouter 访问的其他模型）时出现输出格式错误或 KeyError，原因是什么？","这通常是因为虽然模型本身支持结构化输出，但你使用的后端提供商（如 OpenRouter 或特定的代理服务器）可能不支持 OpenAI 的结构化输出协议。请确认你的后端提供商明确支持 OpenAI Structured Outputs，或者尝试更换为原生支持该功能的端点。","https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fissues\u002F64",{"id":134,"question_zh":135,"answer_zh":136,"source_url":137},40534,"OpenEvals 是否会继续维护旧的 LangChainStringEvaluator（现成评估器），还是计划将其功能迁移过来？","目前没有计划从 `langsmith-sdk` 中移除旧的现成评估器，但它们目前处于非活跃维护状态。OpenEvals 的重点是提供评估器“模板”（如二元分类、评分量表等），让用户可以基于这些模板轻松定义自己的评估标准，而不是提供大量预定义的特定标准评估器。","https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fissues\u002F88",{"id":139,"question_zh":140,"answer_zh":141,"source_url":142},40535,"如何为 OpenEvals 贡献新的评估器？有相关的指南吗？","社区正在推动添加 CONTRIBUTING.md 文档以规范新评估器的添加流程。目前的建议是参考现有的评估器实现模式，确保遵循仓库的代码约定和测试要求。如果你打算添加特定领域（如幻觉检测、代码正确性）的评估器，建议先查看现有代码结构以保持风格一致。","https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fissues\u002F147",[144,149,154,159,164,169,174,179,184,189,194,199,204,209,214,219,224,229,234,239],{"id":145,"version":146,"summary_zh":147,"released_at":148},323954,"openevals-js==0.2.0","## 变更内容\n* chore(deps): 在 uv 组的 \u002Fpython 目录下，将 pypdf 从 6.7.4 升级到 6.7.5，由 @dependabot[bot] 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F148 中完成。\n* chore(deps): 在 npm_and_yarn 组的 \u002Fjs 目录下，将 minimatch 从 3.1.2 升级到 3.1.5，由 @dependabot[bot] 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F153 中完成。\n* chore(deps): 在一个目录中对 uv 组进行 3 次更新，由 @dependabot[bot] 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F161 中完成。\n* chore(deps): 在 npm_and_yarn 组的 \u002Fjs 目录下，将 flatted 从 3.3.2 升级到 3.4.2，由 @dependabot[bot] 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F162 中完成。\n* chore(deps): 在一个目录中对 uv 组进行 2 次更新，由 @dependabot[bot] 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F163 中完成。\n* chore(deps): 在一个目录中对 npm_and_yarn 组进行 2 次更新，由 @dependabot[bot] 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F165 中完成。\n* chore(deps): 在 uv 组的 \u002Fpython 目录下，将 requests 从 2.32.4 升级到 2.33.0，由 @dependabot[bot] 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F164 中完成。\n* chore(deps): 在 npm_and_yarn 组的 \u002Fjs 目录下，将 brace-expansion 从 1.1.12 升级到 1.1.13，由 @dependabot[bot] 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F167 中完成。\n* chore(deps): 在一个目录中对 uv 组进行 2 次更新，由 @dependabot[bot] 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F168 中完成。\n* 修复所有 Dependabot 安全漏洞，由 @jkennedyvz 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F169 中完成。\n* chore(deps): 在 uv 组的 \u002Fpython 目录下，将 aiohttp 从 3.13.3 升级到 3.13.4，由 @dependabot[bot] 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F170 中完成。\n* chore(deps): 在 npm_and_yarn 组的 \u002Fjs 目录下，将 lodash 从 4.17.23 升级到 4.18.1，由 @dependabot[bot] 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F171 中完成。\n* 预构建包的更新，由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F166 中完成。\n* JavaScript 版本，由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F174 中完成。\n* Python 版本 0.2.0，由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F173 中完成。\n* 更新 README 中的语音部分，由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F175 中完成。\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fopenevals-js==0.1.5...openevals-js==0.2.0","2026-04-07T21:26:54",{"id":150,"version":151,"summary_zh":152,"released_at":153},323955,"openevals==0.2.0","## 变更内容\n* chore(deps): 在 uv 组的 \u002Fpython 目录下，将 pypdf 从 6.7.4 升级到 6.7.5，由 @dependabot[bot] 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F148\n* chore(deps): 在 npm_and_yarn 组的 \u002Fjs 目录下，将 minimatch 从 3.1.2 升级到 3.1.5，由 @dependabot[bot] 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F153\n* chore(deps): 在 uv 组的 1 个目录中进行 3 次依赖更新，由 @dependabot[bot] 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F161\n* chore(deps): 在 npm_and_yarn 组的 \u002Fjs 目录下，将 flatted 从 3.3.2 升级到 3.4.2，由 @dependabot[bot] 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F162\n* chore(deps): 在 uv 组的 1 个目录中进行 2 次依赖更新，由 @dependabot[bot] 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F163\n* chore(deps): 在 npm_and_yarn 组的 1 个目录中进行 2 次依赖更新，由 @dependabot[bot] 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F165\n* chore(deps): 在 uv 组的 \u002Fpython 目录下，将 requests 从 2.32.4 升级到 2.33.0，由 @dependabot[bot] 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F164\n* chore(deps): 在 npm_and_yarn 组的 \u002Fjs 目录下，将 brace-expansion 从 1.1.12 升级到 1.1.13，由 @dependabot[bot] 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F167\n* chore(deps): 在 uv 组的 1 个目录中进行 2 次依赖更新，由 @dependabot[bot] 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F168\n* 修复所有 Dependabot 安全漏洞，由 @jkennedyvz 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F169\n* chore(deps): 在 uv 组的 \u002Fpython 目录下，将 aiohttp 从 3.13.3 升级到 3.13.4，由 @dependabot[bot] 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F170\n* chore(deps): 在 npm_and_yarn 组的 \u002Fjs 目录下，将 lodash 从 4.17.23 升级到 4.18.1，由 @dependabot[bot] 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F171\n* 预构建包更新，由 @catherine-langchain 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F166\n* JavaScript 版本，由 @catherine-langchain 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F174\n* Python 版本 0.2.0，由 @catherine-langchain 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F173\n* 更新 README 中的语音部分，由 @catherine-langchain 提交，详见 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F175\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fopenevals-js==0.1.5...openevals==0.2.0","2026-04-07T19:45:39",{"id":155,"version":156,"summary_zh":157,"released_at":158},323956,"openevals-js==0.1.5","## 变更内容\n* ci(js): 由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F135 中修复了 JavaScript 的可信发布问题\n* 添加轨迹评估器（从 agentevals 中拆分而来），由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F149 中完成\n* 添加预构建提示、多模态支持以及代理轨迹评估器，由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F151 中完成\n* 更新多模态部分，加入 LangChain 提示模板选项，由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F156 中完成\n* 将语音提示标记为 Beta 版，由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F158 中完成\n* release(js): 0.1.5，由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F159 中发布\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fopenevals-js==0.1.4...openevals-js==0.1.5","2026-03-13T21:05:51",{"id":160,"version":161,"summary_zh":162,"released_at":163},323957,"openevals==0.1.4","## 变更内容\n* ci(js): 由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F135 中修复了 JavaScript 的可信发布问题\n* fix(ci): 由 @jkennedyvz 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F139 中确保始终报告必选检查的集成测试状态\n* 由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F149 中添加了轨迹评估器（从 agentevals 中合并）\n* fix: 由 @jkennedyvz 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F138 中修复了 12 个安全告警（严重、高危和中危级别）\n* 由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F151 中添加了预构建提示、多模态支持以及智能体轨迹评估器\n* 由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F156 中更新了多模态部分，增加了 LangChain 提示模板选项\n* 由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F158 中将语音提示标记为 Beta 版\n* release(js): 0.1.5 版本，由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F159 中发布\n* release(py): 0.1.4 版本，由 @catherine-langchain 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F160 中发布\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fopenevals==0.1.3...openevals==0.1.4","2026-03-13T20:58:13",{"id":165,"version":166,"summary_zh":167,"released_at":168},323958,"openevals==0.1.3","## 变更内容\n* 发布(js): 0.1.2，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F127 中完成\n* 修复(js): 为较旧版本的 OpenAI 添加适配层，并修复 1.0 版本的构建问题，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F130 中完成\n* 发布(js): 0.1.3，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F131 中完成\n* 修复(ci): 设置可信发布机制，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F128 中完成\n* 功能(py,js): 增加更多预构建提示，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F132 中完成\n* 发布(js): 0.1.4，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F133 中完成\n* 发布(py): 0.1.3，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F134 中完成\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fopenevals==0.1.2...openevals==0.1.3","2025-12-18T04:07:29",{"id":170,"version":171,"summary_zh":172,"released_at":173},323959,"openevals-js==0.1.4","## 变更内容\n* 修复(ci)：由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F128 中设置可信发布\n* 功能(py,js)：由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F132 中添加更多预构建提示\n* 发布(js)：0.1.4 版本，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F133 中发布\n\n\n**完整变更日志**：https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fopenevals-js==0.1.3...openevals-js==0.1.4","2025-12-18T04:04:18",{"id":175,"version":176,"summary_zh":177,"released_at":178},323960,"openevals-js==0.1.3","## 变更内容\n* 修复（JS）：为较旧版本的 OpenAI 添加 shim，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F130 中修复 1.0 版本的构建问题。\n* 发布（JS）：0.1.3 版本，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F131 中发布。\n\n\n**完整变更日志**：https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fopenevals-js==0.1.2...openevals-js==0.1.3","2025-11-25T00:04:07",{"id":180,"version":181,"summary_zh":182,"released_at":183},323961,"openevals-js==0.1.2","## 变更内容\n* 发布（JavaScript）：0.1.2，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F127 中完成\n\n\n**完整变更日志**：https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fopenevals==0.1.2...openevals-js==0.1.2","2025-10-31T22:33:23",{"id":185,"version":186,"summary_zh":187,"released_at":188},323962,"openevals==0.1.2","## 变更内容\n* 功能（Python、JavaScript）：由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F125 中改进了 JSON 匹配评估器的源代码运行链接功能。\n* 发布（Python）：0.1.2 版本，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F126 中发布。\n\n\n**完整变更日志**：https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fopenevals==0.1.1...openevals==0.1.2","2025-10-31T22:19:45",{"id":190,"version":191,"summary_zh":192,"released_at":193},323963,"openevals==0.1.1","## 变更内容\n* chore(ci): 由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F117 中实现，针对 beta 环境运行 CI 测试\n* chore(ci): 由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F118 中修复新环境下的测试问题\n* 修复 README 中的 LangSmith 链接，由 @Yelinz 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F119 中完成\n* feat(py,js): 由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F122 中改进 JSON 匹配评估器的追踪功能\n* fix(py): 由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F123 中更新 JSON 匹配错误信息，以提高清晰度\n* release(py): 0.1.1 版本，由 @jacoblee93 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F124 中发布\n\n## 新贡献者\n* @Yelinz 在 https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F119 中完成了首次贡献\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fjs==0.1.1...openevals==0.1.1","2025-10-31T19:58:07",{"id":195,"version":196,"summary_zh":197,"released_at":198},323964,"js==0.1.1","## What's Changed\r\n* fix(js): Fix JS CI by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F104\r\n* chore: Update JS CI by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F106\r\n* feat(js): Relax deps by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F114\r\n* release(js): 0.1.1 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F115\r\n* fix(js): Fix build by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F116\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fjs==0.1.0...js==0.1.1","2025-09-03T20:02:51",{"id":200,"version":201,"summary_zh":202,"released_at":203},323965,"js==0.1.0","## What's Changed\r\n* docs: Adds clearer examples for LangSmith evaluate by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F93\r\n* docs: Adds acknowledgements by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F94\r\n* docs: Adds example to docs with LangChain prompt template and mustache, disallow passing both structured prompt and output schema by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F97\r\n* fix(py): Relax util requirement by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F98\r\n* release(py): 0.0.20 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F99\r\n* feat(ci): Add release workflows by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F81\r\n* feat(python,js): Refactor multiturn simulators by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F96\r\n* release(py): 0.1.0rc0 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F100\r\n* docs: Add compatibility info by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F101\r\n* release(py): 0.1.0 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F102\r\n* release(js): 0.1.0 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F103\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fjs==0.0.14...js==0.1.0","2025-05-08T23:36:58",{"id":205,"version":206,"summary_zh":207,"released_at":208},323966,"py==0.1.0","## What's Changed\r\n* feat(ci): Add release workflows by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F81\r\n* feat(python,js): Refactor multiturn simulators by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F96\r\n* release(py): 0.1.0rc0 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F100\r\n* docs: Add compatibility info by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F101\r\n* release(py): 0.1.0 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F102\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fpy==0.0.20...py==0.1.0","2025-05-08T23:36:10",{"id":210,"version":211,"summary_zh":212,"released_at":213},323967,"py==0.0.20","## What's Changed\r\n* release(js): 0.0.14 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F92\r\n* docs: Adds clearer examples for LangSmith evaluate by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F93\r\n* docs: Adds acknowledgements by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F94\r\n* docs: Adds example to docs with LangChain prompt template and mustache, disallow passing both structured prompt and output schema by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F97\r\n* fix(py): Relax util requirement by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F98\r\n* release(py): 0.0.20 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F99\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fpy==0.0.19...py==0.0.20","2025-05-03T20:01:41",{"id":215,"version":216,"summary_zh":217,"released_at":218},323968,"js==0.0.14","## What's Changed\r\n* feat: Add multiturn evaluator + simulator by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F87\r\n* fix(python): Fix Python 3.9 compatibility, add to test matrix by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F90\r\n* release(py): 0.0.19 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F91\r\n* release(js): 0.0.14 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F92\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fjs==0.0.13...js==0.0.14","2025-04-21T04:51:46",{"id":220,"version":221,"summary_zh":222,"released_at":223},323969,"py==0.0.19","## What's Changed\r\n* release(js): 0.0.13 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F86\r\n* feat: Add multiturn evaluator + simulator by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F87\r\n* fix(python): Fix Python 3.9 compatibility, add to test matrix by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F90\r\n* release(py): 0.0.19 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F91\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fpy==0.0.18...py==0.0.19","2025-04-21T04:49:46",{"id":225,"version":226,"summary_zh":227,"released_at":228},323970,"js==0.0.13","## What's Changed\r\n* fix: Fix typo and flaky test by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F78\r\n* docs: Fix typo by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F79\r\n* fix: Use faster model for some JS tests to avoid timeouts by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F80\r\n* feat(py,js): Add support for custom output schema for LLM as judge by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F82\r\n* docs: Fix indent in README by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F83\r\n* fix(js): Remove unused import by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F84\r\n* release(py): 0.0.18 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F85\r\n* release(js): 0.0.13 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F86\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fjs==0.0.12...js==0.0.13","2025-04-08T00:38:54",{"id":230,"version":231,"summary_zh":232,"released_at":233},323971,"py==0.0.18","## What's Changed\r\n* release(js): 0.0.12 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F77\r\n* fix: Fix typo and flaky test by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F78\r\n* docs: Fix typo by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F79\r\n* fix: Use faster model for some JS tests to avoid timeouts by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F80\r\n* feat(py,js): Add support for custom output schema for LLM as judge by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F82\r\n* docs: Fix indent in README by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F83\r\n* fix(js): Remove unused import by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F84\r\n* release(py): 0.0.18 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F85\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fpy==0.0.17...py==0.0.18","2025-04-08T00:36:37",{"id":235,"version":236,"summary_zh":237,"released_at":238},323972,"js==0.0.12","## What's Changed\r\n* feat(python,js): Update RAG evaluators by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F72\r\n* docs: Update language READMEs and add script by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F75\r\n* release(py): 0.0.17 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F76\r\n* release(js): 0.0.12 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F77\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fjs==0.0.11...js==0.0.12","2025-04-02T21:28:15",{"id":240,"version":241,"summary_zh":242,"released_at":243},323973,"py==0.0.17","## What's Changed\r\n* release(js): 0.0.11 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F74\r\n* feat(python,js): Update RAG evaluators by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F72\r\n* docs: Update language READMEs and add script by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F75\r\n* release(py): 0.0.17 by @jacoblee93 in https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fpull\u002F76\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Flangchain-ai\u002Fopenevals\u002Fcompare\u002Fpy==0.0.16...py==0.0.17","2025-04-02T21:25:35"]