[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-ndif-team--nnsight":3,"tool-ndif-team--nnsight":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",151314,2,"2026-04-11T23:32:58",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":77,"owner_email":77,"owner_twitter":78,"owner_website":79,"owner_url":80,"languages":81,"stars":98,"forks":99,"last_commit_at":100,"license":101,"difficulty_score":32,"env_os":102,"env_gpu":103,"env_ram":104,"env_deps":105,"category_tags":111,"github_topics":112,"view_count":32,"oss_zip_url":77,"oss_zip_packed_at":77,"status":17,"created_at":118,"updated_at":119,"faqs":120,"releases":150},6819,"ndif-team\u002Fnnsight","nnsight","The nnsight package enables interpreting and manipulating the internals of deep learned models.","nnsight 是一款专为深度学习和大模型研究设计的 Python 开源库，旨在帮助开发者轻松“透视”并干预神经网络的内部运作。在传统的模型调试中，想要获取中间层的激活值、修改数据流向或计算特定梯度的过程往往繁琐且容易出错，而 nnsight 通过提供简洁直观的接口，完美解决了这一痛点。\n\n它允许用户在模型前向传播过程中，随时访问任意层级的激活状态，甚至直接修改这些数值以研究其因果效应。无论是本地运行的小型 PyTorch 模型，还是需要通过远程基础设施执行的大规模语言模型，nnsight 都能高效支持。其独特的技术亮点在于强大的“追踪（tracing）”机制和上下文管理系统，确保用户能在安全的执行环境中批量处理干预实验，同时自动管理内存，避免数据丢失。\n\n这款工具特别适合 AI 研究人员、大模型开发者以及致力于可解释性研究（Interpretability）的工程师使用。如果你希望深入探究模型“黑盒”内部的决策逻辑，或者需要验证某种干预手段对模型输出的具体影响，nnsight 将是你得力的实验助手。配合其完善的文档和对主流 LLM 智能体的支持，它能显著降低复杂模型分析的门槛，让前沿","nnsight 是一款专为深度学习和大模型研究设计的 Python 开源库，旨在帮助开发者轻松“透视”并干预神经网络的内部运作。在传统的模型调试中，想要获取中间层的激活值、修改数据流向或计算特定梯度的过程往往繁琐且容易出错，而 nnsight 通过提供简洁直观的接口，完美解决了这一痛点。\n\n它允许用户在模型前向传播过程中，随时访问任意层级的激活状态，甚至直接修改这些数值以研究其因果效应。无论是本地运行的小型 PyTorch 模型，还是需要通过远程基础设施执行的大规模语言模型，nnsight 都能高效支持。其独特的技术亮点在于强大的“追踪（tracing）”机制和上下文管理系统，确保用户能在安全的执行环境中批量处理干预实验，同时自动管理内存，避免数据丢失。\n\n这款工具特别适合 AI 研究人员、大模型开发者以及致力于可解释性研究（Interpretability）的工程师使用。如果你希望深入探究模型“黑盒”内部的决策逻辑，或者需要验证某种干预手段对模型输出的具体影响，nnsight 将是你得力的实验助手。配合其完善的文档和对主流 LLM 智能体的支持，它能显著降低复杂模型分析的门槛，让前沿研究变得更加触手可及。","\u003Cp align=\"center\">\n  \u003Cimg src=\".\u002Fnnsight_logo.svg\" alt=\"nnsight\" width=\"300\">\n\u003C\u002Fp>\n\n\u003Ch3 align=\"center\">\nInterpret and manipulate the internals of deep learning models\n\u003C\u002Fh3>\n\n\u003Cp align=\"center\">\n\u003Ca href=\"https:\u002F\u002Fwww.nnsight.net\">\u003Cb>Documentation\u003C\u002Fb>\u003C\u002Fa> | \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\">\u003Cb>GitHub\u003C\u002Fb>\u003C\u002Fa> | \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002F6uFJmCSwW7\">\u003Cb>Discord\u003C\u002Fb>\u003C\u002Fa> | \u003Ca href=\"https:\u002F\u002Fdiscuss.ndif.us\u002F\">\u003Cb>Forum\u003C\u002Fb>\u003C\u002Fa> | \u003Ca href=\"https:\u002F\u002Fx.com\u002Fndif_team\">\u003Cb>Twitter\u003C\u002Fb>\u003C\u002Fa> | \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.14561\">\u003Cb>Paper\u003C\u002Fb>\u003C\u002Fa>\n\u003C\u002Fp>\n\n\u003Cp align=\"center\">\n\u003Ca href=\"https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fndif-team\u002Fnnsight\u002Fblob\u002Fmain\u002FNNsight_Walkthrough.ipynb\">\u003Cimg src=\"https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg\">\u003C\u002Fimg>\u003C\u002Fa>\n\u003Ca href=\"https:\u002F\u002Fdeepwiki.com\u002Fndif-team\u002Fnnsight\">\u003Cimg src=\"https:\u002F\u002Fdeepwiki.com\u002Fbadge.svg\" alt=\"Ask DeepWiki\">\u003C\u002Fimg>\u003C\u002Fa>\n\u003C\u002Fp>\n\n---\n\n## About\n\n**nnsight** is a Python library that enables interpreting and intervening on the internals of deep learning models. It provides a clean, Pythonic interface for:\n\n- **Accessing activations** at any layer during forward passes\n- **Modifying activations** to study causal effects\n- **Computing gradients** with respect to intermediate values\n- **Batching interventions** across multiple inputs efficiently\n\nOriginally developed in the [NDIF team](https:\u002F\u002Fndif.us\u002F) at Northeastern University, nnsight supports local execution on any PyTorch model and remote execution on large models via the NDIF infrastructure.\n\n> 📖 For a deeper technical understanding of nnsight's internals (tracing, interleaving, the Envoy system, etc.), see **[NNsight.md](.\u002FNNsight.md)**.\n\n---\n\n## Installation\n\n```bash\npip install nnsight\n```\n\n---\n\n## Agents\n\nInform LLM agents how to use nnsight using one of these methods:\n\n### Skills Repository\n\n**Claude Code**\n\n```bash\n# Open Claude Code terminal\nclaude\n\n# Add the marketplace (one time)\n\u002Fplugin marketplace add https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fskills.git\n\n# Install all skills\n\u002Fplugin install nnsight@skills\n```\n\n**OpenAI Codex**\n\n```bash\n# Open OpenAI Codex terminal\ncodex\n\n# Install skills\nskill-installer install https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fskills.git\n```\n\n### Context7 MCP\n\nAlternatively, use [Context7](https:\u002F\u002Fgithub.com\u002Fupstash\u002Fcontext7) to provide up-to-date nnsight documentation directly to your LLM. Add `use context7` to your prompts or configure it in your MCP client:\n\n```json\n{\n  \"mcpServers\": {\n    \"context7\": {\n      \"url\": \"https:\u002F\u002Fmcp.context7.com\u002Fmcp\"\n    }\n  }\n}\n```\n\nSee the [Context7 README](https:\u002F\u002Fgithub.com\u002Fupstash\u002Fcontext7\u002Fblob\u002Fmaster\u002FREADME.md) for full installation instructions across different IDEs.\n\n### Documentation Files\n\nYou can also add our documentation files directly to your agent's context:\n\n- **[CLAUDE.md](.\u002FCLAUDE.md)** — Comprehensive guide for AI agents working with nnsight\n- **[NNsight.md](.\u002FNNsight.md)** — Deep technical documentation on nnsight's internals\n\n---\n\n## Quick Start\n\n```python\nfrom nnsight import LanguageModel\n\nmodel = LanguageModel('openai-community\u002Fgpt2', device_map='auto', dispatch=True)\n\nwith model.trace('The Eiffel Tower is in the city of'):\n    # Intervene on activations (must access in execution order!)\n    model.transformer.h[0].output[0][:] = 0\n    \n    # Access and save hidden states from a later layer\n    hidden_states = model.transformer.h[-1].output[0].save()\n    \n    # Get model output\n    output = model.output.save()\n\nprint(model.tokenizer.decode(output.logits.argmax(dim=-1)[0]))\n```\n\n> **💡 Tip:** Always call `.save()` on values you want to access after the trace exits. Without `.save()`, values are garbage collected. You can also use `nnsight.save(value)` as an alternative.\n\n## Accessing Activations\n\n```python\nwith model.trace(\"The Eiffel Tower is in the city of\"):\n    # Access attention output\n    attn_output = model.transformer.h[0].attn.output[0].save()\n    \n    # Access MLP output\n    mlp_output = model.transformer.h[0].mlp.output.save()\n\n    # Access any layer's output (access in execution order)\n    layer_output = model.transformer.h[5].output[0].save()\n    \n    # Access final logits\n    logits = model.lm_head.output.save()\n```\n\n**Note:** GPT-2 transformer layers return tuples where index 0 contains the hidden states.\n\n## Modifying Activations\n\n### In-Place Modification\n\n```python\nwith model.trace(\"Hello\"):\n    # Zero out all activations\n    model.transformer.h[0].output[0][:] = 0\n    \n    # Modify specific positions\n    model.transformer.h[0].output[0][:, -1, :] = 0  # Last token only\n```\n\n### Replacement\n\n```python\nwith model.trace(\"Hello\"):\n    # Add noise to activations\n    hs = model.transformer.h[-1].mlp.output.clone()\n    noise = 0.01 * torch.randn(hs.shape)\n    model.transformer.h[-1].mlp.output = hs + noise\n    \n    result = model.transformer.h[-1].mlp.output.save()\n```\n\n## Batching with Invokers\n\nProcess multiple inputs in one forward pass. Each invoke runs its code in a **separate worker thread**:\n\n- Threads execute serially (no race conditions)\n- Each thread waits for values via `.output`, `.input`, etc.\n- Invokes run in the order they're defined\n- Cross-invoke references work because threads run sequentially\n- **Within an invoke, access modules in execution order only**\n\n```python\nwith model.trace() as tracer:\n    # First invoke: worker thread 1\n    with tracer.invoke(\"The Eiffel Tower is in\"):\n        embeddings = model.transformer.wte.output  # Thread waits here\n        output1 = model.lm_head.output.save()\n    \n    # Second invoke: worker thread 2 (runs after thread 1 completes)\n    with tracer.invoke(\"_ _ _ _ _ _\"):\n        model.transformer.wte.output = embeddings  # Uses value from thread 1\n        output2 = model.lm_head.output.save()\n```\n\n### Prompt-less Invokers\n\nUse `.invoke()` with no arguments to operate on the entire batch:\n\n```python\nwith model.trace() as tracer:\n    with tracer.invoke(\"Hello\"):\n        out1 = model.lm_head.output[:, -1].save()\n    \n    with tracer.invoke([\"World\", \"Test\"]):\n        out2 = model.lm_head.output[:, -1].save()\n    \n    # No-arg invoke: operates on ALL 3 inputs\n    with tracer.invoke():\n        out_all = model.lm_head.output[:, -1].save()  # Shape: [3, vocab]\n```\n\n\n## Multi-Token Generation\n\nUse `.generate()` for autoregressive generation:\n\n```python\nwith model.generate(\"The Eiffel Tower is in\", max_new_tokens=3) as tracer:\n    output = model.generator.output.save()\n\nprint(model.tokenizer.decode(output[0]))\n# \"The Eiffel Tower is in the city of Paris\"\n```\n\n### Iterating Over Generation Steps\n\n```python\nwith model.generate(\"Hello\", max_new_tokens=5) as tracer:\n    logits = list().save()\n    \n    # Iterate over all generation steps\n    for step in tracer.iter[:]:\n        logits.append(model.lm_head.output[0][-1].argmax(dim=-1))\n\nprint(model.tokenizer.batch_decode(logits))\n```\n\n### Conditional Interventions Per Step\n\n```python\nwith model.generate(\"Hello\", max_new_tokens=5) as tracer:\n    outputs = list().save()\n    \n    for step_idx in tracer.iter[:]:\n        if step_idx == 2:\n            model.transformer.h[0].output[0][:] = 0  # Only on step 2\n\n        outputs.append(model.transformer.h[-1].output[0])\n```\n\n> **⚠️ Warning:** Code after `tracer.iter[:]` never executes! The unbounded iterator waits forever for more steps. Put post-iteration code in a separate `tracer.invoke()`. When using multiple invokes, do not pass input to `generate()` — pass it to the first invoke:\n> ```python\n> with model.generate(max_new_tokens=3) as tracer:\n>     with tracer.invoke(\"Hello\"):  # First invoker — pass input here\n>         for step in tracer.iter[:]:\n>             hidden = model.transformer.h[-1].output.save()\n>     with tracer.invoke():  # Second invoker — runs after generation\n>         final = model.output.save()  # Now works!\n> ```\n\n\n## Gradients\n\nGradients are accessed on **tensors** (not modules), only inside a `with tensor.backward():` context:\n\n```python\nwith model.trace(\"Hello\"):\n    hs = model.transformer.h[-1].output[0]\n    hs.requires_grad_(True)\n    \n    logits = model.lm_head.output\n    loss = logits.sum()\n    \n    with loss.backward():\n        grad = hs.grad.save()\n\nprint(grad.shape)\n```\n\n\n## Model Editing\n\nCreate persistent model modifications:\n\n```python\n# Create edited model (non-destructive)\nwith model.edit() as model_edited:\n    model.transformer.h[0].output[0][:] = 0\n\n# Original model unchanged\nwith model.trace(\"Hello\"):\n    out1 = model.transformer.h[0].output[0].save()\n\n# Edited model has modification\nwith model_edited.trace(\"Hello\"):\n    out2 = model_edited.transformer.h[0].output[0].save()\n\nassert not torch.all(out1 == 0)\nassert torch.all(out2 == 0)\n```\n\n\n## Scanning (Shape Inference)\n\nGet shapes without running the full model. Like all tracing contexts, `.save()` is required to persist values outside the block:\n\n```python\nimport nnsight\n\nwith model.scan(\"Hello\"):\n    dim = nnsight.save(model.transformer.h[0].output[0].shape[-1])\n\nprint(dim)  # 768\n```\n\n\n## Caching Activations\n\nAutomatically cache outputs from modules:\n\n```python\nwith model.trace(\"Hello\") as tracer:\n    cache = tracer.cache()\n\n# Access cached values\nlayer0_out = cache['model.transformer.h.0'].output\nprint(cache.model.transformer.h[0].output[0].shape)\n```\n\n\n## Sessions\n\nGroup multiple traces for efficiency:\n\n```python\nwith model.session() as session:\n    with model.trace(\"Hello\"):\n        hs1 = model.transformer.h[0].output[0].save()\n    \n    with model.trace(\"World\"):\n        model.transformer.h[0].output[0][:] = hs1  # Use value from first trace\n        hs2 = model.transformer.h[0].output[0].save()\n```\n\n\n## Remote Execution (NDIF)\n\nRun on NDIF's remote infrastructure:\n\n```python\nfrom nnsight import CONFIG\nCONFIG.set_default_api_key(\"YOUR_API_KEY\")\n\nmodel = LanguageModel(\"meta-llama\u002FMeta-Llama-3.1-8B\")\n\nwith model.trace(\"Hello\", remote=True):\n    hidden_states = model.model.layers[-1].output.save()\n```\n\nCheck available models at [nnsight.net\u002Fstatus](https:\u002F\u002Fnnsight.net\u002Fstatus\u002F)\n\n\n## vLLM Integration\n\nHigh-performance inference with vLLM:\n\n```python\nfrom nnsight.modeling.vllm import VLLM\n\nmodel = VLLM(\"gpt2\", tensor_parallel_size=1, dispatch=True)\n\nwith model.trace(\"Hello\", temperature=0.0, max_tokens=5) as tracer:\n    logits = list().save()\n    \n    for step in tracer.iter[:]:\n        logits.append(model.logits.output)\n```\n\n\n## NNsight for Any PyTorch Model\n\nUse `NNsight` for arbitrary PyTorch models:\n\n```python\nfrom nnsight import NNsight\nimport torch\n\nnet = torch.nn.Sequential(\n    torch.nn.Linear(5, 10),\n    torch.nn.Linear(10, 2)\n)\n\nmodel = NNsight(net)\n\nwith model.trace(torch.rand(1, 5)):\n    layer1_out = model[0].output.save()\n    output = model.output.save()\n```\n\n## Source Tracing\n\nAccess intermediate operations inside a module's forward pass. `.source` rewrites the forward method to hook into all operations:\n\n```python\n# Discover available operations\nprint(model.transformer.h[0].attn.source)\n# Shows forward method with operation names like:\n#   attention_interface_0 -> 66  attn_output, attn_weights = attention_interface(...)\n#   self_c_proj_0         -> 79  attn_output = self.c_proj(attn_output)\n\n# Access operation values\nwith model.trace(\"Hello\"):\n    attn_out = model.transformer.h[0].attn.source.attention_interface_0.output.save()\n```\n\n## Ad-hoc Module Application\n\nApply modules out of their normal execution order:\n\n```python\nwith model.trace(\"The Eiffel Tower is in the city of\"):\n    # Get intermediate hidden states\n    hidden_states = model.transformer.h[-1].output[0]\n    \n    # Apply lm_head to get \"logit lens\" view\n    logits = model.lm_head(model.transformer.ln_f(hidden_states))\n    tokens = logits.argmax(dim=-1).save()\n\nprint(model.tokenizer.decode(tokens[0]))\n```\n\n---\n\n## Core Concepts\n\n### Deferred Execution with Thread-Based Synchronization\n\nNNsight uses **deferred execution** with **thread-based synchronization**:\n\n1. **Code extraction**: When you enter a `with model.trace(...)` block, nnsight captures your code (via AST) and immediately exits the block\n2. **Thread execution**: Your code runs in a separate worker thread\n3. **Value waiting**: When you access `.output`, the thread **waits** until the model provides that value\n4. **Hook-based injection**: The model uses PyTorch hooks to provide values to waiting threads\n\n```python\nwith model.trace(\"Hello\"):\n    # Code runs in a worker thread\n    # Thread WAITS here until layer output is available\n    hs = model.transformer.h[-1].output[0]\n    \n    # .save() marks the value to persist after the context exits\n    hs = hs.save()\n    # Alternative: hs = nnsight.save(hs)\n\n# After exiting, hs contains the actual tensor\nprint(hs.shape)  # torch.Size([1, 2, 768])\n```\n\n**Key insight:** Your code runs directly. When you access `.output`, you get the **real tensor** - your thread just waits for it to be available.\n\n**Important:** Within an invoke, you must access modules in execution order. Accessing layer 5's output before layer 2's output will cause a deadlock (layer 2 has already been executed).\n\n### Key Properties\n\nEvery module has these special properties. Accessing them causes the worker thread to **wait** for the value:\n\n| Property | Description |\n|----------|-------------|\n| `.output` | Module's forward pass output (thread waits) |\n| `.input` | First positional argument to the module |\n| `.inputs` | All inputs as `(args_tuple, kwargs_dict)` |\n\n**Note:** `.grad` is accessed on **tensors** (not modules), only inside a `with tensor.backward():` context.\n\n### Module Hierarchy\n\nPrint the model to see its structure:\n\n```python\nprint(model)\n# GPT2LMHeadModel(\n#   (transformer): GPT2Model(\n#     (h): ModuleList(\n#       (0-11): 12 x GPT2Block(\n#         (attn): GPT2Attention(...)\n#         (mlp): GPT2MLP(...)\n#       )\n#     )\n#   )\n#   (lm_head): Linear(...)\n# )\n```\n\n---\n\n## Troubleshooting\n\n| Error | Cause | Fix |\n|-------|-------|-----|\n| `OutOfOrderError: Value was missed...` | Accessed modules in wrong order | Access modules in forward-pass execution order |\n| `NameError` after `tracer.iter[:]` | Code after unbounded iter doesn't run | Use separate `tracer.invoke()` for post-iteration code; pass input to first invoke, not `generate()` |\n| `ValueError: Cannot invoke during an active model execution` | Passed input to `generate()` while using multiple invokes | Use `model.generate(max_new_tokens=N)` with no input; pass prompt to first `tracer.invoke(\"Hello\")` |\n| `ValueError: Cannot return output of Envoy...` | No input provided to trace | Provide input: `model.trace(input)` or use `tracer.invoke(input)` |\n\nFor more debugging tips, see the [documentation](https:\u002F\u002Fwww.nnsight.net).\n\n---\n\n## More Resources\n\n- **[Documentation](https:\u002F\u002Fwww.nnsight.net)** — Tutorials, guides, and API reference\n- **[NNsight.md](.\u002FNNsight.md)** — Deep technical documentation on nnsight's internals\n- **[CLAUDE.md](.\u002FCLAUDE.md)** — Comprehensive guide for AI agents working with nnsight\n- **[Performance Report](.\u002Ftests\u002Fperformance\u002Fprofile\u002Fresults\u002Fperformance_report.md)** — Detailed performance analysis and benchmarks\n\n---\n\n## Citation\n\nIf you use `nnsight` in your research, please cite:\n\n```bibtex\n@article{fiottokaufman2024nnsightndifdemocratizingaccess,\n      title={NNsight and NDIF: Democratizing Access to Foundation Model Internals}, \n      author={Jaden Fiotto-Kaufman and Alexander R Loftus and Eric Todd and Jannik Brinkmann and Caden Juang and Koyena Pal and Can Rager and Aaron Mueller and Samuel Marks and Arnab Sen Sharma and Francesca Lucchetti and Michael Ripa and Adam Belfki and Nikhil Prakash and Sumeet Multani and Carla Brodley and Arjun Guha and Jonathan Bell and Byron Wallace and David Bau},\n      year={2024},\n      eprint={2407.14561},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.14561}, \n}\n```\n","\u003Cp align=\"center\">\n  \u003Cimg src=\".\u002Fnnsight_logo.svg\" alt=\"nnsight\" width=\"300\">\n\u003C\u002Fp>\n\n\u003Ch3 align=\"center\">\n解读并操控深度学习模型的内部机制\n\u003C\u002Fh3>\n\n\u003Cp align=\"center\">\n\u003Ca href=\"https:\u002F\u002Fwww.nnsight.net\">\u003Cb>文档\u003C\u002Fb>\u003C\u002Fa> | \u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\">\u003Cb>GitHub\u003C\u002Fb>\u003C\u002Fa> | \u003Ca href=\"https:\u002F\u002Fdiscord.gg\u002F6uFJmCSwW7\">\u003Cb>Discord\u003C\u002Fb>\u003C\u002Fa> | \u003Ca href=\"https:\u002F\u002Fdiscuss.ndif.us\u002F\">\u003Cb>论坛\u003C\u002Fb>\u003C\u002Fa> | \u003Ca href=\"https:\u002F\u002Fx.com\u002Fndif_team\">\u003Cb>Twitter\u003C\u002Fb>\u003C\u002Fa> | \u003Ca href=\"https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.14561\">\u003Cb>论文\u003C\u002Fb>\u003C\u002Fa>\n\u003C\u002Fp>\n\n\u003Cp align=\"center\">\n\u003Ca href=\"https:\u002F\u002Fcolab.research.google.com\u002Fgithub\u002Fndif-team\u002Fnnsight\u002Fblob\u002Fmain\u002FNNsight_Walkthrough.ipynb\">\u003Cimg src=\"https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg\">\u003C\u002Fimg>\u003C\u002Fa>\n\u003Ca href=\"https:\u002F\u002Fdeepwiki.com\u002Fndif-team\u002Fnnsight\">\u003Cimg src=\"https:\u002F\u002Fdeepwiki.com\u002Fbadge.svg\" alt=\"Ask DeepWiki\">\u003C\u002Fimg>\u003C\u002Fa>\n\u003C\u002Fp>\n\n---\n\n## 关于\n\n**nnsight** 是一个 Python 库，能够帮助用户解读和干预深度学习模型的内部运作。它提供了一个简洁、符合 Python 风格的接口，用于：\n\n- 在前向传播过程中访问任意层的激活值\n- 修改激活值以研究因果效应\n- 计算中间值的梯度\n- 高效地对多个输入执行批量干预\n\nnnsight 最初由东北大学的 [NDIF 团队](https:\u002F\u002Fndif.us\u002F) 开发，支持在任何 PyTorch 模型上进行本地运行，同时也可通过 NDIF 基础设施对大型模型进行远程执行。\n\n> 📖 如需深入了解 nnsight 的内部实现细节（如追踪、交错执行、Envoy 系统等），请参阅 **[NNsight.md](.\u002FNNsight.md)**。\n\n---\n\n## 安装\n\n```bash\npip install nnsight\n```\n\n---\n\n## 代理工具\n\n使用以下方法之一，指导 LLM 代理如何使用 nnsight：\n\n### 技能库\n\n**Claude Code**\n\n```bash\n# 打开 Claude Code 终端\nclaude\n\n# 添加技能市场（仅需一次）\n\u002Fplugin marketplace add https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fskills.git\n\n# 安装所有技能\n\u002Fplugin install nnsight@skills\n```\n\n**OpenAI Codex**\n\n```bash\n# 打开 OpenAI Codex 终端\ncodex\n\n# 安装技能\nskill-installer install https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fskills.git\n```\n\n### Context7 MCP\n\n此外，您也可以使用 [Context7](https:\u002F\u002Fgithub.com\u002Fupstash\u002Fcontext7) 将最新的 nnsight 文档直接提供给您的 LLM。只需在提示中加入 `use context7`，或在您的 MCP 客户端配置中进行设置：\n\n```json\n{\n  \"mcpServers\": {\n    \"context7\": {\n      \"url\": \"https:\u002F\u002Fmcp.context7.com\u002Fmcp\"\n    }\n  }\n}\n```\n\n有关不同 IDE 的完整安装说明，请参阅 [Context7 的 README](https:\u002F\u002Fgithub.com\u002Fupstash\u002Fcontext7\u002Fblob\u002Fmaster\u002FREADME.md)。\n\n### 文档文件\n\n您还可以将我们的文档直接添加到代理的上下文中：\n\n- **[CLAUDE.md](.\u002FCLAUDE.md)** — 针对使用 nnsight 的 AI 代理的全面指南\n- **[NNsight.md](.\u002FNNsight.md)** — nnsight 内部机制的深入技术文档\n\n---\n\n## 快速入门\n\n```python\nfrom nnsight import LanguageModel\n\nmodel = LanguageModel('openai-community\u002Fgpt2', device_map='auto', dispatch=True)\n\nwith model.trace('埃菲尔铁塔位于城市里'):\n    # 干预激活值（必须按执行顺序访问！）\n    model.transformer.h[0].output[0][:] = 0\n    \n    # 访问并保存后续层的隐藏状态\n    hidden_states = model.transformer.h[-1].output[0].save()\n    \n    # 获取模型输出\n    output = model.output.save()\n\nprint(model.tokenizer.decode(output.logits.argmax(dim=-1)[0]))\n```\n\n> **💡 提示：** 对于希望在跟踪结束后仍可访问的值，请务必调用 `.save()`。否则，这些值会被垃圾回收。您也可以使用 `nnsight.save(value)` 作为替代方案。\n\n## 访问激活值\n\n```python\nwith model.trace(\"埃菲尔铁塔位于城市里\"):\n    # 访问注意力输出\n    attn_output = model.transformer.h[0].attn.output[0].save()\n    \n    # 访问 MLP 输出\n    mlp_output = model.transformer.h[0].mlp.output.save()\n\n    # 访问任意层的输出（按执行顺序访问）\n    layer_output = model.transformer.h[5].output[0].save()\n    \n    # 访问最终的 logits\n    logits = model.lm_head.output.save()\n```\n\n**注意：** GPT-2 的 Transformer 层会返回元组，其中索引 0 包含隐藏状态。\n\n## 修改激活值\n\n### 原地修改\n\n```python\nwith model.trace(\"你好\"):\n    # 将所有激活值置零\n    model.transformer.h[0].output[0][:] = 0\n    \n    # 修改特定位置\n    model.transformer.h[0].output[0][:, -1, :] = 0  # 仅最后一个 token\n```\n\n### 替换\n\n```python\nwith model.trace(\"你好\"):\n    # 向激活值中添加噪声\n    hs = model.transformer.h[-1].mlp.output.clone()\n    noise = 0.01 * torch.randn(hs.shape)\n    model.transformer.h[-1].mlp.output = hs + noise\n    \n    result = model.transformer.h[-1].mlp.output.save()\n```\n\n## 使用 Invoker 进行批量处理\n\n在一个前向传播中处理多个输入。每个 invoke 都会在一个独立的工作线程中运行：\n\n- 线程按顺序执行（无竞争条件）\n- 每个线程通过 `.output`、`.input` 等等待所需值\n- Invoke 按照定义的顺序运行\n- 跨 invoke 引用有效，因为线程是依次执行的\n- **在单个 invoke 内，只能按执行顺序访问模块**\n\n```python\nwith model.trace() as tracer:\n    # 第一个 invoke：工作线程 1\n    with tracer.invoke(\"埃菲尔铁塔位于\"):\n        embeddings = model.transformer.wte.output  # 线程在此处等待\n        output1 = model.lm_head.output.save()\n    \n    # 第二个 invoke：工作线程 2（在线程 1 完成后运行）\n    with tracer.invoke(\"_ _ _ _ _ _\"):\n        model.transformer.wte.output = embeddings  # 使用线程 1 的值\n        output2 = model.lm_head.output.save()\n```\n\n### 无提示的 Invoke\n\n使用不带参数的 `.invoke()` 来操作整个批次：\n\n```python\nwith model.trace() as tracer:\n    with tracer.invoke(\"你好\"):\n        out1 = model.lm_head.output[:, -1].save()\n    \n    with tracer.invoke([\"世界\", \"测试\"]):\n        out2 = model.lm_head.output[:, -1].save()\n    \n    # 不带参数的 invoke：作用于全部 3 个输入\n    with tracer.invoke():\n        out_all = model.lm_head.output[:, -1].save()  # 形状：[3, 词汇表]\n```\n\n\n## 多 token 生成\n\n使用 `.generate()` 进行自回归生成：\n\n```python\nwith model.generate(\"埃菲尔铁塔位于\", max_new_tokens=3) as tracer:\n    output = model.generator.output.save()\n\nprint(model.tokenizer.decode(output[0]))\n# “埃菲尔铁塔位于巴黎市”\n```\n\n### 遍历生成步骤\n\n```python\nwith model.generate(\"你好\", max_new_tokens=5) as tracer:\n    logits = list().save()\n    \n    # 遍历所有生成步骤\n    for step in tracer.iter[:]:\n        logits.append(model.lm_head.output[0][-1].argmax(dim=-1))\n\nprint(model.tokenizer.batch_decode(logits))\n```\n\n### 每步的条件干预\n\n```python\nwith model.generate(\"Hello\", max_new_tokens=5) as tracer:\n    outputs = list().save()\n    \n    for step_idx in tracer.iter[:]:\n        if step_idx == 2:\n            model.transformer.h[0].output[0][:] = 0  # 仅在第2步执行\n\n        outputs.append(model.transformer.h[-1].output[0])\n```\n\n> **⚠️ 注意：** `tracer.iter[:]` 之后的代码永远不会执行！无界迭代器会一直等待更多步骤。请将迭代后的代码放在单独的 `tracer.invoke()` 中。当使用多个调用时，不要将输入传递给 `generate()` —— 应该将其传递给第一次调用：\n> ```python\n> with model.generate(max_new_tokens=3) as tracer:\n>     with tracer.invoke(\"Hello\"):  # 第一次调用——在此处传递输入\n>         for step in tracer.iter[:]:\n>             hidden = model.transformer.h[-1].output.save()\n>     with tracer.invoke():  # 第二次调用——在生成完成后运行\n>         final = model.output.save()  # 现在可以正常工作了！\n> ```\n\n\n## 梯度\n\n梯度只能在 **张量**（而非模块）上访问，并且必须位于 `with tensor.backward():` 上下文中：\n\n```python\nwith model.trace(\"Hello\"):\n    hs = model.transformer.h[-1].output[0]\n    hs.requires_grad_(True)\n    \n    logits = model.lm_head.output\n    loss = logits.sum()\n    \n    with loss.backward():\n        grad = hs.grad.save()\n\nprint(grad.shape)\n```\n\n\n## 模型编辑\n\n创建持久化的模型修改：\n\n```python\n# 创建编辑后的模型（非破坏性）\nwith model.edit() as model_edited:\n    model.transformer.h[0].output[0][:] = 0\n\n# 原始模型未被改变\nwith model.trace(\"Hello\"):\n    out1 = model.transformer.h[0].output[0].save()\n\n# 编辑后的模型已应用修改\nwith model_edited.trace(\"Hello\"):\n    out2 = model_edited.transformer.h[0].output[0].save()\n\nassert not torch.all(out1 == 0)\nassert torch.all(out2 == 0)\n```\n\n\n## 扫描（形状推断）\n\n无需运行完整模型即可获取形状。与所有跟踪上下文一样，需要使用 `.save()` 将值保存到块之外：\n\n```python\nimport nnsight\n\nwith model.scan(\"Hello\"):\n    dim = nnsight.save(model.transformer.h[0].output[0].shape[-1])\n\nprint(dim)  # 768\n```\n\n\n## 缓存激活值\n\n自动缓存模块的输出：\n\n```python\nwith model.trace(\"Hello\") as tracer:\n    cache = tracer.cache()\n\n# 访问缓存的值\nlayer0_out = cache['model.transformer.h.0'].output\nprint(cache.model.transformer.h[0].output[0].shape)\n```\n\n\n## 会话\n\n为了提高效率，可以将多个跟踪分组在一起：\n\n```python\nwith model.session() as session:\n    with model.trace(\"Hello\"):\n        hs1 = model.transformer.h[0].output[0].save()\n    \n    with model.trace(\"World\"):\n        model.transformer.h[0].output[0][:] = hs1  # 使用第一次跟踪中的值\n        hs2 = model.transformer.h[0].output[0].save()\n```\n\n\n## 远程执行（NDIF）\n\n在 NDIF 的远程基础设施上运行：\n\n```python\nfrom nnsight import CONFIG\nCONFIG.set_default_api_key(\"YOUR_API_KEY\")\n\nmodel = LanguageModel(\"meta-llama\u002FMeta-Llama-3.1-8B\")\n\nwith model.trace(\"Hello\", remote=True):\n    hidden_states = model.model.layers[-1].output.save()\n```\n\n可在 [nnsight.net\u002Fstatus](https:\u002F\u002Fnnsight.net\u002Fstatus\u002F) 查看可用模型\n\n\n## vLLM 集成\n\n使用 vLLM 进行高性能推理：\n\n```python\nfrom nnsight.modeling.vllm import VLLM\n\nmodel = VLLM(\"gpt2\", tensor_parallel_size=1, dispatch=True)\n\nwith model.trace(\"Hello\", temperature=0.0, max_tokens=5) as tracer:\n    logits = list().save()\n    \n    for step in tracer.iter[:]:\n        logits.append(model.logits.output)\n```\n\n\n## NNsight 适用于任何 PyTorch 模型\n\n可将 `NNsight` 用于任意 PyTorch 模型：\n\n```python\nfrom nnsight import NNsight\nimport torch\n\nnet = torch.nn.Sequential(\n    torch.nn.Linear(5, 10),\n    torch.nn.Linear(10, 2)\n)\n\nmodel = NNsight(net)\n\nwith model.trace(torch.rand(1, 5)):\n    layer1_out = model[0].output.save()\n    output = model.output.save()\n```\n\n## 源码跟踪\n\n访问模块前向传播中的中间操作。`.source` 会重写前向方法，以钩住所有操作：\n\n```python\n# 发现可用的操作\nprint(model.transformer.h[0].attn.source)\n# 显示带有操作名称的前向方法，例如：\n#   attention_interface_0 -> 66  attn_output, attn_weights = attention_interface(...)\n#   self_c_proj_0         -> 79  attn_output = self.c_proj(attn_output)\n\n# 访问操作的值\nwith model.trace(\"Hello\"):\n    attn_out = model.transformer.h[0].attn.source.attention_interface_0.output.save()\n```\n\n## 临时模块应用\n\n可以在非正常执行顺序的情况下应用模块：\n\n```python\nwith model.trace(\"埃菲尔铁塔位于城市中\"):\n    # 获取中间隐藏状态\n    hidden_states = model.transformer.h[-1].output[0]\n    \n    # 应用 lm_head 以获得“logit lens”视图\n    logits = model.lm_head(model.transformer.ln_f(hidden_states))\n    tokens = logits.argmax(dim=-1).save()\n\nprint(model.tokenizer.decode(tokens[0]))\n```\n\n\n---\n\n## 核心概念\n\n### 基于线程同步的延迟执行\n\nNNsight 使用 **延迟执行** 和 **基于线程的同步**：\n\n1. **代码提取**：当你进入 `with model.trace(...)` 块时，nnsight 会捕获你的代码（通过 AST），并立即退出该块。\n2. **线程执行**：你的代码会在一个独立的工作线程中运行。\n3. **值等待**：当你访问 `.output` 时，线程会 **等待** 直到模型提供该值。\n4. **钩子注入**：模型使用 PyTorch 钩子将值提供给等待的线程。\n\n```python\nwith model.trace(\"Hello\"):\n    # 代码在工作线程中运行\n    # 线程在此处等待，直到层的输出可用\n    hs = model.transformer.h[-1].output[0]\n    \n    # .save() 标记该值，以便在上下文退出后保留\n    hs = hs.save()\n    # 或者：hs = nnsight.save(hs)\n\n# 退出后，hs 包含实际的张量\nprint(hs.shape)  # torch.Size([1, 2, 768])\n```\n\n**关键点：** 你的代码直接运行。当你访问 `.output` 时，你得到的是 **真实的张量**——你的线程只是等待它变得可用。\n\n**重要提示：** 在一次调用中，必须按照执行顺序访问模块。如果在访问第2层的输出之前就尝试访问第5层的输出，就会导致死锁（因为第2层已经执行完毕）。\n\n### 关键属性\n\n每个模块都具有这些特殊属性。访问它们会导致工作线程 **等待** 该值：\n\n| 属性       | 描述                                   |\n|------------|----------------------------------------|\n| `.output`  | 模块前向传播的输出（线程等待）         |\n| `.input`   | 模块的第一个位置参数                 |\n| `.inputs`  | 所有输入，格式为 `(args_tuple, kwargs_dict)` |\n\n**注意：** `.grad` 只能在 **张量**（而非模块）上访问，并且必须位于 `with tensor.backward():` 上下文中。\n\n### 模块层次结构\n\n打印模型以查看其结构：\n\n```python\nprint(model)\n# GPT2LMHeadModel(\n#   (transformer): GPT2Model(\n#     (h): ModuleList(\n#       (0-11): 12 x GPT2Block(\n#         (attn): GPT2Attention(...)\n#         (mlp): GPT2MLP(...)\n#       )\n#     )\n#   )\n#   (lm_head): Linear(...)\n# )\n```\n\n---\n\n## 故障排除\n\n| 错误 | 原因 | 解决方法 |\n|-------|-------|-----|\n| `OutOfOrderError: Value was missed...` | 模块访问顺序错误 | 按照前向传播的执行顺序访问模块 |\n| `tracer.iter[:]` 之后出现 `NameError` | 无界迭代后的代码未运行 | 对于迭代后的代码，请使用单独的 `tracer.invoke()`；将输入传递给第一次调用，而不是 `generate()` |\n| `ValueError: Cannot invoke during an active model execution` | 在使用多次调用时，向 `generate()` 传递了输入 | 使用 `model.generate(max_new_tokens=N)` 而不需输入；将提示词传递给第一次 `tracer.invoke(\"Hello\")` |\n| `ValueError: Cannot return output of Envoy...` | 跟踪时未提供输入 | 提供输入：`model.trace(input)` 或使用 `tracer.invoke(input)` |\n\n更多调试技巧，请参阅 [文档](https:\u002F\u002Fwww.nnsight.net)。\n\n---\n\n## 更多资源\n\n- **[文档](https:\u002F\u002Fwww.nnsight.net)** — 教程、指南和 API 参考\n- **[NNsight.md](.\u002FNNsight.md)** — 关于 nnsight 内部机制的深度技术文档\n- **[CLAUDE.md](.\u002FCLAUDE.md)** — 面向使用 nnsight 的 AI 代理的全面指南\n- **[性能报告](.\u002Ftests\u002Fperformance\u002Fprofile\u002Fresults\u002Fperformance_report.md)** — 详细的性能分析与基准测试\n\n---\n\n## 引用\n\n如果您在研究中使用了 `nnsight`，请引用以下内容：\n\n```bibtex\n@article{fiottokaufman2024nnsightndifdemocratizingaccess,\n      title={NNsight 和 NDIF： democratizing 访问基础模型内部}, \n      author={Jaden Fiotto-Kaufman 和 Alexander R Loftus 和 Eric Todd 和 Jannik Brinkmann 和 Caden Juang 和 Koyena Pal 和 Can Rager 和 Aaron Mueller 和 Samuel Marks 和 Arnab Sen Sharma 和 Francesca Lucchetti 和 Michael Ripa 和 Adam Belfki 和 Nikhil Prakash 和 Sumeet Multani 和 Carla Brodley 和 Arjun Guha 和 Jonathan Bell 和 Byron Wallace 和 David Bau},\n      year={2024},\n      eprint={2407.14561},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.14561}, \n}\n```","# nnsight 快速上手指南\n\nnnsight 是一个用于解释和干预深度学习模型内部机制的 Python 库。它提供了简洁的接口，支持访问任意层的激活值、修改激活值以研究因果效应、计算中间值的梯度，以及高效地批量处理干预操作。\n\n## 环境准备\n\n*   **系统要求**：Linux, macOS, 或 Windows (推荐 Linux 以获得最佳兼容性)\n*   **Python 版本**：Python 3.8 或更高版本\n*   **前置依赖**：\n    *   PyTorch (nnsight 将自动处理大部分依赖，但需确保环境中已安装兼容的 PyTorch 版本)\n    *   Hugging Face Transformers (用于加载预训练语言模型)\n*   **网络提示**：加载预训练模型（如 GPT-2, Llama 等）时需要连接 Hugging Face。国内用户建议配置镜像源或使用代理加速下载。\n\n## 安装步骤\n\n使用 pip 直接安装最新稳定版：\n\n```bash\npip install nnsight\n```\n\n> **提示**：如果下载速度较慢，可以使用国内镜像源加速安装：\n> ```bash\n> pip install nnsight -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n> ```\n\n## 基本使用\n\n以下示例展示如何加载一个预训练模型（GPT-2），并在前向传播过程中拦截和修改特定层的激活值。\n\n### 1. 加载模型并追踪执行\n\n使用 `LanguageModel` 类加载模型，并通过 `model.trace()` 上下文管理器进入追踪模式。\n\n```python\nfrom nnsight import LanguageModel\n\n# 加载模型，device_map='auto' 会自动分配设备，dispatch=True 启用分布式\u002F远程支持\nmodel = LanguageModel('openai-community\u002Fgpt2', device_map='auto', dispatch=True)\n\nwith model.trace('The Eiffel Tower is in the city of'):\n    # 干预激活值：将第 0 层输出的第一个元素（hidden states）全部置零\n    # 注意：必须按照执行顺序访问模块\n    model.transformer.h[0].output[0][:] = 0\n    \n    # 获取并保存最后一层的隐藏状态\n    # ⚠️ 重要：想要在 trace 块外部访问数据，必须调用 .save()\n    hidden_states = model.transformer.h[-1].output[0].save()\n    \n    # 获取模型最终输出\n    output = model.output.save()\n\n# 解码并打印结果\nprint(model.tokenizer.decode(output.logits.argmax(dim=-1)[0]))\n```\n\n### 2. 关键概念说明\n\n*   **延迟执行 (Deferred Execution)**：`with model.trace(...)` 块内的代码不会立即执行，而是被记录并重写，以便在模型实际前向传播时注入干预逻辑。\n*   **数据保存 (.save())**：由于执行是延迟的，块内产生的张量在块结束后会被销毁。若需在块外使用数据（如打印或后续计算），必须对目标张量调用 `.save()` 方法。\n*   **按序访问**：在 trace 块中访问模块（如 `model.transformer.h[0]`）时，必须遵循模型的前向传播顺序，否则可能引发错误。\n\n### 3. 进阶：批量处理与生成\n\nnnsight 支持通过 `invoke` 进行批量干预，以及通过 `generate` 进行自回归生成时的逐步干预。\n\n```python\n# 批量处理示例：在不同输入上应用不同的干预\nwith model.trace() as tracer:\n    with tracer.invoke(\"Hello\"):\n        out1 = model.lm_head.output[:, -1].save()\n    \n    with tracer.invoke([\"World\", \"Test\"]):\n        out2 = model.lm_head.output[:, -1].save()\n\n# 生成过程干预示例\nwith model.generate(\"The Eiffel Tower is in\", max_new_tokens=3) as tracer:\n    output = model.generator.output.save()\n\nprint(model.tokenizer.decode(output[0]))\n```\n\n更多高级功能（如梯度计算、模型编辑、远程执行等）请参考官方文档或源码注释。","某大模型安全研究员正在排查一个大型语言模型为何会在特定诱导下输出有害内容，试图定位并阻断模型内部的“恶意”计算路径。\n\n### 没有 nnsight 时\n- **黑盒调试困难**：只能观察输入和最终输出，无法直接查看中间层的具体激活值，难以判断错误发生在哪一层。\n- **干预成本高昂**：若要测试修改某层神经元对结果的影响，需手动重写复杂的前向传播代码或重新编译计算图，极易出错。\n- **因果分析低效**：难以批量对不同输入样本进行相同的中间层干预实验，导致验证假设的过程耗时且繁琐。\n- **梯度获取受限**：无法直接计算损失函数相对于中间隐藏状态的梯度，限制了基于梯度的归因分析方法的应用。\n\n### 使用 nnsight 后\n- **透明化内部状态**：通过简洁的 Python 接口，直接在 `trace` 上下文中访问任意 Transformer 层的注意力输出或 MLP 激活值，瞬间定位异常层。\n- **动态即时干预**：只需一行代码（如 `model.transformer.h[5].output[:] = 0`）即可在运行时清零特定神经元，实时观察干预后的输出变化，无需修改模型结构。\n- **高效批量实验**：原生支持对多个输入样本并行执行相同的干预策略，快速验证“关闭某层是否普遍消除有害输出”的假设。\n- **深层归因分析**：轻松计算中间变量对最终损失的梯度，精准量化特定神经元对有害生成的贡献度，为模型修剪提供依据。\n\nnnsight 将深不可测的模型内部变成了可透视、可操控的实验室，让研究者能从“猜谜式”调试转向精确的因果干预。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fndif-team_nnsight_036988f2.png","ndif-team","National Deep Inference Fabric","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fndif-team_6d626ac6.png","The National Deep Inference Fabric is a proposed research computing project that will enable us to crack open the mysteries inside large-scale AI",null,"ndif_team","https:\u002F\u002Fndif.us\u002F","https:\u002F\u002Fgithub.com\u002Fndif-team",[82,86,90,94],{"name":83,"color":84,"percentage":85},"Python","#3572A5",95.2,{"name":87,"color":88,"percentage":89},"Jupyter Notebook","#DA5B0B",4.5,{"name":91,"color":92,"percentage":93},"C","#555555",0.4,{"name":95,"color":96,"percentage":97},"Dockerfile","#384d54",0,891,84,"2026-04-10T09:11:22","MIT","未说明","非必需（支持本地 CPU 运行及 NDIF 远程执行）；若本地运行大模型，需 NVIDIA GPU 且显存大小取决于模型规模；支持 vLLM 集成以优化推理","未说明（取决于加载的模型大小）",{"notes":106,"python":102,"dependencies":107},"该工具核心特性是支持‘延迟执行’和‘远程执行’。用户可通过 NDIF 基础设施在云端运行超大模型，从而降低本地硬件需求。本地运行时兼容任意 PyTorch 模型。安装仅需执行 'pip install nnsight'。使用生成式任务时需注意迭代器代码块后的代码不会执行，需使用单独的 invoke 块。",[108,109,110],"torch","transformers (隐含，用于 LanguageModel)","vllm (可选，用于高性能推理)",[14],[113,114,115,116,117],"interpretability","machine-learning","neural-networks","python","pytorch","2026-03-27T02:49:30.150509","2026-04-12T16:43:16.013695",[121,126,131,136,141,146],{"id":122,"question_zh":123,"answer_zh":124,"source_url":125},30753,"为什么在 nnsight 中切换 tensor_parallel_size（从 1 到 2）会导致输出结果不一致？","这通常是由于张量并行（Tensor Parallelism）集成中的数值不稳定或 Bug 导致的。维护者已发布修复补丁，应用相关 PR 后，TP=1 和 TP=2 之间的差异将减少到正常的数值误差范围内，且多次运行结果会变得稳定。如果遇到此问题，请尝试更新到包含该修复的最新版本或应用对应的 PR。","https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fissues\u002F631",{"id":127,"question_zh":128,"answer_zh":129,"source_url":130},30754,"在 NVIDIA Blackwell 架构（如 RTX PRO 6000）上加载 Llama 模型时遇到 CUDA 断言错误（probability tensor contains inf\u002Fnan）怎么办？","该问题可能并非 nnsight 本身导致，而是与 `transformers` 库的版本兼容性有关，特别是在新架构 GPU 上。建议首先尝试升级 `transformers` 库到最新版本，并检查其官方仓库是否有相关议题。如果在使用 `device_map='auto'` 初始化 HuggingFace 模型时也复现了该错误，则进一步证实是底层库的问题。","https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fissues\u002F605",{"id":132,"question_zh":133,"answer_zh":134,"source_url":135},30755,"运行 Geometry of Truth 教程时在 Google Colab 中导致内核崩溃如何解决？","这通常是因为教程中默认安装的 nnsight 版本过旧（如强制使用了 0.5.0.dev0）。解决方法是在 Colab 单元格中将安装命令修改为 `!pip install nnsight>0.5.0`，以强制安装大于 0.5.0 的稳定版本，从而避免内核崩溃。","https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fissues\u002F532",{"id":137,"question_zh":138,"answer_zh":139,"source_url":140},30756,"生成过程中出现警告 \"Execution complete but `model.model.layers.X.output.iY` was not provided\" 是什么意思？","这个警告通常表示生成过程在达到最大迭代次数之前就已经停止（例如触发了停止条件），因此预期的某次迭代（如第 70 次）并未实际发生。如果你使用的是 `.iter[:]` 进行全量捕获，这通常不是错误，可以忽略。只要确保第一次 `invoke` 能正确获取输出即可。","https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fissues\u002F488",{"id":142,"question_zh":143,"answer_zh":144,"source_url":145},30757,"使用 nnsight+vLLM 后端捕获的 MLP 输出激活值与 HuggingFace PyTorch Hooks 的结果存在巨大数值差异，原因是什么？","这种差异通常源于 vLLM 后端使用了融合算子（fused kernels）或不同的计算图优化策略，导致中间激活值的计算路径与标准 PyTorch 不同。虽然输入激活值通常一致，但输出激活值可能因算子重排序或残差流处理方式不同而产生较大偏差。目前建议在使用 vLLM 后端时，主要关注其相对变化趋势，若需绝对数值一致性，建议使用标准 PyTorch\u002FHuggingFace 后端进行干预实验。","https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fissues\u002F630",{"id":147,"question_zh":148,"answer_zh":149,"source_url":140},30758,"如何在 nnsight 中确认干预代码是否正确执行了至少一次？","根据文档和开发者反馈，可以确保第一对 `invoke(input)` 和随后的 `invoke()` 调用能够正确获取并保存输出。如果在干预脚本中观察到警告但首次调用正常，通常意味着干预逻辑在有效的时间步内已执行。可以通过检查保存的 logits 或激活值形状来验证第一次干预是否生效。",[151,156,161,166,171,176,181,186,191,196,201,206,211,216,221,226,231,236,241,246],{"id":152,"version":153,"summary_zh":154,"released_at":155},222586,"v0.6.3","## 变更内容\n* 由 @MichaelRipa 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F561 中为 `EnvoySource` 实现了 `__iter__`\n* 由 @Butanium 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F624 中修复了在缺少 `attention_mask` 时 VLLM._prepare_input 崩溃的问题\n* 由 @AntoninPoche 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F629 中修复了 Envoy 在处理带有子模块且名为 `output` 的挂载时出现的 `RecursionError`\n* 由 @JadenFiotto-Kaufman 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F633 中保留了反序列化函数调用栈中的原始行号\n* 由 @JadenFiotto-Kaufman 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F632 中修复了使用交错器钩子时 vLLM 张量并行的非确定性问题\n* 由 @JadenFiotto-Kaufman 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F636 中进行开发\n\n## 新贡献者\n* @AntoninPoche 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F629 中做出了首次贡献\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.6.2...v0.6.3","2026-03-20T16:38:51",{"id":157,"version":158,"summary_zh":159,"released_at":160},222587,"v0.6.2","# NNsight v0.6.3 发行说明\n\n## 错误修复\n\n- **修复 vLLM 张量并行中的非确定性问题** — NNsight 的交错钩子会在 CUDA 操作之间引入 CPU 端延迟，导致在 `tensor_parallel_size > 1` 时 NCCL 读取到过时的数据。现已在张量并行模块（`RowParallelLinear`\u002F`ColumnParallelLinear`）中、NCCL 聚合操作之后以及 logits\u002Fsamples 包装调用之前，添加了针对性的 CUDA 流同步。经基准测试，在跟踪过程中约有 3% 的额外开销，而当 TP=1 时则无额外开销。（#632）\n\n- **修复 VLLM._prepare_input 在缺少 attention_mask 时的崩溃问题** — 当传入仅包含 `input_ids` 而没有 `attention_mask` 的字典时，程序会崩溃。现将缺失的 `attention_mask` 视为“对所有 token 进行注意力计算”，与直接使用 `List[int]` 输入的行为保持一致。（#624）\n\n- **修复 Envoy 中的递归错误** — 解决了 Envoy 在处理具有名为 `output` 的子模块的模型（如 BERT）时，因挂载逻辑重载而导致的 `RecursionError`。（#629）\n\n- **在反序列化的函数调用栈中保留原始行号** — 当包含 `model.trace()` 的辅助函数被序列化以进行远程执行时，堆栈跟踪现在会显示真实的文件名和正确的行号，而非内部引用。（#633）\n\n## 改进\n\n- **为 `EnvoySource` 实现 `__iter__` 方法** — 现在可以对源代码追踪的操作进行迭代，从而支持更自然的交互方式。（#561）\n\n- **重构 vLLM 的输入处理逻辑** — 整合了同步与异步代码路径，提取了共享的初始化逻辑 `_setup_interleaver()`，移除了 `AsyncInterleavingTracer`，并对 `_prepare_input` 进行了结构优化。`AsyncVLLMBackend` 现在直接调用初始化方法，并自行序列化中介对象，使 `interleave()` 仍保持同步模式。\n\n- **支持 LoRA 请求** — 在 vLLM 的输入处理中增加了 LoRA 请求的透传功能。\n\n- **在 `pyproject.toml` 中固定 vllm==0.15.1，以匹配 `NNS_VLLM_VERSION`。**\n\n## 测试\n\n- 添加了针对 VLLM 字典输入中缺少 `attention_mask` 的回归测试。\n- 添加了 BERT 包装测试，用于复现并防止 Envoy 递归问题的发生。\n- 添加了关于 linecache 注册的序列化边缘情况测试。\n- 添加了 vLLM 张量并行确定性测试。\n\n**完整差异：** https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.6.2...v0.6.3","2026-03-07T04:56:41",{"id":162,"version":163,"summary_zh":164,"released_at":165},222588,"v0.6.1","0.6 版本中工作台的错误修复。\n\n**完整更新日志**：https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.6.0...v0.6.1","2026-02-27T04:37:44",{"id":167,"version":168,"summary_zh":169,"released_at":170},222589,"v0.6.0","# nnsight v0.6.0 发行说明\n\n## 🚀 亮点\n\n### NDIF 上的无缝自定义代码\n\n本次发布最大的变化是引入了一种新的序列化格式，使得您本地的包能够在 NDIF 上无缝运行——即使这些包并未安装在服务器上。\n\n**工作原理：** nnsight 现在会以值（即源代码）的方式序列化函数和类，而不是通过引用。这意味着您的本地包会随请求一同发送到服务器，并在服务器上重新构建——即便它们并未安装在 NDIF 上。\n\n```python\nfrom nnsight import ndif, LanguageModel\nimport mymodule\n\n# 注册您的模块以便远程执行\nndif.register(mymodule)\n\nmodel = LanguageModel(\"meta-llama\u002FLlama-3.1-70B\")\n\nwith model.trace(\"Hello world\", remote=True):\n    result = mymodule.my_analysis_function(model).save()\n```\n\n**自动注册的内容：** 您自己编写的代码会自动发送，无需额外操作。具体包括：\n\n- **您的脚本及其本地导入。** 您正在运行的文件中定义的所有代码，以及该文件从同一目录或项目中导入的任何 `.py` 文件，都会随请求一起发送。\n- **可编辑\u002F开发模式安装的包。** 如果您对某个包运行了 `pip install -e .`，nnsight 会检测到它不是常规的 site-packages 包，并将其一并发送。\n\n**需要调用 `ndif.register()` 的情况：** 通过 `pip install` 正常安装的包（即位于 `site-packages` 中的包）会被假定为已存在于服务器上，因此不会被发送。如果您有一个 NDIF 上没有的 pip 安装包，请调用 `ndif.register(pkg)` 显式地将其包含进来。\n\n**Python 版本灵活性：** 新的序列化格式使 Python 3.9 及以上版本的客户端能够与 NDIF 配合使用，而无需考虑服务器的 Python 版本。\n\n一个实际的例子：[nnterp](https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnterp) 是一个基于 nnsight 构建的库，用于标准化不同模型家族之间的 Transformer 接口。NDIF 并未安装 nnterp，但这并不影响使用——只需将其注册即可：\n\n```python\nfrom nnterp import StandardizedTransformer\nfrom nnsight import ndif\nimport nnterp\n\nndif.register(nnterp)\n\nmodel = StandardizedTransformer(\"meta-llama\u002FLlama-3.1-70B\")\n\nwith model.trace(\"hello\", remote=True):\n    layer_5_output = model.layers_output[5]\n    model.layers_output[10] = layer_5_output\n```\n\n这种方式将库的开发与服务器部署解耦开来。nnterp 可以在不等待 NDIF 更新其安装的情况下发布新功能和修复。您始终运行的是本地版本。\n\n---\n\n### vLLM 集成\n\nvLLM 集成得到了显著扩展。nnsight 现在支持 vLLM 的完整部署配置——从单 GPU 到多节点张量并行——并且使用相同的跟踪 API。\n\n**单 GPU：**\n\n```python\nfrom nnsight.modeling.vllm import VLLM\n\nmodel = VLLM(\"meta-llama\u002FLlama-3.1-8B\", dispatch=True)\n\nwith model.trace(\"埃菲尔铁塔位于城市中\", temperature=0.0):\n    hidden = model.model.layers[16].output","2026-02-26T18:37:59",{"id":172,"version":173,"summary_zh":174,"released_at":175},222590,"v0.6.0a1","## 变更内容\n* 重构(config): 简化 AppConfigModel 中的环境变量处理逻辑\n\n- 通过 @JadenFiotto-Kaufman 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F592 中的提交，简化了从环境变量和 Colab 用户数据中设置 API 密钥和主机地址的逻辑。\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.14...v0.6.0a1","2026-02-18T18:10:09",{"id":177,"version":178,"summary_zh":179,"released_at":180},222591,"v0.5.15","## 变更内容\n* 重构(config): 简化 AppConfigModel 中的环境变量处理\n\n- 通过 @JadenFiotto-Kaufman 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F592 中的提交，简化了从环境变量和 Colab 用户数据中设置 API 密钥和主机地址的逻辑。\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.14...v0.5.15","2026-01-13T00:40:29",{"id":182,"version":183,"summary_zh":184,"released_at":185},222592,"v0.5.14","# NNsight v0.5.14 发行说明\n\n**发布日期：** 2026年1月\n\n本次发布重点改进了远程执行体验、vLLM 兼容性、开发者文档以及整体代码质量。共包含 **59 次提交**，涉及 **37 个文件**，并对作业状态显示系统、vLLM 输入处理以及全新的文档进行了重大增强。\n\n---\n\n## ✨ 新特性\n\n### 增强的远程作业状态显示\n\n远程执行日志系统已全新设计，引入了 `JobStatusDisplay` 类，提供以下功能：\n\n- **实时可视化反馈**：使用 Unicode 动画光标和状态图标\n- **ANSI 颜色支持**：自动检测终端和笔记本环境\n- **原地状态更新**：避免控制台被重复消息刷屏\n- **各状态阶段的耗时统计**\n- **无缝集成 Jupyter Notebook**：通过 `DisplayHandle` 实现无闪烁的 HTML 渲染\n\n```python\n# 运行远程追踪时的新视觉状态显示\nwith model.trace(\"Hello\", remote=True):\n    output = model.lm_head.output.save()\n\n# 输出现在会显示：\n# ⠋ [job-id] QUEUED     (2.3s)\n# ● [job-id] RUNNING    (0.5s) \n# ✓ [job-id] COMPLETED  (1.2s)\n```\n\n### vLLM 令牌输入兼容性\n\nvLLM 现在支持更广泛的输入格式，与 `LanguageModel` 的灵活性保持一致：\n\n- **令牌 ID 列表**：`model.trace([1, 2, 3, 4])`\n- **HuggingFace 分词器输出**：`model.trace(tokenizer(\"Hello\", return_tensors=\"pt\"))`\n- **包含 `input_ids` 的字典**：`model.trace({\"input_ids\": tensor, \"attention_mask\": mask})`\n\n```python\nfrom nnsight.modeling.vllm import VLLM\n\nmodel = VLLM(\"gpt2\", dispatch=True)\n\n# 现在支持预分词的输入\ntokens = tokenizer(\"Hello world\", return_tensors=\"pt\")\nwith model.trace(tokens, temperature=0.0):\n    logits = model.logits.output.save()\n```\n\n### vLLM 自动调度\n\nvLLM 模型现在在进入追踪上下文时会**自动调度**，无需显式指定 `dispatch=True`，行为与 `LanguageModel` 一致：\n\n```python\nmodel = VLLM(\"gpt2\")  # 无需设置 dispatch=True\n\n# 在首次追踪时自动调度\nwith model.trace(\"Hello\"):\n    output = model.logits.output.save()\n```\n\n### `Envoy.devices` 属性\n\n新增属性，用于获取模型所分布的所有设备：\n\n```python\nmodel = LanguageModel(\"meta-llama\u002FLlama-3.1-70B\", device_map=\"auto\")\nprint(model.devices)  # {device(type='cuda', index=0), device(type='cuda', index=1), ...}\n```\n\n### API 密钥自动检测\n\nAPI 密钥现在会按以下顺序从多个来源自动检测：\n1. `NDIF_API_KEY` 环境变量\n2. Google Colab 用户数据（`userdata.get(\"NDIF_API_KEY\")`）\n3. 已保存的配置\n\n```python\n# 如果环境中已设置 NDIF_API_KEY，则无需手动配置\nimport os\nos.environ[\"NDIF_API_KEY\"] = \"your-key\"\n\n# 可自动生效\nwith model.trace(\"Hello\", remote=True):\n    output = model.output.save()\n```\n\n### vLLM 选项","2026-01-08T04:58:46",{"id":187,"version":188,"summary_zh":189,"released_at":190},222593,"v0.5.13","**发布说明：**\n\n`v0.5.13`\n\n**1.** 对 vLLM 推理的 `nnsight` 支持经过复杂的重构，现已兼容 vLLM 的最新版本，包括张量并行。借助 NNsight 的介入，可在多 GPU 模型上实现快速推理！\n\n```python\nif __name__ == \"__main__\":\n    from nnsight.modeling.vllm import VLLM\n\n    model = VLLM(\"meta-llama\u002FLlama-3.1-8B\", dispatch=True, tensor_parallel_size=2)\n\n    with model.trace(\n        \"埃菲尔铁塔位于\",\n        temperature=0.8,\n        max_tokens=30,\n    ) as tracer:\n\n        activations = list().save()\n        logits = list().save()\n        samples = list().save()\n\n        with tracer.iter[:30]:\n            activations.append(model.model.layers[16].mlp.down_proj.output[0].cpu())\n            logits = logits.append(model.logits.output)\n            samples.append(model.samples.output)\n\n        output = tracer.result.save()\n```\n\n我们非常欢迎对 vLLM 集成的反馈。\n\n**兼容 vLLM>=0.12**\n\n**2.** 交错执行的优化带来了全面的性能提升，在进行大量介入操作时尤为明显。\n\n此外，还有三个配置标志可以设置，它们能带来更大的性能提升，但需要修改代码或属于实验性功能。\n\n```py\nfrom nnsight import CONFIG as NNSIGHT_CONFIG\n\nNNSIGHT_CONFIG.APP.PYMOUNT = False\nNNSIGHT_CONFIG.APP.CROSS_INVOKER = False\nNNSIGHT_CONFIG.APP.TRACE_CACHING = True\n```\n\n* **PYMOUNT**：关闭此标志后，将无法在任意对象上直接调用 `.save()`，而必须改用 `nnsight.save`。\n\n```py\nfrom nnsight import save\nwith model.trace(\"Hello world\"):\n    output = save(model.output)\n```\n\n将 `.save()` 挂载到 Python 对象上以及卸载会带来一定的性能开销。\n\n* **CROSS_INVOKER**：关闭此选项可防止不同调用者之间共享变量。这种共享会带来性能损耗。大多数情况下人们并不会使用这一功能，因此建议将其关闭。\n\n```py\nwith model.trace() as tracer:\n\n    with tracer.invoke(\"Hello world\"):\n        hs = model.model.layers[0].output\n    with tracer.invoke(\"Hello world\"):\n        model.model.layers[1].output = hs # X UnboundVariable: hs 未定义\n```\n\n* **TRACE_CACHING**：此选项会缓存 trace 的源代码，从而大幅加快后续查找速度。因此，如果你的 trace 位于循环中，或者在一个被多次调用的函数中，你将会看到显著的性能提升。\n\n\n**完整变更日志**：https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.12...v0.5.13","2025-12-19T22:31:46",{"id":192,"version":193,"summary_zh":194,"released_at":195},222594,"v0.5.12","## 变更内容\n* 检查元组和列表，因为在执行非阻塞请求时…… 由 @JadenFiotto-Kaufman 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F557 中提交\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.11...v0.5.12","2025-11-21T23:25:58",{"id":197,"version":198,"summary_zh":199,"released_at":200},222595,"v0.5.11","## 变更内容\n* 重构迭代器，由 @JadenFiotto-Kaufman 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F547 中完成\n* 开发工作，由 @JadenFiotto-Kaufman 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F548 中完成\n* 修复 bug，由 @JadenFiotto-Kaufman 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F549 中完成\n* Hugging Face 相关工作，由 @JadenFiotto-Kaufman 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F553 中完成\n* 开发工作，由 @JadenFiotto-Kaufman 在 https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F556 中完成\n\n\n**完整变更日志**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.10...v0.5.11","2025-11-21T17:26:53",{"id":202,"version":203,"summary_zh":204,"released_at":205},222596,"v0.5.10","## What's Changed\r\n* Diffusion Max Iteration by @AdamBelfki3 in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F544\r\n* add iteration to operation envoy by @AdamBelfki3 in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F543\r\n* NDIF Status API by @AdamBelfki3 in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F545\r\n* Dev by @JadenFiotto-Kaufman in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F546\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.9...v0.5.10","2025-10-30T18:23:27",{"id":207,"version":208,"summary_zh":209,"released_at":210},222597,"v0.5.9","## What's Changed\r\n* memory leak fix by @JadenFiotto-Kaufman in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F540\r\n* Dev by @JadenFiotto-Kaufman in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F541\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.8...v0.5.9","2025-10-14T22:31:43",{"id":212,"version":213,"summary_zh":214,"released_at":215},222598,"v0.5.8","## What's Changed\r\n* tracer.result by @JadenFiotto-Kaufman in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F537\r\n\r\n* Added `tracer.result` to grab the result from any method you're tracing.  Same api as `envoy.output\u002Finput` but on the `Tracer` object itself:\r\n\r\n```py\r\n\r\nwith model.generate(\"Hello world\") as tracer:\r\n\r\n    result = tracer.result.save()\r\n```\r\n\r\nWhy this was added:\r\n\r\nWhen you're using `.trace()`, youre tracing the forward pass of a module where the result can be obtained normally via `model.output`. However when you're tracing another method like `.generate`, theres no module to access the complete generation output, `model.output` refers only to a single iteration. \r\n\r\nThis was addressed by adding a blank `.generator` module to `LanguageModel`s where we simply passed the output of `.generate` through it making it accessible via `model.generator.output`. This would have to be done explicitly for any method output we want to support. \r\n\r\nThe is now addressed with `tracer.result` and works generally with all method on any model\u002F`Envoy`. Here is wrapping a VLM with `NNsight` and getting the result of `.generate()`:\r\n\r\n```py\r\n\r\nimport torch\r\nfrom nnsight import NNsight\r\nfrom transformers import AutoProcessor\r\nfrom transformers import Qwen2_5_VLForConditionalGeneration\r\n\r\nprocessor = AutoProcessor.from_pretrained(\"Qwen\u002FQwen2.5-VL-3B-Instruct\", trust_remote_code=True)\r\nmodel = Qwen2_5_VLForConditionalGeneration.from_pretrained(\r\n    \"Qwen\u002FQwen2.5-VL-3B-Instruct\",\r\n    dtype=torch.bfloat16,\r\n    device_map=\"auto\",\r\n)\r\n\r\nmessages = [\r\n    {\r\n        \"role\": \"user\",\r\n        \"content\": [\r\n            {\"type\": \"image\"},\r\n            {\"type\": \"text\", \"text\": \"Who is the president of the United States?\"},\r\n        ],\r\n    },\r\n]\r\ntext = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\r\nprompt = processor(text=[text], return_tensors=\"pt\").to(\"cuda:0\")\r\n\r\ninput_ids = prompt.pop(\"input_ids\")\r\n\r\nlm = NNsight(model)\r\n\r\nwith lm.generate(input_ids, **prompt) as tracer:\r\n    \r\n    result = tracer.result.save()\r\n``` \r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.7...v0.5.8","2025-10-10T18:20:54",{"id":217,"version":218,"summary_zh":219,"released_at":220},222599,"v0.5.7","## What's Changed\r\n* Performance changes by @JadenFiotto-Kaufman in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F529\r\n* Dev by @JadenFiotto-Kaufman in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F530\r\n* Dev by @JadenFiotto-Kaufman in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F531\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.6...v0.5.7","2025-10-01T16:04:06",{"id":222,"version":223,"summary_zh":224,"released_at":225},222600,"v0.5.6","## What's Changed\r\n* Remote classes by @JadenFiotto-Kaufman in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F526\r\n* Dev by @JadenFiotto-Kaufman in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F527\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.5...v0.5.6","2025-09-29T19:37:02",{"id":227,"version":228,"summary_zh":229,"released_at":230},222601,"v0.5.5","## What's Changed\r\n* Dev by @JadenFiotto-Kaufman in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F524\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.4...v0.5.5","2025-09-22T04:34:26",{"id":232,"version":233,"summary_zh":234,"released_at":235},222602,"v0.5.4","## What's Changed\r\n* All Stop Fix by @AdamBelfki3 in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F516\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.3...v0.5.4","2025-09-17T17:54:23",{"id":237,"version":238,"summary_zh":239,"released_at":240},222603,"v0.5.3","## What's Changed\r\n* Add python version to request header by @MichaelRipa in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F514\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.2...v0.5.3","2025-09-04T19:03:48",{"id":242,"version":243,"summary_zh":244,"released_at":245},222604,"v0.5.2","**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.1...v0.5.2","2025-09-01T16:21:40",{"id":247,"version":248,"summary_zh":249,"released_at":250},222605,"v0.5.1","## What's Changed\r\n* remove dill by @AdamBelfki3 in https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fpull\u002F507\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fndif-team\u002Fnnsight\u002Fcompare\u002Fv0.5.0...v0.5.1","2025-08-25T16:17:09"]