[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-microsoft--mup":3,"tool-microsoft--mup":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",159636,2,"2026-04-17T23:33:34",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":77,"owner_twitter":78,"owner_website":79,"owner_url":80,"languages":81,"stars":90,"forks":91,"last_commit_at":92,"license":93,"difficulty_score":32,"env_os":94,"env_gpu":95,"env_ram":95,"env_deps":96,"category_tags":101,"github_topics":102,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":109,"updated_at":110,"faqs":111,"releases":140},8939,"microsoft\u002Fmup","mup","maximal update parametrization (µP)","mup 是一个专为 PyTorch 设计的开源工具包，旨在实现“最大更新参数化”（μP）技术。它主要解决了深度学习领域的一个核心痛点：当神经网络规模扩大时，原本在小模型上调试好的超参数（如学习率）往往失效，导致研究人员不得不为每个新尺寸的大模型重新进行昂贵且耗时的调优。\n\n通过引入 μP，mup 让模型的最优超参数在不同宽度下保持稳定。这意味着开发者只需在低成本的小模型上找到最佳配置，即可直接迁移到超大模型（如大型 Transformer）上使用，无需重新搜索。这一特性显著降低了从实验探索到模型扩展过程中的不确定性与风险。\n\n该工具特别适合 AI 研究人员和深度学习工程师，尤其是那些从事大语言模型预训练或需要频繁调整网络架构的团队。其独特亮点在于提供了`MuReadout`、`MuAdam`等即插即用组件，能轻松替换标准 PyTorch 层，并自动处理复杂的缩放规则（如注意力机制中的系数调整）。此外，mup 还内置了坐标检查（Coord Check）功能，帮助用户验证参数化实现的正确性，让大规模模型的训练变得更加高效、稳健且不易出错。","# Maximal Update Parametrization (μP) and Hyperparameter Transfer (μTransfer) \n\n[Paper link](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.03466)\n|\n[Blog link](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fblog\u002F%C2%B5transfer-a-technique-for-hyperparameter-tuning-of-enormous-neural-networks\u002F)\n|\n[YouTube link](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=z8-C42mAwBc)\n\nIn [*Tensor Programs V: Tuning Large Neural Networks via Zero-Shot Hyperparameter Transfer*](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.03466), we show that optimal hyperparameters become stable across neural network sizes when we parametrize the model in [maximal update parametrization (μP)](http:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14522).\nThis can be used to tune extremely large neural networks such as large pretrained transformers, as we have done in our work.\nMore generally, μP reduces the fragility and uncertainty when transitioning from exploration to scaling up, which are not often talked about explicitly in the deep learning literature.\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_mup_readme_f37098aa45f2.png)\n\u003Cfont size=\"1\"> *Figure above: Training loss against learning rate on Transformers of varying `d_model` trained with Adam.*\u003C\u002Ffont> \n\n\nμP turns out to be the *unique* \"natural\" parametrization that has this hyperparameter stability property across width, as empirically verified in the gif below on MLPs trained with SGD. Here, across time, we interpolate between PyTorch default and μP's learning rate and initialization scalings (right), and we scale up the width-256 model (log2(width)=8) to width 2^13 = 8192 using this interpolated scaling rule (left).\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_mup_readme_97d40211cbfa.gif)\n\nThis repo contains the source code for the `mup` package, our tool that makes the implementation of μP in Pytorch models effortless and less error-prone.\n\n## Table of Contents\n\n\n  - [Installation](#installation)\n    - [Install From Source](#install-from-source)\n  - [Basic Usage](#basic-usage)\n  - [How `mup` Works Under the Hood](#how-mup-works-under-the-hood)\n  - [Current Limitations](#current-limitations)\n  - [Checking Correctness of Parametrization](#checking-correctness-of-parametrization)\n    - [Coord Check](#coord-check)\n    - [Making Your Own Coord Check Plots](#making-your-own-coord-check-plots)\n    - [Wider is Always Better](#wider-is-always-better)\n  - [Examples](#examples)\n  - [Running Tests](#running-tests)\n  - [The Basic Math](#the-basic-math)\n  - [Contributing](#contributing)\n  - [Trademarks](#trademarks)\n\n## Installation\n\n```\npip install mup\n```\n\n### Install From Source\n\nClone this repo, change to its directory, and do\n```\npip install -r requirements.txt\npip install -e .\n```\n\n## Basic Usage\n\n```Python\nfrom mup import MuReadout, make_base_shapes, set_base_shapes, MuSGD, MuAdam\n\nclass MyModel(nn.Module):\n    def __init__(self, width, ...):\n        ...\n        ### In model definition, replace output layer with MuReadout\n        # readout = nn.Linear(width, d_out)\n        readout = MuReadout(width, d_out)\n        ### If tying weights with an input nn.Embedding layer, do\n        # readout = MuSharedReadout(input_layer.weight)\n        ...\n    def forward(self, ...):\n        ...\n        ### If using a transformer, make sure to use\n        ###   1\u002Fd instead of 1\u002Fsqrt(d) attention scaling\n        # attention_scores = query @ key.T \u002F d**0.5\n        attention_scores = query @ key.T * 8 \u002F d\n        ### We use 8\u002Fd instead of 1\u002Fd here to be backward compatible\n        ###   with 1\u002Fd**0.5 when d=64, a common head dimension.\n        ...\n\n### Instantiate a base model\nbase_model = MyModel(width=1)\n### Optionally, use `torchdistx.deferred_init.deferred_init` to avoid instantiating the parameters\n### Simply install `torchdistx` and use\n# base_model = torchdistx.deferred_init.deferred_init(MyModel, width=1)\n### Instantiate a \"delta\" model that differs from the base model\n###   in all dimensions (\"widths\") that one wishes to scale.\n### Here it's simple, but e.g., in a Transformer, you may want to scale\n###   both nhead and dhead, so the delta model should differ in both.\ndelta_model = MyModel(width=2) # Optionally use `torchdistx` to avoid instantiating\n\n### Instantiate the target model (the model you actually want to train).\n### This should be the same as the base model except \n###   the widths could be potentially different.\n### In particular, base_model and model should have the same depth.\nmodel = MyModel(width=100)\n\n### Set base shapes\n### When `model` has same parameter shapes as `base_model`,\n###   `model` behaves exactly the same as `base_model`\n###   (which is in PyTorch's default parametrization).\n###   This provides backward compatibility at this particular model size.\n###   Otherwise, `model`'s init and LR are scaled by μP.\n### IMPORTANT: this should be called as soon as possible,\n###   before re-initialization and optimizer definition.\nset_base_shapes(model, base_model, delta=delta_model)\n\n### Alternatively, one can save the base model shapes in a file\n# make_base_shapes(base_model, delta_model, filename)\n### and later set base shapes directly from the filename\n# set_base_shapes(model, filename)\n### This is useful when one cannot fit both \n###   base_model and model in memory at the same time\n\n### Replace your custom init, if any\nfor param in model.parameters():\n    ### If initializing manually with fixed std or bounds,\n    ### then replace with same function from mup.init\n    # torch.nn.init.uniform_(param, -0.1, 0.1)\n    mup.init.uniform_(param, -0.1, 0.1)\n    ### Likewise, if using\n    ###   `xavier_uniform_, xavier_normal_, kaiming_uniform_, kaiming_normal_`\n    ### from `torch.nn.init`, replace with the same functions from `mup.init`\n\n### Use the optimizers from `mup.optim` instead of `torch.optim`\n# optimizer = torch.optim.SGD(model.parameters(), lr=0.1)\noptimizer = MuSGD(model.parameters(), lr=0.1)\n\n### Then just train normally\n```\n\nNote the base and delta models *do not need to be trained* --- we are only extracting parameter shape information from them.\nTherefore, optionally, we can avoid instantiating these potentially large models by using the `deferred_init` function in `torchdistx`.\nAfter installing [`torchdistx`](https:\u002F\u002Fgithub.com\u002Fpytorch\u002Ftorchdistx), use `torchdistx.deferred_init.deferred_init(MyModel, **args)` instead of `MyModel(**args)`. See [this page](https:\u002F\u002Fpytorch.org\u002Ftorchdistx\u002Flatest\u002Fdeferred_init.html) for more detail.\nIn the MLP and Transformer examples (not `mutransformers`) we provided, you can activate this feature by passing `--deferred_init`.\n\n\n## How `mup` Works Under the Hood\n\n\nBy invoking `set_base_shapes(model, ...)`, each parameter tensor `p` of `model` gets a `p.infshape` attribute that stores, for each of its dimensions, the corresponding base dimension and whether that dimension should be considered `infinite` (i.e. will be scaled up\u002Fdown, e.g., `d_model` of a Transformer) or `finite` (i.e. will be fixed, e.g., vocabulary size).\nThis information is used in the initializers and optimizers to automatically scale the parameters or learning rates to be compliant with μP.\nFor example, the Adam learning rate of hidden weights `p` is calculated as  `globalLR \u002F p.infshape.width_mult()`, where `p.infshape.width_mult()` essentially calculates `fan_in \u002F base_fan_in`.\n\n\n## Current Limitations\n\n- `set_base_shapes(model, ...)` assumes that `model` has just been randomly initialized in the standard way and rescales its parameters using the base shape information so the model is in μP.\n- If you want data parallelism, please use `torch.nn.parallel.DistributedDataParallel` instead of `torch.nn.DataParallel`. This is because the latter removes the attributes the `mup` package adds to each parameter tensor of the model. Also, for performance, `pytorch` [recommends the former anyway](https:\u002F\u002Fpytorch.org\u002Fdocs\u002Fstable\u002Fnotes\u002Fcuda.html#cuda-nn-ddp-instead).\n- We scale the learning rate according to μP explicitly by creating refined parameter groups from what is passed to the `mup` optimizer and by manipulating the `lr` attribute in those groups. This is compatible with PyTorch's learning rate schedulers. However, if you roll your own, make sure the scheduler sets the learning rate relative to what is currently in the refined parameter groups. The following is an example of what *not* to do and what is OK:\n```python\noptimizer = mup.MuAdam(model.parameters(), lr=1e-3)\nfor pg in optimizer.param_groups:\n  # what NOT to do: setting learning rate absolutely\n  # pg['lr'] = 1e-3 * 2\n  # what is an OK alternative: setting it relatively\n  pg['lr'] *= 2\n```\n- By default, any parameter matrix that has 2 \"infinite\" dimensions (i.e. dimensions that are different from base dimensions) are considered by `mup` to have shape (fan_out, fan_in), i.e., in the forward pass, this matrix multiplies its input on the right. This is the case with all `nn.Linear` weights from pytorch. If you have a custom parameter, say `W`, that violates this convention, you can manually set `W.infshape.main_idx = 0; W.infshape.main = W.infshape[0]` to let `mup` know that its shape corresponds to (fan_in, fan_out). A similar discussion applies if you have a parameter *tensor* with many dimensions but exactly 2 \"infinite\" dimensions, for which the first is fan_in and the second is fan_out.\n- Currently, [`torch.save` does not save the `infshape` objects attached to each parameter tensor](https:\u002F\u002Fgithub.com\u002Fpytorch\u002Fpytorch\u002Fissues\u002F72129). Before this is fixed, you would have to set base shape manually after loading a model checkpoint like so:\n```python\nmodel = torch.load('my\u002Fmodel\u002Fpath.pt')\n# Important: note the flag `rescale_params=False`!\nset_base_shapes(model, 'my\u002Fbase\u002Fshape\u002Fpath.bsh', rescale_params=False)\n```\n(`set_base_shapes` by default rescales the parameters of `model`, assuming it's freshly initialized by PyTorch, to be consistent with μP.\nThe `rescale_params=False` flag turns off this behavior.)\n\n\n## Checking Correctness of Parametrization\n\n\n### Coord Check\n\nJust like gradient checking is a simple way of verifying the correctness of an autograd implementation, *coordinate checking* is a simple way to verify you have implemented μP correctly: calculate the average size (which we denote in the y-axis below by `l1`) of the coordinates of each activation vector in, and output of, the model, for a few steps of training and a few different widths.\nIf implemented correctly, then we shall see this `l1` stable over many widths; otherwise, the `l1` can blow up or shrink to 0 with width.\n(We are essentially checking desideratum 1 described below.)\n(The `l1` calculates `x.abs().mean()` for each activation vector `x` and is just one measure of the \"average size\" of `x`'s entries; one can also use analogously defined `l2`, `l4`, etc, though they may exhibit greater fluctuation with random seeds.)\n\nFor example, in the following, we plot `width` vs `l1` for 2 steps of training, where t=1 means at initialization, before any gradient update.\nEach curve corresponds to an (pre-)activation vector of a layer or the output of the network.\nThe first set of 3 plots shows an MLP in standard parametrization (SP), trained by adam.\nWe see after 1 step of update, activation\u002Foutput `l1` are exploding with width.\nThis means SP is \"incorrect.\"\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_mup_readme_1ad80b9ff3e3.png)\nWe now do the same for an MLP in maximal update parametrization (μP) (including using `mup.optim.MuAdam` instead of `torch.optim.Adam`).\nIn contrast to the above, all curves stay horizontal, indicating that μP is implemented correctly.\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_mup_readme_5664d7c1f8ab.png)\nWe call this way of checking implementation correctness a *coord check*, short for \"coordinate check.\"\n\n### Making Your Own Coord Check Plots\nWe provide an easy way to implement this check via functions in the `mup.coord_check` module.\nThe workflow typically looks like the following.\n\n```Python\nfrom mup.coord_check import get_coord_data, plot_coord_data\n# construct a dictionary of lazy μP models with differing widths\ndef lazy_model(width):\n    # `set_base_shapes` returns the model\n    return lambda: set_base_shapes(MyMuModel(width), 'my\u002Fbase\u002Fshape\u002Fpath.bsh')\n    # Note: any custom initialization with `mup.init` would need to\n    # be done inside the lambda as well\nmodels = {64: lazy_model(64), ..., 1024: lazy_model(1024)}\n# make a dataloader with small batch size\u002Fseq len\n#   just for testing\ndataloader = ...\n# record data from the model activations over a few steps of training\n# this returns a pandas dataframe\ndf = get_coord_data(models, dataloader)\n# This saves the coord check plots to filename.\nplot_coord_data(df, save_to=filename)\n# If you are in jupyter notebook, you can also do\n#   `plt.show()`\n# to show the plot\n```\nFor example, the `mup.coord_check.example_plot_coord_check` function is implemented this way for toy MLP and CNN models.\n\nIf you see the curves blow up or shrink to 0 with width after a few steps of training, then there's a bug in your μP implementation (did you forget to vary some dimension, like `d_ffn`, in the delta model?).\nIf instead you see the curves converge to the right, then most likely your implementation is correct.\nHowever, there are two typical exceptions to this;\nthe following can shrink to 0 at initialization in μP (at a 1\u002Fsqrt(width) rate):\n  - the network output\n  - the attention logits in a Transformer\n\nThese are transient, and after a few steps their curves should be roughly flat.\nNevertheless, to remove the discrepancy at init, we recommend\n   - initializing the output layer \n   (should be a `MuReadout` instance) weights to be 0 via\n   the `readout_zero_init=True` option and\n   - initializing the query matrix in a Transformer to 0\n     (this has to be done manually). If symmetry-breaking is desired in the attention logits at init, initialize the (relative) position biases with nonzero variance.\n     \n#### Tips for Coord Check\n\n- Use a large learning rate (larger than you'd use for actual training). This would emphasize any potential exploding coordinates issue, which could be hidden by the initialization if the learning rate is too small.\n- If you reuse a module multiple times in the forward pass, then `mup.get_coord_data` will only record the statistics from the last usage. In this case, for testing purposes, one can wrap different usages with `nn.Identity` modules of different names to distinguish them.\n\n### Wider is Always Better\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_mup_readme_73112f690f09.png)\n\nAnother sign that μP has not been implemented correctly is if going wider does worse (on training loss) after some width, at some point during training.\nThe figure above illustrates this in a collection of training curves: (left) the correct implementation should always see performance improve with width, at any point in training; (middle) if you used standard parametrization (SP), sometimes you may see performance improve with width up to some point and then suddenly it becomes worse with wider models; (right) or you may immediately see worsening performance even for narrow models.\n\n## Examples\nSee the `MLP`, `Transformer`, and `ResNet` folders inside `examples\u002F` as well as the tests in `mup\u002Ftest` for examples.\nPeople familiar with [Huggingface Transformers](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers) may also find the `examples\u002Fmutransformers` submodule instructive (obtained via `git submodule update --init`), which is also available standalone at [https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fmutransformers](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fmutransformers).\n\n## Native Integration With Huggingface\n\nFrustrated that your [Huggingface Transformer](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers) breaks when you scale up? Want to tune hyperparameters for your large mult-GPU [Huggingface Transformer](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers) on a single GPU, right out the box? If so, please upvote [this github issue](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers\u002Fissues\u002F16157)!\n\n\n## Running Tests\nTo run tests, do\n```bash\npython -m mup.test\n```\n\n\n## The Basic Math\n\nμP is designed so as to satisfy the following desiderata:\n\n> At any time during training\n> 1. Every (pre)activation vector in a network should have Θ(1)-sized coordinates\n> 2. Neural network output should be O(1).\n> 3. All parameters should be updated as much as possible (in terms of scaling in width) without leading to divergence\n\nIt turns out these desiderata uniquely single out μP.\nTo derive μP from them, one needs to carefully consider how the *coordinate size* of a vector Av, resulting from a square matrix A multiplying vector v, depends on those of A and v, when A and v are \"correlated\".\nHere you can think of A as weights and v as an activation vector.\nThis in turn depends on what kind of matrix is A and what kind of vector is v.\nIn the context of training a wide neural network, it turns out we only need to consider vectors that has approximately iid coordinates, and two kinds of matrices: 1) those that look like outer products of such vectors, and 2) random iid matrices.\nThose of type 1 cover things like weight gradients; those of type 2 cover things like weight initialization.\nThen, if A and v both have entry size Θ(1) and they are correlated in ways that arise naturally during training, then we have the following table.\n\n|                  | outer product A (type 1) | iid A  (type 2)    |\n|------------------|--------------------------|--------------------|\n| Entry size of Av | Θ(n)                     | Θ(sqrt(n))         |\n\nGiven this table, one can then trace the forward and backward computation of a network to derive μP straightforwardly.\n\nSee [our blog post](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fblog\u002F%C2%B5transfer-a-technique-for-hyperparameter-tuning-of-enormous-neural-networks\u002F) for a gentle primer and [our paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.03466) for details.\n\n\n## Contributing\n\nThis project welcomes contributions and suggestions.  Most contributions require you to agree to a\nContributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us\nthe rights to use your contribution. For details, visit https:\u002F\u002Fcla.opensource.microsoft.com.\n\nWhen you submit a pull request, a CLA bot will automatically determine whether you need to provide\na CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions\nprovided by the bot. You will only need to do this once across all repos using our CLA.\n\nThis project has adopted the [Microsoft Open Source Code of Conduct](https:\u002F\u002Fopensource.microsoft.com\u002Fcodeofconduct\u002F).\nFor more information see the [Code of Conduct FAQ](https:\u002F\u002Fopensource.microsoft.com\u002Fcodeofconduct\u002Ffaq\u002F) or\ncontact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.\n\n## Trademarks\n\nThis project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft \ntrademarks or logos is subject to and must follow \n[Microsoft's Trademark & Brand Guidelines](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Flegal\u002Fintellectualproperty\u002Ftrademarks\u002Fusage\u002Fgeneral).\nUse of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.\nAny use of third-party trademarks or logos are subject to those third-party's policies.\n","# 最大更新参数化 (μP) 和超参数迁移 (μTransfer)\n\n[论文链接](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.03466)\n|\n[博客链接](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fblog\u002F%C2%B5transfer-a-technique-for-hyperparameter-tuning-of-enormous-neural-networks\u002F)\n|\n[YouTube链接](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=z8-C42mAwBc)\n\n在 [*Tensor Programs V: 通过零样本超参数迁移调优大型神经网络*](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.03466) 中，我们表明，当以 [最大更新参数化 (μP)](http:\u002F\u002Farxiv.org\u002Fabs\u002F2011.14522) 对模型进行参数化时，最优超参数会在不同规模的神经网络之间保持稳定。\n这可用于调优极其庞大的神经网络，例如大型预训练 Transformer 模型，正如我们在工作中所做的那样。\n更广泛地说，μP 能够降低从探索阶段过渡到大规模扩展时的脆弱性和不确定性，而这些往往在深度学习文献中并未被明确讨论。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_mup_readme_f37098aa45f2.png)\n\u003Cfont size=\"1\"> *上图：使用 Adam 优化器训练的不同 `d_model` 大小的 Transformer 的训练损失随学习率的变化。*\u003C\u002Ffont> \n\n\n事实证明，μP 是唯一一种具有这种跨宽度超参数稳定性特性的“自然”参数化方法，这一点在下方 GIF 动画中通过使用 SGD 训练的 MLP 得到了实证验证。在此过程中，我们随着时间推移，在 PyTorch 默认的学习率和初始化缩放与 μP 的缩放之间进行插值（右图），并利用这一插值后的缩放规则将宽度为 256 的模型（log2(width)=8）逐步扩展到宽度 2^13 = 8192（左图）。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_mup_readme_97d40211cbfa.gif)\n\n本仓库包含 `mup` 软件包的源代码，这是我们用于在 PyTorch 模型中轻松且不易出错地实现 μP 的工具。\n\n## 目录\n\n\n  - [安装](#installation)\n    - [从源码安装](#install-from-source)\n  - [基本用法](#basic-usage)\n  - [`mup` 的工作原理](#how-mup-works-under-the-hood)\n  - [当前限制](#current-limitations)\n  - [检查参数化的正确性](#checking-correctness-of-parametrization)\n    - [坐标检查](#coord-check)\n    - [自定义坐标检查图](#making-your-own-coord-check-plots)\n    - [越宽越好](#wider-is-always-better)\n  - [示例](#examples)\n  - [运行测试](#running-tests)\n  - [基础数学](#the-basic-math)\n  - [贡献](#contributing)\n  - [商标](#trademarks)\n\n## 安装\n\n```\npip install mup\n```\n\n### 从源码安装\n\n克隆本仓库，进入其目录，并执行\n```\npip install -r requirements.txt\npip install -e .\n```\n\n## 基本用法\n\n```Python\nfrom mup import MuReadout, make_base_shapes, set_base_shapes, MuSGD, MuAdam\n\nclass MyModel(nn.Module):\n    def __init__(self, width, ...):\n        ...\n        ### 在模型定义中，将输出层替换为 MuReadout\n        # readout = nn.Linear(width, d_out)\n        readout = MuReadout(width, d_out)\n        ### 如果与输入的 nn.Embedding 层共享权重，则应使用\n        # readout = MuSharedReadout(input_layer.weight)\n        ...\n    def forward(self, ...):\n        ...\n        ### 如果使用 Transformer，务必采用\n        ###   1\u002Fd 而不是 1\u002Fsqrt(d) 的注意力缩放\n        # attention_scores = query @ key.T \u002F d**0.5\n        attention_scores = query @ key.T * 8 \u002F d\n        ### 我们使用 8\u002Fd 而不是 1\u002Fd，以便向后兼容\n        ###   当 d=64 时的 1\u002Fd**0.5，这是常见的头维度。\n        ...\n\n### 实例化一个基础模型\nbase_model = MyModel(width=1)\n### 可选地，可以使用 `torchdistx.deferred_init.deferred_init` 来避免实例化参数\n### 只需安装 `torchdistx` 并使用\n# base_model = torchdistx.deferred_init.deferred_init(MyModel, width=1)\n### 实例化一个“delta”模型，该模型在所有希望缩放的维度（即宽度）上都不同于基础模型。\n### 这里很简单，但在 Transformer 中，你可能希望同时缩放 nhead 和 dhead，因此 delta 模型应在两者上都不同。\ndelta_model = MyModel(width=2) # 可选地使用 `torchdistx` 避免实例化\n\n### 实例化目标模型（即你真正想要训练的模型）。\n### 该模型应与基础模型相同，只是宽度可能有所不同。\n### 特别要注意的是，基础模型和目标模型应具有相同的深度。\nmodel = MyModel(width=100)\n\n### 设置基础形状\n### 当 `model` 的参数形状与 `base_model` 相同时，\n###   `model` 的行为将完全等同于 `base_model`\n###   （后者处于 PyTorch 的默认参数化方式下）。\n###   这在当前模型尺寸下提供了向后兼容性。\n###   否则，`model` 的初始化和学习率将按照 μP 进行缩放。\n### 重要提示：应在重新初始化和定义优化器之前尽快调用此函数。\nset_base_shapes(model, base_model, delta=delta_model)\n\n### 或者，也可以将基础模型的形状保存到文件中\n# make_base_shapes(base_model, delta_model, filename)\n### 然后稍后直接从文件设置基础形状\n# set_base_shapes(model, filename)\n### 这在无法同时将 `base_model` 和 `model` 放入内存时非常有用\n\n### 替换自定义的初始化方法（如有）\nfor param in model.parameters():\n    ### 如果手动使用固定的标准差或边界进行初始化，\n    ### 则用 mup.init 中的相应函数替代\n    # torch.nn.init.uniform_(param, -0.1, 0.1)\n    mup.init.uniform_(param, -0.1, 0.1)\n    ### 同样，如果使用\n    ###   `xavier_uniform_, xavier_normal_, kaiming_uniform_, kaiming_normal_`\n    ### 来自 `torch.nn.init` 的函数，也应替换为 `mup.init` 中的对应函数\n\n### 使用 `mup.optim` 中的优化器，而非 `torch.optim`\n# optimizer = torch.optim.SGD(model.parameters(), lr=0.1)\noptimizer = MuSGD(model.parameters(), lr=0.1)\n\n### 之后即可正常训练\n```\n\n请注意，基础模型和 delta 模型 *无需训练* —— 我们只是从中提取参数形状信息。\n因此，可选地，我们可以通过使用 `torchdistx` 中的 `deferred_init` 函数来避免实例化这些潜在的大模型。\n在安装 [`torchdistx`](https:\u002F\u002Fgithub.com\u002Fpytorch\u002Ftorchdistx) 后，可用 `torchdistx.deferred_init.deferred_init(MyModel, **args)` 代替 `MyModel(**args)`。更多详情请参阅 [此页面](https:\u002F\u002Fpytorch.org\u002Ftorchdistx\u002Flatest\u002Fdeferred_init.html)。\n在我们提供的 MLP 和 Transformer 示例中（非 `mutransformers`），可通过传递 `--deferred_init` 来启用此功能。\n\n## `mup` 的内部工作机制\n\n\n通过调用 `set_base_shapes(model, ...)`, 模型的每个参数张量 `p` 都会获得一个 `p.infshape` 属性，该属性为每个维度存储对应的基维以及该维度是应被视为“无限”（即会被放大或缩小，例如 Transformer 中的 `d_model`）还是“有限”（即会被固定，例如词汇表大小）。这些信息会在初始化器和优化器中使用，以自动调整参数或学习率，使其符合 μP 规范。例如，隐藏权重 `p` 的 Adam 学习率计算公式为 `globalLR \u002F p.infshape.width_mult()`，其中 `p.infshape.width_mult()` 实际上计算的是 `fan_in \u002F base_fan_in`。\n\n\n## 当前限制\n\n- `set_base_shapes(model, ...)` 假设 `model` 刚刚以标准方式随机初始化，并根据基形状信息重新缩放其参数，从而使模型处于 μP 状态。\n- 如果需要数据并行，建议使用 `torch.nn.parallel.DistributedDataParallel` 而不是 `torch.nn.DataParallel`。这是因为后者会移除 `mup` 包为模型的每个参数张量添加的属性。此外，出于性能考虑，PyTorch 也[推荐使用前者](https:\u002F\u002Fpytorch.org\u002Fdocs\u002Fstable\u002Fnotes\u002Fcuda.html#cuda-nn-ddp-instead)。\n- 我们通过从传递给 `mup` 优化器的参数中创建细化的参数组，并操作这些组中的 `lr` 属性，显式地按照 μP 规范缩放学习率。这种方式与 PyTorch 的学习率调度器兼容。然而，如果你自定义调度器，请确保它设置的学习率是相对于细化参数组中当前值的比例关系。以下是一个错误做法和正确做法的示例：\n```python\noptimizer = mup.MuAdam(model.parameters(), lr=1e-3)\nfor pg in optimizer.param_groups:\n  # 错误做法：绝对设置学习率\n  # pg['lr'] = 1e-3 * 2\n  # 正确做法：相对调整学习率\n  pg['lr'] *= 2\n```\n- 默认情况下，任何具有两个“无限”维度（即不同于基维度的维度）的参数矩阵都被 `mup` 视为具有 (fan_out, fan_in) 形状，即在前向传播中，该矩阵会将输入右乘。这适用于 PyTorch 中的所有 `nn.Linear` 权重。如果你有一个自定义参数，比如 `W`，它违反了这一约定，你可以手动设置 `W.infshape.main_idx = 0; W.infshape.main = W.infshape[0]`，以告知 `mup` 其形状对应于 (fan_in, fan_out)。类似的情况也适用于具有多个维度但恰好有两个“无限”维度的参数张量，其中第一个维度是 fan_in，第二个维度是 fan_out。\n- 目前，[`torch.save` 不会保存附加到每个参数张量的 `infshape` 对象](https:\u002F\u002Fgithub.com\u002Fpytorch\u002Fpytorch\u002Fissues\u002F72129)。在问题修复之前，你必须在加载模型检查点后手动设置基形状，如下所示：\n```python\nmodel = torch.load('my\u002Fmodel\u002Fpath.pt')\n# 重要提示：注意 `rescale_params=False` 标志！\nset_base_shapes(model, 'my\u002Fbase\u002Fshape\u002Fpath.bsh', rescale_params=False)\n```\n（`set_base_shapes` 默认会重新缩放由 PyTorch 刚刚初始化的 `model` 参数，使其与 μP 保持一致。`rescale_params=False` 标志会关闭此行为。）\n\n\n## 检查参数化是否正确\n\n\n### 坐标检查\n\n就像梯度检查是验证自动微分实现是否正确的一种简单方法一样，*坐标检查*也是一种验证你是否正确实现了 μP 的简便方法：在训练的几个步骤中，针对几种不同的宽度，计算模型输入和输出中每个激活向量坐标的平均大小（我们在下图的 y 轴上用 `l1` 表示）。如果实现正确，那么我们会看到这个 `l1` 在多种宽度下保持稳定；否则，随着宽度增加，`l1` 可能会爆炸式增长，也可能收缩至 0。\n（我们实际上是在检验下面描述的第 1 个期望特性。）\n（`l1` 计算每个激活向量 `x` 的 `x.abs().mean()`，只是衡量 `x` 元素“平均大小”的一种指标；也可以使用类似定义的 `l2`、`l4` 等，尽管它们可能因随机种子的不同而出现更大的波动。）\n\n例如，在下面的图表中，我们绘制了训练 2 个步骤时的 `width` 与 `l1` 曲线，其中 t=1 表示初始化时，尚未进行任何梯度更新。每条曲线对应于某一层的（预）激活向量或网络的输出。第一组 3 张图显示的是采用标准参数化（SP）的 MLP，由 adam 优化器训练。我们可以看到，经过 1 步更新后，激活\u002F输出的 `l1` 随着宽度增加而迅速膨胀。这意味着 SP 是“不正确的”。\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_mup_readme_1ad80b9ff3e3.png)\n现在我们对采用最大更新参数化（μP）的 MLP 进行同样的操作（包括使用 `mup.optim.MuAdam` 而代之于 `torch.optim.Adam`）。与上述情况相反，所有曲线都保持水平，表明 μP 已被正确实现。\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_mup_readme_5664d7c1f8ab.png)\n我们将这种检查实现正确性的方法称为“坐标检查”，简称“coord check”。\n\n### 自己制作坐标检查图表\n我们提供了一种简单的方法来实现这一检查，即使用 `mup.coord_check` 模块中的函数。典型的工作流程如下所示。\n\n```Python\nfrom mup.coord_check import get_coord_data, plot_coord_data\n# 构建一个包含不同宽度懒加载 μP 模型的字典\ndef lazy_model(width):\n    # `set_base_shapes` 返回模型\n    return lambda: set_base_shapes(MyMuModel(width), 'my\u002Fbase\u002Fshape\u002Fpath.bsh')\n    # 注意：任何使用 `mup.init` 的自定义初始化也需要在 lambda 内部完成\nmodels = {64: lazy_model(64), ..., 1024: lazy_model(1024)}\n# 创建一个批次大小\u002F序列长度较小的数据加载器\n#   仅用于测试\ndataloader = ...\n# 记录模型在训练的几个步骤中各层激活的数据\n# 这将返回一个 pandas 数据框\ndf = get_coord_data(models, dataloader)\n# 这会将坐标检查图表保存到指定文件名。\nplot_coord_data(df, save_to=filename)\n# 如果你在 Jupyter Notebook 中，也可以直接使用\n#   `plt.show()`\n\n# 用于展示图表\n```\n例如，`mup.coord_check.example_plot_coord_check` 函数就是以这种方式为玩具级的 MLP 和 CNN 模型实现的。\n\n如果你在训练几轮后看到曲线随着宽度的增加而发散或收缩至零，那么你的 μP 实现中可能存在 bug（你是否忘记在 delta 模型中调整某些维度，比如 `d_ffn`？）。\n相反，如果你看到曲线向右侧收敛，那么很可能你的实现是正确的。\n不过，这里也有两种典型例外；\n以下内容在 μP 的初始化阶段可能会收缩至零（以 1\u002F√(宽度) 的速率）：\n  - 网络输出\n  - Transformer 中的注意力 logits\n\n这些现象都是暂时的，经过几轮训练后，它们的曲线应该大致趋于平稳。\n尽管如此，为了消除初始化时的差异，我们建议：\n   - 将输出层权重（应为 `MuReadout` 实例）通过 `readout_zero_init=True` 选项初始化为 0，\n   - 将 Transformer 中的查询矩阵手动初始化为 0。\n如果希望在初始化时打破对称性以产生非零的注意力 logits，则可以使用非零方差来初始化（相对）位置偏置。\n\n#### 坐标检查技巧\n\n- 使用较大的学习率（比实际训练时使用的还要大）。这样可以突出任何潜在的坐标爆炸问题，因为如果学习率过小，这些问题可能会被初始化掩盖。\n- 如果你在前向传播中多次复用同一个模块，那么 `mup.get_coord_data` 只会记录最后一次使用的统计信息。在这种情况下，为了测试目的，你可以将不同的使用情况分别包裹在不同名称的 `nn.Identity` 模块中，以便区分它们。\n\n### 更宽总是更好\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_mup_readme_73112f690f09.png)\n\n另一个表明 μP 未正确实现的迹象是：在训练过程中，当网络宽度超过某个值后，更宽的模型反而会导致训练损失变差。\n上图展示了一系列训练曲线：（左）正确的实现应在训练的任何阶段都表现出随着宽度增加性能提升；（中）如果你使用的是标准参数化（SP），有时可能会观察到性能随宽度增加到一定程度后突然下降；（右）或者你甚至会发现即使对于较窄的模型，性能也会立即开始恶化。\n\n## 示例\n\n请参阅 `examples\u002F` 文件夹中的 `MLP`、`Transformer` 和 `ResNet` 子文件夹，以及 `mup\u002Ftest` 中的测试用例作为示例。\n熟悉 [Huggingface Transformers](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers) 的用户也可以参考 `examples\u002Fmutransformers` 子模块（可通过 `git submodule update --init` 获取），该子模块也可单独在 [https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fmutransformers](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fmutransformers) 上找到。\n\n## 与 Huggingface 的原生集成\n\n是否曾因你的 [Huggingface Transformer](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers) 在扩展规模时出现问题而感到沮丧？或者想在单个 GPU 上直接调试大型多 GPU [Huggingface Transformer](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers) 的超参数吗？如果是的话，请为 [这个 GitHub 问题](https:\u002F\u002Fgithub.com\u002Fhuggingface\u002Ftransformers\u002Fissues\u002F16157) 点赞吧！\n\n\n## 运行测试\n要运行测试，执行以下命令：\n```bash\npython -m mup.test\n```\n\n\n## 基本数学原理\n\nμP 的设计旨在满足以下要求：\n\n> 在训练过程中的任何时候\n> 1. 网络中的每个（预）激活向量都应具有 Θ(1) 大小的坐标\n> 2. 神经网络的输出应为 O(1)\n> 3. 所有参数都应在不导致发散的前提下，尽可能地根据网络宽度进行缩放更新\n\n事实证明，这些要求唯一地确定了 μP。\n为了从这些要求推导出 μP，我们需要仔细考虑当矩阵 A 和向量 v“相关”时，由 A 乘以 v 所得到的向量 Av 的*坐标大小*如何依赖于 A 和 v 的坐标大小。\n在这里，可以把 A 看作权重，v 看作激活向量。\n而这又取决于 A 是什么类型的矩阵，v 又是什么类型的向量。\n在训练宽神经网络的背景下，我们只需要考虑那些具有近似独立同分布坐标的向量，以及两类矩阵：1) 类似于这类向量外积的矩阵，2) 随机的独立同分布矩阵。\n第一类矩阵涵盖了诸如权重梯度之类的内容；第二类则对应于权重初始化等场景。\n因此，如果 A 和 v 的元素大小均为 Θ(1)，且它们在训练过程中以自然方式相关联，那么就会出现下表所示的情况。\n\n|                  | 外积矩阵 A（类型 1） | 独立同分布矩阵 A（类型 2）    |\n|------------------|--------------------------|--------------------|\n| Av 的元素大小   | Θ(n)                     | Θ(sqrt(n))         |\n\n基于这张表，我们可以直接追踪网络的前向和反向计算过程，从而推导出 μP。\n\n有关更详细的介绍，请参阅我们的[博客文章](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Fblog\u002F%C2%B5transfer-a-technique-for-hyperparameter-tuning-of-enormous-neural-networks\u002F)和[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2203.03466)。\n\n\n## 贡献说明\n\n本项目欢迎各种贡献和建议。大多数贡献都需要你签署一份贡献者许可协议（CLA），声明你有权并将你的贡献权利授予我们使用。详情请访问 https:\u002F\u002Fcla.opensource.microsoft.com。\n\n当你提交拉取请求时，CLA 机器人会自动判断你是否需要提供 CLA，并相应地标记 PR（例如状态检查、评论）。只需按照机器人提供的指示操作即可。对于所有使用我们 CLA 的仓库，你只需完成一次即可。\n\n本项目已采纳 [微软开源行为准则](https:\u002F\u002Fopensource.microsoft.com\u002Fcodeofconduct\u002F)。\n如需更多信息，请参阅[行为准则常见问题解答](https:\u002F\u002Fopensource.microsoft.com\u002Fcodeofconduct\u002Ffaq\u002F)，或发送邮件至 [opencode@microsoft.com](mailto:opencode@microsoft.com) 提出任何其他问题或意见。\n\n## 商标说明\n\n本项目可能包含其他项目、产品或服务的商标或标识。未经授权使用微软商标或标识的行为必须遵守并遵循 [微软商标与品牌指南](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Flegal\u002Fintellectualproperty\u002Ftrademarks\u002Fusage\u002Fgeneral)。\n在本项目的修改版本中使用微软商标或标识时，不得造成混淆或暗示微软的赞助关系。\n任何第三方商标或标识的使用均须遵守其各自的政策。","# mup 快速上手指南\n\n**mup** (Maximal Update Parametrization) 是一个用于 PyTorch 的工具包，旨在实现“最大更新参数化”（μP）。它能让神经网络的最优超参数（如学习率）在不同模型宽度下保持稳定。这意味着你可以在小规模模型上调优超参数，然后直接将其迁移到超大模型上进行训练，无需重新搜索。\n\n## 环境准备\n\n*   **操作系统**: Linux, macOS, Windows\n*   **Python**: 3.8 或更高版本\n*   **核心依赖**:\n    *   `PyTorch`: 需安装与你的 CUDA 版本匹配的 PyTorch。\n    *   `torchdistx` (可选但推荐): 用于延迟初始化（deferred initialization），在处理极大模型时可节省显存和内存。\n\n> **国内加速建议**：\n> 推荐使用清华或阿里镜像源安装 PyTorch 和 pip 包，以提升下载速度。\n> ```bash\n> # 设置 pip 使用清华镜像\n> pip config set global.index-url https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n> ```\n\n## 安装步骤\n\n### 方式一：通过 Pip 安装（推荐）\n\n直接使用 pip 安装稳定版：\n\n```bash\npip install mup\n```\n\n### 方式二：从源码安装\n\n如果你需要最新功能或进行开发，可以克隆仓库并安装：\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fmup.git\ncd mup\npip install -r requirements.txt\npip install -e .\n```\n\n*(可选) 安装延迟初始化工具 `torchdistx`：*\n```bash\npip install torchdistx\n```\n\n## 基本使用\n\n要在现有 PyTorch 项目中启用 μP，只需进行以下四步修改：\n\n### 1. 修改模型定义\n*   将输出层替换为 `MuReadout`。\n*   如果是 Transformer 架构，将 Attention 的缩放因子从 `1\u002Fsqrt(d)` 改为 `8\u002Fd`（为了兼容常见头维度 64）。\n\n```python\nimport torch.nn as nn\nfrom mup import MuReadout\n\nclass MyModel(nn.Module):\n    def __init__(self, width, d_out, ...):\n        super().__init__()\n        # ... 其他层定义 ...\n        \n        # 【修改点 1】替换输出层\n        # readout = nn.Linear(width, d_out)  # 原生写法\n        self.readout = MuReadout(width, d_out) # mup 写法\n        \n        # 如果权重共享 (如 Transformer)，使用:\n        # self.readout = MuSharedReadout(input_layer.weight)\n\n    def forward(self, x, ...):\n        # ... 前向传播逻辑 ...\n        \n        # 【修改点 2】调整 Attention 缩放 (仅针对 Transformer)\n        # 原生: scores = q @ k.transpose(-2, -1) \u002F math.sqrt(d_k)\n        # mup:  使用 8\u002Fd 代替 1\u002Fsqrt(d) 以保持向后兼容 (当 d=64 时等效)\n        d = query.shape[-1]\n        attention_scores = query @ key.transpose(-2, -1) * 8 \u002F d\n        \n        # ... 后续逻辑 ...\n        return self.readout(x)\n```\n\n### 2. 设置基础形状 (Base Shapes)\n实例化一个极小的“基准模型”（base model）和一个稍大的“增量模型”（delta model），并将它们的形状信息应用到你要训练的目标模型上。**注意：基准模型和增量模型不需要训练。**\n\n```python\nfrom mup import set_base_shapes\n\n# 实例化基准模型 (宽度设为 1)\nbase_model = MyModel(width=1)\n\n# 实例化增量模型 (宽度设为 2，用于识别哪些维度是可扩展的)\ndelta_model = MyModel(width=2)\n\n# 实例化目标模型 (实际要训练的宽度，例如 1024)\nmodel = MyModel(width=1024)\n\n# 【关键步骤】应用 μP 缩放规则\n# 必须在重新初始化参数或定义优化器之前调用\nset_base_shapes(model, base_model, delta=delta_model)\n\n# 可选：如果显存不足，可使用 torchdistx 延迟初始化上述模型\n# from torchdistx.deferred_init import deferred_init\n# base_model = deferred_init(MyModel, width=1)\n# delta_model = deferred_init(MyModel, width=2)\n# model = deferred_init(MyModel, width=1024)\n# set_base_shapes(model, base_model, delta=delta_model)\n```\n\n### 3. 替换初始化函数 (如有自定义初始化)\n如果你在代码中手动调用了 `torch.nn.init` 系列函数，请替换为 `mup.init` 中对应的函数，以确保缩放正确。\n\n```python\nfrom mup import init\n\nfor param in model.parameters():\n    # 原生写法: nn.init.uniform_(param, -0.1, 0.1)\n    # mup 写法:\n    init.uniform_(param, -0.1, 0.1)\n    \n    # 同样适用于 xavier_uniform_, kaiming_normal_ 等\n```\n*注：如果没有自定义初始化逻辑，可跳过此步，`set_base_shapes` 会自动处理标准初始化的重缩放。*\n\n### 4. 使用专用优化器\n将 `torch.optim` 替换为 `mup` 提供的优化器（如 `MuSGD`, `MuAdam`）。它们会自动根据参数形状调整学习率。\n\n```python\nfrom mup import MuAdam\n\n# 原生写法: optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n# mup 写法:\noptimizer = MuAdam(model.parameters(), lr=1e-3)\n\n# 之后即可像往常一样进行训练循环\n# for batch in dataloader:\n#     ...\n```\n\n完成以上步骤后，你在小模型上调优得到的学习率等超参数，即可直接应用于大规模模型训练。","某大型 AI 实验室团队正在研发一款参数量从亿级向千亿级扩展的垂直领域大语言模型，急需在有限算力下完成超大规模模型的学习率等关键超参数调优。\n\n### 没有 mup 时\n- **试错成本极高**：团队必须直接在千亿参数的大模型上进行多轮超参数搜索，单次实验耗时数天且消耗巨额 GPU 资源。\n- **经验无法复用**：在小规模模型（如 100M 参数）上调优得到的最佳学习率，直接应用到大模型时往往导致训练发散或收敛极慢，缺乏可迁移性。\n- **扩展过程脆弱**：随着模型宽度增加，最优超参数区间剧烈波动，工程师难以判断是模型架构问题还是参数设置不当，排查困难。\n- **手动调整易错**：需要人工根据理论公式修改每一层权重初始化和输出层的缩放比例，代码侵入性强且极易引入细微的数学错误。\n\n### 使用 mup 后\n- **零样本超参数迁移**：只需在极小的代理模型（如宽度为 1 的基础模型）上完成调优，得到的最佳学习率可直接无损迁移至千亿参数大模型，无需在大模型上重复搜索。\n- **训练稳定性显著提升**：mup 自动处理最大更新参数化（μP），确保无论模型宽度如何扩展，梯度更新幅度保持稳定，彻底消除了因扩容导致的训练崩溃风险。\n- **开发效率大幅飞跃**：仅需将普通线性层替换为 `MuReadout` 并调整注意力缩放系数，即可自动获得跨尺度的超参数稳定性，将原本数周的调优周期缩短至几天。\n- **数学实现自动化**：底层自动处理复杂的权重初始化缩放逻辑，避免了人工推导公式带来的实现偏差，让团队能专注于架构创新而非数值调试。\n\nmup 通过独特的参数化方法，让小模型上的调优经验能完美“零样本”迁移到超大模型，从根本上解决了深度学习模型规模化过程中的超参数不稳定难题。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_mup_f37098aa.png","microsoft","Microsoft","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fmicrosoft_4900709c.png","Open source projects and samples from Microsoft",null,"opensource@microsoft.com","OpenAtMicrosoft","https:\u002F\u002Fopensource.microsoft.com","https:\u002F\u002Fgithub.com\u002Fmicrosoft",[82,86],{"name":83,"color":84,"percentage":85},"Jupyter Notebook","#DA5B0B",94,{"name":87,"color":88,"percentage":89},"Python","#3572A5",6,1698,104,"2026-04-15T13:15:30","MIT","","未说明",{"notes":97,"python":95,"dependencies":98},"该工具是一个参数化库，旨在使超参数在不同模型宽度间稳定迁移。安装可通过 pip 或源码进行。若需处理极大模型以节省内存，建议安装可选依赖 `torchdistx` 并使用其延迟初始化功能。在数据并行训练时，必须使用 `torch.nn.parallel.DistributedDataParallel` 而非 `torch.nn.DataParallel`，因为后者会移除 mup 所需的参数属性。保存模型检查点时需特别注意，由于 PyTorch 限制，`infshape` 属性不会自动保存，加载后需手动重新设置基础形状并关闭参数重缩放。",[99,100],"torch","torchdistx (可选，用于延迟初始化)",[14,35],[103,104,105,106,107,108,64],"python","pytorch","transformers","machine-learning","deep-learning","mutransfer","2026-03-27T02:49:30.150509","2026-04-18T14:15:04.281561",[112,117,122,127,132,136],{"id":113,"question_zh":114,"answer_zh":115,"source_url":116},40109,"为什么我的模型在使用 μTransfer 后性能没有提升，或者坐标检查（Coord Check）看起来不正常？","这通常是因为 `output_mult`（输出乘数）超参数设置不当。模型族应该共享相似的 `output_mult` 值。建议将其作为一个额外的超参数进行搜索，例如尝试 `[2**x for x in range(2, 10)]` 这样的范围。`output_mult` 不需要极其精细的调优，只要数量级正确即可解决问题。","https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fmup\u002Fissues\u002F23",{"id":118,"question_zh":119,"answer_zh":120,"source_url":121},40110,"WeightNorm（权重归一化）是否与 μP 兼容？如果坐标检查不平滑该怎么办？","WeightNorm 在数学上与 μP 的缩放规则不自然兼容。WeightNorm 将隐藏权重矩阵归一化为 Θ(1)，而 μP 理论要求归一化为 Θ(√width)。因此，使用 WeightNorm 会导致坐标检查效果不佳且无法正确扩展。建议不要使用 WeightNorm，可以直接将其替换为 LayerNorm 或 BatchNorm。","https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fmup\u002Fissues\u002F14",{"id":123,"question_zh":124,"answer_zh":125,"source_url":126},40111,"Warmup Ratio（预热比例）是否是可以通过 μTransfer 转移的超参数？","根据用户经验，Warmup Ratio 可能不是完全可转移的。如果在小宽度模型上调优的学习率直接转移到大宽度模型导致发散，尝试增加 Warmup Ratio（例如从 0.01 增加到 0.1）可能会使损失收敛。此外，需确保学习率调度器（lr_scheduler）的配置不会破坏优化器的重新分组（re-group），这是导致训练失败的常见原因。","https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fmup\u002Fissues\u002F40",{"id":128,"question_zh":129,"answer_zh":130,"source_url":131},40112,"在 Electra 风格的多任务模型或复杂架构中，坐标检查失败或 μP 不工作的原因是什么？","一个常见的错误是初始化顺序不对。必须确保在调用 `set_base_shapes` 之后再进行权重初始化（如 `_init_weights`）。如果在 `set_base_shapes` 之前初始化权重，会导致 μP 的缩放规则无法正确应用，从而使坐标检查失效。","https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fmup\u002Fissues\u002F27",{"id":133,"question_zh":134,"answer_zh":135,"source_url":126},40113,"是否可以从非 μP 训练的预训练检查点（Checkpoint）开始继续使用 μP 进行微调？","目前官方没有针对此场景的实验数据。理论上，从零开始训练的理解不能直接无损地应用到从非 μP 训练的检查点继续训练的情况。如果必须加载现有检查点，可能需要考虑将矩阵类张量的参数重新归一化为零均值，但这方面的具体效果和最佳实践尚未明确。",{"id":137,"question_zh":138,"answer_zh":139,"source_url":126},40114,"对于二元分类头（Binary Classification Head），是否必须使用全零初始化？","虽然全零初始化是常见做法，但在二元 Softmax 中，由于梯度特性，两个输出神经元的权重和 logits 会呈现 \"x 和 -x\" 的关系。目前尚不确定这是否有负面影响，也没有强制规定必须全零初始化。如果在 Transformer 预训练中遇到相关问题，可以尝试观察其实际影响，但需注意这并非 μP 的核心限制。",[141],{"id":142,"version":143,"summary_zh":144,"released_at":145},323633,"v1.0.0","初始发布。","2022-03-08T14:11:08"]