[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-philipturner--metal-flash-attention":3,"tool-philipturner--metal-flash-attention":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":79,"owner_company":80,"owner_location":80,"owner_email":81,"owner_twitter":82,"owner_website":80,"owner_url":83,"languages":84,"stars":89,"forks":90,"last_commit_at":91,"license":92,"difficulty_score":93,"env_os":94,"env_gpu":95,"env_ram":96,"env_deps":97,"category_tags":104,"github_topics":105,"view_count":23,"oss_zip_url":80,"oss_zip_packed_at":80,"status":16,"created_at":113,"updated_at":114,"faqs":115,"releases":144},3621,"philipturner\u002Fmetal-flash-attention","metal-flash-attention","FlashAttention (Metal Port)","metal-flash-attention 是将高效的 FlashAttention 算法移植到 Apple Silicon 芯片（如 M1、M4 系列）的开源项目。它旨在解决苹果硬件在运行大规模注意力机制时面临的内存带宽瓶颈和寄存器压力问题，让大模型在 Mac 设备上跑得更快、更省显存。\n\n该项目特别适合需要在苹果硬件上进行大模型推理优化、算法研究或底层开发的工程师与研究人员。与普通用户直接使用的封装软件不同，metal-flash-attention 提供了核心算法的源码实现，便于开发者深入理解并定制注意力机制。\n\n其技术亮点在于针对 Apple 硬件特性做了深度优化：首先，它完全采用运行时即时编译（JIT），摆脱了对特定 Xcode 版本的依赖；其次，重新设计了反向传播算法，虽增加了少量计算量，但实现了行列维度的 100% 并行效率，且内存占用比官方原版更低；最后，为克服大维度下的寄存器限制，创新性地引入了沿维度 D 的分块策略和智能寄存器溢出机制。实测显示，该方案在 M1 Max 上能稳定达到每秒 4400 亿条指令的处理速度，ALU 利用率高达 83%，即便在无限序列长度下也","metal-flash-attention 是将高效的 FlashAttention 算法移植到 Apple Silicon 芯片（如 M1、M4 系列）的开源项目。它旨在解决苹果硬件在运行大规模注意力机制时面临的内存带宽瓶颈和寄存器压力问题，让大模型在 Mac 设备上跑得更快、更省显存。\n\n该项目特别适合需要在苹果硬件上进行大模型推理优化、算法研究或底层开发的工程师与研究人员。与普通用户直接使用的封装软件不同，metal-flash-attention 提供了核心算法的源码实现，便于开发者深入理解并定制注意力机制。\n\n其技术亮点在于针对 Apple 硬件特性做了深度优化：首先，它完全采用运行时即时编译（JIT），摆脱了对特定 Xcode 版本的依赖；其次，重新设计了反向传播算法，虽增加了少量计算量，但实现了行列维度的 100% 并行效率，且内存占用比官方原版更低；最后，为克服大维度下的寄存器限制，创新性地引入了沿维度 D 的分块策略和智能寄存器溢出机制。实测显示，该方案在 M1 Max 上能稳定达到每秒 4400 亿条指令的处理速度，ALU 利用率高达 83%，即便在无限序列长度下也能保持高性能表现。","# FlashAttention (Metal Port)\n\nThis repository ports the official implementation of [FlashAttention](https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention) to Apple silicon. It is a minimal, maintainable set of source files that reproduces the FlashAttention algorithm.\n\n## Documentation\n\nSingle-headed attention only, to focus on the core bottlenecks of different attention algorithms (register pressure, parallelism). With the basic algorithm done correctly, it should be comparatively trivial to add customizations like block sparsity.\n\nEverything is JIT compiled at runtime. This contrasts with the previous implementation, which relied on an executable embedded in Xcode 14.2.\n\nThe backward pass uses less memory than [Dao-AILab\u002Fflash-attention](https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention). The official implementation allocates scratch space for atomics and partial sums. Apple hardware lacks native FP32 atomics (`metal::atomic\u003Cfloat>` is emulated). While attempting to circumvent the lack of hardware support, bandwidth and parallelization bottlenecks in the FlashAttention-2 backward kernel were revealed. An alternative backward pass was designed with higher compute cost (7 GEMMs instead of 5 GEMMs). It achieves 100% parallelization efficiency across both the row and column dimensions of the attention matrix. Most importantly, it is easier to code and maintain.\n\nA lot of crazy stuff was done to overcome register pressure bottlenecks. At large head dimensions (e.g. 256), none of the matrix blocks can fit into registers. Not even the accumulator can. Therefore, intentional register spilling is done, but in a more optimized way. A third block dimension was added to the attention algorithm, which blocks along `D`. The aspect ratio of attention matrix blocks was warped heavily, to minimize the bandwidth cost of register spilling. For example, 16-32 along the parallelization dimension and 80-128 along the traversal dimension. There is a large parameter file that takes the `D` dimension, and determines which operands can fit into registers. It then assigns a block size that balances many competing bottlenecks.\n\nThe end result is a consistent 4400 gigainstructions per second on M1 Max (83% ALU utilization), at infinite sequence length and infinite head dimension. Provided BF16 emulation is being used for mixed precision (Metal's `bfloat` has IEEE-compliant rounding, a major overhead on older chips without hardware BF16).\n\n![M1_Max_Image.png](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fphilipturner_metal-flash-attention_readme_9ad37a96ea01.png)\n\n![M4_Image.png](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fphilipturner_metal-flash-attention_readme_4a39e667069b.png)\n\nRaw Data: https:\u002F\u002Fdocs.google.com\u002Fspreadsheets\u002Fd\u002F1Xf4jrJ7e19I32J1IWIekGE9uMFTeZKoOpQ6hlUoh-xY\u002Fedit?usp=sharing\n\n## Quantifying Performance\n\nIn the AI field, performance is most often reported in giga-floating point operations per second (GFLOPS). This metric reflects a simplified model of performance, that every instruction occurs in GEMM. As hardware has advanced from early FPUs to modern vector processors, the most common floating-point operations were fused into a single instruction. Fused multiply-add (FMA). When one multiplies two 100x100 matrices, 1 million FMA instructions are issued. Why must we treat this FMA as two separate instructions?\n\nThis question is relevant to attention, where not all floating point operations are created equal. The exponentiation during softmax occurs in a single clock cycle, granted that most of the other instructions go to the FMA unit. Some of the multiplies and adds during softmax, cannot be fused with a nearby add or multiply. Should we treat these the same as FMA, and pretend the hardware is just executing the FMA two times slower? It is unclear how the GEMM performance model can explain whether my shader is using the ALU hardware effectively.\n\nInstead of gigaflops, I use gigainstructions to understand how well the shader is performing. It maps more directly to the algorithm. For example, one GEMM is `N^3` FMA instructions. Forward attention performs two matrix multiplies, or `2 * D * N^2` FMA instructions. Backward attention (by the [Dao-AILab\u002Fflash-attention](https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention) implementation) is `5 * D * N^2` FMA instructions. Try comparing this table to roofline models in the Flash1, Flash2, or Flash3 papers.\n\n| Operation   | Work |\n| :---------- | ---: |\n| Square GEMM | `N^3`  |\n| Forward Attention | `(2D + 5) * N^2` |\n| Backward Naive Attention | `4D * N^2` |\n| Backward FlashAttention | `(5D + 5) * N^2` |\n| FWD + BWD Combined | `(7D + 10) * N^2` | \n\nDue to the complexity of FP32 atomics, MFA used a different approach for backward pass. This one has higher compute cost. It splits the backward pass into two separate kernels: `dQ` and `dK\u002FdV`. A dropdown shows the pseudocode. Compare this to one of the algorithms in the Flash1, Flash2, or Flash3 papers.\n\n| Operation   | Work |\n| :---------- | ---: |\n| Forward | `(2D + 5) * N^2` |\n| Backward dQ | `(3D + 5) * N^2` |\n| Backward dK\u002FdV | `(4D + 5) * N^2` |\n| FWD + BWD Combined | `(9D + 15) * N^2` | \n\n\u003Cdetails>\n\u003Csummary>Algorithm Pseudocode\u003C\u002Fsummary>\n\n```swift\n\u002F\u002F Forward\n\u002F\u002F   for c in 0..\u003CC {\n\u002F\u002F     load K[c]\n\u002F\u002F     S = Q * K^T\n\u002F\u002F     (m, l, P) = softmax(m, l, S * scaleFactor)\n\u002F\u002F\n\u002F\u002F     O *= correction\n\u002F\u002F     load V[c]\n\u002F\u002F     O += P * V\n\u002F\u002F   }\n\u002F\u002F   O \u002F= l\n\u002F\u002F\n\u002F\u002F   L = m + logBaseE(l)\n\u002F\u002F\n\u002F\u002F Backward Query\n\u002F\u002F   D = dO * O\n\u002F\u002F\n\u002F\u002F   for c in 0..\u003CC {\n\u002F\u002F     load K[c]\n\u002F\u002F     S = Q * K^T\n\u002F\u002F     P = exp(S - L)\n\u002F\u002F\n\u002F\u002F     load V[c]\n\u002F\u002F     dP = dO * V^T\n\u002F\u002F     dS = P * (dP - D) * scaleFactor\n\u002F\u002F\n\u002F\u002F     load K[c]\n\u002F\u002F     dQ += dS * K\n\u002F\u002F   }\n\u002F\u002F\n\u002F\u002F Backward Key-Value\n\u002F\u002F   for r in 0..\u003CR {\n\u002F\u002F     load Q[r]\n\u002F\u002F     load L[r]\n\u002F\u002F     S^T = K * Q^T\n\u002F\u002F     P^T = exp(S^T - L)\n\u002F\u002F\n\u002F\u002F     load dO[r]\n\u002F\u002F     dV += P^T * dO\n\u002F\u002F\n\u002F\u002F     load dO[r]\n\u002F\u002F     load D[r]\n\u002F\u002F     dP^T = V * dO^T\n\u002F\u002F     dS^T = P^T * (dP^T - D) * scaleFactor\n\u002F\u002F\n\u002F\u002F     load Q[r]\n\u002F\u002F     dK += dS^T * Q\n\u002F\u002F   }\n```\n\n\u003C\u002Fdetails>\n\nPerformance is measured by calculating the amount of compute work, then dividing by seconds. The end result is \"gigainstructions per second\". Next, we need a roofline model. The table below shows rooflines for GINSTRS, calculated as half of GFLOPS. ALU utilization is (actual gigainstructions per second) \u002F (expected gigainstructions per second). For example, M1 Max typically achieves 80% ALU utilization with mixed precision. \n\nThere are limits to this model. It breaks down with the M3 generation at small head dimensions. Different compute units might be utilized simultaneously, making the apparent utilization over 100%. For the most part, the benchmark provides an accurate model of how much performance is left on the table.\n\n```swift\nvar operations: Int\nswitch benchmarkedKernel {\ncase .forward:\n  operations = 2 * headDimension + 5\ncase .backwardQuery:\n  operations = 3 * headDimension + 5\ncase .backwardKeyValue:\n  operations = 4 * headDimension + 5\n}\noperations *= (sequenceDimension * sequenceDimension)\noperations *= dispatchCount\n\n\u002F\u002F Divide the work by the latency, resulting in throughput.\nlet instrs = Double(operations) \u002F Double(latencySeconds)\nlet ginstrs = Int(instrs \u002F 1e9)\n```\n\n| Hardware | GFLOPS | GINSTRS |\n| :------- | -----: | ------: |\n| M1 Max   | 10616  | 5308    |\n| M4       | 3580   | 1790    |\n\nHow well does the Metal port compare to the official FlashAttention repository? Imagine I went with the \"atomic dQ\" algorithm and achieved 100% performance. Then, switched to the actual MFA repo and found model training to be 4x slower. That would be 25% of the roofline from the official repository. To get this percentage, multiply the average ALU utilization across all three kernels by `7 \u002F 9`. A more nuanced model was used for the statistics on Apple hardware, but this is the gist of it.\n\nTo calculate utilization of Nvidia hardware, I used GFLOPS for FP16\u002FBF16 ALUs. I divided the highest GFLOPS from each graph in the paper by 312000 (A100 SXM), 989000 (H100 SXM). Notice that, for larger head dimensions and register intensive kernels (backward pass), no benchmarks were reported. I confirmed they did not solve the register pressure issue at infinite head dimensions. For example, the accumulator is always held in registers. At the time of writing, I had not seen concrete evidence of D=256 backward gradient executing with correct results.\n\n### GFLOPS\n\n| A100, Flash2, FP16 | D = 64  | D = 128 | D = 256 |\n| :----------------- | ------: | ------: | ------: |\n| Forward            | 192000  | 223000  | 0       |\n| Backward           | 170000  | 196000  | 0       |\n| Forward + Backward | 176000  | 203000  | 0       |\n\n| H100, Flash3, FP16 | D = 64  | D = 128 | D = 256 |\n| :----------------- | ------: | ------: | ------: |\n| Forward            | 497000  | 648000  | 756000  |\n| Backward           | 474000  | 561000  | 0       |\n| Forward + Backward | 480000  | 585000  | 0       |\n\n| H100, Flash3, FP8  | D = 64  | D = 128 | D = 256 |\n| :----------------- | ------: | ------: | ------: |\n| Forward            | 613000  | 1008000 | 1171000 |\n| Backward           | 0       | 0       | 0       |\n| Forward + Backward | 0       | 0       | 0       |\n\n### Compute Utilization\n\n| A100, Flash2, FP16 | D = 64  | D = 128 | D = 256 |\n| :----------------- | ------: | ------: | ------: |\n| Forward            | 62%     | 71%     | 0%      |\n| Forward + Backward | 56%     | 65%     | 0%      |\n\n| H100, Flash3, FP16 | D = 64  | D = 128 | D = 256 |\n| :----------------- | ------: | ------: | ------: |\n| Forward            | 50%     | 66%     | 76%     |\n| Forward + Backward | 48%     | 59%     | 0%      |\n\n| M1 Architecture, FP16 | D = 64  | D = 128 | D = 256 |\n| :-------------------- | ------: | ------: | ------: |\n| Forward               | 86%     | 85%     | 86%     |\n| Forward + Backward    | 62%     | 63%     | 64%     |\n\n| M3 Architecture, FP16 | D = 64  | D = 128 | D = 256 |\n| :-------------------- | ------: | ------: | ------: |\n| Forward               | 94%     | 91%     | 82%     |\n| Forward + Backward    | 71%     | 69%     | 61%     |\n\n### Side by Side\n\n| Hardware Produced in 2020 | D = 64  | D = 128 | D = 256 |\n| :------------------------ | ------: | ------: | ------: |\n| A100                      | 56%     | 65%     | 0%      |\n| M1&mdash;M2 Architecture  | 62%     | 63%     | 64%     |\n\n| Hardware Produced in 2023 | D = 64  | D = 128 | D = 256 |\n| :------------------------ | ------: | ------: | ------: |\n| H100 (using FP8 GFLOPS)   | 24%     | 30%     | 0%      |\n| H100 (using FP16 GFLOPS)  | 48%     | 59%     | 0%      |\n| M3&mdash;M4 Architecture  | 71%     | 69%     | 61%     |\n\nDespite issuing more computations, Apple hardware is training transformers \u003Cb>faster than Nvidia hardware doing the same work\u003C\u002Fb>. Normalizing for the difference in size between different GPUs. Just focusing on how efficiently the GPU is utilized.\n\nPerhaps the main repository should try the algorithm that avoids FP32 atomics and deliberately spills registers when they cannot fit in the GPU core. This seems unlikely, as they have hard-coded support for a small subset of the possible problem sizes. The motivation seems to be supporting the most common models, where `D` is a power of 2, and less than 128. For anything else, users need to rely on alternative fallback implementations (e.g. the MFA repository), which might use a completely different underlying algorithm.\n\n## Usage\n\n### Setting Up Workflow\n\nOn macOS, download the Swift package and compile with `-Xswiftc -Ounchecked`. This compiler option is needed for performance-sensitive CPU code. Release mode cannot be used because it forces the entire codebase to be recompiled from scratch, every time there is a single change. Navigate to the Git repo in Finder and double-click `Package.swift`. An Xcode window should pop up. On the left, there should be a hierarchy of files. If you cannot unravel the hierarchy, something went wrong.\n\n```\ngit clone https:\u002F\u002Fgithub.com\u002Fphilipturner\u002Fmetal-flash-attention\nswift build -Xswiftc -Ounchecked # Does it even compile?\nswift test -Xswiftc -Ounchecked # Does the test suite finish in ~10 seconds?\n```\n\nAlternatively, create a new Xcode project with the SwiftUI template. Override the `\"Hello, world!\"` string with a call to a function that returns a `String`. This function will execute the script of your choosing, then call `exit(0)`, so the app crashes before rendering anything to the screen. You will use the output in the Xcode console as feedback about your code. This workflow is compatible with both macOS and iOS.\n\nAdd the `-Xswiftc -Ounchecked` option through \u003Cb>Project\u003C\u002Fb> > your project's name > \u003Cb>Build Settings\u003C\u002Fb> > \u003Cb>Swift Compiler - Code Generation\u003C\u002Fb> > \u003Cb>Optimization Level\u003C\u002Fb>. The second column of the table lists your project's name. Click \u003Cb>Other\u003C\u002Fb> in the dropdown and type `-Ounchecked` in the panel that appears. Next, add this repository as a Swift package dependency. Look through some of the tests under `Tests\u002FFlashAttention`. Copy the raw source code for one of these tests into your project. Invoke the test from the function in the previous paragraph. Examine what it displays on the console.\n\nTo modify the Metal code generation (e.g. add multi-head or mask support), copy the raw Swift code into your Xcode project. Either use `git clone` in a separate folder, or download the raw files on GitHub as a ZIP. There is also a way to link to your fork of `metal-flash-attention` and autosave your changes to the cloud, but this is more difficult to set up. Remove the Swift package dependency from the previous paragraph. Re-run the test of your choosing. Does it compile and display something in the console?\n\n### Editing Source Code\n\nLocate one of the multi-line string literals in either of these folders:\n\n```\nSources\u002FFlashAttention\u002FAttention\u002FAttentionKernel\nSources\u002FFlashAttention\u002FGEMM\u002FGEMMKernel\n```\n\nAdd random text to one of them. Compile and run the project again. Something should go terribly wrong. For example, the Metal compiler may throw an error. If this does not happen, try messing up a different line of code somewhere else. If the test still passes, Xcode is not registering your changes.\n\nProceed with coding up [block sparsity](https:\u002F\u002Fpytorch.org\u002Fblog\u002Fflexattention\u002F) or something. Get feedback about whether the code works at all, whether it works fast, whether it works fast at every problem size. Integrate the raw source code into your app, or translate it to another programming language.\n","# FlashAttention（Metal移植版）\n\n本仓库将 [FlashAttention](https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention) 的官方实现移植到了 Apple 芯片上。它是一组极简且易于维护的源文件，完整复现了 FlashAttention 算法。\n\n## 文档\n\n目前仅支持单头注意力机制，以便聚焦于不同注意力算法的核心瓶颈问题（寄存器压力、并行度）。在基础算法正确实现的基础上，添加诸如块稀疏等自定义功能应当相对简单。\n\n所有代码都在运行时进行即时编译（JIT）。这与之前的实现形成对比——后者依赖于内嵌在 Xcode 14.2 中的可执行文件。\n\n反向传播所需的内存占用低于 [Dao-AILab\u002Fflash-attention](https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention)。官方实现会为原子操作和部分求和分配临时工作空间。然而，Apple 硬件并不原生支持 FP32 原子操作（`metal::atomic\u003Cfloat>` 是通过模拟实现的）。在尝试绕过这一硬件限制的过程中，我们发现了 FlashAttention-2 反向核函数中存在的带宽和并行化瓶颈。为此，我们设计了一种计算开销更高的替代反向传播方案：使用 7 次 GEMM 替代原来的 5 次 GEMM。该方案能够在注意力矩阵的行和列两个维度上均达到 100% 的并行效率。更重要的是，它的代码更简洁易读，也更便于维护。\n\n为了克服寄存器压力带来的瓶颈，我们采取了许多非常规的优化手段。当头维度较大时（例如 256），任何矩阵分块都无法完全放入寄存器中，甚至连累加器也无法容纳。因此，我们采用了有意识的寄存器溢出策略，但进行了更为优化的设计。我们在注意力算法中引入了第三个分块维度，并沿 `D` 维度进行分块。同时，我们大幅调整了注意力矩阵分块的长宽比，以最小化寄存器溢出带来的带宽开销。例如，在并行化方向上设置为 16–32，而在遍历方向上设置为 80–128。此外，我们还提供了一个大型参数配置文件，用于指定 `D` 维度的大小，并据此决定哪些操作数可以放入寄存器中，从而选择一个能够平衡多种竞争性瓶颈的分块尺寸。\n\n最终结果是在 M1 Max 上实现了稳定的每秒 44000 亿次指令执行速度（ALU 利用率为 83%），并且支持无限序列长度和无限头维度。前提是混合精度计算中使用 BF16 模拟运算（Metal 的 `bfloat` 实现遵循 IEEE 规范的舍入规则，这在不支持硬件 BF16 的旧款芯片上会造成较大的性能开销）。\n\n![M1_Max_Image.png](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fphilipturner_metal-flash-attention_readme_9ad37a96ea01.png)\n\n![M4_Image.png](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fphilipturner_metal-flash-attention_readme_4a39e667069b.png)\n\n原始数据：https:\u002F\u002Fdocs.google.com\u002Fspreadsheets\u002Fd\u002F1Xf4jrJ7e19I32J1IWIekGE9uMFTeZKoOpQ6hlUoh-xY\u002Fedit?usp=sharing\n\n## 量化性能\n\n在人工智能领域，性能通常以每秒十亿次浮点运算（GFLOPS）来报告。这一指标反映了一种简化的性能模型，即所有指令都在 GEMM 中执行。随着硬件从早期的 FPU 发展到现代的向量处理器，最常见的浮点运算已被融合为一条单独的指令——融合乘加（FMA）。当对两个 100×100 的矩阵进行相乘时，会发出 100 万条 FMA 指令。那么，为什么我们要将这条 FMA 指令视为两条独立的指令呢？\n\n这个问题与注意力机制密切相关，因为并非所有的浮点运算都是一样的。在 softmax 过程中的指数运算可以在一个时钟周期内完成，前提是大多数其他指令都被送往 FMA 单元。然而，softmax 过程中的一些乘法和加法操作无法与附近的加法或乘法操作融合。我们是否应该把这些操作也当作 FMA 来对待，并假装硬件只是以两倍的速度执行 FMA 呢？目前尚不清楚 GEMM 性能模型如何解释我的着色器是否有效地利用了 ALU 硬件。\n\n因此，我改用每秒十亿次指令（GINSTRS）来衡量着色器的性能表现，因为它更直接地对应于算法本身。例如，一次 GEMM 计算需要 `N^3` 条 FMA 指令。前向注意力机制执行两次矩阵乘法，即 `2 * D * N^2` 条 FMA 指令。而后向注意力机制（由 [Dao-AILab\u002Fflash-attention](https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention) 实现）则需要 `5 * D * N^2` 条 FMA 指令。不妨将这张表与 Flash1、Flash2 或 Flash3 论文中的屋顶模型进行对比。\n\n| 操作           | 工作量 |\n| :-------------- | -------: |\n| 正方形 GEMM     | `N^3`  |\n| 前向注意力      | `(2D + 5) * N^2` |\n| 后向朴素注意力  | `4D * N^2` |\n| 后向 FlashAttention | `(5D + 5) * N^2` |\n| 前向 + 后向合并  | `(7D + 10) * N^2` |\n\n由于 FP32 原子操作的复杂性，MFA 在后向传播中采用了不同的方法，这导致更高的计算开销。它将后向传播拆分为两个独立的内核：`dQ` 和 `dK\u002FdV`。下拉菜单中展示了伪代码，可以将其与 Flash1、Flash2 或 Flash3 论文中的一种算法进行比较。\n\n| 操作           | 工作量 |\n| :-------------- | -------: |\n| 前向            | `(2D + 5) * N^2` |\n| 后向 dQ         | `(3D + 5) * N^2` |\n| 后向 dK\u002FdV      | `(4D + 5) * N^2` |\n| 前向 + 后向合并  | `(9D + 15) * N^2` |\n\n\u003Cdetails>\n\u003Csummary>算法伪代码\u003C\u002Fsummary>\n\n```swift\n\u002F\u002F 前向\n\u002F\u002F   for c in 0..\u003CC {\n\u002F\u002F     加载 K[c]\n\u002F\u002F     S = Q * K^T\n\u002F\u002F     (m, l, P) = softmax(m, l, S * 缩放因子)\n\u002F\u002F\n\u002F\u002F     O *= 校正系数\n\u002F\u002F     加载 V[c]\n\u002F\u002F     O += P * V\n\u002F\u002F   }\n\u002F\u002F   O \u002F= l\n\u002F\u002F\n\u002F\u002F   L = m + logBaseE(l)\n\u002F\u002F\n\u002F\u002F 后向查询\n\u002F\u002F   D = dO * O\n\u002F\u002F\n\u002F\u002F   for c in 0..\u003CC {\n\u002F\u002F     加载 K[c]\n\u002F\u002F     S = Q * K^T\n\u002F\u002F     P = exp(S - L)\n\u002F\u002F\n\u002F\u002F     加载 V[c]\n\u002F\u002F     dP = dO * V^T\n\u002F\u002F     dS = P * (dP - D) * 缩放因子\n\u002F\u002F\n\u002F\u002F     加载 K[c]\n\u002F\u002F     dQ += dS * K\n\u002F\u002F   }\n\u002F\u002F\n\u002F\u002F 后向键值\n\u002F\u002F   for r in 0..\u003CR {\n\u002F\u002F     加载 Q[r]\n\u002F\u002F     加载 L[r]\n\u002F\u002F     S^T = K * Q^T\n\u002F\u002F     P^T = exp(S^T - L)\n\u002F\u002F\n\u002F\u002F     加载 dO[r]\n\u002F\u002F     dV += P^T * dO\n\u002F\u002F\n\u002F\u002F     加载 dO[r]\n\u002F\u002F     加载 D[r]\n\u002F\u002F     dP^T = V * dO^T\n\u002F\u002F     dS^T = P^T * (dP^T - D) * 缩放因子\n\u002F\u002F\n\u002F\u002F     加载 Q[r]\n\u002F\u002F     dK += dS^T * Q\n\u002F\u002F   }\n```\n\n\u003C\u002Fdetails>\n\n性能是通过计算总的计算工作量，再除以时间（秒）来衡量的。最终结果就是“每秒十亿次指令”。接下来，我们需要一个屋顶模型。下表显示了基于 GINSTRS 的屋顶线，其数值为 GFLOPS 的一半。ALU 利用率等于实际每秒十亿次指令数除以理论上的每秒十亿次指令数。例如，M1 Max 在混合精度下通常能达到 80% 的 ALU 利用率。\n\n不过，这种模型也有局限性。在 M3 芯片上，当头维度较小时，该模型就会失效。不同的计算单元可能会同时被利用，从而导致 apparent 利用率超过 100%。尽管如此，总体而言，该基准测试仍能较为准确地反映出系统还有多少性能潜力尚未被挖掘。\n\n```swift\nvar operations: Int\nswitch benchmarkedKernel {\ncase .forward:\n  operations = 2 * headDimension + 5\ncase .backwardQuery:\n  operations = 3 * headDimension + 5\ncase .backwardKeyValue:\n  operations = 4 * headDimension + 5\n}\noperations *= (sequenceDimension * sequenceDimension)\noperations *= dispatchCount\n\n\u002F\u002F 将总工作量除以延迟时间，得到吞吐量。\nlet instrs = Double(operations) \u002F Double(latencySeconds)\nlet ginstrs = Int(instrs \u002F 1e9)\n```\n\n| 硬件       | GFLOPS | GINSTRS |\n| :--------- | -------: | --------: |\n| M1 Max     | 10616  | 5308    |\n| M4         | 3580   | 1790    |\n\nMetal 版本与官方 FlashAttention 仓库相比，性能如何呢？假设我采用了“原子 dQ”算法并达到了 100% 的性能，随后切换到实际的 MFA 仓库，却发现模型训练速度慢了四倍。这意味着我只发挥了官方仓库屋顶线的 25%。要计算这个百分比，只需将三个内核的平均 ALU 利用率乘以 `7 \u002F 9` 即可。虽然针对 Apple 硬件的统计使用了更为精细的模型，但大致思路就是这样。\n\n为了计算 Nvidia 硬件的利用率，我使用了 FP16\u002FBF16 ALU 的 GFLOPS 数据。我将论文中每张图给出的最高 GFLOPS 数值分别除以 A100 SXM 的 312000 和 H100 SXM 的 989000。需要注意的是，对于较大的头维度和寄存器密集型的内核（如后向传播），并未报告任何基准测试结果。我确认他们并未解决无限头维度下的寄存器压力问题。例如，累加器始终保存在寄存器中。截至撰写本文时，我尚未看到 D=256 的后向梯度能够正确执行的具体证据。\n\n### GFLOPS\n\n| A100, Flash2, FP16 | D = 64  | D = 128 | D = 256 |\n| :----------------- | ------: | ------: | ------: |\n| 前向              | 192000  | 223000  | 0       |\n| 后向              | 170000  | 196000  | 0       |\n| 前向 + 后向        | 176000  | 203000  | 0       |\n\n| H100, Flash3, FP16 | D = 64  | D = 128 | D = 256 |\n| :----------------- | ------: | ------: | ------: |\n| 前向              | 497000  | 648000  | 756000  |\n| 后向              | 474000  | 561000  | 0       |\n| 前向 + 后向        | 480000  | 585000  | 0       |\n\n| H100, Flash3, FP8  | D = 64  | D = 128 | D = 256 |\n| :----------------- | ------: | ------: | ------: |\n| 前向              | 613000  | 1008000 | 1171000 |\n| 后向              | 0       | 0       | 0       |\n| 前向 + 后向        | 0       | 0       | 0       |\n\n### 计算利用率\n\n| A100, Flash2, FP16 | D = 64  | D = 128 | D = 256 |\n| :----------------- | ------: | ------: | ------: |\n| 前向传播            | 62%     | 71%     | 0%      |\n| 前向传播 + 反向传播 | 56%     | 65%     | 0%      |\n\n| H100, Flash3, FP16 | D = 64  | D = 128 | D = 256 |\n| :----------------- | ------: | ------: | ------: |\n| 前向传播            | 50%     | 66%     | 76%     |\n| 前向传播 + 反向传播 | 48%     | 59%     | 0%      |\n\n| M1 架构, FP16 | D = 64  | D = 128 | D = 256 |\n| :------------- | ------: | ------: | ------: |\n| 前向传播       | 86%     | 85%     | 86%     |\n| 前向传播 + 反向传播 | 62%     | 63%     | 64%     |\n\n| M3 架构, FP16 | D = 64  | D = 128 | D = 256 |\n| :------------- | ------: | ------: | ------: |\n| 前向传播       | 94%     | 91%     | 82%     |\n| 前向传播 + 反向传播 | 71%     | 69%     | 61%     |\n\n### 并排对比\n\n| 2020 年生产的硬件 | D = 64  | D = 128 | D = 256 |\n| :---------------- | ------: | ------: | ------: |\n| A100              | 56%     | 65%     | 0%      |\n| M1–M2 架构        | 62%     | 63%     | 64%     |\n\n| 2023 年生产的硬件 | D = 64  | D = 128 | D = 256 |\n| :---------------- | ------: | ------: | ------: |\n| H100（使用 FP8 GFLOPS）   | 24%     | 30%     | 0%      |\n| H100（使用 FP16 GFLOPS）  | 48%     | 59%     | 0%      |\n| M3–M4 架构              | 71%     | 69%     | 61%     |\n\n尽管发出的计算量更多，但苹果硬件在训练 Transformer 模型时的速度\u003Cb>比执行相同任务的英伟达硬件更快\u003C\u002Fb>。这一比较已根据不同 GPU 的规模差异进行了归一化处理，仅关注 GPU 的利用效率。\n\n或许主仓库应该尝试一种避免 FP32 原子操作、并在寄存器无法容纳于 GPU 核心时主动溢出的算法。不过这种可能性不大，因为他们已经为一小部分可能的问题规模硬编码了支持。其动机似乎是支持最常见的模型，即 `D` 为 2 的幂且小于 128 的情况。对于其他情况，用户需要依赖替代的回退实现（例如 MFA 仓库），而这些实现可能会采用完全不同的底层算法。\n\n## 使用方法\n\n### 设置工作流程\n\n在 macOS 上，下载 Swift 包并使用 `-Xswiftc -Ounchecked` 进行编译。此编译选项对性能敏感的 CPU 代码至关重要。不能使用 Release 模式，因为它会在每次有哪怕一处改动时，强制从头重新编译整个代码库。在 Finder 中导航到 Git 仓库并双击 `Package.swift`，应会弹出一个 Xcode 窗口。左侧应显示文件层级结构。如果无法展开该结构，则说明设置出现了问题。\n\n```\ngit clone https:\u002F\u002Fgithub.com\u002Fphilipturner\u002Fmetal-flash-attention\nswift build -Xswiftc -Ounchecked # 是否能成功编译？\nswift test -Xswiftc -Ounchecked # 测试套件是否能在约 10 秒内完成？\n```\n\n或者，可以使用 SwiftUI 模板创建一个新的 Xcode 项目，将默认的 `\"Hello, world!\"` 字符串替换为调用一个返回 `String` 的函数。该函数将执行你选择的脚本，然后调用 `exit(0)`，使应用在渲染任何内容之前崩溃。你可以通过 Xcode 控制台输出来获取代码反馈。此工作流程同时适用于 macOS 和 iOS。\n\n通过 \u003Cb>项目\u003C\u002Fb> > 你的项目名称 > \u003Cb>构建设置\u003C\u002Fb> > \u003Cb>Swift 编译器 - 代码生成\u003C\u002Fb> > \u003Cb>优化级别\u003C\u002Fb> 添加 `-Xswiftc -Ounchecked` 选项。表格的第二列显示你的项目名称。点击下拉菜单中的 \u003Cb>其他\u003C\u002Fb>,并在出现的面板中输入 `-Ounchecked`。接下来，将此仓库添加为 Swift 包依赖项。浏览一下 `Tests\u002FFlashAttention` 下的一些测试，将其中一个测试的原始源代码复制到你的项目中，并从上一段提到的函数中调用该测试，检查控制台输出的内容。\n\n要修改 Metal 代码生成（例如添加多头或掩码支持），可将原始 Swift 代码复制到你的 Xcode 项目中。你可以选择在单独的文件夹中使用 `git clone`，或直接从 GitHub 下载 ZIP 格式的原始文件。此外，还可以链接到你自己的 `metal-flash-attention` 分支，并自动将更改保存到云端，但这种方式设置起来较为复杂。移除上一段中的 Swift 包依赖项，然后重新运行你选择的测试。它能否成功编译并在控制台显示内容？\n\n### 编辑源代码\n\n在以下任一文件夹中找到一个多行字符串字面量：\n\n```\nSources\u002FFlashAttention\u002FAttention\u002FAttentionKernel\nSources\u002FFlashAttention\u002FGEMM\u002FGEMMKernel\n```\n\n向其中任意一个添加随机文本，然后再次编译并运行项目。此时应该会出现严重错误，例如 Metal 编译器抛出异常。如果未发生这种情况，请尝试修改其他地方的代码行。若测试仍能通过，则说明 Xcode 并未识别你的更改。\n\n接下来可以继续开发 [块稀疏性](https:\u002F\u002Fpytorch.org\u002Fblog\u002Fflexattention\u002F) 或其他功能，以验证代码是否有效、运行速度如何以及在各种问题规模下是否都能保持高效。最后，将原始源代码集成到你的应用中，或将其转换为其他编程语言。","# metal-flash-attention 快速上手指南\n\n`metal-flash-attention` 是官方 FlashAttention 算法在 Apple Silicon (M1\u002FM2\u002FM3\u002FM4 等) 芯片上的 Metal 移植版本。它专注于核心性能瓶颈优化，支持即时编译 (JIT)，并在大维度头 (Head Dimension) 下通过独特的寄存器溢出策略实现了极高的 ALU 利用率。\n\n## 环境准备\n\n*   **操作系统**: macOS (推荐最新版本)\n*   **硬件**: Apple Silicon 芯片 (M1, M2, M3, M4 系列)\n*   **开发工具**:\n    *   Xcode (需包含命令行工具)\n    *   Swift 5.9 或更高版本\n*   **依赖**: 无额外第三方依赖，仅需标准 Swift 包管理器。\n\n## 安装步骤\n\n1.  **克隆仓库**\n    打开终端，执行以下命令下载源码：\n    ```bash\n    git clone https:\u002F\u002Fgithub.com\u002Fphilipturner\u002Fmetal-flash-attention\n    cd metal-flash-attention\n    ```\n\n2.  **编译项目**\n    为了获得最佳性能，必须使用 `-Ounchecked` 优化标志进行编译。该标志针对性能敏感的 CPU 代码进行了优化，但会跳过部分安全检查（仅限受信任的代码库）。\n    ```bash\n    swift build -Xswiftc -Ounchecked\n    ```\n\n3.  **运行测试（可选）**\n    验证安装是否成功及核心算法是否正确：\n    ```bash\n    swift test -Xswiftc -Ounchecked\n    ```\n\n4.  **Xcode 集成（可选）**\n    如果你偏好使用 Xcode 进行开发或调试：\n    *   在 Finder 中进入项目目录。\n    *   双击 `Package.swift` 文件。\n    *   Xcode 将自动打开并解析项目结构。\n\n## 基本使用\n\n该项目主要作为底层内核库供其他 Swift 项目调用。以下是一个最简单的集成示例，展示如何在你的 Swift 项目中引入并使用它。\n\n### 1. 添加依赖\n\n在你的项目根目录下的 `Package.swift` 文件中，添加 `metal-flash-attention` 作为依赖：\n\n```swift\n\u002F\u002F Package.swift\ndependencies: [\n    .package(url: \"https:\u002F\u002Fgithub.com\u002Fphilipturner\u002Fmetal-flash-attention.git\", branch: \"main\")\n],\ntargets: [\n    .executableTarget(\n        name: \"YourApp\",\n        dependencies: [\"metal-flash-attention\"]\n    )\n]\n```\n\n### 2. 代码示例\n\n在你的 Swift 源代码中导入模块并调用注意力机制。由于该库采用 JIT 编译，运行时会根据当前的序列长度和头维度动态生成优化的 Metal 着色器。\n\n```swift\nimport MetalFlashAttention\nimport Metal\n\n\u002F\u002F 初始化 Metal 设备\nlet device = MTLCreateSystemDefaultDevice()!\nlet commandQueue = device.makeCommandQueue()!\n\n\u002F\u002F 配置参数 (示例)\nlet headDimension = 128\nlet sequenceLength = 2048\nlet batchSize = 1\n\n\u002F\u002F 创建输入张量 (此处仅为伪代码示意，实际需分配 Metal Buffer)\n\u002F\u002F let queryBuffer = ...\n\u002F\u002F let keyBuffer = ...\n\u002F\u002F let valueBuffer = ...\n\n\u002F\u002F 执行前向传播 (Forward Pass)\n\u002F\u002F 注意：具体 API 调用请参考库中暴露的最新接口，核心逻辑如下：\n\u002F*\nlet attention = FlashAttention(device: device)\nlet outputBuffer = attention.forward(\n    query: queryBuffer,\n    key: keyBuffer,\n    value: valueBuffer,\n    headDim: headDimension,\n    seqLen: sequenceLength\n)\n*\u002F\n\nprint(\"FlashAttention kernel compiled and ready on Apple Silicon.\")\n```\n\n> **提示**: 该库针对无限序列长度和无限头维度进行了优化，特别是在 `D=256` 等大维度场景下，通过特殊的分块策略避免了寄存器压力瓶颈，性能表现优于传统实现。","一位 iOS 开发者正在 Apple Silicon 设备上部署并优化一个长文本生成的本地大语言模型应用。\n\n### 没有 metal-flash-attention 时\n- **显存爆炸导致崩溃**：处理长序列（如万字文档）时，标准 Attention 机制需要分配巨大的中间缓存，极易触发 M1\u002FM2 芯片的内存上限导致应用闪退。\n- **推理速度缓慢**：由于缺乏针对 Metal 后向传播的深度优化，反向计算时带宽受限，生成每个 token 的延迟极高，用户等待时间过长。\n- **寄存器压力瓶颈**：在处理高维头（Head Dimension 256+）时，数据无法完全放入寄存器，频繁的内存读写导致算力利用率低下，ALU 闲置严重。\n- **开发维护困难**：旧方案依赖嵌入 Xcode 的可执行文件，难以动态调整或集成自定义的稀疏注意力模块，迭代新功能成本高昂。\n\n### 使用 metal-flash-attention 后\n- **无限序列长度支持**：通过优化的分块算法和更低的反向传播内存占用，即使在“无限”序列长度下也能稳定运行，彻底消除长文本处理的显存焦虑。\n- **极致算力释放**：在 M1 Max 上实现了每秒 4400 亿条指令的稳定输出，ALU 利用率高达 83%，显著降低了长文本生成的延迟。\n- **智能寄存器溢出管理**：针对大维度头设计了特殊的第三维分块策略，以最小的带宽代价处理寄存器溢出，确保高维计算依然高效流畅。\n- **运行时灵活编译**：所有代码均在运行时即时编译（JIT），无需依赖特定 Xcode 版本，让开发者能轻松添加块稀疏性等自定义功能。\n\nmetal-flash-attention 通过重构底层并行策略与内存管理，让 Apple 设备上的长上下文大模型推理从“不可用”变为“高性能且稳定”。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fphilipturner_metal-flash-attention_be812ca5.png","philipturner","Philip Turner","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fphilipturner_a8917c9f.jpg","Molecular Nanotechnology Researcher",null,"philipturner.AR@gmail.com","philipturnerar","https:\u002F\u002Fgithub.com\u002Fphilipturner",[85],{"name":86,"color":87,"percentage":88},"Swift","#F05138",100,596,38,"2026-03-30T21:13:52","MIT",4,"macOS","必需 Apple Silicon GPU (M1, M2, M3, M4 系列等)，不支持 NVIDIA CUDA","未说明",{"notes":98,"python":99,"dependencies":100},"该项目是 FlashAttention 的 Metal 移植版，专为 Apple Silicon 芯片设计。主要使用 Swift 语言开发，需在 macOS 上通过 Xcode 或 Swift 包管理器编译运行（建议使用 `-Ounchecked` 优化选项）。不支持 Linux 或 Windows，也不依赖 Python 或 CUDA。代码在运行时进行 JIT 编译，针对大头部维度（如 D=256）进行了寄存器溢出优化。","不适用 (基于 Swift)",[101,102,103],"Xcode 14.2+","Swift 编译器","Metal",[14,26],[106,107,108,109,110,111,112],"artificial-intelligence","attention-mechanism","high-performance-computing","metal","stable-diffusion","transformer-models","software-engineering","2026-03-27T02:49:30.150509","2026-04-06T07:22:58.235508",[116,121,126,131,135,140],{"id":117,"question_zh":118,"answer_zh":119,"source_url":120},16619,"如何在 Python 或 PyTorch 代码中使用此 Flash Attention 库？","目前官方没有直接的 Python 绑定。维护者建议使用 Tinygrad 使用的 PyObjC 绑定以获得更底层的控制（如设置每个线程组的最大线程数）。对于希望桥接 Python 和 Metal 代码的用户，可以尝试使用 MLX 库中的新接口 `mlx.core.fast.metal_kernel`。需要注意的是，Python 在 CPU 端可能存在瓶颈，因为它不像 Swift 编译模式（-Xswiftc -Ounchecked）那样能编译为优化的汇编代码。","https:\u002F\u002Fgithub.com\u002Fphilipturner\u002Fmetal-flash-attention\u002Fissues\u002F2",{"id":122,"question_zh":123,"answer_zh":124,"source_url":125},16620,"M3 芯片上的 GEMM 性能表现如何？有哪些优化建议？","针对 M3 及更高版本芯片，推荐的最佳块大小（Block Sizes）配置如下：\n1. 使用异步拷贝（async copy）时：32x32x32 和 48x48x24。\n2. 不使用异步拷贝时：32x32x8。\n相比之下，MPS 在 M3+ 上使用 32x32x8 的配置。维护者指出，FP32 是第一步，后续可探索截断 FP32（brain float）等高级类型。","https:\u002F\u002Fgithub.com\u002Fphilipturner\u002Fmetal-flash-attention\u002Fissues\u002F11",{"id":127,"question_zh":128,"answer_zh":129,"source_url":130},16621,"为什么在 GEMV（矩阵乘向量）操作中使用共享内存（SRAM）有时反而没有提升性能？","GEMV 和 GEMM 的性能特征截然不同。GEMV 通常是带宽受限（memory-bound），瓶颈在于从内存读取矩阵，矩阵只被读取一次，因此目标是最大化读取带宽，而非使用共享内存。共享内存和 SIMD 矩阵乘法指令（如 async copy）主要针对计算受限（compute-bound）的 GEMM 操作，其中矩阵会被多次读取。在 GEMV 中强行使用共享内存可能无法带来预期加速，甚至因开销导致变慢。","https:\u002F\u002Fgithub.com\u002Fphilipturner\u002Fmetal-flash-attention\u002Fissues\u002F12",{"id":132,"question_zh":133,"answer_zh":134,"source_url":120},16622,"Apple Neural Engine (ANE) 与 GPU 在运行 Flash Attention 时的性能对比如何？","ANE 并不总能发挥全部性能。例如，通过 MPSGraph 运行时，通用 GEMM 只能达到其标称 FLOPS 的 25%，这通常是缓存带宽瓶颈所致。只有卷积等数据密集型操作才能在 ANE 上达到高吞吐量（如 15 TFLOPS FP16）。Flash Attention 利用局部 SRAM 的灵活性和附近的超越数学单元加速注意力模块，缩小了 GPU 与 ANE 的差距。在 iPhone 和 M1 上，ANE 略快；但在其他机器上，GPU 通常更快或相当。",{"id":136,"question_zh":137,"answer_zh":138,"source_url":139},16623,"M1\u002FM2\u002FM3 系列芯片对 bfloat16 (BF16) 的支持情况如何？","M2 CPU 首次在 NEON 和 AMX 单元中引入了 BF16 的硬件支持，但 M1 CPU 不支持。在 GPU 方面，虽然 Metal 语言概念上支持 bfloat，但 M1\u002FM2 GPU 没有像 float 那样的内置硬件支持（可能需要软件模拟），直到 M3 架构才可能具备真正的硬件加速。目前在 Swift SIMD、Accelerate 或 BLAS\u002FLAPACK 中无法直接使用 BF16，只能通过内联汇编访问。在 GPU 上使用 16 位类型主要优势在于减少寄存器压力或降低内存带宽，而非直接的计算速度提升。","https:\u002F\u002Fgithub.com\u002Fphilipturner\u002Fmetal-flash-attention\u002Fissues\u002F17",{"id":141,"question_zh":142,"answer_zh":143,"source_url":130},16624,"不同 GPU 核心之间的内存带宽指标（如 Bytes\u002Fcore-cycle）有何规律？","当按 GPU 核心数量和频率（GHz）进行归一化后，同一家族甚至不同厂商的 GPU 表现出非常相似的特征。例如，L1 缓存通常为 64 bytes\u002Fcore\u002Fcycle，L2 缓存为 32 bytes\u002Fcore\u002Fcycle。CPU 和 GPU 核心的 I\u002FO 总线位数相同，主要区别在于 GPU 核心拥有更多的晶体管用于数学计算。理解这些归一化指标有助于在不同硬件间进行性能预估和优化。",[145,150,155,160],{"id":146,"version":147,"summary_zh":148,"released_at":149},98890,"v1.0.1","在 GEMM 中添加了融合偏置。","2023-07-28T21:33:30",{"id":151,"version":152,"summary_zh":153,"released_at":154},98891,"v1.0.0","FlashAttention，包括稠密和块稀疏两种实现。\n\n稠密版本的性能始终比 MPSGraph 高一个数量级（3到5倍）。在某些边缘场景下，这种差距甚至可达两个数量级（20倍）。MPSGraph 是 Apple 推荐用于在机器学习应用中使用 Metal 的现代 API。\n\n块稀疏版本间接支持（并加速）三角因果掩码，但任务分配并不理想。它的速度有时比稠密版本理论上的最佳性能快60%，有时则与稠密版本持平；性能具有非确定性。这使得它与来自 https:\u002F\u002Fgithub.com\u002FDao-AILab\u002Fflash-attention 的 FlashAttention-2 实现表现相同。","2023-07-27T20:03:43",{"id":156,"version":157,"summary_zh":158,"released_at":159},98892,"v0.2.0-alpha","新增了对融合转置和批量 GEMM 的支持。","2023-07-08T00:29:48",{"id":161,"version":162,"summary_zh":163,"released_at":164},98893,"v0.1.0-alpha","初始阿尔法版本。","2023-07-06T17:07:43"]