[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-gy910210--rnn-from-scratch":3,"tool-gy910210--rnn-from-scratch":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",153609,2,"2026-04-13T11:34:59",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":77,"owner_email":78,"owner_twitter":75,"owner_website":79,"owner_url":80,"languages":81,"stars":86,"forks":87,"last_commit_at":88,"license":75,"difficulty_score":89,"env_os":90,"env_gpu":91,"env_ram":91,"env_deps":92,"category_tags":96,"github_topics":97,"view_count":32,"oss_zip_url":75,"oss_zip_packed_at":75,"status":17,"created_at":104,"updated_at":105,"faqs":106,"releases":121},7200,"gy910210\u002Frnn-from-scratch","rnn-from-scratch","Implementing Recurrent Neural Network from Scratch","rnn-from-scratch 是一个专为深度学习爱好者打造的开源项目，旨在从零开始手写实现循环神经网络（RNN）。它主要解决了初学者在面对复杂深度学习框架时，难以深入理解 RNN 内部运作机制及梯度计算原理的痛点。通过摒弃现成的高级 API，该项目引导用户一步步构建网络架构，直观地展示参数如何在不同时间步中共享，以及如何利用交叉熵损失函数进行模型优化。\n\n这个项目特别适合具有一定神经网络基础的开发者、人工智能研究人员以及计算机专业的学生使用。如果你已经了解前馈神经网络，并希望进一步探索序列建模的奥秘，rnn-from-scratch 将是理想的进阶学习材料。其独特的技术亮点在于基于“计算图”实现了自动微分，并详细演示了如何通过“随时间反向传播”（BPTT）算法来计算梯度。这种方法不仅比手动推导公式更加简洁可靠，还清晰地揭示了传统反向传播与 BPTT 之间的本质联系——即在时间维度上对梯度进行累加。通过以 RNN 语言模型为例，该项目帮助用户在代码实践中真正掌握处理序列数据的核心逻辑，为后续开发更复杂的自然语言处理应用打下坚实基础。","# Implementing Recurrent Neural Network from Scratch\nI’m assuming that you are somewhat familiar with basic Neural Networks. If you’re not, you may want to head over to [Implementing A Neural Network From Scratch](https:\u002F\u002Fgithub.com\u002Fpangolulu\u002Fneural-network-from-scratch),  which guides you through the ideas and implementation behind non-recurrent networks.\n## Introduction\nThis post is inspired by [recurrent-neural-networks-tutorial](http:\u002F\u002Fwww.wildml.com\u002F2015\u002F09\u002Frecurrent-neural-networks-tutorial-part-1-introduction-to-rnns\u002F) from [WildML](http:\u002F\u002Fwww.wildml.com\u002F). And you can deeply read it to know the basic knowledge about RNN, which I will not include in this tutorial.\n\nIn this tutorial, we will focus on how to train RNN by [Backpropagation Through Time (BPTT)](http:\u002F\u002Fwww.wildml.com\u002F2015\u002F10\u002Frecurrent-neural-networks-tutorial-part-3-backpropagation-through-time-and-vanishing-gradients\u002F), based on the **computation graph** of RNN and do **automatic differentiation**. You can find that it is more simple and reliable to calculate the gradient in this way than you do it by hand.\n\nThis post will take RNN language model (rnnlm) as example. More about the fancy applications of RNN can be found [here](http:\u002F\u002Fkarpathy.github.io\u002F2015\u002F05\u002F21\u002Frnn-effectiveness\u002F).\n## How to train RNN\nThe architecture of RNN can be as the following figure.\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_6ee325d3906f.jpg)\n\nYou can find that the parameters `(W, U, V)` are shared in different time steps. And the output in each time step can be **softmax**. So you can use **cross entropy** loss as an error function and use some optimizing method (e.g. gradient descent) to calculate the optimized parameters `(W, U, V)`.\n\nLet recap the equations of our RNN:\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_3b204ca0428c.png)\n\nWe also defined our loss, or error, to be the cross entropy loss, given by:\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_63d9d9c8d26b.png)\n\nHere `y_t` is the correct word at time step `t`, and `y^_t` is our prediction. We typically treat the full sequence (sentence) as one training example, so the total error is just the sum of the errors at each time step (word).\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_3588c3fb9606.png)\n\nRemember that our goal is to calculate the gradients of the error with respect to our parameters `U`, `V` and `W` and then learn good parameters using optimizing method (in this post we use **Stochastic Gradient Descent**). Just like we sum up the errors, we also sum up the gradients at each time step for one training example:  ![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_e6ade86f89c5.png). That is we should calculate `dEt\u002FdW`, `dEt\u002FdU` and `dEt\u002FdV`, then sum up all time steps.\n\nIt is simple to calculate `dEt\u002FdV`, because it only depends on the values at the current time step. But the story is different for `dEt\u002FdW` and `dEt\u002FdU`. Note that `s_3 = tanh(Ux_3 + Ws_2)` depend on `s_2`, which depends on `W`, `U` and `s_1`, and so on.  So if we take the derivative with respect to `W` we can't treat `s_2` as a constant! We need to apply the chain rule again. You can have a view from the following figure.\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_9a3c4effe6fb.png)\n\nNow use **computation graph** to represent `E1` as an example and calculate `dE1\u002FdW`, `dE1\u002FdU` is the same idea.\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_7e9032d82ff8.png)\n\nNote that this is exactly the same as the standard backpropagation algorithm that we use in deep [Feedforward Neural Networks](https:\u002F\u002Fgithub.com\u002Fpangolulu\u002Fneural-network-from-scratch). The key difference is that we sum up the gradients for `W` at each time step. In a traditional NN we don’t share parameters across layers, so we don’t need to sum anything.  But in my opinion BPTT is just a fancy name for standard backpropagation on an unrolled RNN.\n\nTo simplify the **computation graph** to make it efficient, we can integrate some small operation units to a big operation unit. You can have a look the following figure. Note that the operation unit should also implement the `forward` function and `backward` function.\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_b7f4dad4706c.png)\n\nThe implementation of all operation unit and softmax output can be found as follows:\n```python\nmulGate = MultiplyGate()\naddGate = AddGate()\nactivation = Tanh()\n\nclass RNNLayer:\n    def forward(self, x, prev_s, U, W, V):\n        self.mulu = mulGate.forward(U, x)\n        self.mulw = mulGate.forward(W, prev_s)\n        self.add = addGate.forward(self.mulw, self.mulu)\n        self.s = activation.forward(self.add)\n        self.mulv = mulGate.forward(V, self.s)\n        \n    def backward(self, x, prev_s, U, W, V, diff_s, dmulv):\n        self.forward(x, prev_s, U, W, V)\n        dV, dsv = mulGate.backward(V, self.s, dmulv)\n        ds = dsv + diff_s\n        dadd = activation.backward(self.add, ds)\n        dmulw, dmulu = addGate.backward(self.mulw, self.mulu, dadd)\n        dW, dprev_s = mulGate.backward(W, prev_s, dmulw)\n        dU, dx = mulGate.backward(U, x, dmulu)\n        return (dprev_s, dU, dW, dV)\n```\n\n```python\nclass MultiplyGate:\n    def forward(self,W, x):\n        return np.dot(W, x)\n    def backward(self, W, x, dz):\n        dW = np.asarray(np.dot(np.transpose(np.asmatrix(dz)), np.asmatrix(x)))\n        dx = np.dot(np.transpose(W), dz)\n        return dW, dx\n\nclass AddGate:\n    def forward(self, x1, x2):\n        return x1 + x2\n    def backward(self, x1, x2, dz):\n        dx1 = dz * np.ones_like(x1)\n        dx2 = dz * np.ones_like(x2)\n        return dx1, dx2\n```\n\n```python\nclass Sigmoid:\n    def forward(self, x):\n        return 1.0 \u002F (1.0 + np.exp(-x))\n    def backward(self, x, top_diff):\n        output = self.forward(x)\n        return (1.0 - output) * output * top_diff\n\nclass Tanh:\n    def forward(self, x):\n        return np.tanh(x)\n    def backward(self, x, top_diff):\n        output = self.forward(x)\n        return (1.0 - np.square(output)) * top_diff\n```\n\n```python\nclass Softmax:\n    def predict(self, x):\n        exp_scores = np.exp(x)\n        return exp_scores \u002F np.sum(exp_scores)\n    def loss(self, x, y):\n        probs = self.predict(x)\n        return -np.log(probs[y])\n    def diff(self, x, y):\n        probs = self.predict(x)\n        probs[y] -= 1.0\n        return probs\n```\n\nThese implementation is just the same with [Implementing A Neural Network From Scratch](https:\u002F\u002Fgithub.com\u002Fpangolulu\u002Fneural-network-from-scratch), except that in this post the input `x` or `s` is `1-D array`, but in previous post input `X` is a batch of data represented as a matrix (each row is an example).\n\nNow that we are able to calculate the gradients for our parameters we can use SGD to train the model.\n\n## Implement\n### Initialization\nInitializing the parameters  `U`, `V` and `W` is a bit tricky. We can’t just initialize them to 0’s because that would result in symmetric calculations in all our layers. We must initialize them randomly. Because proper initialization seems to have an impact on training results there has been lot of research in this area. It turns out that the best initialization depends on the activation function (`tanh` in our case) and one recommended approach is to initialize the weights randomly in the interval from ![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_5ead4467c15c.png) where `n` is the number of incoming connections from the previous layer. \n```python\nclass Model:\n    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):\n        self.word_dim = word_dim\n        self.hidden_dim = hidden_dim\n        self.bptt_truncate = bptt_truncate\n        self.U = np.random.uniform(-np.sqrt(1. \u002F word_dim), np.sqrt(1. \u002F word_dim), (hidden_dim, word_dim))\n        self.W = np.random.uniform(-np.sqrt(1. \u002F hidden_dim), np.sqrt(1. \u002F hidden_dim), (hidden_dim, hidden_dim))\n        self.V = np.random.uniform(-np.sqrt(1. \u002F hidden_dim), np.sqrt(1. \u002F hidden_dim), (word_dim, hidden_dim))\n```\n\nAbove, `word_dim` is the size of our vocabulary, and `hidden_dim` is the size of our hidden layer (we can pick it). Don’t worry about the `bptt_truncate` parameter for now, we’ll explain what that is later.\n### Forward Propagation\nNext, let’s implement the forward propagation (predicting word probabilities) defined by our equations above:\n```python\n'''\n    forward propagation (predicting word probabilities)\n    x is one single data, and a batch of data\n    for example x = [0, 179, 341, 416], then its y = [179, 341, 416, 1]\n'''\ndef forward_propagation(self, x):\n    # The total number of time steps\n    T = len(x)\n    layers = []\n    prev_s = np.zeros(self.hidden_dim)\n    # For each time step...\n    for t in range(T):\n        layer = RNNLayer()\n        input = np.zeros(self.word_dim)\n        input[x[t]] = 1\n        layer.forward(input, prev_s, self.U, self.W, self.V)\n        prev_s = layer.s\n        layers.append(layer)\n    return layers\n```\n\nWe also implement a `predict` function to generate the results.\n```python\ndef predict(self, x):\n    output = Softmax()\n    layers = self.forward_propagation(x)\n    return [np.argmax(output.predict(layer.mulv)) for layer in layers]\n```\n\n### Calculating the Loss\nTo train our network we need a way to measure the errors it makes. We call this the loss function `L`, and our goal is find the parameters `U`, `V` and `W` that minimize the loss function for our training data. A common choice for the loss function is the **cross-entropy** loss. \n```python\ndef calculate_loss(self, x, y):\n    assert len(x) == len(y)\n    output = Softmax()\n    layers = self.forward_propagation(x)\n    loss = 0.0\n    for i, layer in enumerate(layers):\n        loss += output.loss(layer.mulv, y[i])\n    return loss \u002F float(len(y))\n\ndef calculate_total_loss(self, X, Y):\n    loss = 0.0\n    for i in range(len(Y)):\n        loss += self.calculate_loss(X[i], Y[i])\n    return loss \u002F float(len(Y))\n```\n\n### Backpropagation Through Time (BPTT)\nJust as what we have introduced, we implement BPTT algorithm.  It takes as input a training example `(x, y)` and returns the gradients `dL\u002FdW`, `dL\u002FdU` and `dL\u002FdV`.\n```python\ndef bptt(self, x, y):\n    assert len(x) == len(y)\n    output = Softmax()\n    layers = self.forward_propagation(x)\n    dU = np.zeros(self.U.shape)\n    dV = np.zeros(self.V.shape)\n    dW = np.zeros(self.W.shape)\n\n    T = len(layers)\n    prev_s_t = np.zeros(self.hidden_dim)\n    diff_s = np.zeros(self.hidden_dim)\n    for t in range(0, T):\n        dmulv = output.diff(layers[t].mulv, y[t])\n        input = np.zeros(self.word_dim)\n        input[x[t]] = 1\n        dprev_s, dU_t, dW_t, dV_t = layers[t].backward(input, prev_s_t, self.U, self.W, self.V, diff_s, dmulv)\n        prev_s_t = layers[t].s\n        dmulv = np.zeros(self.word_dim)\n        for i in range(t-1, max(-1, t-self.bptt_truncate-1), -1):\n            input = np.zeros(self.word_dim)\n            input[x[i]] = 1\n            prev_s_i = np.zeros(self.hidden_dim) if i == 0 else layers[i-1].s\n            dprev_s, dU_i, dW_i, dV_i = layers[i].backward(input, prev_s_i, self.U, self.W, self.V, dprev_s, dmulv)\n            dU_t += dU_i\n            dW_t += dW_i\n        dV += dV_t\n        dU += dU_t\n        dW += dW_t\n    return (dU, dW, dV)\n```\n\n### SGD Implementation\nNow that we are able to calculate the gradients for our parameters we can implement SGD. I like to do this in two steps:\n\n1. A function `sdg_step` that calculates the gradients and performs the updates for one batch.\n2. An outer loop that iterates through the training set and adjusts the learning rate.\n\n```python\ndef sgd_step(self, x, y, learning_rate):\n    dU, dW, dV = self.bptt(x, y)\n    self.U -= learning_rate * dU\n    self.V -= learning_rate * dV\n    self.W -= learning_rate * dW\n   \ndef train(self, X, Y, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):\n    num_examples_seen = 0\n    losses = []\n    for epoch in range(nepoch):\n        if (epoch % evaluate_loss_after == 0):\n            loss = self.calculate_total_loss(X, Y)\n            losses.append((num_examples_seen, loss))\n            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')\n            print(\"%s: Loss after num_examples_seen=%d epoch=%d: %f\" % (time, num_examples_seen, epoch, loss))\n            # Adjust the learning rate if loss increases\n            if len(losses) > 1 and losses[-1][1] > losses[-2][1]:\n                learning_rate = learning_rate * 0.5\n                print(\"Setting learning rate to %f\" % learning_rate)\n            sys.stdout.flush()\n        # For each training example...\n        for i in range(len(Y)):\n            self.sgd_step(X[i], Y[i], learning_rate)\n            num_examples_seen += 1\n    return losses \n```\n\nHere, we annealing the learning rate by `0.5` if we find the loss increases in this epoch. More about the decay of learning rate can be found [here](http:\u002F\u002Fcs231n.github.io\u002Fneural-networks-3\u002F#anneal).\n## Evaluation\nDone! Let’s try to get a sense of how long it would take to train our network:\n```python\nword_dim = 8000\nhidden_dim = 100\nX_train, y_train = getSentenceData('data\u002Freddit-comments-2015-08.csv', word_dim)\n\nnp.random.seed(10)\nrnn = Model(word_dim, hidden_dim)\nrnn.sgd_step(X_train[10], y_train[10], 0.005)\n```\n\nBad new is that  one step of SGD takes a few seconds on my laptop. We have about 80,000 examples in our training data, so one epoch (iteration over the whole data set) would take several hours. Multiple epochs would take days, or even weeks! \n\nThere are many ways to speed up our code. One is to implement our code on GPU with some library like **Theano**. But in this tutorial, let’s just try to run SGD with a small dataset and check if the loss actually decreases:\n```python\nword_dim = 8000\nhidden_dim = 100\nX_train, y_train = getSentenceData('data\u002Freddit-comments-2015-08.csv', word_dim)\n\nnp.random.seed(10)\nrnn = Model(word_dim, hidden_dim)\n\nlosses = rnn.train(X_train[:100], y_train[:100], learning_rate=0.005, nepoch=10, evaluate_loss_after=1)\n```\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_f3d9a9b7736c.png)\n\nGood, it seems like our implementation is at least doing something useful and decreasing the loss, just like we wanted.\n## Further more\nThere is a problem about RNN called `vanishing gradient problem`. That's why traditional RNN cannot capture the long term dependency, so we use `bptt_truncate` parameter to constrain the length of dependency. This will motivate our move to more sophisticated RNN models, such as LSTMs, which are the current state of the art for many tasks in NLP. \n\nMore about `vanishing gradient problem` and `LSTM` can be found [here](http:\u002F\u002Fwww.wildml.com\u002F2015\u002F10\u002Frecurrent-neural-networks-tutorial-part-3-backpropagation-through-time-and-vanishing-gradients\u002F) and [here](http:\u002F\u002Fwww.wildml.com\u002F2015\u002F10\u002Frecurrent-neural-network-tutorial-part-4-implementing-a-grulstm-rnn-with-python-and-theano\u002F) and [here](http:\u002F\u002Fcolah.github.io\u002Fposts\u002F2015-08-Understanding-LSTMs\u002F).\n## Reference\n1. \u003Chttps:\u002F\u002Fgithub.com\u002Fpangolulu\u002Fneural-network-from-scratch>\n2. \u003Chttp:\u002F\u002Fwww.wildml.com\u002F2015\u002F09\u002Frecurrent-neural-networks-tutorial-part-1-introduction-to-rnns\u002F>\n2. \u003Chttp:\u002F\u002Fwww.wildml.com\u002F2015\u002F09\u002Frecurrent-neural-networks-tutorial-part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano\u002F>\n3. \u003Chttp:\u002F\u002Fwww.wildml.com\u002F2015\u002F10\u002Frecurrent-neural-networks-tutorial-part-3-backpropagation-through-time-and-vanishing-gradients\u002F>\n","# 从零开始实现循环神经网络\n我假设你对基础的神经网络已经有一定的了解。如果没有，你可以先阅读[从零开始实现神经网络](https:\u002F\u002Fgithub.com\u002Fpangolulu\u002Fneural-network-from-scratch)，它会引导你理解非循环网络背后的思想和实现方法。\n## 引言\n本篇博客受到[WildML](http:\u002F\u002Fwww.wildml.com\u002F)上的[循环神经网络教程](http:\u002F\u002Fwww.wildml.com\u002F2015\u002F09\u002Frecurrent-neural-networks-tutorial-part-1-introduction-to-rnns\u002F)的启发。你可以深入阅读那篇文章来掌握RNN的基础知识，这些内容在本教程中将不再赘述。\n\n在本教程中，我们将重点讲解如何基于RNN的**计算图**，通过**时间反向传播（BPTT）**进行训练，并实现**自动微分**。你会发现，用这种方式计算梯度比手动推导要更加简单可靠。\n\n本文将以RNN语言模型（rnnlm）为例。关于RNN更高级的应用，可以参考[Karpathy的文章](http:\u002F\u002Fkarpathy.github.io\u002F2015\u002F05\u002F21\u002Frnn-effectiveness\u002F)。\n## 如何训练RNN\nRNN的结构如下图所示：\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_6ee325d3906f.jpg)\n\n可以看到，参数`(W, U, V)`在不同时间步之间是共享的。每个时间步的输出可以经过**softmax**处理。因此，我们可以使用**交叉熵损失**作为误差函数，并采用优化算法（如梯度下降法）来求解最优参数`(W, U, V)`。\n\n回顾一下我们的RNN公式：\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_3b204ca0428c.png)\n\n我们定义的损失函数为交叉熵损失，表达式如下：\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_63d9d9c8d26b.png)\n\n其中`y_t`是时间步`t`的真实标签，`y^_t`是我们预测的值。通常我们会把整个序列（句子）当作一个训练样本，因此总误差就是各个时间步（单词）误差的总和。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_3588c3fb9606.png)\n\n记住，我们的目标是计算误差对参数`U`、`V`和`W`的梯度，然后利用优化算法（本教程中使用**随机梯度下降法**）来更新参数。就像我们对误差求和一样，我们也需要对每个时间步的梯度进行累加：![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_e6ade86f89c5.png)。也就是说，我们需要分别计算`dEt\u002FdW`、`dEt\u002FdU`和`dEt\u002FdV`，并将所有时间步的结果相加。\n\n计算`dEt\u002FdV`相对简单，因为它只依赖于当前时间步的值。然而，`dEt\u002FdW`和`dEt\u002FdU`则复杂得多。注意，`s_3 = tanh(Ux_3 + Ws_2)`依赖于`s_2`，而`s_2`又依赖于`W`、`U`和`s_1`，依此类推。因此，在对`W`求导时，我们不能把`s_2`视为常数！必须再次应用链式法则。具体过程可以参考下图。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_9a3c4effe6fb.png)\n\n现在我们以`E1`为例，用**计算图**来表示，并计算`dE1\u002FdW`和`dE1\u002FdU`，思路是一样的。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_7e9032d82ff8.png)\n\n需要注意的是，这与我们在深度[前馈神经网络](https:\u002F\u002Fgithub.com\u002Fpangolulu\u002Fneural-network-from-scratch)中使用的标准反向传播算法完全一致。关键区别在于，我们在RNN中需要对每个时间步的梯度进行累加。而在传统的前馈网络中，各层的参数并不共享，因此不需要做这样的累加操作。在我看来，BPTT不过是展开后的RNN上应用标准反向传播的一种更炫酷的称呼而已。\n\n为了简化**计算图**并提高效率，我们可以将一些小的操作单元整合为一个大的操作单元。请看下图。需要注意的是，每个操作单元都需要实现`forward`和`backward`函数。\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_b7f4dad4706c.png)\n\n以下是所有操作单元以及softmax输出的实现代码：\n```python\nmulGate = MultiplyGate()\naddGate = AddGate()\nactivation = Tanh()\n\nclass RNNLayer:\n    def forward(self, x, prev_s, U, W, V):\n        self.mulu = mulGate.forward(U, x)\n        self.mulw = mulGate.forward(W, prev_s)\n        self.add = addGate.forward(self.mulw, self.mulu)\n        self.s = activation.forward(self.add)\n        self.mulv = mulGate.forward(V, self.s)\n        \n    def backward(self, x, prev_s, U, W, V, diff_s, dmulv):\n        self.forward(x, prev_s, U, W, V)\n        dV, dsv = mulGate.backward(V, self.s, dmulv)\n        ds = dsv + diff_s\n        dadd = activation.backward(self.add, ds)\n        dmulw, dmulu = addGate.backward(self.mulw, self.mulu, dadd)\n        dW, dprev_s = mulGate.backward(W, prev_s, dmulw)\n        dU, dx = mulGate.backward(U, x, dmulu)\n        return (dprev_s, dU, dW, dV)\n```\n\n```python\nclass MultiplyGate:\n    def forward(self,W, x):\n        return np.dot(W, x)\n    def backward(self, W, x, dz):\n        dW = np.asarray(np.dot(np.transpose(np.asmatrix(dz)), np.asmatrix(x)))\n        dx = np.dot(np.transpose(W), dz)\n        return dW, dx\n\nclass AddGate:\n    def forward(self, x1, x2):\n        return x1 + x2\n    def backward(self, x1, x2, dz):\n        dx1 = dz * np.ones_like(x1)\n        dx2 = dz * np.ones_like(x2)\n        return dx1, dx2\n```\n\n```python\nclass Sigmoid:\n    def forward(self, x):\n        return 1.0 \u002F (1.0 + np.exp(-x))\n    def backward(self, x, top_diff):\n        output = self.forward(x)\n        return (1.0 - output) * output * top_diff\n\nclass Tanh:\n    def forward(self, x):\n        return np.tanh(x)\n    def backward(self, x, top_diff):\n        output = self.forward(x)\n        return (1.0 - np.square(output)) * top_diff\n```\n\n```python\nclass Softmax:\n    def predict(self, x):\n        exp_scores = np.exp(x)\n        return exp_scores \u002F np.sum(exp_scores)\n    def loss(self, x, y):\n        probs = self.predict(x)\n        return -np.log(probs[y])\n    def diff(self, x, y):\n        probs = self.predict(x)\n        probs[y] -= 1.0\n        return probs\n```\n\n这些实现与[从零开始实现神经网络](https:\u002F\u002Fgithub.com\u002Fpangolulu\u002Fneural-network-from-scratch)中的代码基本相同，唯一的区别在于：本教程中的输入`x`或`s`是一维数组，而在之前的教程中，输入`X`是一个矩阵形式的数据批次（每行代表一个样本）。\n\n现在我们已经能够计算出参数的梯度，接下来就可以使用SGD来训练模型了。\n\n## 实现\n\n### 初始化\n初始化参数 `U`、`V` 和 `W` 有点讲究。我们不能简单地将它们初始化为 0，因为那样会导致所有层的计算对称。我们必须随机初始化这些权重。由于合适的初始化似乎会对训练结果产生影响，这一领域已经进行了大量研究。事实证明，最佳的初始化方式取决于激活函数（在我们的例子中是 `tanh`），一种推荐的做法是将权重随机初始化在区间 ![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_5ead4467c15c.png) 内，其中 `n` 是来自前一层的输入连接数。\n```python\nclass Model:\n    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):\n        self.word_dim = word_dim\n        self.hidden_dim = hidden_dim\n        self.bptt_truncate = bptt_truncate\n        self.U = np.random.uniform(-np.sqrt(1. \u002F word_dim), np.sqrt(1. \u002F word_dim), (hidden_dim, word_dim))\n        self.W = np.random.uniform(-np.sqrt(1. \u002F hidden_dim), np.sqrt(1. \u002F hidden_dim), (hidden_dim, hidden_dim))\n        self.V = np.random.uniform(-np.sqrt(1. \u002F hidden_dim), np.sqrt(1. \u002F hidden_dim), (word_dim, hidden_dim))\n```\n\n上述代码中，`word_dim` 是词汇表的大小，而 `hidden_dim` 是隐藏层的大小（我们可以自行选择）。目前可以先不用管 `bptt_truncate` 参数，稍后我们会解释它的作用。\n### 前向传播\n接下来，让我们实现由上述公式定义的前向传播（预测单词概率）：\n```python\n'''\n    前向传播（预测单词概率）\n    x 既可以是一个单独的数据点，也可以是一批数据\n    例如 x = [0, 179, 341, 416], 那么 y = [179, 341, 416, 1]\n'''\ndef forward_propagation(self, x):\n    # 总的时间步数\n    T = len(x)\n    layers = []\n    prev_s = np.zeros(self.hidden_dim)\n    # 对于每一个时间步...\n    for t in range(T):\n        layer = RNNLayer()\n        input = np.zeros(self.word_dim)\n        input[x[t]] = 1\n        layer.forward(input, prev_s, self.U, self.W, self.V)\n        prev_s = layer.s\n        layers.append(layer)\n    return layers\n```\n\n我们还实现了一个 `predict` 函数来生成最终结果。\n```python\ndef predict(self, x):\n    output = Softmax()\n    layers = self.forward_propagation(x)\n    return [np.argmax(output.predict(layer.mulv)) for layer in layers]\n```\n\n### 计算损失\n为了训练我们的网络，我们需要一种方法来衡量它产生的误差。我们称这种度量为损失函数 `L`，我们的目标是找到能够使训练数据上的损失函数最小化的参数 `U`、`V` 和 `W`。常用的损失函数是 **交叉熵** 损失。\n```python\ndef calculate_loss(self, x, y):\n    assert len(x) == len(y)\n    output = Softmax()\n    layers = self.forward_propagation(x)\n    loss = 0.0\n    for i, layer in enumerate(layers):\n        loss += output.loss(layer.mulv, y[i])\n    return loss \u002F float(len(y))\n\ndef calculate_total_loss(self, X, Y):\n    loss = 0.0\n    for i in range(len(Y)):\n        loss += self.calculate_loss(X[i], Y[i])\n    return loss \u002F float(len(Y))\n```\n\n### 时间反向传播（BPTT）\n正如前面介绍的那样，我们实现了 BPTT 算法。它以一个训练样本 `(x, y)` 作为输入，并返回梯度 `dL\u002FdW`、`dL\u002FdU` 和 `dL\u002FdV`。\n```python\ndef bptt(self, x, y):\n    assert len(x) == len(y)\n    output = Softmax()\n    layers = self.forward_propagation(x)\n    dU = np.zeros(self.U.shape)\n    dV = np.zeros(self.V.shape)\n    dW = np.zeros(self.W.shape)\n\n    T = len(layers)\n    prev_s_t = np.zeros(self.hidden_dim)\n    diff_s = np.zeros(self.hidden_dim)\n    for t in range(0, T):\n        dmulv = output.diff(layers[t].mulv, y[t])\n        input = np.zeros(self.word_dim)\n        input[x[t]] = 1\n        dprev_s, dU_t, dW_t, dV_t = layers[t].backward(input, prev_s_t, self.U, self.W, self.V, diff_s, dmulv)\n        prev_s_t = layers[t].s\n        dmulv = np.zeros(self.word_dim)\n        for i in range(t-1, max(-1, t-self.bptt_truncate-1), -1):\n            input = np.zeros(self.word_dim)\n            input[x[i]] = 1\n            prev_s_i = np.zeros(self.hidden_dim) 如果 i == 0 else layers[i-1].s\n            dprev_s, dU_i, dW_i, dV_i = layers[i].backward(input, prev_s_i, self.U, self.W, self.V, dprev_s, dmulv)\n            dU_t += dU_i\n            dW_t += dW_i\n        dV += dV_t\n        dU += dU_t\n        dW += dW_t\n    return (dU, dW, dV)\n```\n\n### SGD 的实现\n现在我们已经能够计算出参数的梯度，就可以实现随机梯度下降（SGD）了。我喜欢分两步来实现：\n\n1. 一个 `sgd_step` 函数，用于计算梯度并对一个批次进行更新。\n2. 外层循环遍历整个训练集，并调整学习率。\n\n```python\ndef sgd_step(self, x, y, learning_rate):\n    dU, dW, dV = self.bptt(x, y)\n    self.U -= learning_rate * dU\n    self.V -= learning_rate * dV\n    self.W -= learning_rate * dW\n   \ndef train(self, X, Y, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):\n    num_examples_seen = 0\n    losses = []\n    for epoch in range(nepoch):\n        if (epoch % evaluate_loss_after == 0):\n            loss = self.calculate_total_loss(X, Y)\n            losses.append((num_examples_seen, loss))\n            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')\n            print(\"%s: Loss after num_examples_seen=%d epoch=%d: %f\" % (time, num_examples_seen, epoch, loss))\n            # 如果损失增加，则调整学习率\n            if len(losses) > 1 and losses[-1][1] > losses[-2][1]:\n                learning_rate = learning_rate * 0.5\n                print(\"Setting learning rate to %f\" % learning_rate)\n            sys.stdout.flush()\n        # 对于每一个训练样本...\n        for i in range(len(Y)):\n            self.sgd_step(X[i], Y[i], learning_rate)\n            num_examples_seen += 1\n    return losses \n```\n\n在这里，如果发现当前 epoch 的损失有所增加，我们就将学习率减半。关于学习率衰减的更多信息可以参见 [这里](http:\u002F\u002Fcs231n.github.io\u002Fneural-networks-3\u002F#anneal)。\n\n## 评估\n好了！让我们来估算一下训练我们的网络需要多长时间：\n```python\nword_dim = 8000\nhidden_dim = 100\nX_train, y_train = getSentenceData('data\u002Freddit-comments-2015-08.csv', word_dim)\n\nnp.random.seed(10)\nrnn = Model(word_dim, hidden_dim)\nrnn.sgd_step(X_train[10], y_train[10], 0.005)\n```\n\n坏消息是，在我的笔记本上，SGD 的一步就需要几秒钟。我们的训练数据大约有 8 万个样本，因此完成一个 epoch（遍历整个数据集）就需要好几个小时。而要完成多个 epoch，则可能需要几天，甚至几周！\n\n我们可以通过多种方式加速代码的运行。一种方法是使用像 **Theano** 这样的库，在 GPU 上实现我们的代码。不过在本教程中，我们先尝试用一个小数据集运行 SGD，并检查损失是否真的在下降：\n```python\nword_dim = 8000\nhidden_dim = 100\nX_train, y_train = getSentenceData('data\u002Freddit-comments-2015-08.csv', word_dim)\n\nnp.random.seed(10)\nrnn = Model(word_dim, hidden_dim)\n\nlosses = rnn.train(X_train[:100], y_train[:100], learning_rate=0.005, nepoch=10, evaluate_loss_after=1)\n```\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_readme_f3d9a9b7736c.png)\n\n不错，看起来我们的实现至少在做一些有用的事情，并且损失确实在下降，这正是我们期望的结果。\n## 更进一步\nRNN 存在一个被称为“梯度消失问题”的难题。这也是为什么传统的 RNN 无法捕捉长期依赖关系的原因。因此，我们使用 `bptt_truncate` 参数来限制依赖关系的长度。这也促使我们转向更复杂的 RNN 模型，比如 LSTM，它目前在许多自然语言处理任务中处于最先进水平。\n\n关于“梯度消失问题”和“LSTM”的更多信息可以参见 [这里](http:\u002F\u002Fwww.wildml.com\u002F2015\u002F10\u002Frecurrent-neural-networks-tutorial-part-3-backpropagation-through-time-and-vanishing-gradients\u002F)、[这里](http:\u002F\u002Fwww.wildml.com\u002F2015\u002F10\u002Frecurrent-neural-network-tutorial-part-4-implementing-a-grulstm-rnn-with-python-and-theano\u002F)以及 [这里](http:\u002F\u002Fcolah.github.io\u002Fposts\u002F2015-08-Understanding-LSTMs\u002F)。\n## 参考文献\n1. \u003Chttps:\u002F\u002Fgithub.com\u002Fpangolulu\u002Fneural-network-from-scratch>\n2. \u003Chttp:\u002F\u002Fwww.wildml.com\u002F2015\u002F09\u002Frecurrent-neural-networks-tutorial-part-1-introduction-to-rnns\u002F>\n2. \u003Chttp:\u002F\u002Fwww.wildml.com\u002F2015\u002F09\u002Frecurrent-neural-networks-tutorial-part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano\u002F>\n3. \u003Chttp:\u002F\u002Fwww.wildml.com\u002F2015\u002F10\u002Frecurrent-neural-networks-tutorial-part-3-backpropagation-through-time-and-vanishing-gradients\u002F>","# rnn-from-scratch 快速上手指南\n\n本指南旨在帮助开发者从零开始理解并实现循环神经网络（RNN），重点讲解基于计算图的自动微分和随时间反向传播（BPTT）算法。\n\n## 环境准备\n\n本项目主要依赖 Python 和 NumPy 进行数值计算，无需复杂的深度学习框架（如 TensorFlow 或 PyTorch），适合学习原理。\n\n*   **操作系统**: Linux, macOS 或 Windows\n*   **Python 版本**: 推荐 Python 3.6+\n*   **核心依赖**:\n    *   `numpy`: 用于矩阵运算和数组操作\n\n确保已安装 `pip` 包管理工具。国内用户建议使用清华源或阿里源加速安装。\n\n## 安装步骤\n\n1.  **克隆项目代码**\n    将仓库克隆到本地：\n    ```bash\n    git clone https:\u002F\u002Fgithub.com\u002Fpangolulu\u002Frnn-from-scratch.git\n    cd rnn-from-scratch\n    ```\n\n2.  **安装依赖**\n    使用 pip 安装必要的 Python 库。国内用户推荐使用以下命令加速下载：\n    ```bash\n    pip install numpy -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n    ```\n    *(注：如果项目中包含 `requirements.txt`，也可运行 `pip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple`)*\n\n## 基本使用\n\n该项目主要通过 Python 脚本演示 RNN 的语言模型（RNNLM）训练过程。核心逻辑包括前向传播、损失计算、BPTT 反向传播以及 SGD 优化。\n\n### 1. 模型初始化\n模型参数 $U, V, W$ 需要随机初始化以避免对称性问题。初始化范围取决于激活函数（此处为 `tanh`）和输入维度。\n\n```python\nclass Model:\n    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):\n        self.word_dim = word_dim\n        self.hidden_dim = hidden_dim\n        self.bptt_truncate = bptt_truncate\n        # 随机初始化权重\n        self.U = np.random.uniform(-np.sqrt(1. \u002F word_dim), np.sqrt(1. \u002F word_dim), (hidden_dim, word_dim))\n        self.W = np.random.uniform(-np.sqrt(1. \u002F hidden_dim), np.sqrt(1. \u002F hidden_dim), (hidden_dim, hidden_dim))\n        self.V = np.random.uniform(-np.sqrt(1. \u002F hidden_dim), np.sqrt(1. \u002F hidden_dim), (word_dim, hidden_dim))\n```\n\n### 2. 前向传播与预测\n输入为一个序列（例如单词索引列表），模型逐个时间步计算隐藏状态并输出预测概率。\n\n```python\n# 假设 x 是输入序列，例如 [0, 179, 341, 416]\n# y 是对应的目标序列，例如 [179, 341, 416, 1]\n\nmodel = Model(word_dim=1000, hidden_dim=100)\n\n# 执行前向传播\nlayers = model.forward_propagation(x)\n\n# 获取预测结果 (返回每个时间步概率最大的词索引)\npredictions = model.predict(x)\n```\n\n### 3. 训练模型 (BPTT + SGD)\n使用交叉熵作为损失函数，通过 BPTT 计算梯度，并使用随机梯度下降（SGD）更新参数。\n\n```python\n# 定义训练数据 X (输入序列列表) 和 Y (目标序列列表)\n# X = [[...], [...], ...], Y = [[...], [...], ...]\n\nlearning_rate = 0.005\nnepoch = 100\n\n# 开始训练\nmodel.train(X, Y, learning_rate=learning_rate, nepoch=nepoch, evaluate_loss_after=5)\n```\n\n在 `train` 函数内部，程序会迭代执行以下步骤：\n1.  调用 `bptt(x, y)` 计算梯度 $dU, dW, dV$。\n2.  调用 `sgd_step` 利用学习率更新参数：\n    ```python\n    self.U -= learning_rate * dU\n    self.V -= learning_rate * dV\n    self.W -= learning_rate * dW\n    ```\n\n通过以上步骤，即可完成一个简易 RNN 语言模型的构建与训练，深入理解其内部运作机制。","某高校深度学习实验室的研究员正在指导学生从零构建循环神经网络（RNN），以深入理解序列数据处理机制。\n\n### 没有 rnn-from-scratch 时\n- 学生需手动推导随时间反向传播（BPTT）的复杂梯度公式，极易在链式法则计算中出现数学错误。\n- 缺乏可视化的计算图参考，难以直观理解参数 $(W, U, V)$ 如何在不同时间步共享及梯度如何累加。\n- 调试过程如同“黑盒”，当模型无法收敛时，无法区分是代码逻辑错误还是对 RNN 原理理解偏差。\n- 编写自动微分逻辑耗时耗力，大量时间浪费在重复造轮子上，而非探索语言模型的实际应用。\n\n### 使用 rnn-from-scratch 后\n- 直接复用基于计算图实现的自动微分模块，无需手工计算梯度，确保了 BPTT 算法的数学准确性。\n- 结合项目中的架构图与公式推导，学生能清晰看到误差如何在时间维度上传播并更新共享参数。\n- 代码结构透明且模块化，研究员可快速定位训练失败原因，高效验证学生对隐藏状态传递机制的理解。\n- 团队将精力集中在调整随机梯度下降策略和优化交叉熵损失函数上，显著缩短了从理论到实战的周期。\n\nrnn-from-scratch 通过将复杂的时序反向传播过程代码化与可视化，让开发者从繁琐的公式推导中解放出来，真正专注于掌握 RNN 的核心机理。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fgy910210_rnn-from-scratch_55135dc4.png","gy910210","龚禹 (Yu Gong)","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fgy910210_256f3d8b.jpg",null,"Bytedance","Bellevue, Seattle, WA","gy910210@gmail.com","https:\u002F\u002Fgongyu.vercel.app\u002F","https:\u002F\u002Fgithub.com\u002Fgy910210",[82],{"name":83,"color":84,"percentage":85},"Python","#3572A5",100,524,151,"2026-04-08T16:58:13",1,"","未说明",{"notes":93,"python":91,"dependencies":94},"该项目是一个从零开始实现循环神经网络（RNN）的教学代码，主要依赖 NumPy 进行矩阵运算。代码中未明确指定 Python 版本，但根据语法特性推测需 Python 3.x。由于是纯 NumPy 实现，无需 GPU 加速，也未提及具体的内存需求。",[95],"numpy",[14,35],[98,99,100,101,102,103],"rnn","neural-network","recurrent-networks","tensorflow","rnn-tensorflow","rnn-language-model","2026-03-27T02:49:30.150509","2026-04-14T03:22:14.682956",[107,112,116],{"id":108,"question_zh":109,"answer_zh":110,"source_url":111},32320,"这个 RNN 语言模型的目标或最终结果是什么？","该模型是一个语言模型，主要用于生成文本。其工作原理是每次输入句子中的一个词，预测下一个词。由于这是一个为了理解 RNN 机制而编写的玩具代码（toy code），存在训练集较小、仅使用 CPU 训练速度慢以及纯 RNN 对长短期记忆效果不佳等局限性。如果需要更出色的示例或生产级代码，建议参考使用 Google Tensorflow 实现的版本：https:\u002F\u002Fgithub.com\u002Fpangolulu\u002Fchar-rnnlm-tensorflow，或查看 Andrej Karpathy 关于 RNN 有效性的博客文章。","https:\u002F\u002Fgithub.com\u002Fgy910210\u002Frnn-from-scratch\u002Fissues\u002F1",{"id":113,"question_zh":114,"answer_zh":115,"source_url":111},32321,"为什么训练了 10 个 epoch 后只能生成 'the', 'a', 'I' 等简单词汇，应该训练多少个 epoch？","生成效果不佳主要有三个原因：1. 训练集太小；2. 仅使用 CPU 训练速度过慢，10 个 epoch 通常不足以让模型收敛；3. 纯 RNN 架构对长距离依赖（long term memory）的记忆能力较差，目前主流多采用 LSTM 等变体。建议增加训练轮数直到模型收敛，或者参考更高效的 TensorFlow 实现（https:\u002F\u002Fgithub.com\u002Fpangolulu\u002Fchar-rnnlm-tensorflow）以获得更好的生成效果。",{"id":117,"question_zh":118,"answer_zh":119,"source_url":120},32322,"这个 RNN 模型具体是在做什么任务？输入 X 和输出 Y 代表什么？","这是一个语言模型任务。具体来说，模型通过输入句子中的当前词（X），来预测句子中的下一个词（Y）。例如，输入序列包含起始标记和单词，目标序列则是将输入序列整体向前移动一位，以预测后续的单词直至结束标记。","https:\u002F\u002Fgithub.com\u002Fgy910210\u002Frnn-from-scratch\u002Fissues\u002F2",[]]