[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-FareedKhan-dev--train-llm-from-scratch":3,"tool-FareedKhan-dev--train-llm-from-scratch":64},[4,17,27,35,43,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",138956,2,"2026-04-05T11:33:21",[13,15,26],"语言模型",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":23,"last_commit_at":33,"category_tags":34,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,"2026-04-03T11:11:01",[13,14,15],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":23,"last_commit_at":41,"category_tags":42,"status":16},3704,"NextChat","ChatGPTNextWeb\u002FNextChat","NextChat 是一款轻量且极速的 AI 助手，旨在为用户提供流畅、跨平台的大模型交互体验。它完美解决了用户在多设备间切换时难以保持对话连续性，以及面对众多 AI 模型不知如何统一管理的痛点。无论是日常办公、学习辅助还是创意激发，NextChat 都能让用户随时随地通过网页、iOS、Android、Windows、MacOS 或 Linux 端无缝接入智能服务。\n\n这款工具非常适合普通用户、学生、职场人士以及需要私有化部署的企业团队使用。对于开发者而言，它也提供了便捷的自托管方案，支持一键部署到 Vercel 或 Zeabur 等平台。\n\nNextChat 的核心亮点在于其广泛的模型兼容性，原生支持 Claude、DeepSeek、GPT-4 及 Gemini Pro 等主流大模型，让用户在一个界面即可自由切换不同 AI 能力。此外，它还率先支持 MCP（Model Context Protocol）协议，增强了上下文处理能力。针对企业用户，NextChat 提供专业版解决方案，具备品牌定制、细粒度权限控制、内部知识库整合及安全审计等功能，满足公司对数据隐私和个性化管理的高标准要求。",87618,"2026-04-05T07:20:52",[13,26],{"id":44,"name":45,"github_repo":46,"description_zh":47,"stars":48,"difficulty_score":23,"last_commit_at":49,"category_tags":50,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,51,52,53,15,54,26,13,55],"数据工具","视频","插件","其他","音频",{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":10,"last_commit_at":62,"category_tags":63,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,26,54],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":76,"owner_avatar_url":77,"owner_bio":78,"owner_company":79,"owner_location":80,"owner_email":81,"owner_twitter":79,"owner_website":79,"owner_url":82,"languages":83,"stars":92,"forks":93,"last_commit_at":94,"license":95,"difficulty_score":10,"env_os":96,"env_gpu":97,"env_ram":96,"env_deps":98,"category_tags":105,"github_topics":106,"view_count":10,"oss_zip_url":79,"oss_zip_packed_at":79,"status":16,"created_at":113,"updated_at":114,"faqs":115,"releases":116},998,"FareedKhan-dev\u002Ftrain-llm-from-scratch","train-llm-from-scratch","A straightforward method for training your LLM, from downloading data to generating text.","train-llm-from-scratch 是一个轻量级开源项目，帮你从零开始训练自己的大型语言模型（LLM）。它覆盖了完整流程：下载Pile数据集（825GB书籍、文章等多样化文本）、预处理数据、训练模型，直到生成文本，只需单个GPU就能训练百万或十亿参数的模型。这个工具解决了LLM训练门槛高的问题——传统方法需要昂贵硬件和复杂配置，而它让个人开发者用Colab等免费资源就能实践，无需团队级算力。  \n\n特别适合熟悉Python基础、PyTorch框架和神经网络概念的开发者与研究人员。如果你正在学习AI原理或想实验小型模型（如1300万参数级别），它提供清晰的代码拆解：基于\"Attention is All You Need\"论文，从头实现Transformer核心组件（如多头注意力机制、多层感知机），并配有step-by-step注释，帮助你理解每一步的数学逻辑。技术亮点在于平衡了简洁性与教学性——既避免过度简化损失关键细节，又通过结构化脚本降低学习曲线。无论你是探索LLM底层机制，还是为研究项目打基础，它都能成为实用的入门伙伴。（字数：298）","![main image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFareedKhan-dev_train-llm-from-scratch_readme_b1e7a161b0d2.png)\n\n\u003Cdiv align=\"center\">\n\n\u003C!-- omit in toc -->\n# Train LLM From Scratch\n  \n![Python](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPython-3.8%2B-blue) ![License](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-MIT-green) ![Contributions](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FContributions-Welcome-blue) [![Docs](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FDocs-Available-success)](#step-by-step-code-explanation)\n\n**I am Looking for a PhD position in AI**. Take a look at my [Resume](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1Q_iklJ1RVGSb-Pdey8BHy3k8IF3UJv0z\u002Fview?usp=sharing) or [GitHub](https:\u002F\u002Fgithub.com\u002FFareedKhan-dev)\n\n\u003C\u002Fdiv>\n\nI implemented a transformer model from scratch using PyTorch, based on the paper [Attention is All You Need](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.03762). You can use my scripts to train your own **billion** or **million** parameter LLM using a single GPU.\n\nBelow is the output of the trained 13 million parameter LLM:\n\n```\nIn ***1978, The park was returned to the factory-plate that \nthe public share to the lower of the electronic fence that \nfollow from the Station's cities. The Canal of ancient Western \nnations were confined to the city spot. The villages were directly \nlinked to cities in China that revolt that the US budget and in\nOdambinais is uncertain and fortune established in rural areas.\n```\n\u003C!-- omit in toc -->\n## Table of Contents\n- [Training Data Info](#training-data-info)\n- [Prerequisites and Training Time](#prerequisites-and-training-time)\n- [Code Structure](#code-structure)\n- [Usage](#usage)\n- [Step by Step Code Explanation](#step-by-step-code-explanation)\n  - [Importing Libraries](#importing-libraries)\n  - [Preparing the Training Data](#preparing-the-training-data)\n  - [Transformer Overview](#transformer-overview)\n  - [Multi Layer Perceptron (MLP)](#multi-layer-perceptron-mlp)\n  - [Single Head Attention](#single-head-attention)\n  - [Multi Head Attention](#multi-head-attention)\n  - [Transformer Block](#transformer-block)\n  - [The Final Model](#the-final-model)\n  - [Batch Processing](#batch-processing)\n  - [Training Parameters](#training-parameters)\n  - [Training the Model](#training-the-model)\n  - [Saving the Trained Model](#saving-the-trained-model)\n  - [Training Loss](#training-loss)\n  - [Generating Text](#generating-text)\n- [What’s Next](#whats-next)\n\n## Training Data Info\n\nTraining data is from the Pile dataset, which is a diverse, open-source, and large-scale dataset for training language models. The Pile dataset is a collection of 22 diverse datasets, including text from books, articles, websites, and more. The total size of the Pile dataset is 825GB, Below is the sample of the training data:\n\n```python\nLine: 0 \n{\n  \"text\": \"Effect of sleep quality ... epilepsy.\",\n  \"meta\": {\n    \"pile_set_name\": \"PubMed Abstracts\"\n  }\n}\n\nLine: 1\n{\n  \"text\": \"LLMops a new GitHub Repository ...\",\n  \"meta\": {\n    \"pile_set_name\": \"Github\"\n  }\n}\n```\n\n## Prerequisites and Training Time\n\nMake sure you have a basic understanding of object-oriented programming (OOP), neural networks (NN) and PyTorch to understand the code. Below are some resources to help you get started:\n\n| Topic               | Video Link                                                |\n|---------------------|-----------------------------------------------------------|\n| OOP                 | [OOP Video](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=Ej_02ICOIgs&pp=ygUKb29wIHB5dGhvbg%3D%3D) |\n| Neural Network      | [Neural Network Video](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=Jy4wM2X21u0&pp=ygUbbmV1cmFsIG5ldHdvcmsgcHl0aG9uIHRvcmNo) |\n| Pytorch             | [Pytorch Video](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=V_xro1bcAuA&pp=ygUbbmV1cmFsIG5ldHdvcmsgcHl0aG9uIHRvcmNo) |\n\nYou will need a GPU to train your model. Colab or Kaggle T4 will work for training a 13+ million-parameter model, but they will fail for billion-parameter training. Take a look at the comparison:\n\n| GPU Name                 | Memory | Data Size | 2B LLM Training | 13M LLM Training | Max Practical LLM Size (Training) |\n|--------------------------|--------|-----------|-----------------|------------------|-----------------------------------|\n| NVIDIA A100              | 40 GB  | Large     | ✔               | ✔                | ~6B–8B                             |\n| NVIDIA V100              | 16 GB  | Medium    | ✘               | ✔                | ~2B                               |\n| AMD Radeon VII           | 16 GB  | Medium    | ✘               | ✔                | ~1.5B–2B                          |\n| NVIDIA RTX 3090          | 24 GB  | Large     | ✔               | ✔                | ~3.5B–4B                          |\n| Tesla P100               | 16 GB  | Medium    | ✘               | ✔                | ~1.5B–2B                          |\n| NVIDIA RTX 3080          | 10 GB  | Medium    | ✘               | ✔                | ~1.2B                             |\n| AMD RX 6900 XT           | 16 GB  | Large     | ✘               | ✔                | ~2B                               |\n| NVIDIA GTX 1080 Ti       | 11 GB  | Medium    | ✘               | ✔                | ~1.2B                             |\n| Tesla T4                 | 16 GB  | Small     | ✘               | ✔                | ~1.5B–2B                          |\n| NVIDIA Quadro RTX 8000   | 48 GB  | Large     | ✔               | ✔                | ~8B–10B                           |\n| NVIDIA RTX 4070          | 12 GB  | Medium    | ✘               | ✔                | ~1.5B                             |\n| NVIDIA RTX 4070 Ti       | 12 GB  | Medium    | ✘               | ✔                | ~1.5B                             |\n| NVIDIA RTX 4080          | 16 GB  | Medium    | ✘               | ✔                | ~2B                               |\n| NVIDIA RTX 4090          | 24 GB  | Large     | ✔               | ✔                | ~4B                               |\n| NVIDIA RTX 4060 Ti       | 8 GB   | Small     | ✘               | ✔                | ~1B                               |\n| NVIDIA RTX 4060          | 8 GB   | Small     | ✘               | ✔                | ~1B                               |\n| NVIDIA RTX 4050          | 6 GB   | Small     | ✘               | ✔                | ~0.75B                            |\n| NVIDIA RTX 3070          | 8 GB   | Small     | ✘               | ✔                | ~1B                               |\n| NVIDIA RTX 3060 Ti       | 8 GB   | Small     | ✘               | ✔                | ~1B                               |\n| NVIDIA RTX 3060          | 12 GB  | Medium    | ✘               | ✔                | ~1.5B                             |\n| NVIDIA RTX 3050          | 8 GB   | Small     | ✘               | ✔                | ~1B                               |\n| NVIDIA GTX 1660 Ti       | 6 GB   | Small     | ✘               | ✔                | ~0.75B                            |\n| AMD RX 7900 XTX          | 24 GB  | Large     | ✔               | ✔                | ~3.5B–4B                          |\n| AMD RX 7900 XT           | 20 GB  | Large     | ✔               | ✔                | ~3B                               |\n| AMD RX 7800 XT           | 16 GB  | Medium    | ✘               | ✔                | ~2B                               |\n| AMD RX 7700 XT           | 12 GB  | Medium    | ✘               | ✔                | ~1.5B                             |\n| AMD RX 7600              | 8 GB   | Small     | ✘               | ✔                | ~1B                               |\n\nThe 13M LLM training is the training of a 13+ million-parameter model, and the 2B LLM training is the training of a 2+ billion-parameter model. The data size is categorized as small, medium, and large. The small data size is around 1 GB, the medium data size is around 5 GB, and the large data size is around 10 GB.\n\n## Code Structure\n\nThe codebase is organized as follows:\n```bash\ntrain-llm-from-scratch\u002F\n├── src\u002F          \n│   ├── models\u002F   \n│   │   ├── mlp.py       # Definition of the Multi-Layer Perceptron (MLP) module\n│   │   ├── attention.py # Definitions for attention mechanisms (single-head, multi-head)\n│   │   ├── transformer_block.py # Definition of a single Transformer block\n│   │   ├── transformer.py     # Definition of the main Transformer model\n├── config\u002F       \n│   └── config.py    # Contains default configurations (model parameters, file paths, etc.)\n├── data_loader\u002F  \n│   └── data_loader.py # Contains functions for creating data loaders\u002Fiterators\n├── scripts\u002F      \n│   ├── train_transformer.py # Script for training the Transformer model\n│   ├── data_download.py   # Script for downloading the dataset\n│   ├── data_preprocess.py # Script for preprocessing the downloaded data\n│   ├── generate_text.py   # Script for generating text using a trained model\n├── data\u002F         # Directory to store the dataset\n│   ├── train\u002F     # Contains training data\n│   └── val\u002F       # Contains validation data\n├── models\u002F       # Directory where trained models are saved\n```\n\n`scripts\u002F` directory contains scripts for downloading the dataset, preprocessing the data, training the model, and generating text using the trained model. `src\u002Fmodels\u002F` directory contains the implementation of the transformer model, multi-layer perceptron (MLP), attention mechanisms, and transformer blocks.`config\u002F` directory contains the configuration file with default parameters. `data_loader\u002F` directory contains functions for creating data loaders\u002Fiterators.\n\n## Usage\n\nClone the repository and navigate to the directory:\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FFareedKhan-dev\u002Ftrain-llm-from-scratch.git\ncd train-llm-from-scratch\n```\n\nif you encounter any issues regarding the imports, make sure to change pythonpath to the root directory of the project:\n```bash\nexport PYTHONPATH=\"${PYTHONPATH}:\u002Fpath\u002Fto\u002Ftrain-llm-from-scratch\"\n\n# or if you are already in the train-llm-from-scratch directory\nexport PYTHONPATH=\"$PYTHONPATH:.\"\n```\n\nInstall the required dependencies:\n```bash\npip install -r requirements.txt\n```\n\nYou can modify the transformer architecture under `src\u002Fmodels\u002Ftransformer.py` and the training configurations under `config\u002Fconfig.py`.\n\n\nTo download the training data, run:\n```bash\npython scripts\u002Fdata_download.py\n```\n\nThe script supports the following arguments:\n* `--train_max`: Maximum number of training files to download. Default is 1 (Max equal to 30) Each file is around 11 GB.\n* `--train_dir`: Directory for storing training data. Default is `data\u002Ftrain`.\n* `--val_dir`: Directory for storing validation data. Default is `data\u002Fval`.\n\nTo preprocess the downloaded data, run:\n```bash\npython scripts\u002Fdata_preprocess.py\n```\n\nThe script supports the following arguments:\n- `--train_dir`: Directory where the training data files are stored (default is `data\u002Ftrain`).\n- `--val_dir`: Directory where the validation data files are stored (default is `data\u002Fval`).\n- `--out_train_file`: Path to store the processed training data in HDF5 format (default is `data\u002Ftrain\u002Fpile_train.h5`).\n- `--out_val_file`: Path to store the processed validation data in HDF5 format (default is `data\u002Fval\u002Fpile_dev.h5`).\n- `--tokenizer_name`: Name of the tokenizer to use for processing the data (default is `r50k_base`).\n- `--max_data`: Maximum number of JSON objects ([lines](#training-data-info)) to process from each dataset (both train and validation). The default is 1000.\n\nNow that the data is preprocessed, you can train the 13 million parameter llm by changing the configuration in `config\u002Fconfig.py` to this:\n\n```python\n# Define vocabulary size and transformer configuration (3 Billion)\nVOCAB_SIZE = 50304          # Number of unique tokens in the vocabulary\nCONTEXT_LENGTH = 128        # Maximum sequence length for the model\nN_EMBED = 128               # Dimension of the embedding space\nN_HEAD = 8                  # Number of attention heads in each transformer block\nN_BLOCKS = 1               # Number of transformer blocks in the model\n```\n\nTo train the model, run:\n```bash\npython scripts\u002Ftrain_transformer.py\n```\n\nIt will start training the model and save the trained model in the `models\u002F` default directory or the directory specified in the configuration file.\n\nTo generate text using the trained model, run:\n```bash\npython scripts\u002Fgenerate_text.py --model_path models\u002Fyour_model.pth --input_text hi\n```\n\nThe script supports the following arguments:\n- `--model_path`: Path to the trained model.\n- `--input_text`: Initial text prompt for generating new text.\n- `--max_new_tokens`: Maximum number of tokens to generate (default is 100).\n\nIt will generate text based on the input prompt using the trained model.\n\n## Step by Step Code Explanation\n\nThis section is for those who want to understand the code in detail. I will explain the code step by step, starting from importing the libraries to training the model and generating text.\n\nPreviously, I wrote an article on Medium about creating a [2.3+ million-parameter](https:\u002F\u002Flevelup.gitconnected.com\u002Fbuilding-a-million-parameter-llm-from-scratch-using-python-f612398f06c2) LLM using the Tiny Shakespeare dataset, but the output didn’t make sense. Here is a sample output:\n\n```bash\n# 2.3 Million Parameter LLM Output\nZELBETH:\nSey solmenter! tis tonguerered if\nVurint as steolated have loven OID the queend refore\nAre been, good plmp:\n\nProforne, wiftes swleen, was no blunderesd a a quain beath!\nTybell is my gateer stalk smend as be matious dazest\n```\n\nI had a thought, what if I make the transformer architecture smaller and less complex, and the training data more diverse? Then, how big of a model could a single person, using their nearly dead GPU, create in terms of parameters that can speak proper grammar and generate text that makes some sense?\n\nI found that **13+ million-parameter** models are enough to start making sense in terms of proper grammar and punctuation, which is a positive point. This means we can use a very specific dataset to further fine-tune our previously trained model for a narrowed task. We might end up with a model under 1 billion parameters or even around 500 million parameters that is perfect for our specific use case, especially for running it on private data securely.\n\nI recommend you **first train a 13+ million-parameter** model using the script available in my GitHub repository. You will get results within one day, instead of waiting for a longer time, or if your local GPU might not be strong enough to train a billion-parameter model.\n\n### Importing Libraries\n\nLet’s import the required libraries that will be used throughout this blog:\n\n```python\n# PyTorch for deep learning functions and tensors\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n# Numerical operations and arrays handling\nimport numpy as np\n\n# Handling HDF5 files\nimport h5py\n\n# Operating system and file management\nimport os\n\n# Command-line argument parsing\nimport argparse\n\n# HTTP requests and interactions\nimport requests\n\n# Progress bar for loops\nfrom tqdm import tqdm\n\n# JSON handling\nimport json\n\n# Zstandard compression library\nimport zstandard as zstd\n\n# Tokenization library for large language models\nimport tiktoken\n\n# Math operations (used for advanced math functions)\nimport math\n```\n\n### Preparing the Training Data\n\nOur training dataset needs to be diverse, containing information from different domains, and The Pile is the right choice for it. Although it is 825 GB in size, we will stick to only a small portion of it, i.e., 5%–10%. Let’s first download the dataset and see how it works. I will be downloading the version available on [HuggingFace](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmonology\u002Fpile-uncopyrighted).\n\n```python\n# Download validation dataset\n!wget https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmonology\u002Fpile-uncopyrighted\u002Fresolve\u002Fmain\u002Fval.jsonl.zst\n\n# Download the first part of the training dataset\n!wget https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmonology\u002Fpile-uncopyrighted\u002Fresolve\u002Fmain\u002Ftrain\u002F00.jsonl.zst\n\n# Download the second part of the training dataset\n!wget https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmonology\u002Fpile-uncopyrighted\u002Fresolve\u002Fmain\u002Ftrain\u002F01.jsonl.zst\n\n# Download the third part of the training dataset\n!wget https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmonology\u002Fpile-uncopyrighted\u002Fresolve\u002Fmain\u002Ftrain\u002F02.jsonl.zst\n```\n\nIt will take some time to download, but you can also limit the training dataset to just one file, `00.jsonl.zst`, instead of three. It is already split into train\u002Fval\u002Ftest. Once it's done, make sure to place the files correctly in their respective directories.\n\n```python\nimport os\nimport shutil\nimport glob\n\n# Define directory structure\ntrain_dir = \"data\u002Ftrain\"\nval_dir = \"data\u002Fval\"\n\n# Create directories if they don't exist\nos.makedirs(train_dir, exist_ok=True)\nos.makedirs(val_dir, exist_ok=True)\n\n# Move all train files (e.g., 00.jsonl.zst, 01.jsonl.zst, ...)\ntrain_files = glob.glob(\"*.jsonl.zst\")\nfor file in train_files:\n    if file.startswith(\"val\"):\n        # Move validation file\n        dest = os.path.join(val_dir, file)\n    else:\n        # Move training file\n        dest = os.path.join(train_dir, file)\n    shutil.move(file, dest)\n\nOur dataset is in the .jsonl.zst format, which is a compressed file format commonly used for storing large datasets. It combines JSON Lines (.jsonl), where each line represents a valid JSON object, with Zstandard (.zst) compression. Let's read a sample of one of the downloaded files and see how it looks.\n\nin_file = \"data\u002Fval\u002Fval.jsonl.zst\"  # Path to our validation file\n\nwith zstd.open(in_file, 'r') as in_f:\n    for i, line in tqdm(enumerate(in_f)):  # Read first 5 lines\n        data = json.loads(line)\n        print(f\"Line {i}: {data}\")  # Print the raw data for inspection\n        if i == 2:\n            break\n```\n\nThe output of the above code is this:\n\n```python\n#### OUTPUT ####\nLine: 0 \n{\n  \"text\": \"Effect of sleep quality ... epilepsy.\",\n  \"meta\": {\n    \"pile_set_name\": \"PubMed Abstracts\"\n  }\n}\n\nLine: 1\n{\n  \"text\": \"LLMops a new GitHub Repository ...\",\n  \"meta\": {\n    \"pile_set_name\": \"Github\"\n  }\n}\n```\n\nNow we need to encode (tokenize) our dataset. Our goal is to have an LLM that can at least output proper words. For that, we need to use an already available tokenizer. We will use the tiktoken open-source tokenizer by OpenAI. We will use the r50k_base tokenizer, which is used for the ChatGPT (GPT-3) model, to tokenize our dataset.\n\nWe need to create a function for this to avoid duplication, as we will be tokenizing both the train and validation datasets.\n\n```python\ndef process_files(input_dir, output_file):\n    \"\"\"\n    Process all .zst files in the specified input directory and save encoded tokens to an HDF5 file.\n\n    Args:\n        input_dir (str): Directory containing input .zst files.\n        output_file (str): Path to the output HDF5 file.\n    \"\"\"\n    with h5py.File(output_file, 'w') as out_f:\n        # Create an expandable dataset named 'tokens' in the HDF5 file\n        dataset = out_f.create_dataset('tokens', (0,), maxshape=(None,), dtype='i')\n        start_index = 0\n\n        # Iterate through all .zst files in the input directory\n        for filename in sorted(os.listdir(input_dir)):\n            if filename.endswith(\".jsonl.zst\"):\n                in_file = os.path.join(input_dir, filename)\n                print(f\"Processing: {in_file}\")\n\n                # Open the .zst file for reading\n                with zstd.open(in_file, 'r') as in_f:\n                    # Iterate through each line in the compressed file\n                    for line in tqdm(in_f, desc=f\"Processing {filename}\"):\n                        # Load the line as JSON\n                        data = json.loads(line)\n\n                        # Append the end-of-text token to the text and encode it\n                        text = data['text'] + \"\u003C|endoftext|>\"\n                        encoded = enc.encode(text, allowed_special={'\u003C|endoftext|>'})\n                        encoded_len = len(encoded)\n\n                        # Calculate the end index for the new tokens\n                        end_index = start_index + encoded_len\n\n                        # Expand the dataset size and store the encoded tokens\n                        dataset.resize(dataset.shape[0] + encoded_len, axis=0)\n                        dataset[start_index:end_index] = encoded\n\n                        # Update the start index for the next batch of tokens\n                        start_index = end_index\n```\n\nThere are two important points regarding this function:\n\n 1. We are storing the tokenized data in an HDF5 file, which allows us flexibility for quicker data access while training the model.\n\n 2. Appending the `\u003C|endoftext|>` token marks the end of each text sequence, signaling to the model that it has reached the end of a meaningful context, which helps in generating coherent outputs.\n\nNow we can simply encode our train and validation datasets using:\n\n```python\n# Define tokenized data output directories\nout_train_file = \"data\u002Ftrain\u002Fpile_train.h5\"\nout_val_file = \"data\u002Fval\u002Fpile_dev.h5\"\n\n# Loading tokenizer of (GPT-3\u002FGPT-2 Model)\nenc = tiktoken.get_encoding('r50k_base')\n\n# Process training data\nprocess_files(train_dir, out_train_file)\n\n# Process validation data\nprocess_files(val_dir, out_val_file)\n```\n\nLet’s take a look at the sample of our tokenized data:\n\n```python\n with h5py.File(out_val_file, 'r') as file:\n     # Access the 'tokens' dataset\n     tokens_dataset = file['tokens']\n     \n     # Print the dtype of the dataset\n     print(f\"Dtype of 'tokens' dataset: {tokens_dataset.dtype}\")\n     \n     # load and print the first few elements of the dataset\n     print(\"First few elements of the 'tokens' dataset:\")\n     print(tokens_dataset[:10])  # First 10 token\n```\n\nThe output of the above code is this:\n\n```python\n#### OUTPUT ####\nDtype of 'tokens' dataset: int32\n\nFirst few elements of the 'tokens' dataset:\n[ 2725  6557    83 23105   157   119   229    77  5846  2429]\n```\nWe have prepared our dataset for training. Now we will code the transformer architecture and look into its theory correspondingly.\n\n### Transformer Overview\n\nLet’s have a quick look at how a transformer architecture is used to process and understand text. It works by breaking text into smaller pieces called tokens and predicting the next token in the sequence. A transformer has many layers, called transformer blocks, stacked on top of each other, with a final layer at the end to make the prediction.\n\nEach transformer block has two main components:\n\n* **Self-Attention Heads**: These figure out which parts of the input are most important for the model to focus on. For example, when processing a sentence, the attention heads can highlight relationships between words, such as how a pronoun relates to the noun it refers to.\n\n* **MLP (Multi-Layer Perceptron)**: This is a simple feed-forward neural network. It takes the information emphasized by the attention heads and processes it further. The MLP has an input layer that receives data from the attention heads, a hidden layer that adds complexity to the processing, and an output layer that passes the results to the next transformer block.\n\nTogether, the attention heads act as the “what to think about” part, while the MLP is the “how to think about it” part. Stacking many transformer blocks allows the model to understand complex patterns and relationships in the text, but this is not always guaranteed.\n\nInstead of looking at the original paper diagram, let’s visualize a simpler and easier architecture diagram that we will be coding.\n\n![Transformer Architecture by [Fareed Khan](undefined)](https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F11808\u002F1*QXmeA-H52C-p82AwawslbQ.png)\n\nLet’s read through the flow of our architecture that we will be coding:\n\n 1. Input tokens are converted to embeddings and combined with position information.\n\n 2. The model has 64 identical transformer blocks that process data sequentially.\n\n 3. Each block first runs multi-head attention to look at relationships between tokens.\n\n 4. Each block then processes data through an MLP that expands and then compresses the data.\n\n 5. Each step uses residual connections (shortcuts) to help information flow.\n\n 6. Layer normalization is used throughout to stabilize training.\n\n 7. The attention mechanism calculates which tokens should pay attention to each other.\n\n 8. The MLP expands the data to 4x size, applies ReLU, and then compresses it back down.\n\n 9. The model uses 16 attention heads to capture different types of relationships.\n\n 10. The final layer converts the processed data into vocabulary-sized predictions.\n\n 11. The model generates text by repeatedly predicting the next most likely token.\n\n### Multi Layer Perceptron (MLP)\n\nMLP is a fundamental building block within the transformer’s feed-forward network. Its role is to introduce non-linearity and learn complex relationships within the embedded representations. When defining an MLP module, an important parameter is n_embed, which defines the dimensionality of the input embedding.\n\nThe MLP typically consists of a hidden linear layer that expands the input dimension by a factor (often 4, which we will use), followed by a non-linear activation function, commonly ReLU. This structure allows our network to learn more complex features. Finally, a projection linear layer maps the expanded representation back to the original embedding dimension. This sequence of transformations enables the MLP to refine the representations learned by the attention mechanism.\n\n![MLP by [Fareed Khan](undefined)](https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F4866\u002F1*GXxiLMW4kUXqOEimBA7g0A.png)\n\n```python\n# --- MLP (Multi-Layer Perceptron) Class ---\n\nclass MLP(nn.Module):\n    \"\"\"\n    A simple Multi-Layer Perceptron with one hidden layer.\n\n    This module is used within the Transformer block for feed-forward processing.\n    It expands the input embedding size, applies a ReLU activation, and then projects it back\n    to the original embedding size.\n    \"\"\"\n    def __init__(self, n_embed):\n        super().__init__()\n        self.hidden = nn.Linear(n_embed, 4 * n_embed)  # Linear layer to expand embedding size\n        self.relu = nn.ReLU()                        # ReLU activation function\n        self.proj = nn.Linear(4 * n_embed, n_embed)  # Linear layer to project back to original size\n\n    def forward(self, x):\n        \"\"\"\n        Forward pass through the MLP.\n\n        Args:\n            x (torch.Tensor): Input tensor of shape (B, T, C), where B is batch size,\n                              T is sequence length, and C is embedding size.\n\n        Returns:\n            torch.Tensor: Output tensor of the same shape as the input.\n        \"\"\"\n        x = self.forward_embedding(x)\n        x = self.project_embedding(x)\n        return x\n\n    def forward_embedding(self, x):\n        \"\"\"\n        Applies the hidden linear layer followed by ReLU activation.\n\n        Args:\n            x (torch.Tensor): Input tensor.\n\n        Returns:\n            torch.Tensor: Output after the hidden layer and ReLU.\n        \"\"\"\n        x = self.relu(self.hidden(x))\n        return x\n\n    def project_embedding(self, x):\n        \"\"\"\n        Applies the projection linear layer.\n\n        Args:\n            x (torch.Tensor): Input tensor.\n\n        Returns:\n            torch.Tensor: Output after the projection layer.\n        \"\"\"\n        x = self.proj(x)\n        return x\n```\n\nWe just coded our MLP part, where the __init__ method initializes a hidden linear layer that expands the input embedding size (n_embed) and a projection layer that reduces it back. ReLU activation is applied after the hidden layer. The forward method defines the data flow through these layers, applying the hidden layer and ReLU via forward_embedding, and the projection layer via project_embedding.\n\n### Single Head Attention\n\nThe attention head is the core part of our model. Its purpose is to focus on relevant parts of the input sequence. When defining a Head module, some important parameters are head_size, n_embed, and context_length. The head_size parameter determines the dimensionality of the key, query, and value projections, influencing the representational capacity of the attention mechanism.\n\nThe input embedding dimension n_embed defines the size of the input to these projection layers. context_length is used to create a causal mask, ensuring that the model only attends to preceding tokens.\n\nWithin the Head, linear layers (nn.Linear) for key, query, and value are initialized without bias. A lower triangular matrix (tril) of size context_length x context_length is registered as a buffer to implement causal masking, preventing the attention mechanism from attending to future tokens.\n\n![Single Head Attention by [Fareed Khan](undefined)](https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F5470\u002F1*teNwEhicq9ebVURiMS8WkA.png)\n\n```python\n# --- Attention Head Class ---\n\nclass Head(nn.Module):\n    \"\"\"\n    A single attention head.\n\n    This module calculates attention scores and applies them to the values.\n    It includes key, query, and value projections, and uses causal masking\n    to prevent attending to future tokens.\n    \"\"\"\n    def __init__(self, head_size, n_embed, context_length):\n        super().__init__()\n        self.key = nn.Linear(n_embed, head_size, bias=False)   # Key projection\n        self.query = nn.Linear(n_embed, head_size, bias=False) # Query projection\n        self.value = nn.Linear(n_embed, head_size, bias=False) # Value projection\n        # Lower triangular matrix for causal masking\n        self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)))\n\n    def forward(self, x):\n        \"\"\"\n        Forward pass through the attention head.\n\n        Args:\n            x (torch.Tensor): Input tensor of shape (B, T, C).\n\n        Returns:\n            torch.Tensor: Output tensor after applying attention.\n        \"\"\"\n        B, T, C = x.shape\n        k = self.key(x)     # (B, T, head_size)\n        q = self.query(x)   # (B, T, head_size)\n        scale_factor = 1 \u002F math.sqrt(C)\n        # Calculate attention weights: (B, T, head_size) @ (B, head_size, T) -> (B, T, T)\n        attn_weights = q @ k.transpose(-2, -1) * scale_factor\n        # Apply causal masking\n        attn_weights = attn_weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))\n        attn_weights = F.softmax(attn_weights, dim=-1)\n        v = self.value(x)   # (B, T, head_size)\n        # Apply attention weights to values\n        out = attn_weights @ v # (B, T, T) @ (B, T, head_size) -> (B, T, head_size)\n        return out\n```\n\nOur attention head class’s __init__ method initializes linear layers for key, query, and value projections, each projecting the input embedding (n_embed) to head_size. A lower triangular matrix based on context_length is used for causal masking. The forward method calculates attention weights by scaling the dot product of the query and key, applies the causal mask, normalizes the weights using softmax, and computes the weighted sum of the values to produce the attention output.\n\n### Multi Head Attention\n\nTo capture diverse relationships within the input sequence, we are going to use the concept of multi-head attention. The MultiHeadAttention module manages multiple independent attention heads operating in parallel.\n\nThe key parameter here is n_head, which determines the number of parallel attention heads. The input embedding dimension (n_embed) and context_length are also necessary to instantiate the individual attention heads. Each head processes the input independently, projecting it into a lower-dimensional subspace of size n_embed \u002F\u002F n_head. By having multiple heads, the model can attend to different aspects of the input simultaneously.\n\n![Multi Head Attention by [Fareed Khan](undefined)](https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F6864\u002F1*fa-YjrZdtbpuCLp7An99dg.png)\n\n```python\n# --- Multi-Head Attention Class ---\n\nclass MultiHeadAttention(nn.Module):\n    \"\"\"\n    Multi-Head Attention module.\n\n    This module combines multiple attention heads in parallel. The outputs of each head\n    are concatenated to form the final output.\n    \"\"\"\n    def __init__(self, n_head, n_embed, context_length):\n        super().__init__()\n        self.heads = nn.ModuleList([Head(n_embed \u002F\u002F n_head, n_embed, context_length) for _ in range(n_head)])\n\n    def forward(self, x):\n        \"\"\"\n        Forward pass through the multi-head attention.\n\n        Args:\n            x (torch.Tensor): Input tensor of shape (B, T, C).\n\n        Returns:\n            torch.Tensor: Output tensor after concatenating the outputs of all heads.\n        \"\"\"\n        # Concatenate the output of each head along the last dimension (C)\n        x = torch.cat([h(x) for h in self.heads], dim=-1)\n        return x\n```\n\nNow that we have defined the MultiHeadAttention class, which combines multiple attention heads, the __init__ method initializes a list of Head instances (a total of n_head), each with a head_size of n_embed \u002F\u002F n_head. The forward method applies each attention head to the input x and concatenates their outputs along the last dimension, merging the information learned by each head.\n\n### Transformer Block\n\nTo create a billion-parameter model, we definitely need a deep architecture. For that, we need to code a transformer block and stack them. The key parameters of a block are n_head, n_embed, and context_length. Each block comprises a multi-head attention layer and a feed-forward network (MLP), with layer normalization applied before each and residual connections after each.\n\nLayer normalization, parameterized by the embedding dimension n_embed, helps stabilize training. The multi-head attention mechanism, as described before, takes n_head, n_embed, and context_length. The MLP also utilizes the embedding dimension n_embed. These components work together to process the input and learn complex patterns.\n\n![Transformer Block by [Fareed Khan](undefined)](https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F6942\u002F1*uLWGajZc6StnQHfZjcb6eA.png)\n\n```python\n# --- Transformer Block Class ---\n\nclass Block(nn.Module):\n    \"\"\"\n    A single Transformer block.\n\n    This block consists of a multi-head attention layer followed by an MLP,\n    with layer normalization and residual connections.\n    \"\"\"\n    def __init__(self, n_head, n_embed, context_length):\n        super().__init__()\n        self.ln1 = nn.LayerNorm(n_embed)\n        self.attn = MultiHeadAttention(n_head, n_embed, context_length)\n        self.ln2 = nn.LayerNorm(n_embed)\n        self.mlp = MLP(n_embed)\n\n    def forward(self, x):\n        \"\"\"\n        Forward pass through the Transformer block.\n\n        Args:\n            x (torch.Tensor): Input tensor.\n\n        Returns:\n            torch.Tensor: Output tensor after the block.\n        \"\"\"\n        # Apply multi-head attention with residual connection\n        x = x + self.attn(self.ln1(x))\n        # Apply MLP with residual connection\n        x = x + self.mlp(self.ln2(x))\n        return x\n\n    def forward_embedding(self, x):\n        \"\"\"\n        Forward pass focusing on the embedding and attention parts.\n\n        Args:\n            x (torch.Tensor): Input tensor.\n\n        Returns:\n            tuple: A tuple containing the output after MLP embedding and the residual.\n        \"\"\"\n        res = x + self.attn(self.ln1(x))\n        x = self.mlp.forward_embedding(self.ln2(res))\n        return x, res\n```\n\nOur Block class represents a single transformer block. The __init__ method initializes layer normalization layers (ln1, ln2), a MultiHeadAttention module, and an MLP module, all parameterized by n_head, n_embed, and context_length.\n\nThe forward method implements the block's forward pass, applying layer normalization and multi-head attention with a residual connection, followed by another layer normalization and the MLP, again with a residual connection. The forward_embedding method provides an alternative forward pass focused on the attention and initial MLP embedding stages.\n\n### The Final Model\n\nSo far, we have coded small components of the transformer model. Next, we integrate token and position embeddings with a series of transformer blocks to perform sequence-to-sequence tasks. To do that, we need to code several key parameters: n_head, n_embed, context_length, vocab_size, and N_BLOCKS.\n\nvocab_size determines the size of the token embedding layer, mapping each token to a dense vector of size n_embed. The context_length parameter is important for the position embedding layer, which encodes the position of each token in the input sequence, also with dimension n_embed. The number of attention heads (n_head) and the number of blocks (N_BLOCKS) dictate the depth and complexity of the network.\n\nThese parameters collectively define the architecture and capacity of the transformer model, so let’s code it.\n\n![Transformer Class by [Fareed Khan](undefined)](https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F5418\u002F1*0XXd_R2EOhkKCQDfqUQg0w.png)\n\n```python\n# --- Transformer Model Class ---\n\nclass Transformer(nn.Module):\n    \"\"\"\n    The main Transformer model.\n\n    This class combines token and position embeddings with a sequence of Transformer blocks\n    and a final linear layer for language modeling.\n    \"\"\"\n    def __init__(self, n_head, n_embed, context_length, vocab_size, N_BLOCKS):\n        super().__init__()\n        self.context_length = context_length\n        self.N_BLOCKS = N_BLOCKS\n        self.token_embed = nn.Embedding(vocab_size, n_embed)\n        self.position_embed = nn.Embedding(context_length, n_embed)\n        self.attn_blocks = nn.ModuleList([Block(n_head, n_embed, context_length) for _ in range(N_BLOCKS)])\n        self.layer_norm = nn.LayerNorm(n_embed)\n        self.lm_head = nn.Linear(n_embed, vocab_size)\n        self.register_buffer('pos_idxs', torch.arange(context_length))\n\n    def _pre_attn_pass(self, idx):\n        \"\"\"\n        Combines token and position embeddings.\n\n        Args:\n            idx (torch.Tensor): Input token indices.\n\n        Returns:\n            torch.Tensor: Sum of token and position embeddings.\n        \"\"\"\n        B, T = idx.shape\n        tok_embedding = self.token_embed(idx)\n        pos_embedding = self.position_embed(self.pos_idxs[:T])\n        return tok_embedding + pos_embedding\n\n    def forward(self, idx, targets=None):\n        \"\"\"\n        Forward pass through the Transformer.\n\n        Args:\n            idx (torch.Tensor): Input token indices.\n            targets (torch.Tensor, optional): Target token indices for loss calculation. Defaults to None.\n\n        Returns:\n            tuple: Logits and loss (if targets are provided).\n        \"\"\"\n        x = self._pre_attn_pass(idx)\n        for block in self.attn_blocks:\n            x = block(x)\n        x = self.layer_norm(x)\n        logits = self.lm_head(x)\n        loss = None\n        if targets is not None:\n            B, T, C = logits.shape\n            flat_logits = logits.view(B * T, C)\n            targets = targets.view(B * T).long()\n            loss = F.cross_entropy(flat_logits, targets)\n        return logits, loss\n\n    def forward_embedding(self, idx):\n        \"\"\"\n        Forward pass focusing on the embedding and attention blocks.\n\n        Args:\n            idx (torch.Tensor): Input token indices.\n\n        Returns:\n            tuple: Output after attention blocks and the residual.\n        \"\"\"\n        x = self._pre_attn_pass(idx)\n        residual = x\n        for block in self.attn_blocks:\n            x, residual = block.forward_embedding(x)\n        return x, residual\n\n    def generate(self, idx, max_new_tokens):\n        \"\"\"\n        Generates new tokens given a starting sequence.\n\n        Args:\n            idx (torch.Tensor): Initial sequence of token indices.\n            max_new_tokens (int): Number of tokens to generate.\n\n        Returns:\n            torch.Tensor: The extended sequence of tokens.\n        \"\"\"\n        for _ in range(max_new_tokens):\n            idx_cond = idx[:, -self.context_length:]\n            logits, _ = self(idx_cond)\n            logits = logits[:, -1, :]\n            probs = F.softmax(logits, dim=-1)\n            idx_next = torch.multinomial(probs, num_samples=1)\n            idx = torch.cat((idx, idx_next), dim=1)\n        return idx\n```\n\nOur Transformer class `__init__` method initializes token and position embedding layers (token_embed, position_embed), a sequence of Block modules (attn_blocks), a final layer normalization layer (layer_norm), and a linear layer for language modeling (lm_head).\n\nThe _pre_attn_pass method combines token and position embeddings. The forward method processes the input sequence through the embedding layers and the series of transformer blocks, applies final layer normalization, and generates logits. It also calculates the loss if targets are provided. The forward_embedding method provides an intermediate forward pass up to the output of the attention blocks, and the generate method implements token generation.\n\n### Batch Processing\n\nWhen we train a deep learning model on big data, we process it in batches due to GPU availability. So, let’s create a get_batch_iterator function, taking the data_path to an HDF5 file, the desired batch_size, the context_length for each sequence, and the device to load the data onto.\n\nThe batch_size determines how many sequences are processed in parallel during training, while the context_length specifies the length of each input sequence. The data_path points to the location of the training data.\n\n```python\n# --- Data Loading Utility --- \n\ndef get_batch_iterator(data_path, batch_size, context_length, device=\"gpu\"):\n    \"\"\"\n    Creates an iterator for generating batches of data from an HDF5 file.\n\n    Args:\n        data_path (str): Path to the HDF5 file containing tokenized data.\n        batch_size (int): Number of sequences in each batch.\n        context_length (int): Length of each sequence.\n        device (str, optional): Device to load the data onto ('cpu' or 'cuda'). Defaults to \"cpu\".\n\n    Yields:\n        tuple: A tuple containing input sequences (xb) and target sequences (yb).\n    \"\"\"\n    # Open the HDF5 file in read mode\n    with h5py.File(data_path, 'r') as hdf5_file:\n        \n        # Extract the dataset of tokenized sequences\n        dataset = hdf5_file['tokens']\n        \n        # Get the total size of the dataset\n        dataset_size = dataset.shape[0]\n        \n        # Calculate the number of examples (sequences) that can be made from the data\n        n_examples = (dataset_size - 1) \u002F\u002F context_length\n        \n        # Create an array of indices for examples and shuffle them for randomness\n        example_idxs = np.arange(n_examples)\n        np.random.shuffle(example_idxs)\n        \n        # Initialize epoch counter and example counter\n        epochs = 0\n        counter = 0\n        \n        while True:\n            # Check if the current batch exceeds the number of available examples\n            if counter + batch_size > n_examples:\n                # Shuffle the indices again and reset the counter to 0\n                np.random.shuffle(example_idxs)\n                counter = 0\n                print(f\"Finished epoch {epochs}\")  # Print epoch number when an epoch finishes\n                epochs += 1  # Increment the epoch counter\n            \n            # Select a batch of random indices to generate sequences\n            random_indices = example_idxs[counter:counter+batch_size] * context_length\n            \n            # Retrieve sequences from the dataset based on the random indices\n            random_samples = torch.tensor(np.array([dataset[idx:idx+context_length+1] for idx in random_indices]))\n            \n            # Separate the input sequences (xb) and target sequences (yb)\n            xb = random_samples[:, :context_length].to(device)  # Input sequence (first half of the random sample)\n            yb = random_samples[:, 1:context_length+1].to(device)  # Target sequence (second half of the random sample)\n            \n            # Increment the counter to move to the next batch\n            counter += batch_size\n            \n            # Yield the input and target sequences as a tuple for the current batch\n            yield xb, yb\n```\nOur get_batch_iterator function handles the loading and batching of training data. It takes data_path, batch_size, context_length, and device as input. The function opens the HDF5 file, shuffles the data, and then enters an infinite loop to generate batches. In each iteration, it selects a random subset of the data to form a batch of input sequences (xb) and their corresponding target sequences (yb).\n\n### Training Parameters\n\nNow that we have coded our model, we need to define the training parameters, such as the number of heads, blocks, and more, along with the data path.\n\n```python\n# --- Configuration ---\n\n# Define vocabulary size and transformer configuration\nVOCAB_SIZE = 50304          # Number of unique tokens in the vocabulary\nCONTEXT_LENGTH = 512        # Maximum sequence length for the model\nN_EMBED = 2048              # Dimension of the embedding space\nN_HEAD = 16                 # Number of attention heads in each transformer block\nN_BLOCKS = 64               # Number of transformer blocks in the model\n\n# Paths to training and development datasets\nTRAIN_PATH = \"data\u002Ftrain\u002Fpile_val.h5\"  # File path for the training dataset\nDEV_PATH = \"data\u002Fval\u002Fpile_val.h5\"      # File path for the validation dataset\n\n# Transformer training parameters\nT_BATCH_SIZE = 32          # Number of samples per training batch\nT_CONTEXT_LENGTH = 16      # Context length for training batches\nT_TRAIN_STEPS = 200000     # Total number of training steps\nT_EVAL_STEPS = 1000        # Frequency (in steps) to perform evaluation\nT_EVAL_ITERS = 250         # Number of iterations to evaluate the model\nT_LR_DECAY_STEP = 50000    # Step at which to decay the learning rate\nT_LR = 5e-4                # Initial learning rate for training\nT_LR_DECAYED = 5e-5        # Learning rate after decay\nT_OUT_PATH = \"models\u002Ftransformer_B.pt\"  # Path to save the trained model\n\n# Device configuration\nDEVICE = 'cuda'\n\n# Store all configurations in a dictionary for easy access and modification\ndefault_config = {\n    'vocab_size': VOCAB_SIZE,\n    'context_length': CONTEXT_LENGTH,\n    'n_embed': N_EMBED,\n    'n_head': N_HEAD,\n    'n_blocks': N_BLOCKS,\n    'train_path': TRAIN_PATH,\n    'dev_path': DEV_PATH,\n    't_batch_size': T_BATCH_SIZE,\n    't_context_length': T_CONTEXT_LENGTH,\n    't_train_steps': T_TRAIN_STEPS,\n    't_eval_steps': T_EVAL_STEPS,\n    't_eval_iters': T_EVAL_ITERS,\n    't_lr_decay_step': T_LR_DECAY_STEP,\n    't_lr': T_LR,\n    't_lr_decayed': T_LR_DECAYED,\n    't_out_path': T_OUT_PATH,\n    'device': DEVICE,\n}\n```\n\nFor most of the parameters, I have used the most common values and also stored them in a dictionary for easy access. Here, the parameters are for a billion-parameter model. If you want to train a model with millions of parameters, you can reduce the main parameters, which include CONTEXT_LENGTH, N_EMBED, N_HEAD, and N_BLOCKS. However, you can also run the million-parameter model script in my GitHub repository.\n\n### Training the Model\n\nLet's initialize our transformer model and check its total number of parameters.\n```python\n# --- Initialize the Model and Print Parameters --- \n\nmodel = Transformer(\n    n_head=config['n_head'],\n    n_embed=config['n_embed'],\n    context_length=config['context_length'],\n    vocab_size=config['vocab_size'],\n    N_BLOCKS=config['n_blocks']\n).to(config['device'])\n\n\n# Print the total number of parameters\ntotal_params = sum(p.numel() for p in model.parameters())\nprint(f\"Total number of parameters in the model: {total_params:,}\")\n\n\n#### OUTPUT ####\n2,141,346,251\n```\n\nNow that we have 2 Billion parameter model, we need to define our Adam optimizer and loss tracking function, which will help us track the progress of our model throughout the training.\n\n```python\n# --- Optimizer Setup and Loss Tracking --- \n\n# Set up the AdamW optimizer with the specified learning rate.\noptimizer = torch.optim.AdamW(model.parameters(), lr=config['t_lr'])\n\n# List to track loss values during training.\nlosses = []\n\n# Define a window size for averaging recent losses in the training loop.\nAVG_WINDOW = 64\n\n# Helper function to estimate the average loss for training and development data.\n@torch.no_grad()\ndef estimate_loss(steps):\n    \"\"\"\n    Evaluate the model on training and development datasets and calculate average loss.\n\n    Args:\n        steps (int): Number of steps to evaluate.\n\n    Returns:\n        dict: Dictionary containing average losses for 'train' and 'dev' splits.\n    \"\"\"\n    out = {}\n    model.eval()  # Set the model to evaluation mode.\n\n    for split in ['train', 'dev']:\n        # Select the appropriate data path for the current split.\n        data_path = config['train_path'] if split == 'train' else config['dev_path']\n        \n        # Create a batch iterator for evaluation.\n        batch_iterator_eval = get_batch_iterator(\n            data_path, config['t_batch_size'], config['t_context_length'], device=config['device']\n        )\n        \n        # Initialize a tensor to track loss values for each evaluation step.\n        losses_eval = torch.zeros(steps)\n        for k in range(steps):\n            try:\n                # Fetch a batch and calculate the loss.\n                xb, yb = next(batch_iterator_eval)\n                _, loss = model(xb, yb)\n                losses_eval[k] = loss.item()\n            except StopIteration:\n                # Handle the case where the data iterator ends early.\n                print(f\"Warning: Iterator for {split} ended early.\")\n                break\n        \n        # Compute the mean loss for the current split.\n        out[split] = losses_eval[:k + 1].mean()\n    \n    model.train()  # Restore the model to training mode.\n    return out\n```\n\nWe will now initialize our batch processing function and training loop, which will start our training.\n\n```python\n# --- Training Loop ---\n\n# Create a batch iterator for the training data.\nbatch_iterator = get_batch_iterator(\n  config['train_path'],\n  config['t_batch_size'],\n  config['t_context_length'],\n  device=config['device']\n)\n\n# Create a progress bar to monitor training progress.\npbar = tqdm(range(config['t_train_steps']))\nfor step in pbar:\n  try:\n      # Fetch a batch of input and target data.\n      xb, yb = next(batch_iterator)\n      \n      # Perform a forward pass and compute the loss.\n      _, loss = model(xb, yb)\n      \n      # Record the loss for tracking.\n      losses.append(loss.item())\n      pbar.set_description(f\"Train loss: {np.mean(losses[-AVG_WINDOW:]):.4f}\")\n      \n      # Backpropagate the loss and update the model parameters.\n      optimizer.zero_grad(set_to_none=True)\n      loss.backward()\n      optimizer.step()\n\n      # Periodically evaluate the model on training and development data.\n      if step % config['t_eval_steps'] == 0:\n          train_loss, dev_loss = estimate_loss(config['t_eval_iters']).values()\n          print(f\"Step: {step}, Train loss: {train_loss:.4f}, Dev loss: {dev_loss:.4f}\")\n\n      # Decay the learning rate at the specified step.\n      if step == config['t_lr_decay_step']:\n          print('Decaying learning rate')\n          for g in optimizer.param_groups:\n              g['lr'] = config['t_lr_decayed']\n  except StopIteration:\n      # Handle the case where the training data iterator ends early.\n      print(\"Training data iterator finished early.\")\n      break\n```\n### Saving the Trained Model\n\nSince our training loop has the ability to handle errors, in case the loop throws any error, it will save our partially trained model to avoid loss. Once the training is complete, we can save our trained model to use it later for inference.\n\n```python\n# --- Save Model and Final Evaluation ---\n\n# Perform a final evaluation of the model on training and development datasets.\ntrain_loss, dev_loss = estimate_loss(200).values()\n\n# Ensure unique model save path in case the file already exists.\nmodified_model_out_path = config['t_out_path']\nsave_tries = 0\nwhile os.path.exists(modified_model_out_path):\n    save_tries += 1\n    model_out_name = os.path.splitext(config['t_out_path'])[0]\n    modified_model_out_path = model_out_name + f\"_{save_tries}\" + \".pt\"\n\n# Save the model's state dictionary, optimizer state, and training metadata.\ntorch.save(\n    {\n        'model_state_dict': model.state_dict(),\n        'optimizer_state_dict': optimizer.state_dict(),\n        'losses': losses,\n        'train_loss': train_loss,\n        'dev_loss': dev_loss,\n        'steps': len(losses),\n    },\n    modified_model_out_path\n)\nprint(f\"Saved model to {modified_model_out_path}\")\nprint(f\"Finished training. Train loss: {train_loss:.4f}, Dev loss: {dev_loss:.4f}\")\n```\nThe final training loss for the billion-parameter model is 0.2314, and the dev loss is 0.643.\n\n### Training Loss\n\nWhen I plot the loss of both the million- and billion-parameter models, they look very different.\n\n![Training Loss Comparison](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFareedKhan-dev_train-llm-from-scratch_readme_ee28212aeba5.png)\n\nThe billion-parameter model starts with a much higher loss and fluctuates a lot at the beginning. It goes down quickly at first, but then wobbles before becoming smoother. This shows that the bigger model has a harder time finding the right way to learn at the start. It might need more data and careful settings. When the learning rate is lowered (the red line), the loss goes down more steadily, showing that this helps it fine-tune.\n\nThe million-parameter model’s loss goes down more easily from the start. It doesn’t fluctuate as much as the bigger model. When the learning rate is lowered, it doesn’t change the curve as much. This is likely because the smaller model is simpler to train and finds a good solution faster. The big difference shows how much harder it is to train very large models. They need different methods and maybe more time to learn well.\n\nWe now have our saved model. We can finally use it for inference and see how it generates text. 😓\n\n### Generating Text\n\nLet’s create a function to generate text from our saved model, which takes the saved model path and the encoder as inputs and returns the generated text.\n\n```python\ndef generate_text(model_path, input_text, max_length=512, device=\"gpu\"):\n    \"\"\"\n    Generate text using a pre-trained model based on the given input text.\n\n    Args:\n    - model_path (str): Path to the model checkpoint.\n    - device (torch.device): Device to load the model on (e.g., 'cpu' or 'cuda').\n    - input_text (str): The input text to seed the generation.\n    - max_length (int, optional): Maximum length of generated text. Defaults to 512.\n\n    Returns:\n    - str: The generated text.\n    \"\"\"\n\n    # Load the model checkpoint\n    checkpoint = torch.load(model_path)\n\n    # Initialize the model (you should ensure that the Transformer class is defined elsewhere)\n    model = Transformer().to(device)\n\n    # Load the model's state dictionary\n    model.load_state_dict(checkpoint['model_state_dict'])\n\n    # Load the tokenizer for the GPT model (we use 'r50k_base' for GPT models)\n    enc = tiktoken.get_encoding('r50k_base')\n\n    # Encode the input text along with the end-of-text token\n    input_ids = torch.tensor(\n        enc.encode(input_text, allowed_special={'\u003C|endoftext|>'}),\n        dtype=torch.long\n    )[None, :].to(device)  # Add batch dimension and move to the specified device\n\n    # Generate text with the model using the encoded input\n    with torch.no_grad():\n        # Generate up to 'max_length' tokens of text\n        generated_output = model.generate(input_ids, max_length)\n\n        # Decode the generated tokens back into text\n        generated_text = enc.decode(generated_output[0].tolist())\n\n    return generated_text\n```\n\nThe transformer we defined earlier needs to be called here to load the architecture, and then we load the saved model as the state in that architecture.\n\nLet’s first observe what both the million and billion-parameter models generate without providing any input, and see what they generate randomly.\n\n```python\n# Defining the file paths for the pre-trained models\nBillion_model_path = 'models\u002Ftransformer_B.pt'  # Path to the Billion model\nMillion_model_path = 'models\u002Ftransformer_M.pt'  # Path to the Million model\n\n# Using '\u003C|endoftext|>' as input to the models (acts as a prompt that allows the models to generate text freely)\ninput_text = \"\u003C|endoftext|>\"\n\n# Call the function to generate text based on the input text using the Billion model\nB_output = generate_text(Billion_model_path, input_text)\n\n# Call the function to generate text based on the input text using the Million model\nM_output = generate_text(Million_model_path, input_text)\n\n# Print the output generated by both models\nprint(B_output)  # Output from the Billion model\nprint(M_output)  # Output from the Million model\n```\n\n| **Million Parameter Output** | **Billion Parameter Output** |\n|------------------------------|------------------------------|\n| In 1978, The park was returned to the factory-plate that the public share to the lower of the electronic fence that follow from the Station's cities. The Canal of ancient Western nations were confined to the city spot. The villages were directly linked to cities in China that revolt that the US budget and in Odambinais is uncertain and fortune established in rural areas. | There are two miles east coast from 1037 and 73 million refugees (hypotetus) as the same men and defeated Harvard, and Croft. At right east and West Nile's Mediterranean Sea jets. It was found there a number of parties, blacksmith, musician and boutique hospitality and inspire the strain delivered Canadians have already killed, rural branches with coalition railholder against Abyssy. |\n\n\nBoth LLMs are able to generate clear and accurate words when the context is short and simple. For example, in the million-parameter output, the phrase **“The villages were directly linked to cities in China”** makes sense and conveys a clear idea. It is easy to understand and logically connects the villages to the cities.\n\nHowever, when the context becomes longer and more complex, the clarity begins to fade. In the billion-parameter output, sentences like **“There are two miles east coast from 1037 and 73 million refugees (hypotetus)”** and **“blacksmith, musician and boutique hospitality and inspire the strain delivered Canadians”** become harder to follow. The ideas seem disjointed, and the sentence structure doesn’t flow naturally. While the words used might still be correct, the overall meaning becomes confusing and unclear.\n\nThe positive point is that the 13+ million-parameter LLM also starts generating some kind of meaningful content with correct word spelling. For instance, when I use the subject input text, it starts generating an email for me. Although, obviously, broader text doesn’t provide meaningful results, take a look at the output:\n\n```python\n# Input text\ninput_text \"Subject: \"\n\n# Call the Million parameter Mod\nm_output = generate_text(Million_model_path, input_text)\n\nprint(m_output)  # Output from the Million model\n```\n| **Million Parameter LLM Output**                                                                 |\n|--------------------------------------------------------------------------------------------------|\n| Subject: ClickPaper-summary Study for Interview \u003Cbr>Good morning, I hope this message finds you well, as the sun gently peeks through the clouds, ... |\n\nOur million parameter model gives us the motivation that we can have a very narrow, goal-oriented LLM under 1B in size, while our 1B trained model shows us that the architecture needs to be coded in great depth with proper consideration. Otherwise, it won’t improve training or performance compared to the million-parameter model. It will just overfit the data unless you have a deep architecture for the billion-sized model.\n\n# What’s Next\n\nI recommend that you create the 13+ million-parameter model and then start scaling it by adding the next 100 parameters, improving its ability to handle shorter contexts. It’s up to you how many more parameters you want to train for specific tasks. Then, for the remaining parameters under 1B, try fine-tuning the model on domain-specific data, such as writing emails or essays, and see how it generates the text.\n\n\u003Chr>\n\nWanna chat on something? [My Linkedin](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Ffareed-khan-dev\u002F)\n\n## Star History\n\n[![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFareedKhan-dev_train-llm-from-scratch_readme_5e29c4daa969.png)](https:\u002F\u002Fstar-history.com\u002F#FareedKhan-dev\u002Ftrain-llm-from-scratch&Date)\n","![main image](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFareedKhan-dev_train-llm-from-scratch_readme_b1e7a161b0d2.png)\n\n\u003Cdiv align=\"center\">\n\n\u003C!-- omit in toc -->\n# 从零开始训练大型语言模型（LLM）\n  \n![Python](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FPython-3.8%2B-blue) ![License](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FLicense-MIT-green) ![Contributions](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FContributions-Welcome-blue) [![Docs](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FDocs-Available-success)](#step-by-step-code-explanation)\n\n**我正在寻找人工智能（AI）领域的博士（PhD）职位**。查看我的 [简历](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1Q_iklJ1RVGSb-Pdey8BHy3k8IF3UJv0z\u002Fview?usp=sharing) 或 [GitHub](https:\u002F\u002Fgithub.com\u002FFareedKhan-dev)\n\n\u003C\u002Fdiv>\n\n我基于论文 [Attention is All You Need（注意力机制就是你需要的一切）](https:\u002F\u002Farxiv.org\u002Fabs\u002F1706.03762) 使用 PyTorch 从零实现了一个 Transformer（变换器）模型。你可以使用我的脚本来训练自己的 **十亿** 或 **百万** 参数的 LLM（大型语言模型），仅需单个 GPU。\n\n以下是训练完成的 1300 万参数 LLM 的输出：\n\n```\nIn ***1978, The park was returned to the factory-plate that \nthe public share to the lower of the electronic fence that \nfollow from the Station's cities. The Canal of ancient Western \nnations were confined to the city spot. The villages were directly \nlinked to cities in China that revolt that the US budget and in\nOdambinais is uncertain and fortune established in rural areas.\n```\n\u003C!-- omit in toc -->\n## 目录\n- [训练数据信息](#training-data-info)\n- [先决条件和训练时间](#prerequisites-and-training-time)\n- [代码结构](#code-structure)\n- [用法](#usage)\n- [代码逐步解释](#step-by-step-code-explanation)\n  - [导入库](#importing-libraries)\n  - [准备训练数据](#preparing-the-training-data)\n  - [Transformer 概述](#transformer-overview)\n  - [多层感知机（MLP）](#multi-layer-perceptron-mlp)\n  - [单头注意力机制](#single-head-attention)\n  - [多头注意力机制](#multi-head-attention)\n  - [Transformer 模块](#transformer-block)\n  - [最终模型](#the-final-model)\n  - [批处理](#batch-processing)\n  - [训练参数](#training-parameters)\n  - [训练模型](#training-the-model)\n  - [保存训练好的模型](#saving-the-trained-model)\n  - [训练损失](#training-loss)\n  - [生成文本](#generating-text)\n- [下一步计划](#whats-next)\n\n## 训练数据信息\n\n训练数据来自 Pile 数据集（PILE 数据集，一个多样化的开源大规模语言模型训练数据集）。Pile 数据集是 22 个多样化数据集的集合，包括书籍、文章、网站等文本。Pile 数据集的总大小为 825GB。以下是训练数据的样本：\n\n```python\nLine: 0 \n{\n  \"text\": \"Effect of sleep quality ... epilepsy.\",\n  \"meta\": {\n    \"pile_set_name\": \"PubMed Abstracts\"\n  }\n}\n\nLine: 1\n{\n  \"text\": \"LLMops a new GitHub Repository ...\",\n  \"meta\": {\n    \"pile_set_name\": \"Github\"\n  }\n}\n```\n\n## 先决条件与训练时间\n\n确保您理解面向对象编程（OOP）、神经网络（NN）和 PyTorch 的基础知识，以便理解代码。以下资源可帮助您入门：\n\n| 主题               | 视频链接                                                |\n|---------------------|-----------------------------------------------------------|\n| 面向对象编程（OOP） | [OOP 视频](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=Ej_02ICOIgs&pp=ygUKb29wIHB5dGhvbg%3D%3D) |\n| 神经网络（NN）      | [神经网络视频](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=Jy4wM2X21u0&pp=ygUbbmV1cmFsIG5ldHdvcmsgcHl0aG9uIHRvcmNo) |\n| PyTorch             | [PyTorch 视频](https:\u002F\u002Fwww.youtube.com\u002Fwatch?v=V_xro1bcAuA&pp=ygUbbmV1cmFsIG5ldHdvcmsgcHl0aG9uIHRvcmNo) |\n\n您需要 GPU 来训练模型。Colab 或 Kaggle T4 可用于训练参数量超过 1300 万的模型，但无法训练十亿级参数模型。参考以下对比：\n\n| GPU 名称                 | 显存   | 数据规模 | 20 亿参数 LLM 训练 | 1300 万参数 LLM 训练 | 最大实用 LLM 规模（训练） |\n|--------------------------|--------|-----------|---------------------|----------------------|----------------------------|\n| NVIDIA A100              | 40 GB  | 大        | ✔                   | ✔                    | ~60 亿–80 亿                |\n| NVIDIA V100              | 16 GB  | 中        | ✘                   | ✔                    | ~20 亿                     |\n| AMD Radeon VII           | 16 GB  | 中        | ✘                   | ✔                    | ~15 亿–20 亿               |\n| NVIDIA RTX 3090          | 24 GB  | 大        | ✔                   | ✔                    | ~35 亿–40 亿               |\n| Tesla P100               | 16 GB  | 中        | ✘                   | ✔                    | ~15 亿–20 亿               |\n| NVIDIA RTX 3080          | 10 GB  | 中        | ✘                   | ✔                    | ~12 亿                     |\n| AMD RX 6900 XT           | 16 GB  | 大        | ✘                   | ✔                    | ~20 亿                     |\n| NVIDIA GTX 1080 Ti       | 11 GB  | 中        | ✘                   | ✔                    | ~12 亿                     |\n| Tesla T4                 | 16 GB  | 小        | ✘                   | ✔                    | ~15 亿–20 亿               |\n| NVIDIA Quadro RTX 8000   | 48 GB  | 大        | ✔                   | ✔                    | ~80 亿–100 亿              |\n| NVIDIA RTX 4070          | 12 GB  | 中        | ✘                   | ✔                    | ~15 亿                     |\n| NVIDIA RTX 4070 Ti       | 12 GB  | 中        | ✘                   | ✔                    | ~15 亿                     |\n| NVIDIA RTX 4080          | 16 GB  | 中        | ✘                   | ✔                    | ~20 亿                     |\n| NVIDIA RTX 4090          | 24 GB  | 大        | ✔                   | ✔                    | ~40 亿                     |\n| NVIDIA RTX 4060 Ti       | 8 GB   | 小        | ✘                   | ✔                    | ~10 亿                     |\n| NVIDIA RTX 4060          | 8 GB   | 小        | ✘                   | ✔                    | ~10 亿                     |\n| NVIDIA RTX 4050          | 6 GB   | 小        | ✘                   | ✔                    | ~7.5 亿                    |\n| NVIDIA RTX 3070          | 8 GB   | 小        | ✘                   | ✔                    | ~10 亿                     |\n| NVIDIA RTX 3060 Ti       | 8 GB   | 小        | ✘                   | ✔                    | ~10 亿                     |\n| NVIDIA RTX 3060          | 12 GB  | 中        | ✘                   | ✔                    | ~15 亿                     |\n| NVIDIA RTX 3050          | 8 GB   | 小        | ✘                   | ✔                    | ~10 亿                     |\n| NVIDIA GTX 1660 Ti       | 6 GB   | 小        | ✘                   | ✔                    | ~7.5 亿                    |\n| AMD RX 7900 XTX          | 24 GB  | 大        | ✔                   | ✔                    | ~35 亿–40 亿               |\n| AMD RX 7900 XT           | 20 GB  | 大        | ✔                   | ✔                    | ~30 亿                     |\n| AMD RX 7800 XT           | 16 GB  | 中        | ✘                   | ✔                    | ~20 亿                     |\n| AMD RX 7700 XT           | 12 GB  | 中        | ✘                   | ✔                    | ~15 亿                     |\n| AMD RX 7600              | 8 GB   | 小        | ✘                   | ✔                    | ~10 亿                     |\n\n1300 万参数 LLM 训练指训练参数量超过 1300 万的模型，20 亿参数 LLM 训练指训练参数量超过 20 亿的模型。数据规模分为小、中、大三类：小规模约 1 GB，中规模约 5 GB，大规模约 10 GB。\n\n## 代码结构\n\n代码库组织如下：\n```bash\ntrain-llm-from-scratch\u002F\n├── src\u002F          \n│   ├── models\u002F   \n│   │   ├── mlp.py       # 多层感知机（MLP）模块的定义\n│   │   ├── attention.py # 注意力机制（单头、多头）的定义\n│   │   ├── transformer_block.py # 单个 Transformer 块（Transformer 模型的基本组成单元）的定义\n│   │   ├── transformer.py     # 主 Transformer 模型的定义\n├── config\u002F       \n│   └── config.py    # 包含默认配置（模型参数、文件路径等）\n├── data_loader\u002F  \n│   └── data_loader.py # 包含创建数据加载器\u002F迭代器的函数\n├── scripts\u002F      \n│   ├── train_transformer.py # 训练 Transformer 模型的脚本\n│   ├── data_download.py   # 下载数据集的脚本\n│   ├── data_preprocess.py # 预处理下载数据的脚本\n│   ├── generate_text.py   # 使用训练模型生成文本的脚本\n├── data\u002F         # 存储数据集的目录\n│   ├── train\u002F     # 包含训练数据\n│   └── val\u002F       # 包含验证数据\n├── models\u002F       # 保存训练模型的目录\n```\n\n`scripts\u002F` 目录包含下载数据集、预处理数据、训练模型及生成文本的脚本。`src\u002Fmodels\u002F` 目录包含 Transformer 模型、多层感知机（MLP）、注意力机制和 Transformer 块的实现。`config\u002F` 目录包含含默认参数的配置文件。`data_loader\u002F` 目录包含创建数据加载器\u002F迭代器的函数。\n\n## 使用方法\n\n克隆仓库并进入目录：\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002FFareedKhan-dev\u002Ftrain-llm-from-scratch.git\ncd train-llm-from-scratch\n```\n\n若遇到导入问题，请确保将 PYTHONPATH 设置为项目根目录：\n```bash\nexport PYTHONPATH=\"${PYTHONPATH}:\u002Fpath\u002Fto\u002Ftrain-llm-from-scratch\"\n```\n\n# 或者如果你已经在 train-llm-from-scratch 目录中\nexport PYTHONPATH=\"$PYTHONPATH:.\"\n```\n\n安装所需依赖项：\n```bash\npip install -r requirements.txt\n```\n\n你可以在 `src\u002Fmodels\u002Ftransformer.py` 中修改 Transformer架构（一种神经网络架构），并在 `config\u002Fconfig.py` 中调整训练配置。\n\n要下载训练数据，请运行：\n```bash\npython scripts\u002Fdata_download.py\n```\n\n该脚本支持以下参数：\n* `--train_max`: 要下载的最大训练文件数量。默认值为 1（最大值为 30）。每个文件约 11 GB。\n* `--train_dir`: 存储训练数据的目录。默认为 `data\u002Ftrain`。\n* `--val_dir`: 存储验证数据的目录。默认为 `data\u002Fval`。\n\n要预处理下载的数据，请运行：\n```bash\npython scripts\u002Fdata_preprocess.py\n```\n\n该脚本支持以下参数：\n- `--train_dir`: 存储训练数据文件的目录（默认为 `data\u002Ftrain`）。\n- `--val_dir`: 存储验证数据文件的目录（默认为 `data\u002Fval`）。\n- `--out_train_file`: 以 HDF5 格式存储处理后训练数据的路径（默认为 `data\u002Ftrain\u002Fpile_train.h5`）。\n- `--out_val_file`: 以 HDF5 格式存储处理后验证数据的路径（默认为 `data\u002Fval\u002Fpile_dev.h5`）。\n- `--tokenizer_name`: 用于数据处理的分词器（Tokenizer）名称（默认为 `r50k_base`）。\n- `--max_data`: 从每个数据集（训练和验证）中处理的最大 JSON 对象数量（即[行数](#training-data-info)）。默认值为 1000。\n\n数据预处理完成后，你可以通过将 `config\u002Fconfig.py` 中的配置修改为以下内容来训练 1300 万参数的 LLM（大型语言模型）：\n\n```python\n# 定义词汇表大小和 Transformer 配置 (30 亿)\nVOCAB_SIZE = 50304          # 词汇表中唯一标记（Token）的数量\nCONTEXT_LENGTH = 128        # 模型的最大序列长度\nN_EMBED = 128               # 嵌入空间（Embedding Space）的维度\nN_HEAD = 8                  # 每个 Transformer 块中的注意力头（Attention Head）数量\nN_BLOCKS = 1               # 模型中 Transformer 块的数量\n```\n\n要训练模型，请运行：\n```bash\npython scripts\u002Ftrain_transformer.py\n```\n\n它将开始训练模型，并将训练好的模型保存在 `models\u002F` 默认目录或配置文件中指定的目录中。\n\n要使用训练好的模型生成文本，请运行：\n```bash\npython scripts\u002Fgenerate_text.py --model_path models\u002Fyour_model.pth --input_text hi\n```\n\n该脚本支持以下参数：\n- `--model_path`: 训练好的模型路径。\n- `--input_text`: 用于生成新文本的初始文本提示。\n- `--max_new_tokens`: 要生成的最大标记（Token）数量（默认为 100）。\n\n它将根据输入提示使用训练好的模型生成文本。\n\n## 逐步代码解析\n\n本节适用于希望详细了解代码的读者。我将从导入库开始，逐步解释代码，直至训练模型和生成文本。\n\n此前，我在 Medium 上写过一篇关于使用 Tiny Shakespeare 数据集创建[230+ 万参数](https:\u002F\u002Flevelup.gitconnected.com\u002Fbuilding-a-million-parameter-llm-from-scratch-using-python-f612398f06c2) LLM 的文章，但输出结果没有意义。以下是示例输出：\n\n```bash\n# 230 万参数 LLM 输出\nZELBETH:\nSey solmenter! tis tonguerered if\nVurint as steolated have loven OID the queend refore\nAre been, good plmp:\n\nProforne, wiftes swleen, was no blunderesd a a quain beath!\nTybell is my gateer stalk smend as be matious dazest\n```\n\n我想到，如果让 Transformer 架构更小、更简单，并使用更多样化的训练数据，那么单个人使用性能接近淘汰的 GPU 能创建多大参数规模的模型，使其能正确使用语法并生成有意义的文本？\n\n我发现 **1300+ 万参数** 的模型已足以在正确语法和标点方面开始产生意义，这是一个积极的信号。这意味着我们可以使用非常特定的数据集进一步微调先前训练的模型，以适应特定任务。最终可能会得到一个低于 10 亿参数甚至约 5 亿参数的模型，该模型非常适合我们的特定用例，尤其是安全地在私有数据上运行。\n\n我建议你**首先使用 GitHub 仓库中的脚本训练一个 1300+ 万参数**的模型。你将在一天内获得结果，而不是等待更长时间，或者避免因本地 GPU 性能不足而无法训练十亿参数模型的情况。\n\n### 导入库\n\n让我们导入本教程中将使用的必要库：\n\n```python\n# PyTorch 用于深度学习函数和张量（Tensor）\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n# 数值运算和数组处理\nimport numpy as np\n\n# 处理 HDF5 文件\nimport h5py\n\n# 操作系统和文件管理\nimport os\n\n# 命令行参数解析\nimport argparse\n\n# HTTP 请求和交互\nimport requests\n\n# 循环进度条\nfrom tqdm import tqdm\n\n# JSON 处理\nimport json\n\n# Zstandard 压缩库\nimport zstandard as zstd\n\n# 大型语言模型的分词（Tokenization）库\nimport tiktoken\n\n# 数学运算（用于高级数学函数）\nimport math\n```\n\n### 准备训练数据\n\n我们的训练数据集需要多样化，包含来自不同领域的信息，而 The Pile 是合适的选择。虽然其大小为 825 GB，但我们仅使用其中一小部分，即 5%–10%。首先下载数据集并了解其工作原理。我将下载 [HuggingFace](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmonology\u002Fpile-uncopyrighted) 上提供的版本。\n\n```python\n# 下载验证数据集\n!wget https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmonology\u002Fpile-uncopyrighted\u002Fresolve\u002Fmain\u002Fval.jsonl.zst\n\n# 下载训练数据集的第一部分\n!wget https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmonology\u002Fpile-uncopyrighted\u002Fresolve\u002Fmain\u002Ftrain\u002F00.jsonl.zst\n\n# 下载训练数据集的第二部分\n!wget https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmonology\u002Fpile-uncopyrighted\u002Fresolve\u002Fmain\u002Ftrain\u002F01.jsonl.zst\n\n# 下载训练数据集的第三部分\n!wget https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fmonology\u002Fpile-uncopyrighted\u002Fresolve\u002Fmain\u002Ftrain\u002F02.jsonl.zst\n```\n\n下载需要一些时间，但你也可以将训练数据集限制为仅一个文件 `00.jsonl.zst`，而不是三个。它已经划分为 train\u002Fval\u002Ftest。完成后，请确保将文件正确放置在相应目录中。\n\n```python\nimport os\nimport shutil\nimport glob\n\n# 定义目录结构\ntrain_dir = \"data\u002Ftrain\"\nval_dir = \"data\u002Fval\"\n\n# 如果目录不存在则创建\nos.makedirs(train_dir, exist_ok=True)\nos.makedirs(val_dir, exist_ok=True)\n```\n\n# 将所有训练文件（例如 00.jsonl.zst, 01.jsonl.zst, ...）\ntrain_files = glob.glob(\"*.jsonl.zst\")\nfor file in train_files:\n    if file.startswith(\"val\"):\n        # 移动验证文件\n        dest = os.path.join(val_dir, file)\n    else:\n        # 移动训练文件\n        dest = os.path.join(train_dir, file)\n    shutil.move(file, dest)\n\n我们的数据集采用 .jsonl.zst 格式（一种常用于存储大型数据集的压缩文件格式，结合了 JSON Lines 格式（.jsonl，每行代表一个有效的 JSON 对象）和 Zstandard 压缩（.zst））。下面我们读取一个下载文件的样例查看其结构。\n\nin_file = \"data\u002Fval\u002Fval.jsonl.zst\"  # 验证文件路径\n\nwith zstd.open(in_file, 'r') as in_f:\n    for i, line in tqdm(enumerate(in_f)):  # 读取前 5 行\n        data = json.loads(line)\n        print(f\"行 {i}: {data}\")  # 打印原始数据用于检查\n        if i == 2:\n            break\n```\n\n上述代码的输出如下：\n\n```python\n#### OUTPUT ####\n行: 0 \n{\n  \"text\": \"Effect of sleep quality ... epilepsy.\",\n  \"meta\": {\n    \"pile_set_name\": \"PubMed Abstracts\"\n  }\n}\n\n行: 1\n{\n  \"text\": \"LLMops a new GitHub Repository ...\",\n  \"meta\": {\n    \"pile_set_name\": \"Github\"\n  }\n}\n```\n\n现在我们需要对数据集进行编码（分词，tokenize）。目标是训练一个能正确输出单词的基础大语言模型（LLM），因此需要使用现成的分词器。我们将采用 OpenAI 开源的 tiktoken 分词器（tokenizer），具体使用 r50k_base 分词方案（ChatGPT\u002FGPT-3 模型所用的分词器）来处理数据集。\n\n为避免重复代码（需同时处理训练集和验证集），我们创建如下函数：\n\n```python\ndef process_files(input_dir, output_file):\n    \"\"\"\n    处理指定输入目录中的所有 .zst 文件，并将编码后的 tokens 保存至 HDF5 文件。\n\n    参数:\n        input_dir (str): 包含输入 .zst 文件的目录。\n        output_file (str): 输出 HDF5 文件路径。\n    \"\"\"\n    with h5py.File(output_file, 'w') as out_f:\n        # 在 HDF5 文件中创建可扩展的数据集 'tokens'\n        dataset = out_f.create_dataset('tokens', (0,), maxshape=(None,), dtype='i')\n        start_index = 0\n\n        # 遍历输入目录中的所有 .zst 文件\n        for filename in sorted(os.listdir(input_dir)):\n            if filename.endswith(\".jsonl.zst\"):\n                in_file = os.path.join(input_dir, filename)\n                print(f\"处理中: {in_file}\")\n\n                # 读取 .zst 压缩文件\n                with zstd.open(in_file, 'r') as in_f:\n                    # 遍历压缩文件中的每一行\n                    for line in tqdm(in_f, desc=f\"处理 {filename}\"):\n                        # 将行内容解析为 JSON\n                        data = json.loads(line)\n\n                        # 在文本末尾添加结束标记并进行编码\n                        text = data['text'] + \"\u003C|endoftext|>\"\n                        encoded = enc.encode(text, allowed_special={'\u003C|endoftext|>'})\n                        encoded_len = len(encoded)\n\n                        # 计算新 tokens 的结束索引\n                        end_index = start_index + encoded_len\n\n                        # 扩展数据集大小并存储编码后的 tokens\n                        dataset.resize(dataset.shape[0] + encoded_len, axis=0)\n                        dataset[start_index:end_index] = encoded\n\n                        # 更新下一批 tokens 的起始索引\n                        start_index = end_index\n```\n\n关于此函数的两个关键点：\n\n 1. 我们将分词后的数据存储在 HDF5（分层数据格式）文件中，这为模型训练期间的快速数据访问提供了灵活性。\n\n 2. 添加 `|\u003Cendoftext|>` 标记（结束文本标记）用于标识每个文本序列的结尾，向模型指示已到达有意义上下文的终点，有助于生成连贯的输出。\n\n现在我们可以直接对训练集和验证集进行编码：\n\n```python\n# 定义分词后数据的输出目录\nout_train_file = \"data\u002Ftrain\u002Fpile_train.h5\"\nout_val_file = \"data\u002Fval\u002Fpile_dev.h5\"\n\n# 加载 GPT-3\u002FGPT-2 模型的分词器\nenc = tiktoken.get_encoding('r50k_base')\n\n# 处理训练数据\nprocess_files(train_dir, out_train_file)\n\n# 处理验证数据\nprocess_files(val_dir, out_val_file)\n```\n\n查看分词后数据的样例：\n\n```python\n with h5py.File(out_val_file, 'r') as file:\n     # 访问 'tokens' 数据集\n     tokens_dataset = file['tokens']\n     \n     # 打印数据集的数据类型\n     print(f\"'tokens' 数据集的 dtype: {tokens_dataset.dtype}\")\n     \n     # 加载并打印数据集的前几个元素\n     print(\"前几个 'tokens' 数据集元素:\")\n     print(tokens_dataset[:10])  # 前 10 个 token\n```\n\n上述代码的输出如下：\n\n```python\n#### OUTPUT ####\n'tokens' 数据集的 dtype: int32\n\n前几个 'tokens' 数据集元素:\n[ 2725  6557    83 23105   157   119   229    77  5846  2429]\n```\n我们已完成训练数据集的准备工作。接下来将实现 Transformer 架构并深入探讨其理论原理。\n\n### Transformer 概览\n\n让我们快速了解 Transformer 架构如何处理和理解文本。它通过将文本拆分为更小的单元（称为 tokens（标记））并预测序列中的下一个标记来实现。Transformer 由多层堆叠而成，这些层称为 transformer blocks（Transformer 块），顶部还有一个最终层用于预测。\n\n每个 Transformer 块包含两个主要组件：\n\n* **自注意力头（Self-Attention Heads）**：这些组件确定输入中哪些部分对模型最重要。例如，在处理句子时，注意力头可以突出显示词语之间的关系，比如代词与其所指代的名词之间的关联。\n\n* **MLP（多层感知机，Multi-Layer Perceptron）**：这是一个简单的前馈神经网络。它接收注意力头强调的信息并进行进一步处理。MLP 包含一个接收注意力头数据的输入层、一个增加处理复杂度的隐藏层，以及一个将结果传递给下一个 Transformer 块的输出层。\n\n自注意力头共同构成“思考内容”部分，而 MLP 则是“思考方式”部分。堆叠多个 Transformer 块使模型能够理解文本中的复杂模式和关系，但这并非总是能保证。\n\n与其查看原始论文中的图示，不如可视化一个更简单易懂的架构图，我们将基于此进行编码。\n\n![Transformer 架构 by [Fareed Khan](undefined)](https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F11808\u002F1*QXmeA-H52C-p82AwawslbQ.png)\n\n让我们阅读即将编码的架构流程：\n\n 1. 输入标记（tokens）被转换为嵌入向量（embeddings）并与位置信息结合。\n\n 2. 模型包含 64 个相同的 Transformer 块，按顺序处理数据。\n\n 3. 每个块首先运行多头注意力（multi-head attention）以分析标记间的关系。\n\n 4. 每个块随后通过 MLP 处理数据，该 MLP 先扩展再压缩数据。\n\n 5. 每一步使用残差连接（residual connections）帮助信息流动。\n\n 6. 全程使用层归一化（layer normalization）稳定训练过程。\n\n 7. 注意力机制计算哪些标记应相互关注。\n\n 8. MLP 将数据扩展至 4 倍大小，应用 ReLU 激活函数，再压缩回原尺寸。\n\n 9. 模型使用 16 个注意力头捕获不同类型的关联。\n\n 10. 最终层将处理后的数据转换为词汇表大小的预测结果。\n\n 11. 模型通过重复预测下一个最可能的标记生成文本。\n\n### 多层感知机（Multi Layer Perceptron, MLP）\n\nMLP 是 Transformer 前馈网络中的基础构建模块。其作用是引入非线性并学习嵌入表示中的复杂关系。定义 MLP 模块时，关键参数是 n_embed（嵌入维度），它定义了输入嵌入的维度。\n\nMLP 通常包含一个将输入维度扩展若干倍（通常为 4 倍，我们也将采用此比例）的隐藏线性层，后接非线性激活函数（常用 ReLU）。此结构使网络能够学习更复杂的特征。最后，投影线性层将扩展后的表示映射回原始嵌入维度。这种变换序列使 MLP 能够优化注意力机制学习到的表示。\n\n![MLP by [Fareed Khan](undefined)](https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F4866\u002F1*GXxiLMW4kUXqOEimBA7g0A.png)\n\n```python\n# --- MLP (Multi-Layer Perceptron) Class ---\n\nclass MLP(nn.Module):\n    \"\"\"\n    A simple Multi-Layer Perceptron with one hidden layer.\n\n    This module is used within the Transformer block for feed-forward processing.\n    It expands the input embedding size, applies a ReLU activation, and then projects it back\n    to the original embedding size.\n    \"\"\"\n    def __init__(self, n_embed):\n        super().__init__()\n        self.hidden = nn.Linear(n_embed, 4 * n_embed)  # Linear layer to expand embedding size\n        self.relu = nn.ReLU()                        # ReLU activation function\n        self.proj = nn.Linear(4 * n_embed, n_embed)  # Linear layer to project back to original size\n\n    def forward(self, x):\n        \"\"\"\n        Forward pass through the MLP.\n\n        Args:\n            x (torch.Tensor): Input tensor of shape (B, T, C), where B is batch size,\n                              T is sequence length, and C is embedding size.\n\n        Returns:\n            torch.Tensor: Output tensor of the same shape as the input.\n        \"\"\"\n        x = self.forward_embedding(x)\n        x = self.project_embedding(x)\n        return x\n\n    def forward_embedding(self, x):\n        \"\"\"\n        Applies the hidden linear layer followed by ReLU activation.\n\n        Args:\n            x (torch.Tensor): Input tensor.\n\n        Returns:\n            torch.Tensor: Output after the hidden layer and ReLU.\n        \"\"\"\n        x = self.relu(self.hidden(x))\n        return x\n\n    def project_embedding(self, x):\n        \"\"\"\n        Applies the projection linear layer.\n\n        Args:\n            x (torch.Tensor): Input tensor.\n\n        Returns:\n            torch.Tensor: Output after the projection layer.\n        \"\"\"\n        x = self.proj(x)\n        return x\n```\n\n我们刚刚完成了 MLP 部分的编码。`__init__` 方法初始化了一个扩展输入嵌入尺寸（n_embed）的隐藏线性层和一个将其还原的投影层，并在隐藏层后应用 ReLU 激活函数。`forward` 方法定义了数据流经这些层的过程：通过 `forward_embedding` 应用隐藏层和 ReLU，通过 `project_embedding` 应用投影层。\n\n### 单头注意力（Single Head Attention）\n\n注意力头是模型的核心部分，其作用是聚焦于输入序列的相关部分。定义 Head 模块时，关键参数包括 head_size（头尺寸）、n_embed（嵌入维度）和 context_length（上下文长度）。head_size 参数决定键（key）、查询（query）和值（value）投影的维度，影响注意力机制的表示能力。\n\n输入嵌入维度 n_embed 定义了这些投影层的输入大小。context_length 用于创建因果掩码（causal mask），确保模型仅关注先前的标记。\n\n在 Head 内部，键、查询和值的线性层（nn.Linear）初始化时无偏置。一个大小为 context_length × context_length 的下三角矩阵（tril）被注册为缓冲区以实现因果掩码，防止注意力机制关注未来标记。\n\n![Single Head Attention by [Fareed Khan](undefined)](https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F5470\u002F1*teNwEhicq9ebVURiMS8WkA.png)\n\n# --- 注意力头类（Attention Head Class）---\n\nclass Head(nn.Module):\n    \"\"\"\n    单个注意力头（Attention Head）。\n\n    该模块计算注意力分数并将其应用于值（values）。它包含键（key）、查询（query）和值（value）的投影层，\n    并使用因果掩码（causal masking）防止关注未来标记（tokens）。\n    \"\"\"\n    def __init__(self, head_size, n_embed, context_length):\n        super().__init__()\n        self.key = nn.Linear(n_embed, head_size, bias=False)   # 键投影\n        self.query = nn.Linear(n_embed, head_size, bias=False) # 查询投影\n        self.value = nn.Linear(n_embed, head_size, bias=False) # 值投影\n        # 用于因果掩码的下三角矩阵\n        self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)))\n\n    def forward(self, x):\n        \"\"\"\n        通过注意力头的前向传播。\n\n        参数:\n            x (torch.Tensor): 输入张量，形状为 (B, T, C)。\n\n        返回:\n            torch.Tensor: 应用注意力后的输出张量。\n        \"\"\"\n        B, T, C = x.shape\n        k = self.key(x)     # (B, T, head_size)\n        q = self.query(x)   # (B, T, head_size)\n        scale_factor = 1 \u002F math.sqrt(C)\n        # 计算注意力权重: (B, T, head_size) @ (B, head_size, T) -> (B, T, T)\n        attn_weights = q @ k.transpose(-2, -1) * scale_factor\n        # 应用因果掩码\n        attn_weights = attn_weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))\n        attn_weights = F.softmax(attn_weights, dim=-1)\n        v = self.value(x)   # (B, T, head_size)\n        # 将注意力权重应用于值\n        out = attn_weights @ v # (B, T, T) @ (B, T, head_size) -> (B, T, head_size)\n        return out\n```\n\n我们的注意力头类的 `__init__` 方法初始化了键、查询和值的线性投影层，每个层将输入嵌入（n_embed）投影到 head_size 维度。基于 context_length 创建的下三角矩阵用于因果掩码（causal masking）。`forward` 方法通过缩放查询和键的点积计算注意力权重，应用因果掩码，使用 softmax 归一化权重，并计算值的加权和以生成注意力输出。\n\n### 多头注意力机制（Multi-Head Attention）\n\n为了捕获输入序列中的多样化关系，我们将使用多头注意力机制（Multi-Head Attention）的概念。`MultiHeadAttention` 模块管理多个并行运行的独立注意力头。\n\n关键参数是 `n_head`，它决定了并行注意力头的数量。输入嵌入维度（n_embed）和上下文长度（context_length）对于实例化各个注意力头也是必需的。每个头独立处理输入，将其投影到维度为 `n_embed \u002F\u002F n_head` 的低维子空间。通过使用多个头，模型可以同时关注输入的不同方面。\n\n![Multi Head Attention by [Fareed Khan](undefined)](https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F6864\u002F1*fa-YjrZdtbpuCLp7An99dg.png)\n\n```python\n# --- 多头注意力类（Multi-Head Attention Class）---\n\nclass MultiHeadAttention(nn.Module):\n    \"\"\"\n    多头注意力模块。\n\n    该模块并行组合多个注意力头。每个头的输出沿最后一维拼接形成最终输出。\n    \"\"\"\n    def __init__(self, n_head, n_embed, context_length):\n        super().__init__()\n        self.heads = nn.ModuleList([Head(n_embed \u002F\u002F n_head, n_embed, context_length) for _ in range(n_head)])\n\n    def forward(self, x):\n        \"\"\"\n        通过多头注意力的前向传播。\n\n        参数:\n            x (torch.Tensor): 输入张量，形状为 (B, T, C)。\n\n        返回:\n            torch.Tensor: 拼接所有头输出后的张量。\n        \"\"\"\n        # 沿最后一维（C）拼接每个头的输出\n        x = torch.cat([h(x) for h in self.heads], dim=-1)\n        return x\n```\n\n现在我们已经定义了 `MultiHeadAttention` 类（它组合了多个注意力头），`__init__` 方法初始化了一个包含 `n_head` 个 `Head` 实例的列表，每个头的 `head_size` 为 `n_embed \u002F\u002F n_head`。`forward` 方法将每个注意力头应用于输入 `x`，并沿最后一维拼接它们的输出，合并每个头学到的信息。\n\n### Transformer 块（Transformer Block）\n\n要构建十亿参数模型，我们肯定需要深层架构。为此，我们需要编写 Transformer 块并堆叠它们。块的关键参数是 `n_head`、`n_embed` 和 `context_length`。每个块包含一个多头注意力层和一个前馈神经网络（MLP），并在每个组件前应用层归一化（layer normalization），在每个组件后添加残差连接（residual connections）。\n\n层归一化（layer normalization）由嵌入维度 `n_embed` 参数化，有助于稳定训练过程。多头注意力机制如前所述，需要 `n_head`、`n_embed` 和 `context_length`。MLP 同样使用嵌入维度 `n_embed`。这些组件协同工作以处理输入并学习复杂模式。\n\n![Transformer Block by [Fareed Khan](undefined)](https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F6942\u002F1*uLWGajZc6StnQHfZjcb6eA.png)\n\n# --- Transformer块类（Transformer Block Class）---\n\nclass Block(nn.Module):\n    \"\"\"\n    单个Transformer块。\n\n    该块包含一个多头注意力层（multi-head attention layer）后接一个MLP（多层感知机，MLP），\n    并带有层归一化（layer normalization）和残差连接（residual connections）。\n    \"\"\"\n    def __init__(self, n_head, n_embed, context_length):\n        super().__init__()\n        self.ln1 = nn.LayerNorm(n_embed)\n        self.attn = MultiHeadAttention(n_head, n_embed, context_length)\n        self.ln2 = nn.LayerNorm(n_embed)\n        self.mlp = MLP(n_embed)\n\n    def forward(self, x):\n        \"\"\"\n        通过Transformer块的前向传播。\n\n        参数:\n            x (torch.Tensor): 输入张量。\n\n        返回:\n            torch.Tensor: 块处理后的输出张量。\n        \"\"\"\n        # 应用带残差连接的多头注意力\n        x = x + self.attn(self.ln1(x))\n        # 应用带残差连接的MLP\n        x = x + self.mlp(self.ln2(x))\n        return x\n\n    def forward_embedding(self, x):\n        \"\"\"\n        专注于嵌入和注意力部分的前向传播。\n\n        参数:\n            x (torch.Tensor): 输入张量。\n\n        返回:\n            tuple: 包含MLP嵌入后输出和残差的元组。\n        \"\"\"\n        res = x + self.attn(self.ln1(x))\n        x = self.mlp.forward_embedding(self.ln2(res))\n        return x, res\n```\n\n我们的Block类表示单个Transformer块。`__init__`方法初始化层归一化层（ln1, ln2）、一个多头注意力模块（MultiHeadAttention）和一个MLP模块，所有模块均由n_head（注意力头数）、n_embed（嵌入维度）和context_length（上下文长度）参数化。\n\n`forward`方法实现了块的前向传播：先应用层归一化和带残差连接的多头注意力，再应用另一层归一化和MLP（同样带残差连接）。`forward_embedding`方法提供了一种替代前向传播路径，专注于注意力和初始MLP嵌入阶段。\n\n### 最终模型\n\n到目前为止，我们已编写了Transformer模型的小型组件。接下来，我们将词元嵌入（token embeddings）和位置嵌入（position embeddings）与一系列Transformer块集成，以执行序列到序列任务。为此，我们需要定义几个关键参数：n_head（注意力头数）、n_embed（嵌入维度）、context_length（上下文长度）、vocab_size（词汇表大小）和N_BLOCKS（块数量）。\n\nvocab_size（词汇表大小）决定了词元嵌入层的规模，将每个词元映射为维度为n_embed的稠密向量。context_length（上下文长度）对位置嵌入层至关重要，该层编码输入序列中每个词元的位置信息，维度同样为n_embed。注意力头数（n_head）和块数量（N_BLOCKS）共同决定了网络的深度和复杂度。\n\n这些参数共同定义了Transformer模型的架构和容量，现在开始编码实现。\n\n![Transformer Class by [Fareed Khan](undefined)](https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F5418\u002F1*0XXd_R2EOhkKCQDfqUQg0w.png)\n\n```python\n# --- Transformer模型类（Transformer Model Class）---\n\nclass Transformer(nn.Module):\n    \"\"\"\n    主Transformer模型。\n\n    该类将词元嵌入（token embeddings）和位置嵌入（position embeddings）与一系列Transformer块结合，\n    并添加一个用于语言建模的最终线性层。\n    \"\"\"\n    def __init__(self, n_head, n_embed, context_length, vocab_size, N_BLOCKS):\n        super().__init__()\n        self.context_length = context_length\n        self.N_BLOCKS = N_BLOCKS\n        self.token_embed = nn.Embedding(vocab_size, n_embed)\n        self.position_embed = nn.Embedding(context_length, n_embed)\n        self.attn_blocks = nn.ModuleList([Block(n_head, n_embed, context_length) for _ in range(N_BLOCKS)])\n        self.layer_norm = nn.LayerNorm(n_embed)\n        self.lm_head = nn.Linear(n_embed, vocab_size)\n        self.register_buffer('pos_idxs', torch.arange(context_length))\n\n    def _pre_attn_pass(self, idx):\n        \"\"\"\n        合并词元嵌入和位置嵌入。\n\n        参数:\n            idx (torch.Tensor): 输入词元索引。\n\n        返回:\n            torch.Tensor: 词元嵌入与位置嵌入之和。\n        \"\"\"\n        B, T = idx.shape\n        tok_embedding = self.token_embed(idx)\n        pos_embedding = self.position_embed(self.pos_idxs[:T])\n        return tok_embedding + pos_embedding\n\n    def forward(self, idx, targets=None):\n        \"\"\"\n        通过Transformer的前向传播。\n\n        参数:\n            idx (torch.Tensor): 输入词元索引。\n            targets (torch.Tensor, 可选): 用于损失计算的目标词元索引。默认为None。\n\n        返回:\n            tuple: 逻辑值（logits）和损失（若提供目标）。\n        \"\"\"\n        x = self._pre_attn_pass(idx)\n        for block in self.attn_blocks:\n            x = block(x)\n        x = self.layer_norm(x)\n        logits = self.lm_head(x)\n        loss = None\n        if targets is not None:\n            B, T, C = logits.shape\n            flat_logits = logits.view(B * T, C)\n            targets = targets.view(B * T).long()\n            loss = F.cross_entropy(flat_logits, targets)\n        return logits, loss\n\n    def forward_embedding(self, idx):\n        \"\"\"\n        专注于嵌入和注意力块的前向传播。\n\n        参数:\n            idx (torch.Tensor): 输入词元索引。\n\n        返回:\n            tuple: 注意力块后的输出和残差。\n        \"\"\"\n        x = self._pre_attn_pass(idx)\n        residual = x\n        for block in self.attn_blocks:\n            x, residual = block.forward_embedding(x)\n        return x, residual\n\n    def generate(self, idx, max_new_tokens):\n        \"\"\"\n        根据起始序列生成新词元。\n\n        参数:\n            idx (torch.Tensor): 初始词元索引序列。\n            max_new_tokens (int): 需生成的词元数量。\n\n        返回:\n            torch.Tensor: 扩展后的词元序列。\n        \"\"\"\n        for _ in range(max_new_tokens):\n            idx_cond = idx[:, -self.context_length:]\n            logits, _ = self(idx_cond)\n            logits = logits[:, -1, :]\n            probs = F.softmax(logits, dim=-1)\n            idx_next = torch.multinomial(probs, num_samples=1)\n            idx = torch.cat((idx, idx_next), dim=1)\n        return idx\n```\n\n我们的Transformer类`__init__`方法初始化了词元嵌入层（token_embed）、位置嵌入层（position_embed）、一系列Block模块（attn_blocks）、最终层归一化层（layer_norm）以及用于语言建模的线性层（lm_head）。\n\n`_pre_attn_pass`方法合并词元嵌入和位置嵌入。`forward`方法通过嵌入层和Transformer块序列处理输入序列，应用最终层归一化并生成逻辑值；若提供目标，则计算损失。`forward_embedding`方法提供中间前向传播路径（至注意力块输出），`generate`方法实现词元生成逻辑。\n\n### 批量处理\n\n当我们使用大数据训练深度学习模型时，由于GPU资源限制，我们需要以批量（batch）方式处理数据。因此，让我们创建一个get_batch_iterator函数，该函数接收指向HDF5文件的data_path、所需的batch_size、每个序列的context_length（上下文长度），以及用于加载数据的device（设备）。\n\nbatch_size（批量大小）决定了训练过程中并行处理的序列数量，而context_length（上下文长度）指定了每个输入序列的长度。data_path（数据路径）指向训练数据的位置。\n\n```python\n# --- Data Loading Utility --- \n\ndef get_batch_iterator(data_path, batch_size, context_length, device=\"gpu\"):\n    \"\"\"\n    Creates an iterator for generating batches of data from an HDF5 file.\n\n    Args:\n        data_path (str): Path to the HDF5 file containing tokenized data.\n        batch_size (int): Number of sequences in each batch.\n        context_length (int): Length of each sequence.\n        device (str, optional): Device to load the data onto ('cpu' or 'cuda'). Defaults to \"cpu\".\n\n    Yields:\n        tuple: A tuple containing input sequences (xb) and target sequences (yb).\n    \"\"\"\n    # Open the HDF5 file in read mode\n    with h5py.File(data_path, 'r') as hdf5_file:\n        \n        # Extract the dataset of tokenized sequences\n        dataset = hdf5_file['tokens']\n        \n        # Get the total size of the dataset\n        dataset_size = dataset.shape[0]\n        \n        # Calculate the number of examples (sequences) that can be made from the data\n        n_examples = (dataset_size - 1) \u002F\u002F context_length\n        \n        # Create an array of indices for examples and shuffle them for randomness\n        example_idxs = np.arange(n_examples)\n        np.random.shuffle(example_idxs)\n        \n        # Initialize epoch counter and example counter\n        epochs = 0\n        counter = 0\n        \n        while True:\n            # Check if the current batch exceeds the number of available examples\n            if counter + batch_size > n_examples:\n                # Shuffle the indices again and reset the counter to 0\n                np.random.shuffle(example_idxs)\n                counter = 0\n                print(f\"Finished epoch {epochs}\")  # Print epoch number when an epoch finishes\n                epochs += 1  # Increment the epoch counter\n            \n            # Select a batch of random indices to generate sequences\n            random_indices = example_idxs[counter:counter+batch_size] * context_length\n            \n            # Retrieve sequences from the dataset based on the random indices\n            random_samples = torch.tensor(np.array([dataset[idx:idx+context_length+1] for idx in random_indices]))\n            \n            # Separate the input sequences (xb) and target sequences (yb)\n            xb = random_samples[:, :context_length].to(device)  # Input sequence (first half of the random sample)\n            yb = random_samples[:, 1:context_length+1].to(device)  # Target sequence (second half of the random sample)\n            \n            # Increment the counter to move to the next batch\n            counter += batch_size\n            \n            # Yield the input and target sequences as a tuple for the current batch\n            yield xb, yb\n```\n\n我们的get_batch_iterator函数负责加载和批处理训练数据。它接收data_path、batch_size、context_length和device作为输入。该函数打开HDF5文件，打乱数据，然后进入一个无限循环以生成批次。在每次迭代中，它选择数据的一个随机子集来形成输入序列（xb）及其对应的目标序列（yb）。\n\n### 训练参数\n\n现在我们已经编写了模型，需要定义训练参数，例如注意力头（heads）的数量、块（blocks）的数量等，以及数据路径。\n\n```python\n# --- Configuration ---\n\n# Define vocabulary size and transformer configuration\nVOCAB_SIZE = 50304          # Number of unique tokens in the vocabulary\nCONTEXT_LENGTH = 512        # Maximum sequence length for the model\nN_EMBED = 2048              # Dimension of the embedding space\nN_HEAD = 16                 # Number of attention heads in each transformer block\nN_BLOCKS = 64               # Number of transformer blocks in the model\n\n# Paths to training and development datasets\nTRAIN_PATH = \"data\u002Ftrain\u002Fpile_val.h5\"  # File path for the training dataset\nDEV_PATH = \"data\u002Fval\u002Fpile_val.h5\"      # File path for the validation dataset\n\n# Transformer training parameters\nT_BATCH_SIZE = 32          # Number of samples per training batch\nT_CONTEXT_LENGTH = 16      # Context length for training batches\nT_TRAIN_STEPS = 200000     # Total number of training steps\nT_EVAL_STEPS = 1000        # Frequency (in steps) to perform evaluation\nT_EVAL_ITERS = 250         # Number of iterations to evaluate the model\nT_LR_DECAY_STEP = 50000    # Step at which to decay the learning rate\nT_LR = 5e-4                # Initial learning rate for training\nT_LR_DECAYED = 5e-5        # Learning rate after decay\nT_OUT_PATH = \"models\u002Ftransformer_B.pt\"  # Path to save the trained model\n\n# Device configuration\nDEVICE = 'cuda'\n\n# Store all configurations in a dictionary for easy access and modification\ndefault_config = {\n    'vocab_size': VOCAB_SIZE,\n    'context_length': CONTEXT_LENGTH,\n    'n_embed': N_EMBED,\n    'n_head': N_HEAD,\n    'n_blocks': N_BLOCKS,\n    'train_path': TRAIN_PATH,\n    'dev_path': DEV_PATH,\n    't_batch_size': T_BATCH_SIZE,\n    't_context_length': T_CONTEXT_LENGTH,\n    't_train_steps': T_TRAIN_STEPS,\n    't_eval_steps': T_EVAL_STEPS,\n    't_eval_iters': T_EVAL_ITERS,\n    't_lr_decay_step': T_LR_DECAY_STEP,\n    't_lr': T_LR,\n    't_lr_decayed': T_LR_DECAYED,\n    't_out_path': T_OUT_PATH,\n    'device': DEVICE,\n}\n```\n\n对于大多数参数，我使用了最常见的值，并将它们存储在字典中以便于访问。这里的参数适用于一个十亿参数（billion-parameter）模型。如果你想训练一个百万参数（million-parameter）模型，可以减少主要参数，包括CONTEXT_LENGTH、N_EMBED、N_HEAD和N_BLOCKS。不过，你也可以在我的GitHub仓库中运行百万参数模型的脚本。\n\n### 训练模型\n\n让我们初始化我们的Transformer模型并检查其总参数数量。\n```python\n# --- Initialize the Model and Print Parameters --- \n\nmodel = Transformer(\n    n_head=config['n_head'],\n    n_embed=config['n_embed'],\n    context_length=config['context_length'],\n    vocab_size=config['vocab_size'],\n    N_BLOCKS=config['n_blocks']\n).to(config['device'])\n\n\n# Print the total number of parameters\ntotal_params = sum(p.numel() for p in model.parameters())\nprint(f\"Total number of parameters in the model: {total_params:,}\")\n\n\n#### OUTPUT ####\n2,141,346,251\n```\n\n现在我们有了一个20亿参数的模型，需要定义Adam优化器（Adam optimizer）和损失跟踪函数（loss tracking function），这将帮助我们在整个训练过程中跟踪模型的进展。\n\n```python\n```\n\n# --- 优化器设置与损失跟踪 --- \n\n# 设置AdamW优化器（一种改进的Adam优化算法），使用指定学习率。\noptimizer = torch.optim.AdamW(model.parameters(), lr=config['t_lr'])\n\n# 用于跟踪训练过程中损失值的列表。\nlosses = []\n\n# 在训练循环中定义用于平均近期损失的窗口大小。\nAVG_WINDOW = 64\n\n# 辅助函数：估算训练和开发数据的平均损失。\n@torch.no_grad()\ndef estimate_loss(steps):\n    \"\"\"\n    在训练集和开发集上评估模型并计算平均损失。\n\n    Args:\n        steps (int): 评估的步数。\n\n    Returns:\n        dict: 包含'train'和'dev'数据集平均损失的字典。\n    \"\"\"\n    out = {}\n    model.eval()  # 将模型设置为评估模式。\n\n    for split in ['train', 'dev']:\n        # 为当前数据集选择适当的数据路径。\n        data_path = config['train_path'] if split == 'train' else config['dev_path']\n        \n        # 创建用于评估的批次迭代器。\n        batch_iterator_eval = get_batch_iterator(\n            data_path, config['t_batch_size'], config['t_context_length'], device=config['device']\n        )\n        \n        # 初始化张量以跟踪每次评估步骤的损失值。\n        losses_eval = torch.zeros(steps)\n        for k in range(steps):\n            try:\n                # 获取批次数据并计算损失。\n                xb, yb = next(batch_iterator_eval)\n                _, loss = model(xb, yb)\n                losses_eval[k] = loss.item()\n            except StopIteration:\n                # 处理数据迭代器提前结束的情况。\n                print(f\"警告：{split}的迭代器提前结束。\")\n                break\n        \n        # 计算当前数据集的平均损失。\n        out[split] = losses_eval[:k + 1].mean()\n    \n    model.train()  # 将模型恢复为训练模式。\n    return out\n```\n\n现在我们将初始化批次处理函数和训练循环，开始训练过程。\n\n```python\n# --- 训练循环 ---\n\n# 为训练数据创建批次迭代器。\nbatch_iterator = get_batch_iterator(\n  config['train_path'],\n  config['t_batch_size'],\n  config['t_context_length'],\n  device=config['device']\n)\n\n# 创建进度条以监控训练进度。\npbar = tqdm(range(config['t_train_steps']))\nfor step in pbar:\n  try:\n      # 获取输入和目标数据的批次。\n      xb, yb = next(batch_iterator)\n      \n      # 执行前向传播并计算损失。\n      _, loss = model(xb, yb)\n      \n      # 记录损失值用于跟踪。\n      losses.append(loss.item())\n      pbar.set_description(f\"训练损失：{np.mean(losses[-AVG_WINDOW:]):.4f}\")\n      \n      # 反向传播损失并更新模型参数。\n      optimizer.zero_grad(set_to_none=True)\n      loss.backward()\n      optimizer.step()\n\n      # 定期在训练集和开发集上评估模型。\n      if step % config['t_eval_steps'] == 0:\n          train_loss, dev_loss = estimate_loss(config['t_eval_iters']).values()\n          print(f\"步骤：{step}，训练损失：{train_loss:.4f}，开发损失：{dev_loss:.4f}\")\n\n      # 在指定步骤衰减学习率。\n      if step == config['t_lr_decay_step']:\n          print('衰减学习率')\n          for g in optimizer.param_groups:\n              g['lr'] = config['t_lr_decayed']\n  except StopIteration:\n      # 处理训练数据迭代器提前结束的情况。\n      print(\"训练数据迭代器提前结束。\")\n      break\n```\n### 保存训练好的模型\n\n由于我们的训练循环具备错误处理能力，若循环抛出任何错误，它将保存部分训练的模型以避免损失。训练完成后，我们可以保存训练好的模型用于后续推理。\n\n```python\n# --- 保存模型与最终评估 ---\n\n# 在训练集和开发集上对模型进行最终评估。\ntrain_loss, dev_loss = estimate_loss(200).values()\n\n# 确保模型保存路径唯一（若文件已存在）。\nmodified_model_out_path = config['t_out_path']\nsave_tries = 0\nwhile os.path.exists(modified_model_out_path):\n    save_tries += 1\n    model_out_name = os.path.splitext(config['t_out_path'])[0]\n    modified_model_out_path = model_out_name + f\"_{save_tries}\" + \".pt\"\n\n# 保存模型状态字典、优化器状态和训练元数据。\ntorch.save(\n    {\n        'model_state_dict': model.state_dict(),\n        'optimizer_state_dict': optimizer.state_dict(),\n        'losses': losses,\n        'train_loss': train_loss,\n        'dev_loss': dev_loss,\n        'steps': len(losses),\n    },\n    modified_model_out_path\n)\nprint(f\"模型已保存至 {modified_model_out_path}\")\nprint(f\"训练完成。训练损失：{train_loss:.4f}，开发损失：{dev_loss:.4f}\")\n```\n十亿参数模型的最终训练损失为0.2314，开发损失为0.643。\n\n### 训练损失分析\n\n当绘制百万参数和十亿参数模型的损失曲线时，它们呈现出显著差异。\n\n![Training Loss Comparison](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFareedKhan-dev_train-llm-from-scratch_readme_ee28212aeba5.png)\n\n十亿参数模型初始损失较高，且初期波动剧烈。损失值起初快速下降，随后出现震荡，最终趋于平稳。这表明大型模型在训练初期更难找到合适的学习路径，可能需要更多数据和精细的参数设置。当学习率降低（红线处）时，损失值更平稳地下降，说明这有助于模型进行微调。\n\n百万参数模型的损失从开始就更平稳地下降，波动幅度远小于大型模型。学习率衰减对其曲线影响较小，可能是因为小型模型结构更简单，能更快找到优质解。这种显著差异揭示了训练超大规模模型的难度——它们需要不同的训练策略，可能还需要更长时间才能充分学习。\n\n现在我们已保存好模型，终于可以将其用于推理并观察文本生成效果了。 😓\n\n### 文本生成\n\n让我们创建一个函数，从保存的模型生成文本。该函数接收模型路径和编码器（encoder）作为输入，并返回生成的文本。\n\n```python\ndef generate_text(model_path, input_text, max_length=512, device=\"gpu\"):\n    \"\"\"\n    使用预训练模型基于给定输入文本生成文本。\n\n    参数:\n    - model_path (str): 模型检查点路径。\n    - device (torch.device): 模型加载设备（例如 'cpu' 或 'cuda'）。\n    - input_text (str): 用于启动生成的输入文本。\n    - max_length (int, 可选): 生成文本的最大长度。默认为 512。\n\n    返回:\n    - str: 生成的文本。\n    \"\"\"\n\n    # 加载模型检查点\n    checkpoint = torch.load(model_path)\n\n    # 初始化模型（需确保 Transformer 类已在其他地方定义）\n    model = Transformer().to(device)\n\n    # 加载模型的状态字典\n    model.load_state_dict(checkpoint['model_state_dict'])\n\n    # 加载 GPT 模型的分词器（对 GPT 模型使用 'r50k_base'）\n    enc = tiktoken.get_encoding('r50k_base')\n\n    # 对输入文本及文本结束符进行编码\n    input_ids = torch.tensor(\n        enc.encode(input_text, allowed_special={'\u003C|endoftext|>'}),\n        dtype=torch.long\n    )[None, :].to(device)  # 添加批次维度并移至指定设备\n\n    # 使用编码后的输入通过模型生成文本\n    with torch.no_grad():\n        # 生成最多 'max_length' 个 token 的文本\n        generated_output = model.generate(input_ids, max_length)\n\n        # 将生成的 token 解码回文本\n        generated_text = enc.decode(generated_output[0].tolist())\n\n    return generated_text\n```\n\n此处需要调用先前定义的 Transformer（变换器）来加载架构，然后将保存的模型作为该架构的状态加载。\n\n首先观察百万参数和十亿参数模型在不提供任何输入时的生成结果，查看其随机生成内容。\n\n```python\n# 定义预训练模型的文件路径\nBillion_model_path = 'models\u002Ftransformer_B.pt'  # 十亿参数模型路径\nMillion_model_path = 'models\u002Ftransformer_M.pt'  # 百万参数模型路径\n\n# 使用 '\u003C|endoftext|>' 作为模型输入（作为允许模型自由生成文本的提示）\ninput_text = \"\u003C|endoftext|>\"\n\n# 调用函数使用十亿参数模型基于输入文本生成文本\nB_output = generate_text(Billion_model_path, input_text)\n\n# 调用函数使用百万参数模型基于输入文本生成文本\nM_output = generate_text(Million_model_path, input_text)\n\n# 打印两个模型生成的输出\nprint(B_output)  # 十亿参数模型输出\nprint(M_output)  # 百万参数模型输出\n```\n\n| **百万参数模型输出** | **十亿参数模型输出** |\n|------------------------------|------------------------------|\n| 1978 年，公园被归还给工厂，公众共享电子围栏的下部，该围栏源自车站所在城市。古代西方国家的运河局限于城市区域。村庄与中国城市直接相连，这些城市反抗美国预算，而 Odambinais 的情况不确定，农村地区已建立财富。 | 东海岸距 1037 年有两英里，7300 万难民（假设）与哈佛和克罗夫特的男性相同。在地中海东部和尼罗河西岸的喷气式飞机上。发现那里有许多政党、铁匠、音乐家和精品酒店业，激励着加拿大人已造成的压力，农村分支与联盟铁路持有者对抗 Abyssy。 |\n\n\n当上下文简短简单时，两个大语言模型（LLM）都能生成清晰准确的语句。例如，在百万参数模型输出中，**“村庄与中国城市直接相连”** 这一表述合理且传达了明确含义，易于理解且逻辑连贯。\n\n然而，当上下文变长变复杂时，清晰度开始下降。在十亿参数模型输出中，类似 **“东海岸距 1037 年有两英里，7300 万难民（假设）”** 和 **“铁匠、音乐家和精品酒店业激励着加拿大人已造成的压力”** 的句子变得难以理解。观点显得支离破碎，句式结构不自然。尽管用词可能正确，但整体含义变得混乱不清。\n\n积极的一面是，1300 万+ 参数的大语言模型也开始生成具有一定意义且拼写正确的内容。例如，当我使用主题输入文本时，它开始为我生成邮件。虽然显然更长的文本无法提供有意义的结果，但请看以下输出：\n\n```python\n# 输入文本\ninput_text = \"Subject: \"\n\n# 调用百万参数模型\nm_output = generate_text(Million_model_path, input_text)\n\nprint(m_output)  # 百万参数模型输出\n```\n| **百万参数大语言模型输出**                                                                 |\n|--------------------------------------------------------------------------------------------------|\n| Subject: ClickPaper-summary Study for Interview \u003Cbr>早上好，希望这条消息送达时您安好，阳光温柔地透过云层，... |\n\n我们的百万参数模型表明，我们可以拥有一个规模小于 10 亿、目标导向明确的窄域大语言模型；而我们的十亿参数训练模型则表明，架构必须经过深度编码并充分考虑。否则，与百万参数模型相比，它不会提升训练效果或性能，只会导致数据过拟合，除非为十亿级模型设计深度架构。\n\n# 后续步骤\n\n建议您先创建 1300 万+ 参数的模型，然后通过增加后续 100 个参数逐步扩展，提升其处理短上下文的能力。您可以根据特定任务需求决定训练更多参数的数量。接着，对于 10 亿以下的剩余参数，尝试在特定领域数据（如撰写邮件或论文）上微调模型，并观察其文本生成效果。\n\n\u003Chr>\n\n想交流？[我的 LinkedIn](https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Ffareed-khan-dev\u002F)\n\n## Star 历史\n\n[![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFareedKhan-dev_train-llm-from-scratch_readme_5e29c4daa969.png)](https:\u002F\u002Fstar-history.com\u002F#FareedKhan-dev\u002Ftrain-llm-from-scratch&Date)","# train-llm-from-scratch 快速上手指南\n\n## 环境准备\n\n- **系统要求**：\n  - Python 3.8+\n  - GPU（训练13M参数模型至少需要8GB显存，如RTX 3060\u002F3080；训练更大模型需参考README中的GPU对比表）\n  - 建议使用Linux系统（Windows\u002FMac也可运行，但可能需额外配置）\n\n- **前置知识**：\n  - 了解面向对象编程(OOP)基础\n  - 熟悉神经网络基本概念\n  - 掌握PyTorch框架使用\n\n- **推荐配置**（针对13M参数模型训练）：\n  - 显存：≥8GB\n  - 内存：≥16GB\n  - 硬盘空间：≥20GB（用于存储数据集和模型）\n\n## 安装步骤\n\n1. 克隆仓库：\n   ```bash\n   git clone https:\u002F\u002Fgithub.com\u002FFareedKhan-dev\u002Ftrain-llm-from-scratch.git\n   cd train-llm-from-scratch\n   ```\n\n2. 设置Python路径（避免导入问题）：\n   ```bash\n   export PYTHONPATH=\"$PYTHONPATH:.\"\n   ```\n\n3. 安装依赖（推荐使用国内镜像加速）：\n   ```bash\n   pip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n   ```\n\n## 基本使用\n\n### 1. 下载训练数据\n```bash\npython scripts\u002Fdata_download.py --train_max 1\n```\n- `--train_max`：指定下载训练文件数量（每个约11GB，默认1，最大30）\n- 数据将保存在 `data\u002Ftrain\u002F` 目录\n\n### 2. 预处理数据\n```bash\npython scripts\u002Fdata_preprocess.py --max_data 1000\n```\n- `--max_data`：指定处理的数据量（默认1000条）\n- 处理后的HDF5文件将保存在 `data\u002Ftrain\u002Fpile_train.h5`\n\n### 3. 配置模型参数\n编辑 `config\u002Fconfig.py`，设置13M参数模型配置：\n```python\nVOCAB_SIZE = 50304      # 词汇表大小\nCONTEXT_LENGTH = 128    # 序列长度\nN_EMBED = 128           # 嵌入维度\nN_HEAD = 8              # 注意力头数\nN_BLOCKS = 1            # Transformer块数\n```\n\n### 4. 开始训练\n```bash\npython scripts\u002Ftrain_transformer.py\n```\n\n### 5. 生成文本（训练完成后）\n```bash\npython scripts\u002Fgenerate_text.py --model_path models\u002Ftransformer_model.pth\n```\n\n> **提示**：首次使用建议先用小数据集测试（`--max_data 100`），确认环境配置正确后再进行完整训练。训练13M参数模型在RTX 3060上约需数小时。","某科技初创公司的数据科学家小李，正尝试为内部知识库训练一个1300万参数的语言模型，用于自动生成技术文档摘要，但缺乏高效训练工具。\n\n### 没有 train-llm-from-scratch 时\n- 手动下载Pile数据集需处理825GB原始数据，清洗JSON格式不一致的文本耗时3天，常因元数据错误中断流程。\n- 从头编写Transformer代码时频繁出现维度不匹配和注意力机制bug，仅调试单头注意力模块就花费2周。\n- 在Colab免费版GPU上训练总触发内存溢出，被迫反复缩减批次大小，导致训练进度停滞。\n- 损失曲线剧烈波动难以解读，需手动添加监控代码排查问题，模型收敛时间延长50%。\n- 生成的摘要逻辑混乱，如输出\"1978年公园返回工厂板\"等无意义片段，无法用于实际产品。\n\n### 使用 train-llm-from-scratch 后\n- 内置数据脚本自动下载Pile子集并标准化预处理，10分钟内完成数据准备，直接跳过清洗陷阱。\n- 提供论文级验证的Transformer实现，多头注意力和MLP模块开箱即用，首日即跑通训练流程。\n- 优化内存管理仅用Kaggle T4 GPU稳定训练，无需升级硬件，训练速度提升40%。\n- 集成实时损失可视化工具，快速定位收敛问题，超参数调优时间缩短至2天。\n- 生成的摘要连贯实用，例如\"Transformer通过自注意力机制高效处理长文本\"，可直接集成到文档系统。\n\ntrain-llm-from-scratch 让开发者用单个消费级GPU在一周内完成从数据到生成的全流程训练，大幅降低LLM定制门槛。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FFareedKhan-dev_train-llm-from-scratch_8f1db2d5.png","FareedKhan-dev","Fareed Khan","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FFareedKhan-dev_ce92a3fd.jpg","I do research on AI",null,"Karachi, Pakistan","whereismymind557@gmail.com","https:\u002F\u002Fgithub.com\u002FFareedKhan-dev",[84,88],{"name":85,"color":86,"percentage":87},"Jupyter Notebook","#DA5B0B",80,{"name":89,"color":90,"percentage":91},"Python","#3572A5",20,547,110,"2026-04-04T04:27:31","MIT","未说明","必须使用 NVIDIA GPU，训练 13M 参数模型需 8GB+ 显存（如 RTX 4060 Ti），训练 2B 参数模型需 24GB+ 显存（如 RTX 3090\u002F4090）",{"notes":99,"python":100,"dependencies":101},"训练必须使用 NVIDIA GPU；数据集单个文件约 11GB，建议预留 50GB+ 存储空间；需预先配置 CUDA 环境；训练 13M 模型需基础 PyTorch 知识","3.8+",[102,103,104],"torch","h5py","numpy",[13,26],[107,108,109,110,111,112],"gemini","large-language-models","llm","openai","training","transformers","2026-03-27T02:49:30.150509","2026-04-06T07:11:55.116423",[],[]]