[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-AlibabaResearch--AdvancedLiterateMachinery":3,"tool-AlibabaResearch--AdvancedLiterateMachinery":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",143909,2,"2026-04-07T11:33:18",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107888,"2026-04-06T11:32:50",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":10,"last_commit_at":59,"category_tags":60,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[35,15,13,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":76,"owner_twitter":76,"owner_website":77,"owner_url":78,"languages":79,"stars":108,"forks":109,"last_commit_at":110,"license":111,"difficulty_score":112,"env_os":75,"env_gpu":113,"env_ram":113,"env_deps":114,"category_tags":117,"github_topics":119,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":140,"updated_at":141,"faqs":142,"releases":173},5119,"AlibabaResearch\u002FAdvancedLiterateMachinery","AdvancedLiterateMachinery","A collection of original, innovative ideas and algorithms towards Advanced Literate Machinery. This project is maintained by the OCR Team in the Language Technology Lab, Tongyi Lab, Alibaba Group.","AdvancedLiterateMachinery 是阿里巴巴通义实验室“读光”团队开源的一系列前沿算法集合，旨在构建具备“阅读、思考与创造”能力的高级智能系统。该项目当前核心聚焦于让机器从图像和文档中高效“阅读”，解决传统 OCR 技术在复杂场景、多语言环境及文档深度理解方面的瓶颈，为未来超越人类水平的通用人工智能奠定基础。\n\n其技术亮点显著：新发布的 CC-OCR 基准测试专为评估大模型的识字能力设计，涵盖真实应用场景；Platypus 模型采用统一架构，能高精度识别各种形态的文本；SceneVTG 则利用多模态大模型与扩散模型，实现在自然场景中生成高质量的文字图像；WebRPG 更是创新地实现了根据 HTML 代码自动生成网页视觉呈现。\n\n这套工具非常适合 AI 研究人员、开发者以及需要处理复杂文档分析的企业用户。无论是希望探索多模态模型前沿技术的学者，还是寻求提升文档自动化处理能力的工程师，都能从中获得强大的算法支持与丰富的数据集资源，共同推动机器认知能力的边界。","# Advanced Literate Machinery\n\n## Introduction\n\nThe ultimate goal of our research is to build a system that has high-level intelligence, i.e., possessing the abilities to ***read, think and create***, so advanced that it could even surpass human intelligence one day in the future. We name this kind of systems **Advanced Literate Machinery (ALM)**.\n\nTo start with, we currently focus on teaching machines to ***read*** from images and documents. In years to come, we will explore the possibilities of endowing machines with the intellectual capabilities of ***thinking and creating***, catching up with and surpassing [GPT-4](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4) and [GPT-4V](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4v-system-card).\n\nThis project is maintained by the **读光 OCR Team** (读光-Du Guang means “*Reading The Light*”) in the [Tongyi Lab, Alibaba Group](https:\u002F\u002Ftongyi.aliyun.com\u002F).\n\n![Logo](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FAlibabaResearch_AdvancedLiterateMachinery_readme_96e7641ffc43.png)\n\nVisit our [读光-Du Guang Portal](https:\u002F\u002Fduguang.aliyun.com\u002F) and [DocMaster](https:\u002F\u002Fwww.modelscope.cn\u002Fstudios\u002Fdamo\u002FDocMaster\u002Fsummary) to experience online demos for OCR and Document Understanding.\n\n## Recent Updates\n**2024.12 Release**\n  - [**CC-OCR**](.\u002FBenchmarks\u002FCC-OCR\u002F) (*CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy*. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.02210)): The CC-OCR benchmark is specifically designed for evaluating the OCR-centric capabilities of Large Multimodal Models. CC-OCR possesses a diverse range of scenarios, tasks, and challenges, which comprises four OCR-centric tracks: multi-scene text reading, multilingual text reading, document parsing, and key information extraction. It includes 39 subsets with 7,058 full annotated images, of which 41% are sourced from real applications, being released for the first time.\n\n**2024.9 Release**\n  - [**Platypus**](.\u002FOCR\u002FPlatypus\u002F) (*Platypus: A Generalized Specialist Model for Reading Text in Various Forms,* ECCV 2024. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.14805)): Platypus introduces a novel approach to text reading from images, addressing limitations of both specialist and generalist models. Platypus leverages **a single unified architecture** to effectively recognize text in **various forms**, maintaining high accuracy and efficiency. We also introduce a **new dataset [Worms](https:\u002F\u002Fwww.modelscope.cn\u002Fdatasets\u002Fyuekun\u002FWorms)** which combines and partially re-labels previous datasets to support the model's development and evaluation. \n\n  - [**SceneVTG**](.\u002FAIGC\u002FSceneVTG\u002F) (*Visual Text Generation in the Wild,* ECCV 2024. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.14138)): We propose a visual text generator (termed SceneVTG), which can produce **high-quality text images in the wild**. Following a **two-stage paradigm**, SceneVTG leverages a Multimodal Large Language Model to recommend reasonable text regions and contents across multiple scales and levels, which are used by a conditional diffusion model as conditions to generate text images. To train SceneVTG, we also contribute a **new dataset [SceneVTG-Erase](https:\u002F\u002Fwww.modelscope.cn\u002Fdatasets\u002FKpillow\u002FSceneVTG-Erase)** with detailed OCR annotations. \n\n  - [**WebRPG**](.\u002FDocumentUnderstanding\u002FWebRPG) (*WebRPG: Automatic Web Rendering Parameters Generation for Visual Presentation,* ECCV 2024. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.15502)): We introduce WebRPG, a novel task that focuses on **automating the generation of visual presentations** for web pages based on HTML code. In the absence of a benchmark, we created a new dataset via an **automated pipeline**. Our proposed models, built on **VAE architecture** and **custom HTML embeddings**, efficiently manage numerous web elements and rendering parameters. Comprehensive experiments, including customized quantitative evaluations, demonstrate the effectiveness of WebRPG model in generating web presentations.\n\n  - [**ProcTag**](.\u002FDocumentUnderstanding\u002FProcTag\u002F) (*ProcTag: Process Tagging for Assessing the Efficacy of Document Instruction Data,* arXiv 2024. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.12358)): An effective evaluation method for document instruction data is crucial in constructing instruction data with high efficacy, which, in turn, facilitates the training of LLMs and MLLMs for document understanding. We propose ProcTag, a data-oriented method that **tags the execution process of instructions** rather than the text itself, enabling more effective assessment and selective sampling of document instructions.\n\n**2024.4 Release**\n  - [**OmniParser**](.\u002FOCR\u002FOmniParser\u002F) (*OmniParser: A Unified Framework for Text Spotting, Key Information Extraction and Table Recognition,* CVPR 2024. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.19128)): We propose a universal model for parsing visually-situated text across diverse scenarios, called OmniParser, which can simultaneously handle three typical visually-situated text parsing tasks: text spotting, key information extraction, and table recognition. In OmniParser, all tasks share the **unified encoder-decoder architecture**, the unified objective: **point-conditioned text generation**, and the unified input & output representation: **prompt & structured sequences**.\n\n**2024.3 Release**\n  - [**GEM**](.\u002FDocumentUnderstanding\u002FGEM\u002F) (*GEM: Gestalt Enhanced Markup Language Model for Web Understanding via Render Tree,* EMNLP 2023. [paper](https:\u002F\u002Faclanthology.org\u002F2023.emnlp-main.375.pdf)): Web pages serve as crucial carriers for humans to acquire and perceive information. Inspired by the Gestalt psychological theory, we propose an innovative Gestalt Enhanced Markup Language Model (GEM for short) for **hosting heterogeneous visual information from render trees of web pages**, leading to excellent performances on tasks such as web question answering and web information extraction.\n\n**2023.9 Release**\n  - [**DocXChain**](.\u002FApplications\u002FDocXChain\u002F) (*DocXChain: A Powerful Open-Source Toolchain for Document Parsing and Beyond,* arXiv 2023. [report](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.12430)): To **promote the level of digitization and structurization for documents**, we develop and release an open-source toolchain, called DocXChain, for precise and detailed document parsing. Currently, basic capabilities, including text detection, text recognition, table structure recognition, and layout analysis, are provided. Also, typical pipelines, i.e., general text reading, table parsing, and document structurization, are built to support more complicated applications related to documents. Most of the algorithmic models are from [ModelScope](https:\u002F\u002Fgithub.com\u002Fmodelscope\u002Fmodelscope). Formula recognition (using models from [RapidLatexOCR](https:\u002F\u002Fgithub.com\u002FRapidAI\u002FRapidLatexOCR)) and whole PDF conversion (PDF to JSON format) are now supported.\n  - [**LISTER**](.\u002FOCR\u002FLISTER\u002F) (*LISTER: Neighbor Decoding for Length-Insensitive Scene Text Recognition,* ICCV 2023. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.12774v1)): We propose a method called Length-Insensitive Scene TExt Recognizer (LISTER), which remedies the limitation regarding the **robustness to various text lengths**. Specifically, a Neighbor Decoder is proposed to obtain accurate character attention maps with the assistance of a novel neighbor matrix regardless of the text lengths. Besides, a Feature Enhancement Module is devised to model the long-range dependency with low computation cost, which is able to perform iterations with the neighbor decoder to enhance the feature map progressively..\n  - [**VGT**](.\u002FDocumentUnderstanding\u002FVGT\u002F) (*Vision Grid Transformer for Document Layout Analysis,* ICCV 2023. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.14978)): To **fully leverage multi-modal information and exploit pre-training techniques to learn better representation** for document layout analysis (DLA), we present VGT, a two-stream Vision Grid Transformer, in which Grid Transformer (GiT) is proposed and pre-trained for 2D token-level and segment-level semantic understanding. In addition, a new benchmark for assessing document layout analysis algorithms, called [D^4LA](https:\u002F\u002Fmodelscope.cn\u002Fdatasets\u002Fdamo\u002FD4LA\u002Fsummary), is curated and released.\n  - [**VLPT-STD**](.\u002FOCR\u002FVLPT-STD\u002F) (*Vision-Language Pre-Training for Boosting Scene Text Detectors,* CVPR 2022. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.13867)): We adapt **vision-language joint learning for scene text detection**, a task that intrinsically involves cross-modal interaction between the two modalities: vision and language. The pre-trained model is able to produce more informative representations with richer semantics, which could readily benefit existing scene text detectors (such as EAST and DB) in the down-stream text detection task.\n\n**2023.6 Release**\n  - [**LiteWeightOCR**](.\u002FOCR\u002FLiteWeightOCR\u002F) (*Building A Mobile Text Recognizer via Truncated SVD-based Knowledge Distillation-Guided NAS,* BMVC 2023. [paper](https:\u002F\u002Fpapers.bmvc2023.org\u002F0375.pdf)): To make OCR models **deployable on mobile devices while keeping high accuracy**, we propose a light-weight text recognizer that integrates Truncated Singular Value Decomposition (TSVD)-based Knowledge Distillation (KD) into the Neural Architecture Search (NAS) process.\n\n**2023.4 Release**\n  - [**GeoLayoutLM**](.\u002FDocumentUnderstanding\u002FGeoLayoutLM\u002F) (*GeoLayoutLM: Geometric Pre-training for Visual Information Extraction,* CVPR 2023. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.10759)): We propose a multi-modal framework, named GeoLayoutLM, for Visual Information Extraction (VIE). In contrast to previous methods for document pre-training, which usually learn geometric representation in an implicit way, GeoLayoutLM **explicitly models the geometric relations of entities in documents**.\n\n**2023.2 Release**\n  - [**LORE-TSR**](.\u002FDocumentUnderstanding\u002FLORE-TSR\u002F) (*LORE: Logical Location Regression Network for Table Structure Recognition,* AAAI 2022. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03730)): We model Table Structure Recognition (TSR) as a logical location regression problem and propose a new algorithm called LORE, standing for LOgical location REgression network, which for the first time **combines logical location regression together with spatial location regression** of table cells.\n\n**2022.9 Release**\n  - [**MGP-STR**](.\u002FOCR\u002FMGP-STR\u002F) (*Multi-Granularity Prediction for Scene Text Recognition,* ECCV 2022. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.03592)): Based on [ViT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.11929) and a tailored Adaptive Addressing and Aggregation module, we explore an implicit way for incorporating linguistic knowledge by introducing subword representations to facilitate **multi-granularity** prediction and fusion in scene text recognition.\n  - [**LevOCR**](.\u002FOCR\u002FLevOCR\u002F) (*Levenshtein OCR,* ECCV 2022. [paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.03594)): Inspired by [Levenshtein Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.11006), we cast the problem of scene text recognition as an iterative sequence refinement process, which allows for **parallel decoding, dynamic length change and good interpretability**.\n\n## Awards\n\nAnnouncement of the Scientific and Technological Progress Award\n![Logo](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FAlibabaResearch_AdvancedLiterateMachinery_readme_f0f9fa60d1c8.jpg)\n","# 高级识字机器\n\n## 简介\n\n我们研究的最终目标是构建一个具备高级智能的系统，即拥有***阅读、思考和创作***的能力，甚至在未来某一天超越人类智能。我们称这类系统为**高级识字机器（ALM）**。\n\n首先，我们目前专注于教会机器从图像和文档中***阅读***。在未来几年里，我们将探索赋予机器***思考和创作***智力能力的可能性，力争赶上并超越[GPT-4](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4)和[GPT-4V](https:\u002F\u002Fopenai.com\u002Fresearch\u002Fgpt-4v-system-card)。\n\n本项目由阿里巴巴集团通义实验室的**读光OCR团队**（读光-Du Guang意为“*阅读之光*”）维护。\n\n![Logo](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FAlibabaResearch_AdvancedLiterateMachinery_readme_96e7641ffc43.png)\n\n欢迎访问我们的[读光-Du Guang门户](https:\u002F\u002Fduguang.aliyun.com\u002F)和[DocMaster](https:\u002F\u002Fwww.modelscope.cn\u002Fstudios\u002Fdamo\u002FDocMaster\u002Fsummary)，体验OCR和文档理解的在线演示。\n\n## 最新动态\n**2024年12月发布**\n  - [**CC-OCR**](.\u002FBenchmarks\u002FCC-OCR\u002F) (*CC-OCR：评估大型多模态模型识字能力的全面且具有挑战性的OCR基准测试*。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.02210))：CC-OCR基准测试专门用于评估大型多模态模型以OCR为核心的各项能力。该基准涵盖了多样化的场景、任务和挑战，包含四个以OCR为中心的赛道：多场景文本识别、多语言文本识别、文档解析以及关键信息提取。它包含39个子集，共7,058张完整标注的图像，其中41%来源于实际应用，此次为首次公开发布。\n\n**2024年9月发布**\n  - [**Platypus**](.\u002FOCR\u002FPlatypus\u002F) (*鸭嘴兽：一种用于读取各种形式文本的通用型专家模型,* ECCV 2024。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.14805))：鸭嘴兽提出了一种全新的图像文本读取方法，解决了专家模型和通用模型各自的局限性。鸭嘴兽利用**单一统一架构**，能够有效识别**各种形式的文本**，同时保持高精度和高效性。我们还引入了一个**新数据集[Worms](https:\u002F\u002Fwww.modelscope.cn\u002Fdatasets\u002Fyuekun\u002FWorms)**，该数据集整合并部分重新标注了以往的数据集，以支持模型的开发与评估。\n\n  - [**SceneVTG**](.\u002FAIGC\u002FSceneVTG\u002F) (*野外环境下的视觉文本生成,* ECCV 2024。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.14138))：我们提出了一种视觉文本生成器（称为SceneVTG），能够生成**高质量的野外文本图像**。SceneVTG采用**两阶段范式**，利用多模态大语言模型在多个尺度和层级上推荐合理的文本区域和内容，这些内容作为条件输入到条件扩散模型中，用于生成文本图像。为了训练SceneVTG，我们还贡献了一个带有详细OCR标注的**新数据集[SceneVTG-Erase](https:\u002F\u002Fwww.modelscope.cn\u002Fdatasets\u002FKpillow\u002FSceneVTG-Erase)**。\n\n  - [**WebRPG**](.\u002FDocumentUnderstanding\u002FWebRPG) (*WebRPG：用于视觉呈现的网页渲染参数自动生成,* ECCV 2024。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.15502))：我们介绍了一项新任务WebRPG，其核心在于根据HTML代码**自动为网页生成视觉呈现方案**。由于缺乏相关基准，我们通过**自动化流水线**创建了一个新的数据集。我们提出的模型基于**VAE架构**和**定制的HTML嵌入**，能够高效地处理众多网页元素及渲染参数。全面的实验，包括定制的定量评估，证明了WebRPG模型在生成网页呈现方案方面的有效性。\n\n  - [**ProcTag**](.\u002FDocumentUnderstanding\u002FProcTag\u002F) (*ProcTag：用于评估文档指令数据有效性的流程标记法,* arXiv 2024。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.12358))：对于构建高效能的文档指令数据而言，有效的评估方法至关重要，这将有助于训练用于文档理解的LLM和MLLM。我们提出了ProcTag，这是一种面向数据的方法，它**标记指令的执行过程**而非文本本身，从而实现对文档指令更有效的评估和选择性采样。\n\n**2024年4月发布**\n  - [**OmniParser**](.\u002FOCR\u002FOmniParser\u002F) (*OmniParser：用于文本定位、关键信息提取和表格识别的统一框架,* CVPR 2024。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.19128))：我们提出了一种适用于多种场景下视觉情境文本解析的通用模型，名为OmniParser，它可以同时处理三种典型的视觉情境文本解析任务：文本定位、关键信息提取和表格识别。在OmniParser中，所有任务共享**统一的编码器-解码器架构**、统一的目标——**点条件驱动的文本生成**，以及统一的输入与输出表示——**提示与结构化序列**。\n\n**2024年3月发布**\n  - [**GEM**](.\u002FDocumentUnderstanding\u002FGEM\u002F) (*GEM：基于渲染树的格式塔增强标记语言模型，用于网页理解,* EMNLP 2023。[论文](https:\u002F\u002Faclanthology.org\u002F2023.emnlp-main.375.pdf))：网页是人类获取和感知信息的重要载体。受格式塔心理学理论启发，我们提出了一种创新的格式塔增强标记语言模型（简称GEM），用于**承载来自网页渲染树的异构视觉信息**，从而在网页问答和信息抽取等任务中表现出色。\n\n**2023年9月发布**\n  - [**DocXChain**](.\u002FApplications\u002FDocXChain\u002F) (*DocXChain：一款功能强大的开源文档解析工具链及其扩展,* arXiv 2023。[报告](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.12430))：为 **提升文档的数字化与结构化水平**，我们开发并发布了名为 DocXChain 的开源工具链，用于精确细致的文档解析。目前，该工具链已提供文本检测、文本识别、表格结构识别和版面分析等基础能力。同时，还构建了通用文本阅读、表格解析和文档结构化等典型流程，以支持更复杂的文档相关应用。其中大部分算法模型来自 [ModelScope](https:\u002F\u002Fgithub.com\u002Fmodelscope\u002Fmodelscope)。此外，现已支持公式识别（使用 [RapidLatexOCR](https:\u002F\u002Fgithub.com\u002FRapidAI\u002FRapidLatexOCR) 中的模型）和 PDF 全文转换（PDF 转 JSON 格式）。\n  - [**LISTER**](.\u002FOCR\u002FLISTER\u002F) (*LISTER：面向长度无关场景文本识别的邻域解码器,* ICCV 2023。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.12774v1))：我们提出了一种称为长度无关场景文本识别器（LISTER）的方法，以解决 **对不同文本长度鲁棒性不足** 的问题。具体而言，我们设计了一种邻域解码器，在新型邻域矩阵的帮助下，无论文本长度如何，都能获得准确的字符注意力图。此外，还引入了一个特征增强模块，以较低的计算成本建模长距离依赖关系，并可与邻域解码器迭代配合，逐步增强特征图。\n  - [**VGT**](.\u002FDocumentUnderstanding\u002FVGT\u002F) (*用于文档版面分析的视觉网格 Transformer,* ICCV 2023。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.14978))：为了 **充分利用多模态信息并借助预训练技术学习更好的表示**，用于文档版面分析（DLA），我们提出了双流视觉网格 Transformer（VGT），其中引入了网格 Transformer（GiT），并针对 2D 级别和片段级别的语义理解进行了预训练。此外，我们还整理并发布了用于评估文档版面分析算法的新基准数据集，名为 [D^4LA](https:\u002F\u002Fmodelscope.cn\u002Fdatasets\u002Fdamo\u002FD4LA\u002Fsummary)。\n  - [**VLPT-STD**](.\u002FOCR\u002FVLPT-STD\u002F) (*用于提升场景文本检测器性能的视觉—语言预训练,* CVPR 2022。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2204.13867))：我们将 **视觉—语言联合学习应用于场景文本检测任务**，该任务本质上涉及视觉与语言两种模态之间的跨模态交互。预训练后的模型能够生成更具语义丰富性的信息表示，从而直接提升下游文本检测任务中现有场景文本检测器（如 EAST 和 DB）的性能。\n\n**2023年6月发布**\n  - [**LiteWeightOCR**](.\u002FOCR\u002FLiteWeightOCR\u002F) (*基于截断奇异值分解知识蒸馏引导的神经架构搜索构建轻量级文本识别器,* BMVC 2023。[论文](https:\u002F\u002Fpapers.bmvc2023.org\u002F0375.pdf))：为使 OCR 模型 **在保持高精度的同时能够在移动设备上部署**，我们提出了一种轻量级文本识别器，将基于截断奇异值分解的知识蒸馏（KD）融入到神经架构搜索（NAS）过程中。\n\n**2023年4月发布**\n  - [**GeoLayoutLM**](.\u002FDocumentUnderstanding\u002FGeoLayoutLM\u002F) (*GeoLayoutLM：面向视觉信息提取的几何预训练,* CVPR 2023。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.10759))：我们提出了一种名为 GeoLayoutLM 的多模态框架，用于视觉信息提取（VIE）。与以往通常以隐式方式学习几何表示的文档预训练方法不同，GeoLayoutLM **显式地建模文档中各实体之间的几何关系**。\n\n**2023年2月发布**\n  - [**LORE-TSR**](.\u002FDocumentUnderstanding\u002FLORE-TSR\u002F) (*LORE：用于表格结构识别的逻辑位置回归网络,* AAAI 2022。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.03730))：我们将表格结构识别（TSR）建模为一个逻辑位置回归问题，并提出了一种名为 LORE 的新算法，即逻辑位置回归网络。这是首次 **将表格单元格的逻辑位置回归与空间位置回归相结合**。\n\n**2022年9月发布**\n  - [**MGP-STR**](.\u002FOCR\u002FMGP-STR\u002F) (*面向场景文本识别的多粒度预测,* ECCV 2022。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.03592))：基于 [ViT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2010.11929) 和定制的自适应寻址与聚合模块，我们通过引入子词表示来探索一种隐式整合语言知识的方式，从而促进场景文本识别中的 **多粒度** 预测与融合。\n  - [**LevOCR**](.\u002FOCR\u002FLevOCR\u002F) (*莱文施泰因 OCR,* ECCV 2022。[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2209.03594))：受 [莱文施泰因 Transformer](https:\u002F\u002Farxiv.org\u002Fabs\u002F1905.11006) 启发，我们将场景文本识别问题转化为一个迭代式的序列精炼过程，该过程具有 **并行解码、动态长度调整以及良好的可解释性** 等优点。\n\n\n\n## 奖项\n\n科学技术进步奖公告\n![Logo](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FAlibabaResearch_AdvancedLiterateMachinery_readme_f0f9fa60d1c8.jpg)","# AdvancedLiterateMachinery 快速上手指南\n\nAdvancedLiterateMachinery (ALM) 是由阿里巴巴通义实验室“读光”团队维护的开源项目，旨在构建具备“阅读、思考、创造”能力的高阶智能系统。本项目涵盖了从 OCR 文字识别、文档理解到视觉文本生成的一系列前沿模型（如 OmniParser, Platypus, DocXChain 等）。\n\n由于该项目是一个包含多个独立子模型的研究集合，不同模型的环境依赖略有差异。以下指南以核心工具链 **DocXChain**（文档解析全能工具）及通用模型部署为例，提供快速上手流程。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**: Linux (推荐 Ubuntu 18.04\u002F20.04\u002F22.04) 或 macOS。Windows 用户建议使用 WSL2 或 Docker。\n*   **Python 版本**: Python 3.8 - 3.10\n*   **硬件要求**: \n    *   推理：建议配备 NVIDIA GPU (显存 >= 4GB)，支持 CUDA 11.7+。\n    *   部分轻量级模型（如 LiteWeightOCR）可在 CPU 上运行。\n*   **前置依赖**:\n    *   Git\n    *   pip (包管理工具)\n    *   (可选) conda\u002Fmamba (用于管理虚拟环境)\n\n## 安装步骤\n\n推荐使用 `conda` 创建独立的虚拟环境以避免依赖冲突。\n\n### 1. 克隆项目代码\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Falibaba\u002FAdvancedLiterateMachinery.git\ncd AdvancedLiterateMachinery\n```\n\n### 2. 创建并激活虚拟环境\n```bash\nconda create -n alm python=3.9\nconda activate alm\n```\n\n### 3. 安装基础依赖\n根据您想要使用的具体模块，安装对应的依赖。以下是安装核心文档解析工具链 **DocXChain** 及通用 OCR 依赖的命令：\n\n```bash\n# 安装 PyTorch (请以官方最新指令为准，此处为示例)\npip install torch torchvision torchaudio --index-url https:\u002F\u002Fdownload.pytorch.org\u002Fwhl\u002Fcu118\n\n# 安装项目核心依赖\npip install -r requirements.txt\n\n# 如果主要使用 DocXChain 进行文档结构化解析\ncd Applications\u002FDocXChain\npip install -r requirements.txt\n```\n\n> **提示**：国内开发者若遇到下载速度慢的问题，可使用清华或阿里镜像源加速：\n> `pip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple`\n\n### 4. 下载预训练模型\n大部分模型权重托管在 **ModelScope (魔搭社区)**，国内访问速度极快。以 DocXChain 为例，可通过 Python 脚本自动下载：\n\n```python\nfrom modelscope import snapshot_download\n\n# 下载 DocXChain 相关模型\nmodel_dir = snapshot_download('damo\u002Fcv_docxchain_document-structure-recognition', cache_dir='.\u002Fmodels')\nprint(f\"Models downloaded to: {model_dir}\")\n```\n\n## 基本使用\n\n以下展示如何使用 **DocXChain** 对一份 PDF 或图片文档进行完整的结构化解析（包含文字检测、识别、表格还原及版面分析）。\n\n### 示例：解析本地文档\n\n创建一个名为 `demo_parse.py` 的文件，写入以下代码：\n\n```python\nimport cv2\nfrom docxchain.pipeline import DocumentParser\n\n# 初始化解析器\n# config_path 和 checkpoint_path 请根据实际下载的模型路径调整\nparser = DocumentParser(\n    config_path='configs\u002Fdocxchain_config.py',\n    checkpoint_path='models\u002Fdamo\u002Fcv_docxchain_document-structure-recognition\u002Fpytorch_model.pt'\n)\n\n# 输入文件路径 (支持 jpg, png, pdf)\ninput_file = 'example_document.pdf'\n\n# 执行解析\nresult = parser.predict(input_file)\n\n# 输出结果为 JSON 格式，包含文本块、表格结构、坐标等信息\nimport json\nprint(json.dumps(result, indent=2, ensure_ascii=False))\n\n# (可选) 保存可视化结果\nparser.visualize(input_file, result, save_path='output_visualization.jpg')\n```\n\n运行脚本：\n```bash\npython demo_parse.py\n```\n\n### 其他模型快速调用\n如果您想尝试最新的 **Platypus** (通用文字识别) 或 **OmniParser** (关键信息提取)，请进入对应子目录（如 `.\u002FOCR\u002FPlatypus`），参照该目录下的 `README.md` 或 `demo.py` 进行调用。所有模型均遵循统一的接口设计风格，便于集成。\n\n---\n*注：更多高级功能（如网页渲染参数生成 WebRPG、视觉文本生成 SceneVTG）请参考各子模块目录下的详细文档。*","某跨境电商运营团队每天需处理数千张来自全球不同国家的商品海报与发票，从中提取多语言文本并重构为可编辑的网页展示形式。\n\n### 没有 AdvancedLiterateMachinery 时\n- 面对倾斜、模糊或艺术字体的多国文字，传统 OCR 识别率极低，人工校对耗时占全流程的 60%。\n- 无法自动区分文档中的关键信息（如价格、日期）与普通背景文本，导致数据结构化整理困难。\n- 将纸质单据转换为线上可视化页面时，需设计师手动调整排版参数，重复劳动且容易出错。\n- 缺乏针对复杂场景的评估基准，难以量化现有模型的不足，技术迭代方向模糊。\n\n### 使用 AdvancedLiterateMachinery 后\n- 借助 Platypus 统一架构，轻松识别各类变形及多语言文本，识别准确率大幅提升，人工复核时间减少 80%。\n- 利用 CC-OCR 基准训练出的模型，能精准定位并提取关键字段，直接输出结构化 JSON 数据供系统调用。\n- 通过 WebRPG 自动解析 HTML 并生成视觉渲染参数，瞬间将原始单据转化为高保真网页预览，无需人工干预。\n- 基于 SceneVTG 生成高质量合成数据用于模型微调，显著增强了系统在极端光照和复杂背景下的鲁棒性。\n\nAdvancedLiterateMachinery 将原本碎片化、高人力的文档处理流程，升级为具备“阅读、思考、创造”能力的自动化智能闭环。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FAlibabaResearch_AdvancedLiterateMachinery_6b341bd3.png","AlibabaResearch","Alibaba Research","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FAlibabaResearch_34dd7cff.png","",null,"opensource.alibaba.com","https:\u002F\u002Fgithub.com\u002FAlibabaResearch",[80,84,88,92,96,100,104],{"name":81,"color":82,"percentage":83},"C++","#f34b7d",53.6,{"name":85,"color":86,"percentage":87},"Python","#3572A5",36.7,{"name":89,"color":90,"percentage":91},"C","#555555",6.7,{"name":93,"color":94,"percentage":95},"Cuda","#3A4E3A",2.4,{"name":97,"color":98,"percentage":99},"Objective-C","#438eff",0.3,{"name":101,"color":102,"percentage":103},"Shell","#89e051",0.2,{"name":105,"color":106,"percentage":107},"Objective-C++","#6866fb",0.1,1825,198,"2026-04-02T06:59:22","Apache-2.0",4,"未说明",{"notes":115,"python":113,"dependencies":116},"README 内容主要介绍了 Advanced Literate Machinery (ALM) 项目的研究目标、多个子模型（如 CC-OCR, Platypus, SceneVTG, DocXChain 等）的功能及论文链接，但未提供具体的安装指南、运行环境配置、依赖库列表或硬件资源需求。部分子项目提及使用了 ModelScope 模型或 RapidLatexOCR，具体环境需求需参考各子项目的独立文档。",[],[14,15,35,118],"其他",[120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139],"artificial-intelligence","documentai","multimodal","multimodal-deep-learning","ocr","computer-vision","vision-language-transformer","end-to-end-ocr","scene-text-detection","scene-text-detection-recognition","scene-text-recognition","text-detection","text-recognition","vision-language","document","document-analysis","document-recognition","document-understanding","document-intelligence","vision-language-model","2026-03-27T02:49:30.150509","2026-04-08T01:08:04.807545",[143,148,153,158,163,168],{"id":144,"question_zh":145,"answer_zh":146,"source_url":147},23249,"如何对未见过的图像（非预处理数据集）使用 VGT 进行推理？","VGT 原始代码似乎需要每个图像都有对应的 `.pkl` 网格文件，这限制了其在非预训练数据集（如 DocBank, DocLaynet, PubLayNet）上的使用。解决方案是自行生成网格文件或直接修改推理逻辑。推荐方法是使用 `pdfplumber` 库读取 PDF 页面，提取文本、边界框（bbox）和子词信息，构建包含 `input_ids`, `bbox_subword_list`, `texts`, `bbox_texts_list` 的字典，并将其保存为 `.pkl` 文件供模型使用。社区已提交相关 PR 支持机器可读 PDF 的直接转换，若需支持 OCR 版本可关注后续更新。","https:\u002F\u002Fgithub.com\u002FAlibabaResearch\u002FAdvancedLiterateMachinery\u002Fissues\u002F75",{"id":149,"question_zh":150,"answer_zh":151,"source_url":152},23250,"如何在没有块（block）边界框的情况下运行 GeoLayoutLM 推理？","GeoLayoutLM 通常需要块边界框作为输入，但如果难以获取，可以尝试将每个单词边界框（word box）视为一个只包含单个单词的块边界框。虽然这种方法在训练数据不足时可能影响性能（因为缺少先验信息），但在某些情况下是可行的。此外，由于 GeoLayoutLM 内置了块内单词的 BIO 标记机制，理论上可以通过一些调整使其在不依赖块信息的情况下进行零样本或少样本推理，但这可能需要对代码进行特定修改。","https:\u002F\u002Fgithub.com\u002FAlibabaResearch\u002FAdvancedLiterateMachinery\u002Fissues\u002F36",{"id":154,"question_zh":155,"answer_zh":156,"source_url":157},23251,"在使用 GeoLayoutLM 处理 CORD 数据集时遇到标签维度不匹配错误，如何解决？","该问题通常是因为模型最终的分类标签数量与公式 `2 * [class_names.txt 中的标签数（排除\"O\"）] + 1` 不匹配导致的。解决方法是将 CORD 数据集的格式转换为类似 FUNSD 的格式。关键步骤是在 `\u002Futils\u002F__init__.py` 第 51 行设置 `cfg.model.n_classes = 2 * 22 + 1`（假设 CORD 有 22 个非'O'标签），确保标签类别数量符合 BIO 分类的维度要求。如果需要具体的数据集转换规则代码，可以向社区成员索取或参考相关的开源实现。","https:\u002F\u002Fgithub.com\u002FAlibabaResearch\u002FAdvancedLiterateMachinery\u002Fissues\u002F50",{"id":159,"question_zh":160,"answer_zh":161,"source_url":162},23252,"WTW 数据集中 logic_axis 字段的含义及其与原始 XML 标注中 startcol\u002Fstartrow 不一致的原因是什么？","在 WTW 数据集中，`logic_axis` 的四个值分别代表 `col_start, col_end, row_start, row_end`；而在其他数据集中，顺序可能是 `row_start, row_end, col_start, col_end`。例如，一个占据第 1 行、第 1-2 列的单元格，其 `logic_axis` 应为 `[0, 0, 0, 1]`（基于 0 索引）。这种差异源于不同数据集的定义方式。如果在重训练（retrain），应根据所使用的数据集权重选择对应的格式；如果是微调（finetune），则可以分别按照上述样式组织数据。目前官方仓库中可能缺乏将 Pubtabnet 或 Fintabnet 等格式直接转换为带 `logic_axis` 的 COCO 格式的脚本。","https:\u002F\u002Fgithub.com\u002FAlibabaResearch\u002FAdvancedLiterateMachinery\u002Fissues\u002F17",{"id":164,"question_zh":165,"answer_zh":166,"source_url":167},23253,"运行 demo_imgs.py 脚本报错，如何修复以成功运行 LevOCR 预训练模型？","旧版本的 `demo_imgs.py` 存在代码兼容性问题。修复方法包括：1. 在第 141 行修改为 `model.module.load_state_dict(torch.load(opt.saved_model, map_location=device), strict=False)`；2. 在第 150 行实例化 `AlignCollate_demo = AlignCollateTest(imgH=opt.imgH, imgW=opt.imgW)`；3. 在第 171 行将编码函数调用修正为 `converter.encode_levt(...)`（注意旧代码中可能错误地调用了不存在的 `encode_levt_tgt`）。维护者已更新该脚本，如果遇到类似 `converter.encode_levt_tgt` 不存在的错误，请拉取最新代码或手动应用上述修改。","https:\u002F\u002Fgithub.com\u002FAlibabaResearch\u002FAdvancedLiterateMachinery\u002Fissues\u002F6",{"id":169,"question_zh":170,"answer_zh":171,"source_url":172},23254,"GeoLayoutLM 几何预训练任务中具体使用了哪些特征？","GeoLayoutLM 的几何预训练任务输入为文本段特征 `{Bi}`，这些特征可以是五种类型之一：`{Hi}`（高度）、`{Mvi}`（垂直中点）、`{Fvi}`（垂直位置特征）、`{Mt,b(i)}`（块内相对中点）、`{Fi,b,(i)}`（块内相对位置特征）。预训练任务包括方向建模和距离建模，通过公式 `P = Softmax(Linear[Bi, Bj])` 计算段落间的关系概率。这些特征帮助模型学习文档布局中的几何结构信息，从而提升下游任务表现。","https:\u002F\u002Fgithub.com\u002FAlibabaResearch\u002FAdvancedLiterateMachinery\u002Fissues\u002F68",[174,179,184,189,194,199,204,209,214,219],{"id":175,"version":176,"summary_zh":177,"released_at":178},136940,"v1.7.0-gem-model-release","发布GEM模型权重","2024-04-02T09:55:19",{"id":180,"version":181,"summary_zh":182,"released_at":183},136941,"v1.6.0-LaTeX-OCR-models","LaTeX-OCR 模型（来自 https:\u002F\u002Fgithub.com\u002FRapidAI\u002FRapidLatexOCR）","2023-11-22T06:47:13",{"id":185,"version":186,"summary_zh":187,"released_at":188},136942,"v1.4.0-LISTER-release","发布 LISTER（ICCV 2023）的模型权重","2023-09-19T08:57:00",{"id":190,"version":191,"summary_zh":192,"released_at":193},136943,"v1.3.0-VGT-release","发布VGT（ICCV 2023）的模型权重","2023-09-19T06:12:54",{"id":195,"version":196,"summary_zh":197,"released_at":198},136944,"v1.2.0-docX-release","发布 DocX_layout 模型权重。\n------------------------------------\n更新日期：2023年10月18日：\n用 DocXLayout_231012.pth 替换模型权重。","2023-09-04T08:27:11",{"id":200,"version":201,"summary_zh":202,"released_at":203},136945,"v1.1.0-geolayoutlm-model","添加 geolayoutlm 模型","2023-04-19T11:54:50",{"id":205,"version":206,"summary_zh":207,"released_at":208},136946,"v1.0.3-LevOCR-model","LevOCR 的最终模型；预训练模型及训练所需材料。","2022-12-21T14:30:54",{"id":210,"version":211,"summary_zh":212,"released_at":213},136947,"V1.0.2-LevOCR-model","LevOCR 模型","2022-11-11T04:43:02",{"id":215,"version":216,"summary_zh":217,"released_at":218},136948,"V1.0.1-ECCV2022-model","MGP-STR模型","2022-09-30T04:09:07",{"id":220,"version":221,"summary_zh":222,"released_at":223},136949,"V1.0.0-ECCV2022","ECCV2022 关于 MGP-STR 和 LevOCR 的相关资料","2022-09-29T06:43:23"]