[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-lakesoul-io--LakeSoul":3,"tool-lakesoul-io--LakeSoul":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",158594,2,"2026-04-16T23:34:05",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":64,"owner_avatar_url":73,"owner_bio":74,"owner_company":75,"owner_location":75,"owner_email":76,"owner_twitter":75,"owner_website":77,"owner_url":78,"languages":79,"stars":119,"forks":120,"last_commit_at":121,"license":122,"difficulty_score":123,"env_os":124,"env_gpu":124,"env_ram":124,"env_deps":125,"category_tags":135,"github_topics":136,"view_count":32,"oss_zip_url":75,"oss_zip_packed_at":75,"status":17,"created_at":155,"updated_at":156,"faqs":157,"releases":188},8268,"lakesoul-io\u002FLakeSoul","LakeSoul","LakeSoul is an end-to-end, realtime and cloud native Lakehouse framework with fast data ingestion, concurrent update and incremental data analytics on cloud storages for both BI and AI applications.","LakeSoul 是一款云原生的湖仓一体框架，旨在为商业智能（BI）和人工智能（AI）应用提供端到端的实时数据处理能力。它有效解决了传统数据架构中批量与流式处理割裂、云端存储难以高效支持并发更新及增量分析等痛点，帮助用户轻松构建现代化的实时数据仓库。\n\n这款工具特别适合大数据工程师、数据架构师及 AI 研究人员使用，尤其是那些需要在 HDFS 或 S3 等云存储上整合 Spark、Flink、Presto 甚至 PyTorch 等多种计算引擎的团队。LakeSoul 的核心亮点在于其独特的技术架构：它利用 Rust 重写原生元数据与 IO 层以提升性能，采用类 LSM-Tree 结构支持基于主键的高吞吐并发 Upsert 操作，并借助 PostgreSQL 实现可扩展的元数据管理与严格的 ACID 事务控制。此外，它还具备自动 Schema 演进、多工作空间权限隔离以及自动化数据维护等特性，让开发者能够专注于业务逻辑，以更低的运维成本实现流批一体的数据处理与模型训练。","\u003C!--\nSPDX-FileCopyrightText: 2023 LakeSoul Contributors\n\nSPDX-License-Identifier: Apache-2.0\n-->\n\n\u003Cimg src='https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002Fartwork\u002Fblob\u002Fmain\u002Fhorizontal\u002Fcolor\u002FLakeSoul_Horizontal_Color.svg' alt=\"LakeSoul\" height='200'>\n\n\u003Cimg src='https:\u002F\u002Fgithub.com\u002Flfai\u002Fartwork\u002Fblob\u002Fmain\u002Flfaidata-assets\u002Flfaidata-project-badge\u002Fsandbox\u002Fcolor\u002Flfaidata-project-badge-sandbox-color.svg' alt=\"LF AI & Data Sandbox Project\" height='180'>\n\n![OpenSSF Best Practices](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flakesoul-io_LakeSoul_readme_e7ac80e96562.png)\n\n![Maven Test](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Factions\u002Fworkflows\u002Fmaven-test.yml\u002Fbadge.svg)\n![Flink CDC Test](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Factions\u002Fworkflows\u002Fflink-cdc-test.yml\u002Fbadge.svg)\n![Build](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Factions\u002Fworkflows\u002Fnative-build.yml\u002Fbadge.svg)\n\n[中文介绍](README-CN.md)\n\n**2025.09: LakeSoul has released newest version 3.0.0, check out our [release note](https:\u002F\u002Flakesoul-io.github.io\u002Fblog\u002F2025\u002F09\u002F05\u002Flakesoul-3.0.0-release)**\n\nLakeSoul is a cloud-native Lakehouse framework that supports scalable metadata management, ACID transactions, efficient and flexible upsert operation, schema evolution, and unified streaming & batch processing.\n\nLakeSoul supports multiple computing engines to read and write lake warehouse table data, including Spark, Flink, Presto, and PyTorch, and supports multiple computing modes such as batch, stream, MPP, and AI. LakeSoul supports storage systems such as HDFS and S3.\n\n![LakeSoul Arch](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flakesoul-io_LakeSoul_readme_c7cde2fcb22d.png)\n\nLakeSoul was originally created by DMetaSoul company and was donated to Linux Foundation AI & Data as a sandbox project since May 2023.\n\nLakeSoul implements incremental upserts for both row and column and allows concurrent updates.\n\nLakeSoul uses LSM-Tree like structure to support updates on hash partitioning table with primary key, and achieves very high write throughput while providing optimized merge on read performance (refer to [Performance Benchmarks](https:\u002F\u002Flakesoul-io.github.io\u002Fblog\u002F2023\u002F04\u002F21\u002Flakesoul-2.2.0-release)). LakeSoul scales metadata management and achieves ACID control by using PostgreSQL.\n\nLakeSoul uses Rust to implement the native metadata layer and IO layer, and provides C\u002FJava\u002FPython interfaces to support the connecting of multiple computing frameworks such as big data and AI.\n\nLakeSoul supports concurrent batch or streaming read and write. Both read and write supports CDC semantics, and together with auto schema evolution and exacly-once guarantee, constructing realtime data warehouses is made easy.\n\nLakeSoul supports multi-workspace and RBAC. LakeSoul uses Postgres's RBAC and row-level security policies to implement permission isolation for metadata. Together with Hadoop users and groups, physical data isolation can be achieved. LakeSoul's permission isolation is effective for SQL\u002FJava\u002FPython jobs.\n\nLakeSoul supports automatic disaggregated size-tiered multi-level compaction, automatic table life cycle maintenance, automatic data asset statistics, and automatic redundant data cleaning, reducing operation costs and improving usability.\n\nMore detailed features please refer to our doc page: [Documentations](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002Fintro)\n\n# Quick Start\nFollow the [Quick Start](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FGetting%20Started\u002Fsetup-local-env) to quickly set up a test env.\n\n# Tutorials\nPlease find tutorials in doc site:\n\n* Checkout [Examples of Python Data Processing and AI Model Training on LakeSoul](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Ftree\u002Fmain\u002Fpython\u002Fexamples) on how LakeSoul connecting AI to Lakehouse to build a unified and modern data infrastructure.\n* Checkout [LakeSoul Flink CDC Whole Database Synchronization Tutorial](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FTutorials\u002Fflink-cdc-sink) on how to sync an entire MySQL database into LakeSoul in realtime, with auto table creation, auto DDL sync and exactly once guarantee.\n* Checkout [Flink SQL Usage](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fflink-lakesoul-connector) on using Flink SQL to read or write LakeSoul in both batch and streaming mode, with the supports of Flink Changelog Stream semantics and row-level upsert and delete.\n* Checkout [Multi Stream Merge and Build Wide Table Tutorial](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FTutorials\u002Fmutil-stream-merge) on how to merge multiple stream with same primary key (and different other columns) concurrently without join.\n* Checkout [Upsert Data and Merge UDF Tutorial](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FTutorials\u002Fupsert-and-merge-udf) on how to upsert data and Merge UDF to customize merge logic.\n* Checkout [Snapshot API Usage](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FTutorials\u002Fsnapshot-manage) on how to do snapshot read (time travel), snapshot rollback and cleanup.\n* Checkout [Incremental Query Tutorial](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FTutorials\u002Fincremental-query) on how to do incremental query in Spark in batch or stream mode.\n\n# Usage Documentations\nPlease find usage documentations in doc site:\n[Usage Doc](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fsetup-meta-env)\n\n[快速开始](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FGetting%20Started\u002Fsetup-local-env)\n\n[教程](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FTutorials\u002Fflink-cdc-sink)\n\n[使用文档](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FUsage%20Docs\u002Fsetup-meta-env)\n\n# Feature Roadmap\n* Data Science and AI\n  - [x] Native Python Reader (without PySpark)\n  - [x] PyTorch Dataset and distributed training\n* Meta Management ([#23](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F23))\n  - [x] Multiple Level Partitioning: Multiple range partition and at most one hash partition\n  - [x] Concurrent write with auto conflict resolution\n  - [x] MVCC with read isolation\n  - [x] Write transaction (two-stage commit) through Postgres Transaction\n  - [x] Schema Evolution: Column add\u002Fdelete supported\n* Table operations \n  - [x] LSM-Tree style upsert for hash partitioned table\n  - [x] Merge on read for hash partition with upsert delta file\n  - [x] Copy on write update for non hash partitioned table\n  - [x] Automatic Disaggregated Compaction Service\n* Data Warehousing\n  - [x] CDC stream ingestion with auto ddl sync\n  - [x] Incremental and Snapshot Query\n    - [x] Snapshot Query ([#103](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F103))\n    - [x] Incremental Query ([#103](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F103))\n    - [x] Incremental Streaming Source ([#130](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F130))\n    - [x] Flink Stream\u002FBatch Source\n  - [x] Multi Workspaces and RBAC\n* Spark Integration\n  - [x] Table\u002FDataframe API\n  - [x] SQL support with catalog except upsert\n  - [x] Query optimization\n    - [x] Shuffle\u002FJoin elimination for operations on primary key\n  - [x] Merge UDF (Merge operator)\n  - [x] Merge Into SQL support\n    - [x] Merge Into SQL with match on Primary Key (Merge on read)\n* Flink Integration and CDC Ingestion ([#57](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F57))\n  - [x] Table API\n    - [x] Batch\u002FStream Sink\n    - [x] Batch\u002FStream source\n    - [x] Stream Source\u002FSink for ChangeLog Stream Semantics\n    - [x] Exactly Once Source and Sink\n  - [x] Flink CDC\n    - [x] Auto Schema Change (DDL) Sync\n    - [x] Auto Table Creation (depends on #78)\n    - [x] Support sink multiple source tables with different schemas ([#84](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F84))\n* Hive Integration\n  - [x] Export to Hive partition after compaction\n  - [x] Apache Kyuubi (Hive JDBC) Integration\n* Realtime Data Warehousing\n  - [x] CDC ingestion\n  - [x] Time Travel (Snapshot read)\n  - [x] Snapshot rollback\n  - [x] Automatic global compaction service\n  - [x] MPP Engine Integration (depends on [#66](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F66))\n    - [x] Presto\n    - [x] Compatibility with Presto Native Execution(with Velox)\n    - [x] Apache Doris\n* Cloud and Native IO ([#66](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F66))\n  - [x] Object storage IO optimization\n  - [x] Native vectorized merge on read\n  - [x] Multi-layer storage classes support with local-disk data cache\n\n# Community guidelines\n[Community guidelines](community-guideline.md)\n\n# Feedback and Contribution\nPlease feel free to open an issue or dicussion if you have any questions.\n\nJoin our [Discord](https:\u002F\u002Fdiscord.gg\u002FWJrHKq4BPf) server for discussions.\n\n# Contact Us\nEmail us at [lakesoul-technical-discuss@lists.lfaidata.foundation](mailto:lakesoul-technical-discuss@lists.lfaidata.foundation).\n\n# Opensource License\nLakeSoul is opensourced under Apache License v2.0.\n","\u003C!--\nSPDX-FileCopyrightText: 2023 LakeSoul Contributors\n\nSPDX-License-Identifier: Apache-2.0\n-->\n\n\u003Cimg src='https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002Fartwork\u002Fblob\u002Fmain\u002Fhorizontal\u002Fcolor\u002FLakeSoul_Horizontal_Color.svg' alt=\"LakeSoul\" height='200'>\n\n\u003Cimg src='https:\u002F\u002Fgithub.com\u002Flfai\u002Fartwork\u002Fblob\u002Fmain\u002Flfaidata-assets\u002Flfaidata-project-badge\u002Fsandbox\u002Fcolor\u002Flfaidata-project-badge-sandbox-color.svg' alt=\"LF AI & Data Sandbox Project\" height='180'>\n\n![OpenSSF 最佳实践](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flakesoul-io_LakeSoul_readme_e7ac80e96562.png)\n\n![Maven 测试](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Factions\u002Fworkflows\u002Fmaven-test.yml\u002Fbadge.svg)\n![Flink CDC 测试](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Factions\u002Fworkflows\u002Fflink-cdc-test.yml\u002Fbadge.svg)\n![构建](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Factions\u002Fworkflows\u002Fnative-build.yml\u002Fbadge.svg)\n\n[中文介绍](README-CN.md)\n\n**2025年9月：LakeSoul发布了最新版本3.0.0，请查看我们的[发布说明](https:\u002F\u002Flakesoul-io.github.io\u002Fblog\u002F2025\u002F09\u002F05\u002Flakesoul-3.0.0-release)**\n\nLakeSoul是一个云原生的湖仓框架，支持可扩展的元数据管理、ACID事务、高效灵活的upsert操作、模式演化以及流批一体化处理。\n\nLakeSoul支持多种计算引擎读写湖仓表数据，包括Spark、Flink、Presto和PyTorch，并且支持批处理、流处理、MPP和AI等多种计算模式。LakeSoul还兼容HDFS和S3等存储系统。\n\n![LakeSoul架构](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flakesoul-io_LakeSoul_readme_c7cde2fcb22d.png)\n\nLakeSoul最初由DMetaSoul公司创建，自2023年5月起作为沙盒项目捐赠给Linux基金会AI与数据组织。\n\nLakeSoul实现了行级和列级的增量upsert操作，并支持并发更新。\n\nLakeSoul采用类似LSM树的结构来支持带有主键的哈希分区表上的更新，在提供优化的读时合并性能的同时，实现了极高的写入吞吐量（详情请参阅[性能基准测试](https:\u002F\u002Flakesoul-io.github.io\u002Fblog\u002F2023\u002F04\u002F21\u002Flakesoul-2.2.0-release)）。LakeSoul通过使用PostgreSQL实现元数据管理的扩展性和ACID控制。\n\nLakeSoul使用Rust语言实现了原生的元数据层和IO层，并提供了C\u002FJava\u002FPython接口，以支持大数据和AI等多种计算框架的连接。\n\nLakeSoul支持并发的批处理或流式读写。无论是读取还是写入都支持CDC语义，结合自动模式演化和精确一次保证，可以轻松构建实时数据仓库。\n\nLakeSoul支持多工作空间和RBAC权限管理。它利用PostgreSQL的RBAC和行级安全策略来实现元数据的权限隔离。结合Hadoop的用户和组机制，可以实现物理数据的隔离。LakeSoul的权限隔离对SQL、Java和Python作业均有效。\n\nLakeSoul支持自动的分层多级压缩、自动的表生命周期维护、自动的数据资产统计以及自动的冗余数据清理功能，从而降低运维成本并提升易用性。\n\n更多详细功能请参考我们的文档页面：[文档](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002Fintro)\n\n# 快速入门\n按照[快速入门](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FGetting%20Started\u002Fsetup-local-env)指南，您可以快速搭建一个测试环境。\n\n# 教程\n请在文档网站中查找相关教程：\n\n* 查看[Python数据处理与AI模型训练在LakeSoul上的示例](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Ftree\u002Fmain\u002Fpython\u002Fexamples)，了解如何将AI与湖仓结合，构建统一的现代化数据基础设施。\n* 查看[LakeSoul Flink CDC整库同步教程](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FTutorials\u002Fflink-cdc-sink)，学习如何将整个MySQL数据库实时同步到LakeSoul，实现自动建表、自动DDL同步以及精确一次保证。\n* 查看[Flink SQL使用指南](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fflink-lakesoul-connector)，了解如何在批处理和流处理模式下使用Flink SQL读写LakeSoul，同时支持Flink Changelog Stream语义以及行级的upsert和删除操作。\n* 查看[多流合并与宽表构建教程](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FTutorials\u002Fmutil-stream-merge)，学习如何在不进行Join的情况下，并发地合并具有相同主键（但其他列不同的）的多个流。\n* 查看[Upsert数据与Merge UDF教程](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FTutorials\u002Fupsert-and-merge-udf)，了解如何使用upsert数据和Merge UDF来自定义合并逻辑。\n* 查看[快照API使用指南](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FTutorials\u002Fsnapshot-manage)，学习如何进行快照读取（时间旅行）、快照回滚和清理。\n* 查看[增量查询教程](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FTutorials\u002Fincremental-query)，了解如何在Spark中以批处理或流式方式执行增量查询。\n\n# 使用文档\n请在文档网站中查找使用文档：\n[使用文档](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fsetup-meta-env)\n\n[快速开始](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FGetting%20Started\u002Fsetup-local-env)\n\n[教程](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FTutorials\u002Fflink-cdc-sink)\n\n[使用文档](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FUsage%20Docs\u002Fsetup-meta-env)\n\n# 功能路线图\n* 数据科学与人工智能\n  - [x] 原生 Python 读取器（无需 PySpark）\n  - [x] PyTorch 数据集及分布式训练\n* 元数据管理 ([#23](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F23))\n  - [x] 多级分区：多个范围分区及最多一个哈希分区\n  - [x] 并发写入与自动冲突解决\n  - [x] MVCC 及读隔离\n  - [x] 通过 Postgres 事务实现的写事务（两阶段提交）\n  - [x] Schema Evolution：支持列的添加与删除\n* 表操作\n  - [x] 哈希分区表的 LSM-Tree 风格 upsert\n  - [x] 带 upsert 增量文件的哈希分区 Merge on read\n  - [x] 非哈希分区表的 Copy on write 更新\n  - [x] 自动去重合并服务\n* 数据仓库\n  - [x] CDC 流式摄取与自动 DDL 同步\n  - [x] 增量查询与快照查询\n    - [x] 快照查询 ([#103](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F103))\n    - [x] 增量查询 ([#103](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F103))\n    - [x] 增量流式源 ([#130](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F130))\n    - [x] Flink 流\u002F批源\n  - [x] 多工作空间与 RBAC\n* Spark 集成\n  - [x] Table\u002FDataframe API\n  - [x] 支持 SQL，但不包括 upsert 操作\n  - [x] 查询优化\n    - [x] 对主键操作的 Shuffle\u002FJoin 消除\n  - [x] Merge UDF（合并算子）\n  - [x] Merge Into SQL 支持\n    - [x] 基于主键匹配的 Merge Into SQL（Merge on read）\n* Flink 集成与 CDC 摄取 ([#57](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F57))\n  - [x] Table API\n    - [x] 批\u002F流 Sink\n    - [x] 批\u002F流 Source\n    - [x] ChangeLog 流语义的流 Source\u002FSink\n    - [x] 精确一次 Source 和 Sink\n  - [x] Flink CDC\n    - [x] 自动模式变更（DDL）同步\n    - [x] 自动建表（依赖于 #78）\n    - [x] 支持将不同 schema 的多张源表写入同一目标表 ([#84](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F84))\n* Hive 集成\n  - [x] 合并后导出至 Hive 分区\n  - [x] Apache Kyuubi（Hive JDBC）集成\n* 实时数据仓库\n  - [x] CDC 摄取\n  - [x] 时间旅行（快照读取）\n  - [x] 快照回滚\n  - [x] 自动全局合并服务\n  - [x] MPP 引擎集成（依赖于 [#66](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F66)）\n    - [x] Presto\n    - [x] 与 Presto 原生执行（使用 Velox）兼容\n    - [x] Apache Doris\n* 云原生 IO ([#66](https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F66))\n  - [x] 对象存储 IO 优化\n  - [x] 原生向量化 Merge on read\n  - [x] 支持多层存储类别，并配备本地磁盘数据缓存\n\n# 社区准则\n[社区准则](community-guideline.md)\n\n# 反馈与贡献\n如果您有任何问题，请随时提出 issue 或发起讨论。\n\n欢迎加入我们的 [Discord](https:\u002F\u002Fdiscord.gg\u002FWJrHKq4BPf) 服务器进行交流。\n\n# 联系我们\n请发送邮件至 [lakesoul-technical-discuss@lists.lfaidata.foundation](mailto:lakesoul-technical-discuss@lists.lfaidata.foundation)。\n\n# 开源许可证\nLakeSoul 采用 Apache License v2.0 开源。","# LakeSoul 快速上手指南\n\nLakeSoul 是一个云原生湖仓一体框架，支持可扩展的元数据管理、ACID 事务、高效 Upsert 操作、Schema 演进以及统一的流批处理。它兼容 Spark、Flink、Presto 等多种计算引擎，并支持 HDFS 和 S3 等存储系统。\n\n## 1. 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**: Linux 或 macOS (Windows 需通过 WSL2 运行)\n*   **Java**: JDK 8 或 JDK 11 (推荐 JDK 11)\n*   **Scala**: 2.12.x (通常随 Spark\u002FFlink 环境自带)\n*   **数据库**: PostgreSQL 9.4+ (用于存储元数据，必须预先安装并启动)\n*   **计算引擎** (任选其一或全部):\n    *   Apache Spark 3.x\n    *   Apache Flink 1.14+\n*   **构建工具**: Maven 3.6+ (如需从源码编译)\n\n> **注意**: LakeSoul 依赖 PostgreSQL 进行元数据管理和 ACID 控制，请务必提前配置好 PostgreSQL 服务并创建对应的数据库和用户。\n\n## 2. 安装步骤\n\n### 方式一：使用 Maven 依赖（推荐）\n\n如果您已有 Spark 或 Flink 项目，只需在 `pom.xml` 中添加 LakeSoul 连接器依赖。\n\n**Spark 连接器:**\n```xml\n\u003Cdependency>\n    \u003CgroupId>io.lakesoul\u003C\u002FgroupId>\n    \u003CartifactId>lakesoul-spark\u003C\u002FartifactId>\n    \u003Cversion>3.0.0\u003C\u002Fversion>\n\u003C\u002Fdependency>\n```\n\n**Flink 连接器:**\n```xml\n\u003Cdependency>\n    \u003CgroupId>io.lakesoul\u003C\u002FgroupId>\n    \u003CartifactId>lakesoul-flink\u003C\u002FartifactId>\n    \u003Cversion>3.0.0\u003C\u002Fversion>\n\u003C\u002Fdependency>\n```\n\n> **国内加速**: 如果下载依赖较慢，建议在 Maven `settings.xml` 中配置阿里云镜像：\n> ```xml\n> \u003Cmirror>\n>   \u003Cid>aliyunmaven\u003C\u002Fid>\n>   \u003CmirrorOf>*\u003C\u002FmirrorOf>\n>   \u003Cname>Aliyun Public\u003C\u002Fname>\n>   \u003Curl>https:\u002F\u002Fmaven.aliyun.com\u002Frepository\u002Fpublic\u003C\u002Furl>\n> \u003C\u002Fmirror>\n> ```\n\n### 方式二：本地测试环境搭建\n\n若要快速搭建本地测试环境，可参考官方脚本初始化元数据（需确保 PostgreSQL 已运行）：\n\n1.  克隆项目：\n    ```bash\n    git clone https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul.git\n    cd LakeSoul\n    ```\n\n2.  初始化元数据表结构（假设 PG 地址为 localhost:5432，用户\u002F库名为 lakesoul）：\n    ```bash\n    # 执行 SQL 初始化脚本 (路径可能在 tools 或 docs 中，具体参考官方 Setup 文档)\n    psql -h localhost -p 5432 -U lakesoul -d lakesoul -f scripts\u002Finit_meta.sql\n    ```\n\n3.  设置环境变量（在提交任务时传递）：\n    ```bash\n    export LAKESOUL_META_DB_URL=\"jdbc:postgresql:\u002F\u002Flocalhost:5432\u002Flakesoul\"\n    export LAKESOUL_META_DB_USER=\"lakesoul\"\n    export LAKESOUL_META_DB_PASSWORD=\"your_password\"\n    ```\n\n## 3. 基本使用\n\n以下示例展示如何使用 **Spark** 和 **Flink** 进行最简单的数据写入与读取。\n\n### 场景 A：使用 Spark 进行 Upsert 写入与查询\n\nLakeSoul 在 Spark 中支持主键表的 Upsert 操作。\n\n```scala\nimport org.apache.spark.sql.SparkSession\nimport io.lakesoul.LakeSoulTable\n\nval spark = SparkSession.builder()\n  .appName(\"LakeSoulQuickStart\")\n  .master(\"local[*]\")\n  .config(\"spark.sql.extensions\", \"io.lakesoul.sql.LakeSoulSparkSessionExtension\")\n  .config(\"spark.sql.catalog.spark_catalog\", \"io.lakesoul.catalog.LakeSoulCatalog\")\n  \u002F\u002F 配置元数据连接信息\n  .config(\"lakesoul.meta.db.url\", \"jdbc:postgresql:\u002F\u002Flocalhost:5432\u002Flakesoul\")\n  .config(\"lakesoul.meta.db.user\", \"lakesoul\")\n  .config(\"lakesoul.meta.db.password\", \"your_password\")\n  .getOrCreate()\n\nimport spark.implicits._\n\n\u002F\u002F 1. 准备测试数据\nval data = Seq(\n  (1, \"Alice\", 25),\n  (2, \"Bob\", 30)\n).toDF(\"id\", \"name\", \"age\")\n\n\u002F\u002F 2. 写入数据 (自动建表，指定主键为 id)\ndata.write\n  .format(\"lakesoul\")\n  .option(\"rangePartitions\", \"id\") \u002F\u002F 可选：范围分区\n  .option(\"hashPartitions\", \"id\")  \u002F\u002F 可选：哈希分区 (用于高效 Upsert)\n  .option(\"primaryKeys\", \"id\")     \u002F\u002F 指定主键\n  .mode(\"overwrite\")\n  .save(\"\u002Ftmp\u002Flakesoul_table\")\n\n\u002F\u002F 3. 执行 Upsert 更新 (插入新数据或更新已存在的主键)\nval updateData = Seq(\n  (1, \"Alice Updated\", 26), \u002F\u002F 更新 id=1\n  (3, \"Charlie\", 35)        \u002F\u002F 新增 id=3\n).toDF(\"id\", \"name\", \"age\")\n\nupdateData.write\n  .format(\"lakesoul\")\n  .option(\"primaryKeys\", \"id\")\n  .mode(\"append\") \u002F\u002F LakeSoul 会根据主键自动合并\n  .save(\"\u002Ftmp\u002Flakesoul_table\")\n\n\u002F\u002F 4. 读取数据\nval df = spark.read.format(\"lakesoul\").load(\"\u002Ftmp\u002Flakesoul_table\")\ndf.show()\n```\n\n### 场景 B：使用 Flink CDC 实时同步 MySQL 数据\n\nLakeSoul 结合 Flink CDC 可实现整库实时同步，支持自动建表和 Schema 演进。\n\n```java\nimport org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;\nimport org.apache.flink.table.api.EnvironmentSettings;\nimport org.apache.flink.table.api.TableEnvironment;\nimport org.apache.flink.table.api.bridge.java.StreamTableEnvironment;\n\npublic class FlinkCDCSync {\n    public static void main(String[] args) {\n        EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();\n        TableEnvironment tEnv = TableEnvironment.create(settings);\n\n        \u002F\u002F 加载 LakeSoul 连接器\n        tEnv.executeSql(\"CREATE CATALOG lakesoul WITH ('type'='lakesoul')\");\n        tEnv.useCatalog(\"lakesoul\");\n\n        \u002F\u002F 配置元数据 (也可以通过 JVM 参数传递)\n        tEnv.executeSql(\"SET 'lakesoul.meta.db.url' = 'jdbc:postgresql:\u002F\u002Flocalhost:5432\u002Flakesoul'\");\n        tEnv.executeSql(\"SET 'lakesoul.meta.db.user' = 'lakesoul'\");\n        tEnv.executeSql(\"SET 'lakesoul.meta.db.password' = 'your_password'\");\n\n        \u002F\u002F 定义 MySQL CDC Source (示例)\n        String sourceDDL = \n            \"CREATE TABLE mysql_source (\" +\n            \"  id INT,\" +\n            \"  name STRING,\" +\n            \"  age INT,\" +\n            \"  PRIMARY KEY (id) NOT ENFORCED\" +\n            \") WITH (\" +\n            \"  'connector' = 'mysql-cdc',\" +\n            \"  'hostname' = 'localhost',\" +\n            \"  'port' = '3306',\" +\n            \"  'username' = 'root',\" +\n            \"  'password' = 'root',\" +\n            \"  'database-name' = 'test_db',\" +\n            \"  'table-name' = 'users'\" +\n            \")\";\n        tEnv.executeSql(sourceDDL);\n\n        \u002F\u002F 写入 LakeSoul (自动创建目标表，支持 Schema 同步)\n        String sinkDDL = \n            \"CREATE TABLE lakesoul_sink (\" +\n            \"  id INT,\" +\n            \"  name STRING,\" +\n            \"  age INT,\" +\n            \"  PRIMARY KEY (id) NOT ENFORCED\" +\n            \") WITH (\" +\n            \"  'connector' = 'lakesoul',\" +\n            \"  'table.path' = '\u002Ftmp\u002Flakesoul_flink_table'\" +\n            \")\";\n        tEnv.executeSql(sinkDDL);\n\n        \u002F\u002F 执行同步\n        tEnv.executeSql(\"INSERT INTO lakesoul_sink SELECT * FROM mysql_source\");\n    }\n}\n```\n\n### 关键特性提示\n*   **时间旅行 (Time Travel)**: 读取历史版本数据，例如 `spark.read.option(\"time_travel_version\", \"v1\").format(\"lakesoul\").load(...)`。\n*   **并发写入**: LakeSoul 原生支持多任务并发写入同一张表，无需外部锁。\n*   **自动 Compaction**: 系统会自动触发后台 Compaction 服务优化小文件，无需手动干预。\n\n更多详细教程（如 Python AI 训练集成、多流合并等）请访问 [LakeSoul 官方文档](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002Fintro)。","某大型电商平台的实时风控团队需要构建一个统一数据底座，以支持毫秒级交易反欺诈分析（BI）和动态用户行为预测模型训练（AI）。\n\n### 没有 LakeSoul 时\n- **数据更新延迟高**：传统数据湖难以高效处理高频并发的主键更新（Upsert），导致用户风险画像滞后，无法拦截实时欺诈交易。\n- **架构割裂维护难**：BI 报表依赖批处理链路，而 AI 训练需要流式数据，团队需维护两套独立存储系统，数据一致性难以保障。\n- **模式演进成本高**：业务字段频繁变更时，缺乏自动 Schema 演化机制，每次调整都需停机重构表结构或编写复杂的兼容代码。\n- **权限管理混乱**：缺乏细粒度的行列级安全控制，多租户环境下数据隔离只能靠物理拆分，资源利用率低且运维复杂。\n\n### 使用 LakeSoul 后\n- **实时并发更新**：利用 LSM-Tree 结构和主键哈希分区，LakeSoul 轻松支撑高吞吐并发 Upsert，确保风控规则在秒级内生效。\n- **流批一体统一**：同一份数据同时服务于 Flink 实时计算和 PyTorch 模型训练，消除了数据搬运，保证了分析与训练数据的高度一致。\n- **自动模式演化**：业务新增特征列时，LakeSoul 自动同步 DDL 并兼容历史数据，开发团队无需停机即可快速响应业务变化。\n- **精细化安全隔离**：基于 PostgreSQL 的 RBAC 和行级安全策略，实现了多团队在同一集群内的逻辑隔离，大幅降低运维成本。\n\nLakeSoul 通过云原生湖仓一体架构，将实时数据摄入、并发更新与 AI\u002FBI 分析无缝融合，让企业以最低成本构建起真正敏捷的数据智能闭环。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Flakesoul-io_LakeSoul_dfe9ba4e.png","lakesoul-io","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Flakesoul-io_d6070d2a.png","LakeSoul provides a data management platform for lakehouse architecture, allowing users to query, explore and visualize data in a unified and scalable way.",null,"info@lfaidata.foundation","https:\u002F\u002Flakesoul-io.github.io\u002F","https:\u002F\u002Fgithub.com\u002Flakesoul-io",[80,84,88,92,96,100,104,108,111,115],{"name":81,"color":82,"percentage":83},"Java","#b07219",38.8,{"name":85,"color":86,"percentage":87},"Scala","#c22d40",30.5,{"name":89,"color":90,"percentage":91},"Rust","#dea584",23.1,{"name":93,"color":94,"percentage":95},"Python","#3572A5",4.7,{"name":97,"color":98,"percentage":99},"MDX","#fcb32c",1.2,{"name":101,"color":102,"percentage":103},"Shell","#89e051",0.6,{"name":105,"color":106,"percentage":107},"Dockerfile","#384d54",0.3,{"name":109,"color":110,"percentage":107},"JavaScript","#f1e05a",{"name":112,"color":113,"percentage":114},"PLpgSQL","#336790",0.2,{"name":116,"color":117,"percentage":118},"CSS","#663399",0.1,3227,415,"2026-04-15T03:00:00","Apache-2.0",4,"未说明",{"notes":126,"python":124,"dependencies":127},"LakeSoul 是一个云原生湖仓框架，核心元数据管理依赖 PostgreSQL 数据库。它支持多种计算引擎（Spark, Flink, Presto, PyTorch）和存储系统（HDFS, S3）。虽然支持 PyTorch 进行 AI 训练，但 README 中未明确指定具体的 Python 版本、GPU 型号或内存最低要求。建议参考官方文档中的“快速开始”指南来设置本地测试环境。",[128,129,130,131,132,133,134],"PostgreSQL (元数据存储)","Apache Spark","Apache Flink","Presto","PyTorch","HDFS\u002FS3 (存储系统)","Rust (原生元数据与 IO 层)",[14,16,15],[137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154],"lakesoul","datalake","lakehouse","spark","flink","streaming","big-data","postgresql","rust","sql","huggingface","python","pytorch","arrow","datafusion","vectorized","velox","gluten","2026-03-27T02:49:30.150509","2026-04-17T09:55:48.386093",[158,163,168,173,178,183],{"id":159,"question_zh":160,"answer_zh":161,"source_url":162},37027,"LakeSoul 是否支持在不使用 S3 的情况下本地部署数据湖？","是的，完全支持。您可以直接使用本地绝对路径作为表的存储路径，数据将会保存在本地文件系统中，无需依赖 S3。例如：\n.option(\"path\", \"\u002Fhome\u002Fyourname\u002Fdata\u002Ftable_name\")\n这样即可在本地保存和加载 LakeSoul 表。","https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F102",{"id":164,"question_zh":165,"answer_zh":166,"source_url":167},37028,"LakeSoul 是否支持 Flink Table API 和 Flink CDC？","是的，LakeSoul 已经支持 Flink。目前功能包括 Flink Catalog 管理和 Sink 写入，Flink Table API 和 Flink CDC 也在支持和开发路线图中。您可以查看项目中 FlinkSupport 分支或相关文档获取最新进展。","https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F11",{"id":169,"question_zh":170,"answer_zh":171,"source_url":172},37029,"升级到 Flink CDC 3.0 时遇到 jar 包冲突（如 hamcrest 版本问题）怎么办？","该问题是由于升级 Flink CDC 3.0 后依赖冲突导致的（例如使用了 hamcrest-core-1.3.jar 而不是 hamcrest-2.1.jar）。维护者已通过代码修复解决了此问题，建议拉取包含修复的最新代码或参考相关 PR（Upgrade_to_Flink_CDC_3.0）进行依赖调整。","https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F410",{"id":174,"question_zh":175,"answer_zh":176,"source_url":177},37024,"使用 LakeSoulTable.upsert() 或创建表时提示“表不存在”怎么办？","这通常是因为未指定正确的存储路径。使用 SQL 建表时，默认路径是 spark.sql.warehouse.dir 配置的目录（通常是本地 spark-warehouse）。解决方法有两种：\n1. 在使用 DataFrame API 写入时，直接指定 path：\n   Seq((1, \"a\")).toDF(\"id\", \"col\").write.format(\"lakesoul\").option(\"path\", \"\u002Fyour\u002Fpath\").save()\n2. 在使用 SQL 建表时，显式指定 OPTIONS('path'='...') 或使用 LOCATION 语法：\n   CREATE TABLE lakesoul_test(a LONG, b String) USING lakesoul OPTIONS('path'='\u002Fyour\u002Fpath')\n   之后使用 LakeSoulTable.forPath(\"\u002Fyour\u002Fpath\") 即可正常访问。","https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F89",{"id":179,"question_zh":180,"answer_zh":181,"source_url":182},37025,"如何在本地环境（非 S3\u002F离线环境）运行 LakeSoul 示例，避免 AWS 证书错误？","在本地测试时，不需要配置 S3 或 AWS 凭证。只需将表路径设置为以 file:\u002F\u002F 开头的本地绝对路径即可。例如：\nval tablePath = \"file:\u002F\u002F\u002Fhome\u002Fuser\u002Fdata\u002Ftable_name\"\n或者直接使用本地绝对路径：\nval tablePath = \"\u002Fhome\u002Fuser\u002Fdata\u002Ftable_name\"\n如果使用 HDFS，将前缀改为 hdfs:\u002F\u002F 即可。此外，本地元数据存储可以通过 Docker 启动 PostgreSQL 来解决，具体参考官方快速开始文档。","https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F72",{"id":184,"question_zh":185,"answer_zh":186,"source_url":187},37026,"编译项目时报错 com.dmetasoul.lakesoul.meta.entity 不存在，如何解决？","这些实体类文件是由 Protobuf 自动生成的。如果在编译过程中缺失，可以手动执行以下 Maven 命令来生成：\nmvn protobuf:compile -pl lakesoul-common\n执行完成后重新编译项目即可解决该问题。","https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fissues\u002F474",[189,194,199,204,209,214,219,224,229,234,239,244,249,254,259,264,269,274,279,284],{"id":190,"version":191,"summary_zh":192,"released_at":193},297470,"py-v1.0.2","* 修复元数据查询结果序列化中的内存溢出问题","2025-09-26T00:43:35",{"id":195,"version":196,"summary_zh":197,"released_at":198},297471,"py-v1.0.1","* 在构建时修复 protoc 版本","2025-09-24T01:46:42",{"id":200,"version":201,"summary_zh":202,"released_at":203},297472,"v3.0.0","# LakeSoul 3.0.0 版本发布\n经过近 1 年的迭代优化，LakeSoul 3.0.0 版本正式发布。本次发布带来以下重要更新：\n\n1. LakeSoul 湖仓框架内核功能更新\n    1. LakeSoul NativeIO 性能再次大幅优化，包括调整写文件压缩和字典编码算法、优化 Merge on Read 关键代码路径等，实现读、写性能均提升一倍(对比 2.6 版本)。\n    2. LakeSoul NativeIO 新增本地热数据缓存功能。可以支持将远程对象存储文件缓存在本地磁盘，大幅提升 MPP 查询等性能。支持所有类型远程存储的本地缓存。\n    3. LakeSoul 查询分区过滤下推性能大幅优化，通过元数据索引查询方式，对等值分区过滤条件下推做了大幅度的性能优化。实测单表百万级分区，分区过滤仅需 50ms。\n    4. Flink 升级至 1.20 版本\n    5. LakeSoul 原生支持 Spark + Gluten 向量化引擎，实现批计算大","2025-09-05T10:41:50",{"id":205,"version":206,"summary_zh":207,"released_at":208},297473,"py-v1.0.0","将 LakeSoul Python 包 1.0.0 发布到 PyPI。","2025-09-05T09:44:05",{"id":210,"version":211,"summary_zh":212,"released_at":213},297474,"v2.6.2","**完整变更日志**: https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fcompare\u002Fv2.6.1...v2.6.2","2024-08-07T07:32:38",{"id":215,"version":216,"summary_zh":217,"released_at":218},297475,"v2.6.1","**完整变更日志**: https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fcompare\u002Fv2.6.0...v2.6.1","2024-07-22T05:23:28",{"id":220,"version":221,"summary_zh":222,"released_at":223},297476,"v2.6.0","## 变更内容\n* [Rust] 应用 clippy 并修复拼写错误；由 @mag1c1an1 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F404 中完成\n* [文档] 添加 Spark 入门指南，由 @Ceng23333 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F403 中完成\n* [文档] 添加 Flink 入门指南，由 @moresun 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F405 中完成\n* [文档] 修改入门环境指南，由 @F-PHantam 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F406 中完成\n* [文档] 更新文档格式，由 @xuchen-plus 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F408 中完成\n* [文档] 修复文档页面显示错误并更新 LakeSoul 版本，由 @F-PHantam 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F409 中完成\n* [文档] 对 Spark 指南中的使用案例进行细致检查，由 @Ceng23333 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F411 中完成\n* [网站] 修复网站简体中文首页的文档链接，由 @mag1c1an1 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F413 中完成\n* [文档] 在 Spark 指南中添加 PySpark 内容，由 @moresun 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F414 中完成\n* [Spark\u002FRust\u002F测试] 修复 MergeOperatorSuite 并禁用 3 个测试用例，由 @Ceng23333 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F417 中完成\n* [Spark] 实现合并操作的列式写入，由 @xuchen-plus 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F415 中完成\n* [Spark] 为合并测试添加调试打印信息，由 @xuchen-plus 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F418 中完成\n* [文档] 将文档版本更新至 2.5.1，由 @xuchen-plus 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F419 中完成\n* [Spark\u002FRust] 修复原生 IO 中的 Unicode 列名问题，由 @Ceng23333 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F420 中完成\n* 支持 SQL Server CDC，由 @ChenYunHey 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F421 中完成\n* [文档] 修复 Python 文档中的拼写错误，由 @Ceng23333 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F425 中完成\n* [Spark\u002FRust] 支持对嵌套列名的过滤操作，由 @Ceng23333 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F422 中完成\n* [文档] 添加文档和近期博客，由 @xuchen-plus 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F423 中完成\n* [Flink\u002FRust] 调整滚动文件逻辑以减少写入时的内存占用，由 @xuchen-plus 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F426 中完成\n* [Rust] 启用元数据最大重试次数，由 @Ceng23333 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F431 中完成\n* [Flink] 修复 CDC 入口数据库名称错误，由 @ChenYunHey 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F430 中完成\n* [Rust] 在 Datafusion 中仅保留下推规则，由 @xuchen-plus 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F432 中完成\n* [Rust] Datafusion 目录支持，由 @mag1c1an1 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F429 中完成\n* [Flink] 修复非主键表的 Sink 并行度问题，由 @ChenYunHey 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F433 中完成\n* [Python] 修复 Python 主机构建问题，由 @xuchen-plus 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F434 中完成\n* [Spark] 为合并测试添加 1 秒的睡眠时间，由 @xuchen-plus 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F435 中完成\n* [文档] 添加部署文档，由 @xuchen-plus 在 https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpul","2024-07-17T05:49:52",{"id":225,"version":226,"summary_zh":227,"released_at":228},297477,"v2.5.4","1. 修复 Lakesoul Common 中的类着色问题","2024-05-23T06:27:05",{"id":230,"version":231,"summary_zh":232,"released_at":233},297478,"v2.5.3","1. 为发布添加阴影包  \n2. 修复压缩可能写入错误分区的问题","2024-03-29T08:19:11",{"id":235,"version":236,"summary_zh":237,"released_at":238},297479,"v2.5.1","1. 修复非主键表的 Flink Sink 并行度问题；2. 修复原生 IO 过滤器对非 ASCII 名称和嵌套列的支持问题；3. 优化 Compaction 性能。","2024-01-29T10:45:08",{"id":240,"version":241,"summary_zh":242,"released_at":243},297480,"v2.5.0","# LakeSoul 2.5.0 Release Note\r\n\r\n## What's New\r\n1. [Python Reader](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fmachine-learning-support) supports PyTorch, PyArrow, Pandas, Ray, and distributed execution;\r\n2. Support Spark Gluten Vectorized Engine;\r\n3. Spark SQL supports Compaction, Rollback and other Call Procedures;\r\n4. [Flink CDC’s entire database synchronization](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fflink-cdc-sync) supports MySQL, PostgreSQL, PolarDB, and Oracle;\r\n5. Support [streaming and batch export](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fexport-to-databases) to MySQL, PostgreSQL, PolarDB, and Apache Doris;\r\n6. Optimized NativeIO performance.\r\n\r\n## 更新内容\r\n1. [Python Reader](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FUsage%20Docs\u002Fmachine-learning-support) 支持 PyTorch、PyArrow、Pandas、Ray，支持分布式执行；\r\n2. 支持 Spark Gluten Vectorized Engine；\r\n3. Spark SQL 支持 Compaction、Rollback 等 Call Procedures；\r\n4. [Flink CDC 整库同步](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FUsage%20Docs\u002Fflink-cdc-sync)支持 MySQL、PostgreSQL、PolarDB、Oracle；\r\n5. 支持[流式、批式出湖](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FUsage%20Docs\u002Fexport-to-databases)至 MySQL、PostgreSQL、PolarDB、Apache Doris；\r\n7. 优化 NativeIO 性能.\r\n\r\n## What's Changed\r\n* [Spark]rename MetaVersion at lakesoul-spark as SparkMetaVersion by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F353\r\n* [Metadata]Replace table_info.table_schema with arrow kind schema (Backward Compatibility) by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F354\r\n* [Python][Dataset] Add Ray reading support by @codingfun2022 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F355\r\n* [Spark]optimize incremental read and fix compact operation cause column disorder bug by @F-PHantam in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F352\r\n* [Rust] Create Rust CI by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F356\r\n* [Rust][Metadata]Create Rust  MetadataClient & add CI test cases by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F357\r\n* [Rust][NativeIO]Use stable rustc for lakesoul-io feature default by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F358\r\n* [Python][Rust][Metadata] Update python metadata interface && Full arrow types test by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F359\r\n* [Spark] Spark Sql Support 'drop partition' Operation by @F-PHantam in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F360\r\n* [Python]python deserialized schema from java by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F361\r\n* [Python] Fix wheel building; update version to 1.0.0b1 by @codingfun2022 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F362\r\n* [Rust][Metadata]Asynchronized rust metadata method by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F365\r\n* Add some rust test cases by @zhaishuangszszs in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F364\r\n* [Datafusion]Implement LakeSoul Catalog by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F366\r\n* [Rust] add upsert test cases by @zhaishuangszszs in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F367\r\n* [Flink] update fury version to 0.4 by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F368\r\n* refine upsert test by @zhaishuangszszs in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F369\r\n* [Spark] support call sql syntax by @moresun in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F370\r\n* [Rust]DataFusion version upgraded to 33.0.0 by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F372\r\n* [Spark] Support Gluten Vectorized Engine by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F374\r\n* [Flink] Support oracle cdc source by @ChenYunHey in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F375\r\n* [NativeIO] Use rust block api in file read by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F377\r\n* [Flink] Add export to external dbs for LakeSoul's tables by @ChenYunHey in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F376\r\n* [Rust] Add LakeSoulHashTable Sink for DataFusion by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F382\r\n* [NativeIO] Enable parquet rowgroup prefetch. Support s3 host style access by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F384\r\n* [Rust]fix hash value to spark_murmur3 by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F385\r\n* [BugFix]Fails when create table with nullable hash colmun by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F387\r\n* [Flink] Add Jdbc cdc sources and sinks by @ChenYunHey in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F381\r\n* [Python] fix python meta config parse logic by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F388\r\n* [Project\u002FDoc] Bump version to 2.5.0 and update docs by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F389\r\n* Bump postcss from 8.4.23 to 8.4.33 in \u002Fwebsite by @dependabot in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F396\r\n* Bump @babel\u002Ftraverse from 7.21.5 to 7.23.7 in \u002Fwebsite by @dependabot ","2024-01-10T04:56:28",{"id":245,"version":246,"summary_zh":247,"released_at":248},297481,"v2.4.1","## What's Changed\r\n* [Flink] Flink can configure global warehouse dir by @F-PHantam in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F342\r\n* [NativeIO] Implement DataFusion TableProvider by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F341\r\n* [Spark]Spark parquet filter pushdown exactly by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F343\r\n* [Spark]Spark parquet filter pushdown evaluation + bugfix by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F344\r\n* [Meta] fix meta field compatibility in partition info table by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F345\r\n* [Common] Cleanup redundant DataOperation by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F346\r\n* [Docs] add kyuubi with lakesoul setup doc. by @Asakiny in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F348\r\n* [Native-Metadata] Adaptive jnr buffer size by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F347\r\n* [NativeIO][Bug] LakeSoulParquetProvider projection bugfix by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F349\r\n* [NativeIO] Enable parquet prefetch & use stable sort by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F350\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fcompare\u002Fv2.4.0...v2.4.1","2023-10-12T07:14:50",{"id":250,"version":251,"summary_zh":252,"released_at":253},297482,"v2.4.0","## What's New In This Release\r\n1. RBAC support for all query engines. [doc](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fworkspace-and-rbac)\r\n2. Auto cleaning of old compaction data and partition TTL. [doc](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fclean-redundant-data)\r\n3. Upgrade Flink version to 1.17 and support row level update\u002Fdelete in batch sql.\r\n4. Optimize whole database Flink cdc sync throughput by 80%: #307 \r\n5. Presto Reader; [doc](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fsetup-presto)\r\n6. Python reader and integration with PyTorch and HuggingFace. [doc](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fmachine-learning-support)\r\n\r\n## 本次更新内容\r\n1. 支持 RBAC 角色权限控制，对所有引擎、所有语言API均有效；[文档](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FUsage%20Docs\u002Fworkspace-and-rbac)\r\n2. 自动清理旧的 compaction 数据，支持分区级生命周期（TTL）；[文档](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FUsage%20Docs\u002Fclean-redundant-data)\r\n3. 升级 Flink 版本到 1.17，并支持批模式下行级别更新和删除；\r\n4. 优化整库同步 Flink 作业，吞吐提升 80%： #307 ；\r\n5. 支持 Presto 读取；[文档](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FUsage%20Docs\u002Fsetup-presto)\r\n6. 支持原生 Python 读取，提供 PyTorch、HuggingFace 的集成。[文档](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FUsage%20Docs\u002Fmachine-learning-support)\r\n\r\n## What's Changed\r\n* [NativeIO] Upgrade datafusion to 27 by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F282\r\n* [Flink] implement filter pushdown and fix partition pushdown in flink by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F287\r\n* Upgrade Flink to 1.17 by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F288\r\n* [Python][NativeIO] Add C interface definition by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F291\r\n* [NativeIO] update arrow version by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F290\r\n* Add Built-in RBAC support by @clouddea in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F292\r\n* fix apache license by @clouddea in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F293\r\n* [Native-Metadata] Rust implementation of DAO layer by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F294\r\n* [Flink] fix jackson-core package in flink by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F297\r\n* [Docs] update docs by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F298\r\n* [Flink] upgrade flink cdc connector to 2.4 by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F303\r\n* clean old compaction data and redundant data by @ChenYunHey in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F304\r\n* [Python][Native-Metadata] Python interface of lakesoul metadata by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F305\r\n* [Python] C callback with data by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F306\r\n* [Python][Dataset] PyArrow and PyTorch dataset api for LakeSoul by @codingfun2022 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F308\r\n* [Flink] rollback flink cdc to 2.3.0 and supplement tables check in benchmark by @F-PHantam in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F309\r\n* [Flink] Optimize CDC sink serde with Fury by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F307\r\n* [NativeIO] add hdfs feature in lakesoul-io-c by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F311\r\n* [Python] exclude partition column at get_arrow_schema_by_table_name by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F312\r\n* [Native-Metadata] Retry when native metadata client fail by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F313\r\n* [Flink] cdc supplement data delay check mechanism and fix logicallyDropColumn bug by @F-PHantam in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F315\r\n* Presto Connector Support by @clouddea in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F314\r\n* add scala in common to address build in idea intellij by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F316\r\n* [Flink] Ignore exception when hadoop env missing by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F317\r\n* [NativeIO] Merge native modules by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F318\r\n* bump version to 2.4.0 by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F319\r\n* [RBAC] Set hdfs dir owner by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F321\r\n* [BugFix]support query metadata with null string by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F324\r\n* [Spark] list namespace should return empty array by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F323\r\n* [Python][Dataset] Update Python dataset api for LakeSoul by @codingfun2022 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F325\r\n* [Python] Examples using Python API for AI model training by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F327\r\n* update docs and readme for release 2.4 by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F328\r\n* [Docs] Usage on auto table clean by @ChenYunHey in https:\u002F\u002Fgithub.com\u002Flake","2023-09-21T09:16:18",{"id":255,"version":256,"summary_zh":257,"released_at":258},297483,"v2.3.1","* Fix jackson-core packaging for Flink package\r\n* Fix commons-lang class missing\r\n* Fix snapshot rollback\u002Fcleanup with local timezone","2023-08-22T02:40:59",{"id":260,"version":261,"summary_zh":262,"released_at":263},297484,"v2.3.0","## v2.3.0 Release Notes\r\n\r\nThis is the first release after LakeSoul donated to Linux Foundation AI & Data. This release contains the following major new features:\r\n\r\n1. Flink Connector for Flink SQL\u002FTable API to read or write LakeSoul in both batch and streaming mode, with the supports of Flink Changelog Stream semantics and row-level upsert and delete. See docs [Flink Connector](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fflink-lakesoul-connector).\r\n2. Flink CDC Ingestion refactored to infer new tables and schema changes automatically from messages. This enables simpler CDC stream ingestion job development for any kinds of database or message queues.\r\n3. Global automatic compaction service. See docs [Auto Compaction Service](https:\u002F\u002Flakesoul-io.github.io\u002Fdocs\u002FUsage%20Docs\u002Fauto-compaction-task).\r\n\r\n### 更新日志\r\n\r\n这是 LakeSoul 捐赠给 Linux Foundation AI & Data 后的第一个发布版本。该版本包含以下重要更新：\r\n\r\n1. 全面支持 Flink SQL\u002FTable API. LakeSoul 支持 Flink 流、批读写。流式读写完整支持 Flink Changelog 语义，支持行级别流式增删改。[参考文档](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FUsage%20Docs\u002Fflink-lakesoul-connector)\r\n2. Flink CDC 整库同步重构，支持从消息中自动推断新表和 schema 变更。能够更简单的开发 CDC 入湖作业并支持消费任意数据库 CDC 流或消息队列流。\r\n3. 全局自动 Compaction 服务。参考文档：[LakeSoul 全局自动压缩服务使用方法](https:\u002F\u002Flakesoul-io.github.io\u002Fzh-Hans\u002Fdocs\u002FUsage%20Docs\u002Fauto-compaction-task)\r\n\r\n## What's Changed\r\n* [NativeIO] Native io misc improvements by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F190\r\n* optimize filesForScan by @F-PHantam in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F192\r\n* Add Definition Comments for com.dmetasoul.lakesoul.meta.entity by @YuChangHui in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F193\r\n* Implement Delta Join Interfaces for LakeSoulTable by @YuChangHui in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F184\r\n* [Flink] pack paranamer to flink release jar by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F196\r\n* [NativeIO] use tcmalloc as global allocator by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F204\r\n* [NativeIO] fix memory leak in native reader by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F209\r\n* [Flink] avoid cast global parameter to ParameterTool by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F207\r\n* migrate arrow-rs and datafusion deps to new org by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F211\r\n* Implement Global Automatic Disaggregated Compaction Service by @F-PHantam in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F212\r\n* Implement Flink ScanTableSource and LookupTableSource by @YuChangHui in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F213\r\n* fix data type timestamp with zone by @lypnaruto in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F215\r\n* [NativeIO]throw execption when LakeSoulArrowReader.hasNext by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F217\r\n* [NativeIO]add rust clippy workflow && fix clippy error\u002Fwarn by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F219\r\n* add flink sql submitter(#199) by @Hades-888 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F221\r\n* Update readme by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F222\r\n* bump version to 2.3.0 by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F223\r\n* update github links by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F224\r\n* fix bug: requested file schema no change in stream task by @F-PHantam in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F226\r\n* [Flink]LakeSoulCatalog::listTables:  list tableName instead of tablePath by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F227\r\n* [Flink]fix parse error of LogicalTypeRoot::Date by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F228\r\n* [NativeIO]panic when target datatype and source datatype mismatch by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F214\r\n* [Flink]support flink decimal by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F232\r\n* update LakeSoulTableSource.getChangelogMode by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F231\r\n* [NativeIO]fix clippy warning by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F230\r\n* Fix hash bucket num by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F233\r\n* [Flink]add batch in flink sql submitter by @Hades-888 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F234\r\n* disable tcmalloc by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F235\r\n* [Project] add lakesoul project website code by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F237\r\n* update load flink sql from hdfs in yarn application by @Hades-888 in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F238\r\n* [Flink]add Maven-test CI for lakesoul-flink by @lypnaruto in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F239\r\n* Add cross build for native io by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F241\r\n* [Project] disable git lfs by @xuchen-plus in https:\u002F\u002Fgithub.com\u002Flakesoul-io\u002FLakeSoul\u002Fpull\u002F243\r\n","2023-07-13T09:44:44",{"id":265,"version":266,"summary_zh":267,"released_at":268},297485,"v2.2.0","# LakeSoul Release v2.2.0\r\n\r\n## v2.2.0 Release Notes\r\n1. Native IO is by default enabled for Flink CDC Sink and Spark SQL. Native IO uses [arrow-rs](https:\u002F\u002Fgithub.com\u002Fapache\u002Farrow-rs) and [Datafusion](https:\u002F\u002Fgithub.com\u002Fapache\u002Farrow-datafusion) with special IO optimizations based on arrow-rs' object store. Benchmarks show 3x IO throughput improvement over parquet-mr and Hadoop filesystem. Native IO supports both HDFS and S3 object storage (including S3 protocol compatible storages). Native IO supports all data types in Spark and Flink and has passed both TPC-H and CHBenchmark correctness tests.\r\n2. [Snapshot read](https:\u002F\u002Fwww.dmetasoul.com\u002Fen\u002Fdocs\u002Flakesoul\u002FTutorials\u002Fsnapshot-manage\u002F) and [incremental read](https:\u002F\u002Fwww.dmetasoul.com\u002Fen\u002Fdocs\u002Flakesoul\u002FTutorials\u002Fincremental-query\u002F) support on Spark. LakeSoul's incremental read on spark supports both batch mode and microbatch streaming mode.\r\n3. Default supported Spark's version has been upgraded to Spark 3.3.\r\n\r\nv2.2.0 发布日志\r\n1. Native IO 在 Flink 和 Spark 上默认启用。Native IO 使用 [arrow-rs](https:\u002F\u002Fgithub.com\u002Fapache\u002Farrow-rs) 和 [Datafusion] (https:\u002F\u002Fgithub.com\u002Fapache\u002Farrow-datafusion) 实现，并在 arrow-rs object store 上做了专门的性能优化。在实际测试中比 parquet-mr+hadoop filesystem 快 3 倍以上。Native IO 可以支持 HDFS 和 S3 存储，以及与 S3 兼容的存储系统。Native IO 经过了详细的测试，能够支持 Flink、Spark 所有数据类型，并通过了 TPC-H 和 CHBenchmark 的正确性校验。\r\n2. 在 Spark 上支持了[快照读](https:\u002F\u002Fwww.dmetasoul.com\u002Fdocs\u002Flakesoul\u002FTutorials\u002Fsnapshot-manage\u002F)和[增量读](https:\u002F\u002Fwww.dmetasoul.com\u002Fdocs\u002Flakesoul\u002FTutorials\u002Fincremental-query\u002F)功能。增量读功能可以支持 batch 模式和 micro batch streaming 模式。\r\n3. 默认的 Spark 版本更新到 3.3.\r\n\r\n## What's Changed\r\n* [Feature] Timestamp based snapshot read, rollback and cleanup by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F104\r\n* [Flink] write timestamp to int64 instead of int96 in flink sink by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F106\r\n* Only one partition and compaction to parquet scan by @F-PHantam in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F109\r\n* Bump postgresql from 42.5.0 to 42.5.1 in \u002Flakesoul-common by @dependabot in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F111\r\n* Incremental query by @lypnaruto in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F110\r\n* Add Benchmarks by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F115\r\n* Flink serde optimization by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F117\r\n* Develop\u002Fnative io spark by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F118\r\n* Fix CI with Maven Test by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F121\r\n* Support Kafka multiple topics sync to LakeSoul by @F-PHantam in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F122\r\n* solve dependency problem of confluent jar by @F-PHantam in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F124\r\n* fix maven-test with native-io by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F125\r\n* [NativeIO] Native io parquet writer implementation by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F128\r\n* [Spark] Streaming Read by @lypnaruto in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F129\r\n* [Spark] Upgrade Spark version to 3.3 for main branch by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F132\r\n* use Arrow Schema instead of HashMap for lakesoul_reader filter by @YuChangHui in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F136\r\n* [NativeIO] Native writer c and jnr-ffi interface by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F137\r\n* [NativeIO] fix native reader memory leak and double free by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F138\r\n* [NativeIO] Native writer with primary keys sort support by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F141\r\n* [NativeIO] Use ffi to pass arrow schema by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F142\r\n* [NativeIO][Flink] Implement Flink native writer by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F143\r\n* [NativeIO] fix callback object reference by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F145\r\n* [NativeIO] upgrade arrow-rs to 31 and datafusion to 17 by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F148\r\n* [NativeIO][Spark] Package native lib in lakesoul-spark jar by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F149\r\n* [NativeIO] use maven profile for native packaging. default to local native build by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F150\r\n* [NativeIO][Spark] Integrate nativeIO writer in lakesoul-spark by @F-PHantam in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F151\r\n* [NativeIO] Implement Sorted Stream Merger by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F147\r\n* fix ParquetNativeFilterSuite by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F152\r\n* [NativeIO][Bug] Fix flink writer panic by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F154\r\n* [NativeIO] optimize with smallvec for native merge by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fp","2023-03-31T08:33:31",{"id":270,"version":271,"summary_zh":272,"released_at":273},297486,"v2.1.1","## What's Changed\r\n\r\nThis is a bug fix release for v2.1.0.\r\n\r\nFixed bugs:\r\n* Support geometry\u002Fpoint type in flink cdc by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F93\r\n* [BUG] fix pg password auth failed exception by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F95\r\n* Add checkpoint_mode to flink job entry by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F96\r\n\r\n\r\n**Full Changelog**: https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fcompare\u002F2.1.0...v2.1.1","2022-10-18T05:45:44",{"id":275,"version":276,"summary_zh":277,"released_at":278},297487,"2.1.0","# v2.1.0 Release Notes\r\nLakeSoul 2.1.0 brings new Flink CDC sink implementation which supports all tables (with different schemas) in one entire MySQL database sync in one Flink job, automatic schema sync and evolution, automatic new table creation and exactly once guarantee. The currently supported flink version is 1.14.\r\n\r\nIn 2.1.0 we also reimplement Spark catalog so that it could be used as a standalone catalog rather than a session catalog extension. This change is to avoid some inconsistencies in Spark's v2 table commands, e.g. `show tables` cannot support v2 tables until 3.3.\r\n\r\nPackages for Spark and Flink are separated into two maven submodules. The maven coordinates are `com.dmetasoul:lakesoul-spark:2.1.0-spark-3.1.2` and `com.dmeatsoul:lakesoul-flink:2.1.0-flink-1.14`. All the required transitive dependencies have already been shaded into the released jars.\r\n\r\n# v2.1.0 发布日志\r\nLakeSoul 2.1.0 增加了全新的 Flink CDC Sink 功能，支持 MySQL 数据库整库千表（支持不同 schema）同步，自动 Schema 变更同步，自动新表感知和严格一次（Exactly Once）语义保证。\r\n\r\nSpark 支持部分重写了 Catalog 的实现，使得 Catalog 可以作为非 Session Catalog 扩展使用，主要目的是规避 Spark 在 3.3 版本之前，一些 DDL Command 不支持 V2 表的问题。\r\n\r\nSpark 和 Flink 分别拆分成了两个 Maven 子模块。在工程中引用的 Maven 坐标分别是  `com.dmetasoul:lakesoul-spark:2.1.0-spark-3.1.2` and `com.dmeatsoul:lakesoul-flink:2.1.0-flink-1.14`。他们各自的依赖库已经通过 shade 的方式打包到了发布的 jar 包中。\r\n\r\n## Merged Pull Requests\r\n* CDC support v1: add table property to identify change kind column by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F1\r\n* Cdc support v2 by @moresun in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F3\r\n* support merge into sql when can be converted to upsert by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F4\r\n* Optimize duplicate tests and code by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F6\r\n* support create hash partitioned table by sql by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F7\r\n* remove cdc filter from mergescan by @moresun in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F9\r\n* fix build error and some coding styles by @bakey in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F10\r\n* Update README.md by @moresun in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F13\r\n* add a cdc sink example by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F17\r\n* update all links in readme to relative by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F18\r\n* [Doc] add cdc cn doc by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F19\r\n* Bump fastjson from 1.2.75 to 1.2.83 by @dependabot in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F38\r\n* Catalog refactor by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F45\r\n* Bump mysql-connector-java from 8.0.19 to 8.0.28 by @dependabot in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F46\r\n* Bump postgresql from 42.2.14 to 42.3.3 by @dependabot in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F47\r\n* bump version to 2.0.0 by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F48\r\n* fix maven packaging by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F55\r\n* Feature flink order sink by @YangZH-v2 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F56\r\n* add parquet-column dependency fix localEnv unable run bug by @YangZH-v2 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F64\r\n* support exactly once semantics for flink write by @YangZH-v2 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F65\r\n* fix filter bug when cdc column is not used by @F-PHantam in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F68\r\n* Align hash bucket and sort logic in flink with spark #60 by @YangZH-v2 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F69\r\n* Split submodules for maven project by @F-PHantam in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F70\r\n* Bump postgresql from 42.3.3 to 42.4.1 in \u002Flakesoul-common by @dependabot in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F71\r\n* add MergeNonNullOp for merge operator by @moresun in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F73\r\n* add docker compose for local test. fix maven install gpg signing by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F76\r\n* clean up unused code by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F77\r\n* fix MultiPartitionMergeBucketScan bug by @F-PHantam in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F81\r\n* Fix flink cdc write event order by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F82\r\n* supports database(namespace) & support mysql cdc using flink by @Ceng23333 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F85\r\n* Bump snakeyaml from 1.30 to 1.31 in \u002Flakesoul-spark by @dependabot in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F88\r\n* Support multiple tables sink for Flink CDC by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F86\r\n* flink cdc task add argument serverTimeZone by @F-PHantam in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F90\r\n* Fix maven dependency by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F91\r\n\r\n## New Contributors\r\n* @dmetasoul01 made their first contribution in h","2022-10-12T09:45:56",{"id":280,"version":281,"summary_zh":282,"released_at":283},297488,"v2.0.1-spark-3.1.2","## What's Changed\r\n* fix maven packaging by @dmetasoul01 in https:\u002F\u002Fgithub.com\u002Fmeta-soul\u002FLakeSoul\u002Fpull\u002F55","2022-07-08T07:56:29",{"id":285,"version":286,"summary_zh":287,"released_at":288},297489,"v2.0.0-spark-3.1.2","# 1. Catalog refactoring \r\n\r\n  1. Replacing the Cassandra protocol with the Postgres protocol \r\n  2. metadata Use PG protocol to rewrite table operations, partition operations, and data operation related functions, and use transaction mechanism to achieve data submission collision detection to ensure ACID attributes\r\n  3. Interface with Spark and metadata, translate Spark-related metadata operations into the underlying interface, and realize the cross-border distribution between the upper computing platform and the underlying development storage layer \r\n \r\n# 2. DDL\r\n\r\n  1. Spark SQL related DDL statements (create alter, etc.) transformation\r\n  2. Spark DataFrame | DataSet related DDL statement (save, etc.) transformation\r\n  \r\n# 3. Data Writing \r\n  1. Transformation of SparkSQL-related DML statements (insert into, update, etc.) \r\n  2. Spark DataFrame | DataSet related DML statements (write function, etc.) \r\n  3. LakeSoulTable upsert function transformation \r\n  4. LakeSoulTable compaction function transformation, and support to mount to hive \r\n \r\n# 4. Data Reading \r\n  1. A variety of ParquetScan transformation, remove the write version sorting mechanism, adapt to the new metadata UUID file list format\r\n  2. LakeSoulTable adds a snapshot reading function to read the historical content according to the specified partition version \r\n  3. LakeSoulTable adds a history rollback function to roll back to a certain historical version of the specified partition \r\n  4. Added and modified the default MergeOprator function to make it easier for users to operate Merge results","2022-07-01T08:38:55"]