[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-Jack-bo1220--Awesome-Remote-Sensing-Foundation-Models":3,"tool-Jack-bo1220--Awesome-Remote-Sensing-Foundation-Models":64},[4,17,26,40,48,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":16},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,3,"2026-04-05T11:01:52",[13,14,15],"开发框架","图像","Agent","ready",{"id":18,"name":19,"github_repo":20,"description_zh":21,"stars":22,"difficulty_score":23,"last_commit_at":24,"category_tags":25,"status":16},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107662,2,"2026-04-03T11:11:01",[13,14,15],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":23,"last_commit_at":32,"category_tags":33,"status":16},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",84991,"2026-04-05T10:45:23",[14,34,35,36,15,37,38,13,39],"数据工具","视频","插件","其他","语言模型","音频",{"id":41,"name":42,"github_repo":43,"description_zh":44,"stars":45,"difficulty_score":10,"last_commit_at":46,"category_tags":47,"status":16},3128,"ragflow","infiniflow\u002Fragflow","RAGFlow 是一款领先的开源检索增强生成（RAG）引擎，旨在为大语言模型构建更精准、可靠的上下文层。它巧妙地将前沿的 RAG 技术与智能体（Agent）能力相结合，不仅支持从各类文档中高效提取知识，还能让模型基于这些知识进行逻辑推理和任务执行。\n\n在大模型应用中，幻觉问题和知识滞后是常见痛点。RAGFlow 通过深度解析复杂文档结构（如表格、图表及混合排版），显著提升了信息检索的准确度，从而有效减少模型“胡编乱造”的现象，确保回答既有据可依又具备时效性。其内置的智能体机制更进一步，使系统不仅能回答问题，还能自主规划步骤解决复杂问题。\n\n这款工具特别适合开发者、企业技术团队以及 AI 研究人员使用。无论是希望快速搭建私有知识库问答系统，还是致力于探索大模型在垂直领域落地的创新者，都能从中受益。RAGFlow 提供了可视化的工作流编排界面和灵活的 API 接口，既降低了非算法背景用户的上手门槛，也满足了专业开发者对系统深度定制的需求。作为基于 Apache 2.0 协议开源的项目，它正成为连接通用大模型与行业专有知识之间的重要桥梁。",77062,"2026-04-04T04:44:48",[15,14,13,38,37],{"id":49,"name":50,"github_repo":51,"description_zh":52,"stars":53,"difficulty_score":10,"last_commit_at":54,"category_tags":55,"status":16},519,"PaddleOCR","PaddlePaddle\u002FPaddleOCR","PaddleOCR 是一款基于百度飞桨框架开发的高性能开源光学字符识别工具包。它的核心能力是将图片、PDF 等文档中的文字提取出来，转换成计算机可读取的结构化数据，让机器真正“看懂”图文内容。\n\n面对海量纸质或电子文档，PaddleOCR 解决了人工录入效率低、数字化成本高的问题。尤其在人工智能领域，它扮演着连接图像与大型语言模型（LLM）的桥梁角色，能将视觉信息直接转化为文本输入，助力智能问答、文档分析等应用场景落地。\n\nPaddleOCR 适合开发者、算法研究人员以及有文档自动化需求的普通用户。其技术优势十分明显：不仅支持全球 100 多种语言的识别，还能在 Windows、Linux、macOS 等多个系统上运行，并灵活适配 CPU、GPU、NPU 等各类硬件。作为一个轻量级且社区活跃的开源项目，PaddleOCR 既能满足快速集成的需求，也能支撑前沿的视觉语言研究，是处理文字识别任务的理想选择。",74939,"2026-04-05T23:16:38",[38,14,13,37],{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":23,"last_commit_at":62,"category_tags":63,"status":16},2471,"tesseract","tesseract-ocr\u002Ftesseract","Tesseract 是一款历史悠久且备受推崇的开源光学字符识别（OCR）引擎，最初由惠普实验室开发，后由 Google 维护，目前由全球社区共同贡献。它的核心功能是将图片中的文字转化为可编辑、可搜索的文本数据，有效解决了从扫描件、照片或 PDF 文档中提取文字信息的难题，是数字化归档和信息自动化的重要基础工具。\n\n在技术层面，Tesseract 展现了强大的适应能力。从版本 4 开始，它引入了基于长短期记忆网络（LSTM）的神经网络 OCR 引擎，显著提升了行识别的准确率；同时，为了兼顾旧有需求，它依然支持传统的字符模式识别引擎。Tesseract 原生支持 UTF-8 编码，开箱即用即可识别超过 100 种语言，并兼容 PNG、JPEG、TIFF 等多种常见图像格式。输出方面，它灵活支持纯文本、hOCR、PDF、TSV 等多种格式，方便后续数据处理。\n\nTesseract 主要面向开发者、研究人员以及需要构建文档处理流程的企业用户。由于它本身是一个命令行工具和库（libtesseract），不包含图形用户界面（GUI），因此最适合具备一定编程能力的技术人员集成到自动化脚本或应用程序中",73286,"2026-04-03T01:56:45",[13,14],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":69,"readme_en":70,"readme_zh":71,"quickstart_zh":72,"use_case_zh":73,"hero_image_url":74,"owner_login":75,"owner_name":68,"owner_avatar_url":76,"owner_bio":68,"owner_company":68,"owner_location":68,"owner_email":68,"owner_twitter":68,"owner_website":68,"owner_url":77,"languages":68,"stars":78,"forks":79,"last_commit_at":80,"license":68,"difficulty_score":23,"env_os":81,"env_gpu":82,"env_ram":82,"env_deps":83,"category_tags":86,"github_topics":68,"view_count":23,"oss_zip_url":68,"oss_zip_packed_at":68,"status":16,"created_at":87,"updated_at":88,"faqs":89,"releases":130},4076,"Jack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models","Awesome-Remote-Sensing-Foundation-Models",null,"Awesome-Remote-Sensing-Foundation-Models 是一个专为遥感领域打造的开源资源合集，旨在系统性地整理与大模型相关的前沿成果。面对遥感数据规模庞大、标注成本高以及传统模型泛化能力不足的痛点，它汇聚了海量的学术论文、预训练权重、代码实现、数据集及评测基准，帮助从业者快速定位并复用现有的先进技术方案。\n\n该资源库特别适合遥感领域的研究人员、算法工程师及开发者使用。无论是希望探索自监督学习新范式，还是需要为特定任务寻找高质量的初始化模型，都能在此找到得力助手。其核心亮点在于分类详尽且更新及时，不仅涵盖了主流的遥感视觉基础模型（如 SatMAE、SeCo），还前瞻性地收录了视觉 - 语言、生成式、视 - 位融合乃至智能体等多模态方向的前沿进展。通过提供从理论论文到可运行代码的一站式指引，Awesome-Remote-Sensing-Foundation-Models 有效降低了技术门槛，加速了遥感大模型从学术研究到实际落地的进程。","[![Maintenance](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FMaintained%3F-yes-green.svg)](https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models\u002Fgraphs\u002Fcommit-activity)\n[![Awesome](https:\u002F\u002Fcdn.rawgit.com\u002Fsindresorhus\u002Fawesome\u002Fd7305f38d29fed78fa85652e3a63e154dd8e8829\u002Fmedia\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models)\n\u003Cimg alt=\"GitHub watchers\" src=\"https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fwatchers\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models?style=social\"> \u003Cimg alt=\"GitHub stars\" src=\"https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models?style=social\"> \u003Cimg alt=\"GitHub forks\" src=\"https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fforks\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models?style=social\">\n\n# \u003Cp align=center>`Awesome Remote Sensing Foundation Models`\u003C\u002Fp>\n\n:star2:**A collection of papers, datasets, benchmarks, code, and pre-trained weights for Remote Sensing Foundation Models (RSFMs).**\n\n## 📢 Latest Updates\n:fire::fire::fire: Last Updated on 2026.03.13 :fire::fire::fire:\n\n## Table of Contents\n- **Models**\n  - [Remote Sensing Vision Foundation Models](#remote-sensing-vision-foundation-models)\n  - [Remote Sensing Vision-Language Foundation Models](#remote-sensing-vision-language-foundation-models)\n  - [Remote Sensing Generative Foundation Models](#remote-sensing-generative-foundation-models)\n  - [Remote Sensing Vision-Location Foundation Models](#remote-sensing-vision-location-foundation-models)\n  - [Remote Sensing Vision-Audio Foundation Models](#remote-sensing-vision-audio-foundation-models)\n  - [Remote Sensing Task-specific Foundation Models](#remote-sensing-task-specific-foundation-models)\n  - [Remote Sensing Agents](#remote-sensing-agents)\n- **Datasets & Benchmarks**\n  - [Benchmarks for RSFMs](#benchmarks-for-rSFMs)\n  - [(Large-scale) Pre-training Datasets](#large-scale-Pre-training-Datasets)\n  - [Embeddings data](#embeddings-data)\n- **Others**\n  - [Relevant Projects](#relevant-projects)\n  - [Survey Papers](#survey-papers)\n  \n## Remote Sensing \u003Cins>Vision\u003C\u002Fins> Foundation Models\n\n|Abbreviation|Title|Publication|Paper|Code & Weights|\n|:---:|---|:---:|:---:|:---:|\n|**GeoKR**|**Geographical Knowledge-Driven Representation Learning for Remote Sensing Images**|TGRS2021|[GeoKR](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9559903)|[link](https:\u002F\u002Fgithub.com\u002Fflyakon\u002FGeographical-Knowledge-driven-Representaion-Learning)|\n|**-**|**Self-Supervised Learning of Remote Sensing Scene Representations Using Contrastive Multiview Coding**|CVPRW2021|[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021W\u002FEarthVision\u002Fhtml\u002FStojnic_Self-Supervised_Learning_of_Remote_Sensing_Scene_Representations_Using_Contrastive_Multiview_CVPRW_2021_paper.html)|[link](https:\u002F\u002Fgithub.com\u002Fvladan-stojnic\u002FCMC-RSSR)|\n|**GASSL**|**Geography-Aware Self-Supervised Learning**|ICCV2021|[GASSL](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021\u002Fhtml\u002FAyush_Geography-Aware_Self-Supervised_Learning_ICCV_2021_paper.html)|[link](https:\u002F\u002Fgithub.com\u002Fsustainlab-group\u002Fgeography-aware-ssl)|\n|**SeCo**|**Seasonal Contrast: Unsupervised Pre-Training From Uncurated Remote Sensing Data**|ICCV2021|[SeCo](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021\u002Fhtml\u002FManas_Seasonal_Contrast_Unsupervised_Pre-Training_From_Uncurated_Remote_Sensing_Data_ICCV_2021_paper.html)|[link](https:\u002F\u002Fgithub.com\u002FServiceNow\u002Fseasonal-contrast)|\n|**DINO-MM**|**Self-Supervised Vision Transformers for Joint SAR-Optical Representation Learning**|IGARSS2022|[DINO-MM](https:\u002F\u002Fdoi.org\u002F10.1109\u002Figarss46834.2022.9883983)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FDINO-MM)|\n|**SatMAE**|**SatMAE: Pre-training Transformers for Temporal and Multi-Spectral Satellite Imagery**|NeurIPS2022|[SatMAE](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2022\u002Fhash\u002F01c561df365429f33fcd7a7faa44c985-Abstract-Conference.html)|[link](https:\u002F\u002Fgithub.com\u002Fsustainlab-group\u002FSatMAE)|\n|**RS-BYOL**|**Self-Supervised Learning for Invariant Representations From Multi-Spectral and SAR Images**|JSTARS2022|[RS-BYOL](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9880533)|null|\n|**GeCo**|**Geographical Supervision Correction for Remote Sensing Representation Learning**|TGRS2022|[GeCo](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9869651)|null|\n|**RingMo**|**RingMo: A remote sensing foundation model with masked image modeling**|TGRS2022|[RingMo](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9844015)|[Code](https:\u002F\u002Fgithub.com\u002Fcomeony\u002FRingMo)|\n|**RVSA**|**Advancing plain vision transformer toward remote sensing foundation model**|TGRS2022|[RVSA](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9956816)|[link](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FRemote-Sensing-RVSA)|\n|**RSP**|**An Empirical Study of Remote Sensing Pretraining**|TGRS2022|[RSP](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9782149)|[link](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FRemote-Sensing-RVSA)|\n|**MATTER**|**Self-Supervised Material and Texture Representation Learning for Remote Sensing Tasks**|CVPR2022|[MATTER](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2022\u002Fhtml\u002FAkiva_Self-Supervised_Material_and_Texture_Representation_Learning_for_Remote_Sensing_Tasks_CVPR_2022_paper.html)|null|\n|**CSPT**|**Consecutive Pre-Training: A Knowledge Transfer Learning Strategy with Relevant Unlabeled Data for Remote Sensing Domain**|RS2022|[CSPT](https:\u002F\u002Fwww.mdpi.com\u002F2072-4292\u002F14\u002F22\u002F5675#)|[link](https:\u002F\u002Fgithub.com\u002FZhAnGToNG1\u002Ftransfer_learning_cspt)|\n|**-**|**Self-supervised Vision Transformers for Land-cover Segmentation and Classification**|CVPRW2022|[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2022W\u002FEarthVision\u002Fhtml\u002FScheibenreif_Self-Supervised_Vision_Transformers_for_Land-Cover_Segmentation_and_Classification_CVPRW_2022_paper.html)|[link](https:\u002F\u002Fgithub.com\u002FHSG-AIML\u002FSSLTransformerRS)|\n|**TOV**|**TOV: The original vision model for optical remote sensing image understanding via self-supervised learning**|JSTARS2023|[TOV](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10110958)|[link](https:\u002F\u002Fgithub.com\u002FGeoX-Lab\u002FG-RSIM\u002Ftree\u002Fmain\u002FTOV_v1)|\n|**CMID**|**CMID: A Unified Self-Supervised Learning Framework for Remote Sensing Image Understanding**|TGRS2023|[CMID](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10105625)|[link](https:\u002F\u002Fgithub.com\u002FNJU-LHRS\u002Fofficial-CMID)|\n|**RingMo-Sense**|**RingMo-Sense: Remote Sensing Foundation Model for Spatiotemporal Prediction via Spatiotemporal Evolution Disentangling**|TGRS2023|[RingMo-Sense](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10254320)|null|\n|**AST**|**AST: Adaptive Self-supervised Transformer for Optical Remote Sensing Representation**|ISPRS JPRS2023|[AST](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.isprsjprs.2023.04.003)|null|\n|**IaI-SimCLR**|**Multi-Modal Multi-Objective Contrastive Learning for Sentinel-1\u002F2 Imagery**|CVPRW2023|[IaI-SimCLR](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023W\u002FEarthVision\u002Fhtml\u002FPrexl_Multi-Modal_Multi-Objective_Contrastive_Learning_for_Sentinel-12_Imagery_CVPRW_2023_paper.html)|null|\n|**CACo**|**Change-Aware Sampling and Contrastive Learning for Satellite Images**|CVPR2023|[CACo](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fhtml\u002FMall_Change-Aware_Sampling_and_Contrastive_Learning_for_Satellite_Images_CVPR_2023_paper.html)|[link](https:\u002F\u002Fgithub.com\u002Futkarshmall13\u002FCACo)|\n|**SatLas**|**SatlasPretrain: A Large-Scale Dataset for Remote Sensing Image Understanding**|ICCV2023|[SatLas](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ficcv51070.2023.01538)|[link](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fsatlas)|\n|**GFM**|**Towards Geospatial Foundation Models via Continual Pretraining**|ICCV2023|[GFM](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ficcv51070.2023.01541)|[link](https:\u002F\u002Fgithub.com\u002Fmmendiet\u002FGFM)|\n|**Scale-MAE**|**Scale-MAE: A Scale-Aware Masked Autoencoder for Multiscale Geospatial Representation Learning**|ICCV2023|[Scale-MAE](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ficcv51070.2023.00378)|[link](https:\u002F\u002Fgithub.com\u002Fbair-climate-initiative\u002Fscale-mae)|\n|**DINO-MC**|**DINO-MC: Self-supervised Contrastive Learning for Remote Sensing Imagery with Multi-sized Local Crops**|Arxiv2023|[DINO-MC](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.06670)|[link](https:\u002F\u002Fgithub.com\u002FWennyXY\u002FDINO-MC)|\n|**CROMA**|**CROMA: Remote Sensing Representations with Contrastive Radar-Optical Masked Autoencoders**|NeurIPS2023|[CROMA](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.00566.pdf)|[link](https:\u002F\u002Fgithub.com\u002Fantofuller\u002FCROMA)|\n|**Cross-Scale MAE**|**Cross-Scale MAE: A Tale of Multiscale Exploitation in Remote Sensing**|NeurIPS2023|[Cross-Scale MAE](https:\u002F\u002Fopenreview.net\u002Fpdf?id=5oEVdOd6TV)|[link](https:\u002F\u002Fgithub.com\u002Faicip\u002FCross-Scale-MAE)|\n|**Presto**|**Lightweight, Pre-trained Transformers for Remote Sensing Timeseries**|Arxiv2023|[Presto](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.14065)|[link](https:\u002F\u002Fgithub.com\u002Fnasaharvest\u002Fpresto)|\n|**Prithvi**|**Foundation Models for Generalist Geospatial Artificial Intelligence**|Arxiv2023|[Prithvi](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.18660)|[link](https:\u002F\u002Fhuggingface.co\u002Fibm-nasa-geospatial)|\n|**-**|**A Self-Supervised Cross-Modal Remote Sensing Foundation Model with Multi-Domain Representation and Cross-Domain Fusion**|IGARSS2023|[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10282433)|null|\n|**EarthPT**|**EarthPT: a time series foundation model for Earth Observation**|NeurIPS2023 CCAI workshop|[EarthPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.07207)|[link](https:\u002F\u002Fgithub.com\u002Faspiaspace\u002FEarthPT)|\n|**USat**|**USat: A Unified Self-Supervised Encoder for Multi-Sensor Satellite Imagery**|Arxiv2023|[USat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.02199)|[link](https:\u002F\u002Fgithub.com\u002Fstanfordmlgroup\u002FUSat)|\n|**AIEarth**|**Analytical Insight of Earth: A Cloud-Platform of Intelligent Computing for Geospatial Big Data**|Arxiv2023|[AIEarth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.16385)|[link](https:\u002F\u002Fengine-aiearth.aliyun.com\u002F#\u002F)|\n|**Clay**|**Clay Foundation Model**|-|null|[link](https:\u002F\u002Fclay-foundation.github.io\u002Fmodel\u002F)|\n|**Hydro**|**Hydro--A Foundation Model for Water in Satellite Imagery**|-|null|[link](https:\u002F\u002Fgithub.com\u002Fisaaccorley\u002Fhydro-foundation-model)|\n|**BFM**|**A Billion-scale Foundation Model for Remote Sensing Images**|IEEE JSTARS2024|[BFM](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fjstars.2024.3401772)|null|\n|**U-BARN**|**Self-Supervised Spatio-Temporal Representation Learning of Satellite Image Time Series**|JSTARS2024|[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10414422)|[link](https:\u002F\u002Fsrc.koda.cnrs.fr\u002Firis.dumeur\u002Fssl_ubarn)|\n|**GeRSP**|**Generic Knowledge Boosted Pretraining for Remote Sensing Images**|TGRS2024|[GeRSP](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2024.3354031)|[GeRSP](https:\u002F\u002Fgithub.com\u002FfloatingstarZ\u002FGeRSP)|\n|**SwiMDiff**|**SwiMDiff: Scene-Wide Matching Contrastive Learning With Diffusion Constraint for Remote Sensing Image**|TGRS2024|[SwiMDiff](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2024.3371481)|null|\n|**OFA-Net**|**One for All: Toward Unified Foundation Models for Earth Vision**|IGARSS2024|[OFA-Net](https:\u002F\u002Fdoi.org\u002F10.1109\u002Figarss53475.2024.10641637)|null|\n|**-**|**Lightweight and Efficient: A Family of Multimodal Earth Observation Foundation Models**|IGARSS2024|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Figarss53475.2024.10641132)|null|\n|**SMLFR**|**Generative ConvNet Foundation Model With Sparse Modeling and Low-Frequency Reconstruction for Remote Sensing Image Interpretation**|TGRS2024|[SMLFR](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10378718)|[link](https:\u002F\u002Fgithub.com\u002FHIT-SIRS\u002FSMLFR)|\n|**SpectralGPT**|**SpectralGPT: Spectral Remote Sensing Foundation Model**|TPAMI2024|[SpectralGPT](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftpami.2024.3362475)|[link](https:\u002F\u002Fgithub.com\u002Fdanfenghong\u002FIEEE_TPAMI_SpectralGPT)|\n|**S2MAE**|**S2MAE: A Spatial-Spectral Pretraining Foundation Model for Spectral Remote Sensing Data**|CVPR2024|[S2MAE](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2024\u002Fpapers\u002FLi_S2MAE_A_Spatial-Spectral_Pretraining_Foundation_Model_for_Spectral_Remote_Sensing_CVPR_2024_paper.pdf)|null|\n|**SatMAE++**|**Rethinking Transformers Pre-training for Multi-Spectral Satellite Imagery**|CVPR2024|[SatMAE++](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fcvpr52733.2024.02627)|[link](https:\u002F\u002Fgithub.com\u002Ftechmn\u002Fsatmae_pp)|\n|**msGFM**|**Bridging Remote Sensors with Multisensor Geospatial Foundation Models**|CVPR2024|[msGFM](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fcvpr52733.2024.02631)|[link](https:\u002F\u002Fgithub.com\u002Fboranhan\u002FGeospatial_Foundation_Models)|\n|**SkySense**|**SkySense: A Multi-Modal Remote Sensing Foundation Model Towards Universal Interpretation for Earth Observation Imagery**|CVPR2024|[SkySense](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2024\u002Fhtml\u002FGuo_SkySense_A_Multi-Modal_Remote_Sensing_Foundation_Model_Towards_Universal_Interpretation_CVPR_2024_paper.html)|[link](https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FSkySense)|\n|**MTP**|**MTP: Advancing Remote Sensing Foundation Model via Multi-Task Pretraining**|IEEE JSTARS2024|[MTP](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fjstars.2024.3408154)|[link](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FMTP)|\n|**DOFA**|**Neural Plasticity-Inspired Multimodal Foundation Model for Earth Observation**|Arxiv2024|[DOFA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.15356)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FDOFA)|\n|**DeCUR**|**DeCUR: decoupling common & unique representations for multimodal self-supervision**|ECCV2024|[DeCUR](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-3-031-73397-0_17)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FDeCUR)|\n|**MMEarth**|**MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial Representation Learning**|ECCV2024|[MMEarth](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-3-031-73039-9_10)|[link](https:\u002F\u002Fvishalned.github.io\u002Fmmearth\u002F)|\n|**LeMeViT**|**LeMeViT: Efficient Vision Transformer with Learnable Meta Tokens for Remote Sensing Image Interpretation**|IJCAI2024|[LeMeViT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.09789)|[link](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FLeMeViT\u002Ftree\u002Fmain?tab=readme-ov-file)|\n|**SoftCon**|**Multi-Label Guided Soft Contrastive Learning for Efficient Earth Observation Pretraining**|TGRS2024|[SoftCon](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10726860)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002Fsoftcon?tab=readme-ov-file)|\n|**RS-DFM**|**RS-DFM: A Remote Sensing Distributed Foundation Model for Diverse Downstream Tasks**|Arxiv2024|[RS-DFM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.07032)|null|\n|**A2-MAE**|**A2-MAE: A spatial-temporal-spectral unified remote sensing pre-training method based on anchor-aware masked autoencoder**|Arxiv2024|[A2-MAE](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.08079)|null|\n|**OmniSat**|**OmniSat: Self-Supervised Modality Fusion for Earth Observation**|ECCV2024|[OmniSat](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-3-031-73390-1_24)|[link](https:\u002F\u002Fgithub.com\u002Fgastruc\u002FOmniSat?tab=readme-ov-file)|\n|**MM-VSF**|**Towards Knowledge Guided Pretraining Approaches for Multimodal Foundation Models: Applications in Remote Sensing**|Arxiv2024|[MM-VSF](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.19660)|null|\n|**MA3E**|**Masked Angle-Aware Autoencoder for Remote Sensing Images**|ECCV2024|[MA3E](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-3-031-73242-3_15)|[link](https:\u002F\u002Fgithub.com\u002Fbenesakitam\u002FMA3E)|\n|**SAR-JEPA**|**Predicting Gradient is Better: Exploring Self-Supervised Learning for SAR ATR with a Joint-Embedding Predictive Architecture**|ISPRS JPRS2024|[SAR-JEPA](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0924271624003514)|[link](https:\u002F\u002Fgithub.com\u002Fwaterdisappear\u002FSAR-JEPA)|\n|**PIS**|**Pretrain a Remote Sensing Foundation Model by Promoting Intra-instance Similarity**|TGRS2024|[PIS](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10697182)|[link](https:\u002F\u002Fgithub.com\u002FShawnAn-WHU\u002FPIS)|\n|**FG-MAE**|**Feature Guided Masked Autoencoder for Self-Supervised Learning in Remote Sensing**|IEEE JSTARS2024|[FG-MAE](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fjstars.2024.3493237)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FFGMAE)|\n|**RingMo-lite**|**RingMo-Lite: A Remote Sensing Lightweight Network With CNN-Transformer Hybrid Framework**|IEEE TGRS2024|[RingMo-lite](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2024.3360447)|null|\n|**-**|**A Multimodal Unified Representation Learning Framework With Masked Image Modeling for Remote Sensing Images**|IEEE TGRS2024|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2024.3494244)|null|\n|**-**|**Masked Feature Modeling for Generative Self-Supervised Representation Learning of High-Resolution Remote Sensing Images**|IEEE JSTARS2024|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fjstars.2024.3385420)|null|\n|**OReole-FM**|**OReole-FM: successes and challenges toward billion-parameter foundation models for high-resolution satellite imagery**|SIGSPATIAL2024|[OReole-FM](https:\u002F\u002Fdoi.org\u002F10.1145\u002F3678717.3691292)|null|\n|**SatVision-TOA**|**SatVision-TOA: A Geospatial Foundation Model for Coarse-Resolution All-Sky Remote Sensing Imagery**|Arxiv2024|[SatVision-TOA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.17000)|[link](https:\u002F\u002Fgithub.com\u002Fnasa-nccs-hpda\u002Fpytorch-caney)|\n|**Prithvi-EO-2.0**|**Prithvi-EO-2.0: A Versatile Multi-Temporal Foundation Model for Earth Observation Applications**|Arxiv2024|[Prithvi-EO-2.0](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.02732)|[link](https:\u002F\u002Fgithub.com\u002FNASA-IMPACT\u002FPrithvi-EO-2.0)|\n|**WildSAT**|**WildSAT: Learning Satellite Image Representations from Wildlife Observations**|Arxiv2024|[WildSAT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.14428)|[link](https:\u002F\u002Fgithub.com\u002Fmdchuc\u002FHRSFM)|\n|**SpectralEarth**|**SpectralEarth: Training Hyperspectral Foundation Models at Scale**|IEEE JSTARS2025|[SpectralEarth](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fjstars.2025.3581451)|null|\n|**SenPa-MAE**|**SenPa-MAE: Sensor Parameter Aware Masked Autoencoder for Multi-Satellite Self-Supervised Pretraining**|LNCS2025|[SenPa-MAE](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-3-031-85187-2_20)|[link](https:\u002F\u002Fgithub.com\u002FJonathanPrexl\u002FSenPa-MAE)|\n|**RingMo-Aerial**|**RingMo-Aerial: An Aerial Remote Sensing Foundation Model With Affine Transformation Contrastive Learning**|IEEE TPAMI2025|[RingMo-Aerial](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftpami.2025.3602237)|null|\n|**PIEViT**|**Pattern Integration and Enhancement Vision Transformer for Self-Supervised Learning in Remote Sensing**|IEEE TGRS2025|[PIEViT](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2025.3541390)|null|\n|**SeaMo**|**SeaMo: A Multi-Seasonal and Multimodal Remote Sensing Foundation Model**|Information Fusion2025|[SeaMo](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS1566253525004075)|null|\n|**HyperSIGMA**|**HyperSIGMA: Hyperspectral Intelligence Comprehension Foundation Model**|IEEE TPAMI2025|[HyperSIGMA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.11519)|[link](https:\u002F\u002Fgithub.com\u002FWHU-Sigma\u002FHyperSIGMA?tab=readme-ov-file)|\n|**FoMo**|**FoMo: Multi-Modal, Multi-Scale and Multi-Task Remote Sensing Foundation Models for Forest Monitoring**|AAAI2025|[FoMo](https:\u002F\u002Fdoi.org\u002F10.1609\u002Faaai.v39i27.35002)|[link](https:\u002F\u002Fgithub.com\u002FRolnickLab\u002FFoMo-Bench)|\n|**RingMamba**|**RingMamba: Remote Sensing Multisensor Pretraining With Visual State Space Model**|IEEE TGRS2025|[RingMamba](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2025.3603998)|null|\n|**CrossEarth**|**CrossEarth: Geospatial Vision Foundation Model for Domain Generalizable Remote Sensing Semantic Segmentation**|IEEE TPAMI2025|[CrossEarth](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftpami.2025.3649001)|[link](https:\u002F\u002Fgithub.com\u002FCuzyoung\u002FCrossEarth)|\n|**CtxMIM**|**CtxMIM: Context-Enhanced Masked Image Modeling for Remote Sensing Image Understanding**|ACM TOMM2025|[CtxMIM](https:\u002F\u002Fdoi.org\u002F10.1145\u002F3769084)|null|\n|**SatMamba**|**SatMamba: Development of Foundation Models for Remote Sensing Imagery Using State Space Models**|Arxiv2025|[SatMamba](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.00435)|[link](https:\u002F\u002Fgithub.com\u002Fmdchuc\u002FHRSFM)|\n|**Galileo**|**Galileo: Learning Global & Local Features of Many Remote Sensing Modalities**|ICML2025 TerraBytes Workshop|[Galileo](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.09356)|[link](https:\u002F\u002Fgithub.com\u002Fnasaharvest\u002Fgalileo)|\n|**SatDiFuser**|**Can Generative Geospatial Diffusion Models Excel as Discriminative Geospatial Foundation Models?**|Arxiv2025|[SatDiFuser](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.07890)|null|\n|**RoMA**|**RoMA: Scaling up Mamba-based Foundation Models for Remote Sensing**|Arxiv2025|[RoMA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.10392)|[link](https:\u002F\u002Fgithub.com\u002FMiliLab\u002FRoMA)|\n|**Panopticon**|**Panopticon: Advancing Any-Sensor Foundation Models for Earth Observation**|CVPR2025|[Panopticon](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.10845)|[link](https:\u002F\u002Fgithub.com\u002FPanopticon-FM\u002Fpanopticon)|\n|**HyperFree**|**HyperFree: A Channel-adaptive and Tuning-free Foundation Model for Hyperspectral Remote Sensing Imagery**|CVPR2025|[HyperFree](https:\u002F\u002Frsidea.whu.edu.cn\u002FHyperFree.pdf)|[link](https:\u002F\u002Fgithub.com\u002FJingtao-Li-CVer\u002FHyperFree)|\n|**AnySat**|**AnySat: One Earth Observation Model for Many Resolutions, Scales, and Modalities**|CVPR2025|[AnySat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.14123)|[link](https:\u002F\u002Fgithub.com\u002Fgastruc\u002FAnySat)|\n|**HyperSL**|**HyperSL: A Spectral Foundation Model for Hyperspectral Image Interpretation**|IEEE TGRS2025|[HyperSL](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10981753)|[link](https:\u002F\u002Fgithub.com\u002Fkkweil\u002FHyperSL)|\n|**DynamicVis**|**DynamicVis: An Efficient and General Visual Foundation Model for Remote Sensing Image Understanding**|Arxiv2025|[DynamicVis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.16426)|[link](https:\u002F\u002Fgithub.com\u002FKyanChen\u002FDynamicVis)|\n|**DeepAndes**|**DeepAndes: A Self-Supervised Vision Foundation Model for Multispectral Remote Sensing Imagery of the Andes**|IEEE JSTARS2025|[DeepAndes](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fjstars.2025.3619423)|null|\n|**TiMo**|**TiMo: Spatiotemporal Foundation Model for Satellite Image Time Series**|Arxiv2025|[TiMo](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.08723)|[link](https:\u002F\u002Fgithub.com\u002FMiliLab\u002FTiMo)|\n|**TerraFM**|**TerraFM: A Scalable Foundation Model for Unified Multisensor Earth Observation**|Arxiv2025|[TerraFM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.06281)|[link](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FTerraFM)|\n|**TESSERA**|**TESSERA: Temporal Embeddings of Surface Spectra for Earth Representation and Analysis**|Arxiv2025|[TESSERA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.20380)|[link](https:\u002F\u002Fgithub.com\u002Fucam-eo\u002Ftessera)|\n|**CGEarthEye**|**CGEarthEye: A High-Resolution Remote Sensing Vision Foundation Model Based on the Jilin-1 Satellite Constellation**|Arxiv2025|[CGEarthEye](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.00356)|null|\n|**MoSAiC**|**MoSAiC: Multi-Modal Multi-Label Supervision-Aware Contrastive Learning for Remote Sensing**|Arxiv2025|[MoSAiC](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.08683)|null|\n|**AlphaEarth**|**AlphaEarth Foundations: An embedding field model for accurate and efficient global mapping from sparse label data**|Arxiv2025|[AlphaEarth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.22291)|null|\n|**MAESTRO**|**MAESTRO: Masked AutoEncoders for Multimodal, Multitemporal, and Multispectral Earth Observation Data**|Arxiv2025|[MAESTRO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.10894)|[link](https:\u002F\u002Fgithub.com\u002FIGNF\u002FMAESTRO)|\n|**FedSense**|**Towards Privacy-preserved Pre-training of Remote Sensing Foundation Models with Federated Mutual-guidance Learning**|ICCV2025|[FedSense](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11051)|null|\n|**RS-vHeat**|**RS-vHeat: Heat Conduction Guided Efficient Remote Sensing Foundation Model**|ICCV2025|[RS-vHeat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.17984)|null|\n|**Copernicus-FM**|**Towards a Unified Copernicus Foundation Model for Earth Vision**|ICCV2025|[Copernicus-FM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11849)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FCopernicus-FM)|\n|**SelectiveMAE**|**Scaling Efficient Masked Autoencoder Learning on Large Remote Sensing Dataset**|ICCV2025|[SelectiveMAE](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.11933)|[link](https:\u002F\u002Fgithub.com\u002FFengxiang23\u002FSelectiveMAE)|\n|**SMARTIES**|**SMARTIES: Spectrum-Aware Multi-Sensor Auto-Encoder for Remote Sensing Images**|ICCV2025|[SMARTIES](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.19585)|[link](https:\u002F\u002Fgsumbul.github.io\u002FSMARTIES\u002F)|\n|**TerraMind**|**TerraMind: Large-Scale Generative Multimodality for Earth Observation**|ICCV2025|[TerraMind](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.11171)|[link](https:\u002F\u002Fgithub.com\u002FIBM\u002Fterramind)|\n|**SkySense V2**|**SkySense V2: A Unified Foundation Model for Multi-modal Remote Sensing**|ICCV2025|[SkySense V2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.13812)|null|\n|**SkySense++**|**A semantic-enhanced multi-modal remote sensing foundation model for Earth observation**|Nature Machine Intelligence 2025|[SkySense++](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs42256-025-01078-8)|[link](https:\u002F\u002Fgithub.com\u002Fkang-wu\u002FSkySensePlusPlus?tab=readme-ov-file)|\n|**FlexiMo**|**FlexiMo: A Flexible Remote Sensing Foundation Model**|IEEE TGRS2026|[FlexiMo](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2026.3656362)|null|\n|**RingMoE**|**RingMoE: Mixture-of-Modality-Experts Multi-Modal Foundation Models for Universal Remote Sensing Image Interpretation**|IEEE TPAMI2026|[RingMoE](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftpami.2025.3643453)|null|\n|**-**|**A Complex-Valued SAR Foundation Model Based on Physically Inspired Representation Learning**|IEEE TIP2026|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftip.2026.3652417)|null|\n|**MAPEX**|**MAPEX: Modality-Aware Pruning of Experts for Remote Sensing Foundation Models**|IEEE TGRS2026|[MAPEX](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2026.3652100)|[link](https:\u002F\u002Fgithub.com\u002FHSG-AIML\u002FMAPEX)|\n|**Alliance**|**Alliance: All-in-One Spectral-Spatial-Frequency Awareness Foundation Model**|IEEE TPAMI2026|[Alliance](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftpami.2025.3639595)|null|\n\n\n## Remote Sensing \u003Cins>Vision-Language\u003C\u002Fins> Foundation Models\n\n|Abbreviation|Title|Publication|Paper|Code & Weights|\n|:---:|---|:---:|:---:|:---:|\n|**RSGPT**|**RSGPT: A Remote Sensing Vision Language Model and Benchmark**|Arxiv2023|[RSGPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15266)|[link](https:\u002F\u002Fgithub.com\u002FLavender105\u002FRSGPT)|\n|**RemoteCLIP**|**RemoteCLIP: A Vision Language Foundation Model for Remote Sensing**|IEEE TGRS2024|[RemoteCLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.11029)|[link](https:\u002F\u002Fgithub.com\u002FChenDelong1999\u002FRemoteCLIP)|\n|**GeoRSCLIP**|**RS5M: A Large Scale Vision-Language Dataset for Remote Sensing Vision-Language Foundation Model**|IEEE TGRS2024|[GeoRSCLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.11300)|[link](https:\u002F\u002Fgithub.com\u002Fom-ai-lab\u002FRS5M?tab=readme-ov-file)|\n|**GRAFT**|**Remote Sensing Vision-Language Foundation Models without Annotations via Ground Remote Alignment**|ICLR2024|[GRAFT](https:\u002F\u002Fopenreview.net\u002Fpdf?id=w9tc699w3Z)|null|\n|**-**|**Charting New Territories: Exploring the Geographic and Geospatial Capabilities of Multimodal LLMs**|Arxiv2023|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.14656)|[link](https:\u002F\u002Fgithub.com\u002Fjonathan-roberts1\u002Fcharting-new-territories)|\n|**-**|**Remote Sensing ChatGPT: Solving Remote Sensing Tasks with ChatGPT and Visual Models**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.09083)|[link](https:\u002F\u002Fgithub.com\u002FHaonanGuo\u002FRemote-Sensing-ChatGPT)|\n|**EarthGPT**|**EarthGPT: A Universal Multimodal Large Language Model for Multisensor Image Comprehension in Remote Sensing Domain**|IEEE TGRS2024|[EarthGPT](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2024.3409624)|null|\n|**SkyCLIP**|**SkyScript: A Large and Semantically Diverse Vision-Language Dataset for Remote Sensing**|AAAI2024|[SkyCLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.12856)|[link](https:\u002F\u002Fgithub.com\u002Fwangzhecheng\u002FSkyScript)|\n|**GeoChat**|**GeoChat: Grounded Large Vision-Language Model for Remote Sensing**|CVPR2024|[GeoChat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.15826)|[link](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FGeoChat)|\n|**LHRS-Bot**|**LHRS-Bot: Empowering Remote Sensing with VGI-Enhanced Large Multimodal Language Model**|ECCV2024|[LHRS-Bot](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.02544)|[link](https:\u002F\u002Fgithub.com\u002FNJU-LHRS\u002FLHRS-Bot)|\n|**RS-LLaVA**|**RS-LLaVA: Large Vision Language Model for Joint Captioning and Question Answering in Remote Sensing Imagery**|RS2024|[RS-LLaVA](https:\u002F\u002Fwww.mdpi.com\u002F2072-4292\u002F16\u002F9\u002F1477)|[link](https:\u002F\u002Fgithub.com\u002FBigData-KSU\u002FRS-LLaVA?tab=readme-ov-file)|\n|**SkySenseGPT**|**SkySenseGPT: A Fine-Grained Instruction Tuning Dataset and Model for Remote Sensing Vision-Language Understanding**|Arxiv2024|[SkySenseGPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.10100)|[link](https:\u002F\u002Fgithub.com\u002FLuo-Z13\u002FSkySenseGPT)|\n|**EarthMarker**|**EarthMarker: Visual Prompt Learning for Region-level and Point-level Remote Sensing Imagery Comprehension**|IEEE TGRS2024|[EarthMarker](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.13596)|[link](https:\u002F\u002Fgithub.com\u002Fwivizhang\u002FEarthMarker)|\n|**GeoText**|**Towards Natural Language-Guided Drones: GeoText-1652 Benchmark with Spatial Relation Matching**|ECCV2024|[GeoText](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.12751)|[link](https:\u002F\u002Fmultimodalgeo.github.io\u002FGeoText\u002F)|\n|**Aquila**|**Aquila: A Hierarchically Aligned Visual-Language Model for Enhanced Remote Sensing Image Comprehension**|Arxiv2024|[Aquila](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.06074)|null|\n|**LHRS-Bot-Nova**|**LHRS-Bot-Nova: Improved multimodal large language model for remote sensing vision-language interpretation**|ISPRS JPRS2025|[LHRS-Bot-Nova](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.isprsjprs.2025.06.003)|[link](https:\u002F\u002Fgithub.com\u002FNJU-LHRS\u002FLHRS-Bot)|\n|**RSCLIP**|**Pushing the Limits of Vision-Language Models in Remote Sensing without Human Annotations**|Arxiv2024|[RSCLIP](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.07048)|null|\n|**GeoGround**|**GeoGround: A Unified Large Vision-Language Model for Remote Sensing Visual Grounding**|Arxiv2024|[GeoGround](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.11904)|[link](https:\u002F\u002Fgithub.com\u002Fzytx121\u002FGeoGround)|\n|**RingMoGPT**|**RingMoGPT: A Unified Remote Sensing Foundation Model for Vision, Language, and grounded tasks**|TGRS2024|[RingMoGPT](https:\u002F\u002Fieeexplore.ieee.org\u002Fstamp\u002Fstamp.jsp?tp=&arnumber=10777289)|null|\n|**RSUniVLM**|**RSUniVLM: A Unified Vision Language Model for Remote Sensing via Granularity-oriented Mixture of Experts**|Arxiv2024|[RSUniVLM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.05679)|[link](https:\u002F\u002Frsunivlm.github.io\u002F)|\n|**UniRS**|**UniRS: Unifying Multi-temporal Remote Sensing Tasks through Vision Language Models**|Arxiv2024|[UniRS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.20742v1)|null|\n|**REO-VLM**|**REO-VLM: Transforming VLM to Meet Regression Challenges in Earth Observation**|Arxiv2024|[REO-VLM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16583)|null|\n|**SkyEyeGPT**|**SkyEyeGPT: Unifying Remote Sensing Vision-Language Tasks via Instruction Tuning with Large Language Model**|ISPRS JPRS2025|[SkyEyeGPT](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.isprsjprs.2025.01.020)|[link](https:\u002F\u002Fgithub.com\u002FZhanYang-nwpu\u002FSkyEyeGPT)|\n|**VHM**|**VHM: Versatile and Honest Vision Language Model for Remote Sensing Image Analysis**|AAAI2025|[VHM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.20213v4)|[link](https:\u002F\u002Fgithub.com\u002Fopendatalab\u002FVHM)|\n|**TEOChat**|**TEOChat: Large Language and Vision Assistant for Temporal Earth Observation Data**|ICLR2025|[TEOChat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.06234)|[link](https:\u002F\u002Fgithub.com\u002Fermongroup\u002FTEOChat)|\n|**EarthDial**|**EarthDial: Turning Multi-sensory Earth Observations to Interactive Dialogues**|CVPR2025|[EarthDial](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.15190#page=3.84)|[link](https:\u002F\u002Fgithub.com\u002Fhiyamdebary\u002FEarthDial)|\n|**SkySense-O**|**SkySense-O: Towards Open-World Remote Sensing Interpretation with Vision-Centric Visual-Language Modeling**|CVPR2025|[SkySense-O](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2025\u002Fpapers\u002FZhu_SkySense-O_Towards_Open-World_Remote_Sensing_Interpretation_with_Vision-Centric_Visual-Language_Modeling_CVPR_2025_paper.pdf)|[link](https:\u002F\u002Fgithub.com\u002Fzqcrafts\u002FSkySense-O)|\n|**XLRS-Bench**|**XLRS-Bench: Could Your Multimodal LLMs Understand Extremely Large Ultra-High-Resolution Remote Sensing Imagery?**|CVPR2025|[XLRS-Bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.23771)|[link](https:\u002F\u002Fxlrs-bench.github.io\u002F)|\n|**GeoPix**|**GeoPix: Multi-Modal Large Language Model for Pixel-level Image Understanding in Remote Sensing**|IEEE GRSM2025|[GeoPix](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.06828)|[link](https:\u002F\u002Fgithub.com\u002FNorman-Ou\u002FGeoPix)|\n|**GeoPixel**|**GeoPixel: Pixel Grounding Large Multimodal Model in Remote Sensing**|ICML2025|[GeoPixel](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.13925)|[link](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FGeoPixel)|\n|**Co-LLaVA**|**Co-LLaVA: Efficient Remote Sensing Visual Question Answering via Model Collaboration**|RS2025|[Co-LLaVA](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs17030466)|null|\n|**GeoMag**|**GeoMag: A Vision-Language Model for Pixel-level Fine-Grained Remote Sensing Image Parsing**|ACMMM2025|[GeoMag](https:\u002F\u002Fdoi.org\u002F10.1145\u002F3746027.3754559)|null|\n|**-**|**Quality-Driven Curation of Remote Sensing Vision-Language Data via Learned Scoring Models**|Arxiv2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.00743)|null|\n|**DOFA-CLIP**|**DOFA-CLIP: Multimodal Vision–Language Foundation Models for Earth Observation**|Arxiv2025|[DOFA-CLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.06312)|[link](https:\u002F\u002Fgithub.com\u002Fxiong-zhitong\u002FGeoLB-SigLIP)|\n|**DGTRS-CLIP**|**DGTRSD & DGTRS-CLIP: A Dual-Granularity Remote Sensing Image-Text Dataset and Vision Language Foundation Model for Alignment**|Arxiv2025|[DGTRS-CLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.19311)|[link](https:\u002F\u002Fgithub.com\u002FMitsuiChen14\u002FDGTRS)|\n|**Falcon**|**Falcon: A Remote Sensing Vision-Language Foundation Model**|Arxiv2025|[Falcon](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11070)|[link](https:\u002F\u002Fgithub.com\u002FTianHuiLab\u002FFalcon)|\n|**GeoRSMLLM**|**GeoRSMLLM: A Multimodal Large Language Model for Vision-Language Tasks in Geoscience and Remote Sensing**|Arxiv2025|[GeoRSMLLM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.12490)|null|\n|**LRS-VQA**|**When Large Vision-Language Model Meets Large Remote Sensing Imagery: Coarse-to-Fine Text-Guided Token Pruning**|ICCV2025|[LRS-VQA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.07588)|[link](https:\u002F\u002Fgithub.com\u002FVisionXLab\u002FLRS-VQA)|\n|**UrbanLLaVA**|**UrbanLLaVA: A Multi-modal Large Language Model for Urban Intelligence with Spatial Reasoning and Understanding**|ICCV2025|[UrbanLLaVA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.23219)|[link](https:\u002F\u002Fgithub.com\u002Ftsinghua-fib-lab\u002FUrbanLLaVA)|\n|**OmniGeo**|**OmniGeo: Towards a Multimodal Large Language Models for Geospatial Artificial Intelligence**|Arxiv2025|[OmniGeo](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.16326)|null|\n|**EagleVision**|**EagleVision: Object-level Attribute Multimodal LLM for Remote Sensing**|Arxiv2025|[EagleVision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.23330)|[link](https:\u002F\u002Fgithub.com\u002FXiangTodayEatsWhat\u002FEagleVision)|\n|**SegEarth-R1**|**SegEarth-R1: Geospatial Pixel Reasoning via Large Language Model**|Arxiv2025|[SegEarth-R1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.09644)|[link](https:\u002F\u002Fgithub.com\u002Fearth-insights\u002FSegEarth-R1)|\n|**RemoteSAM**|**RemoteSAM: Towards Segment Anything for Earth Observation**|ACMMM2025|[RemoteSAM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.18022)|[link](https:\u002F\u002Fgithub.com\u002F1e12Leon\u002FRemoteSAM)|\n|**DynamicVL**|**DynamicVL: Benchmarking Multimodal Large Language Models for Dynamic City Understanding**|Arxiv2025|[DynamicVL](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.21076)|null|\n|**LISAt**|**LISAt: Language- Instructed Segmentation Assistant for Satellite Imagery**|Arxiv2025|[LISAt](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.02829)|[link](https:\u002F\u002Flisat-bair.github.io\u002FLISAt\u002F)|\n|**EarthMind**|**EarthMind: Towards Multi-Granular and Multi-Sensor Earth Observation with Large Multimodal Models**|Arxiv2025|[EarthMind](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.01667)|[link](https:\u002F\u002Fgithub.com\u002Fshuyansy\u002FEarthMind)|\n|**-**|**Remote Sensing Large Vision-Language Model: Semantic-augmented Multi-level Alignment and Semantic-aware Expert Modeling**|Arxiv2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.21863)|null|\n|**RLita**|**RLita: A Region-Level Image-Text Alignment Method for Remote Sensing Foundation Model**|RS2025|[RLita](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs17101661)|null|\n|**RingMo-Agent**|**RingMo-Agent: A Unified Remote Sensing Foundation Model for Multi-Platform and Multi-Modal Reasoning**|Arxiv2025|[RingMo-Agent](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.20776)|null|\n|**FUSE-RSVLM**|**FUSE-RSVLM: Feature Fusion Vision-Language Model for Remote Sensing**|Arxiv2025|[FUSE-RSVLM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.24022)|[link](https:\u002F\u002Fgithub.com\u002FYunkaidang\u002FRSVLM)|\n|**GeoReason**|**GeoReason: Aligning Thinking And Answering In Remote Sensing Vision-Language Models Via Logical Consistency Reinforcement Learning**|Arxiv2026|[GeoReason](https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.04118)|[link](https:\u002F\u002Fgithub.com\u002Fcanlanqianyan\u002FGeoReason)|\n|**RSCoVLM**|**Co-Training Vision-Language Models for Remote Sensing Multi-Task Learning**|RS2026|[RSCoVLM](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs18020222)|[link](https:\u002F\u002Fgithub.com\u002FVisionXLab\u002FRSCoVLM)|\n|**GeoAlignCLIP**|**GeoAlignCLIP: Enhancing Fine-Grained Vision-Language Alignment in Remote Sensing via Multi-Granular Consistency Learning**|Arxiv2026|[GeoAlignCLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.09566)|null|\n\n\n## Remote Sensing \u003Cins>Generative\u003C\u002Fins> Foundation Models\n\n|Abbreviation|Title|Publication|Paper|Code & Weights|\n|:---:|---|:---:|:---:|:---:|\n|**Seg2Sat**|**Seg2Sat - Segmentation to aerial view using pretrained diffuser models**|Github|null|[link](https:\u002F\u002Fgithub.com\u002FRubenGres\u002FSeg2Sat)|\n|**-**|**Generate Your Own Scotland: Satellite Image Generation Conditioned on Maps**|NeurIPSW2023|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.16648)|[link](https:\u002F\u002Fgithub.com\u002Ftoastyfrosty\u002Fmap-sat)|\n|**GeoRSSD**|**RS5M: A Large Scale Vision-Language Dataset for Remote Sensing Vision-Language Foundation Model**|Arxiv2023|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.11300)|[link](https:\u002F\u002Fhuggingface.co\u002FZilun\u002FGeoRSSD)|\n|**DiffusionSat**|**DiffusionSat: A Generative Foundation Model for Satellite Imagery**|ICLR2024|[DiffusionSat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.03606)|[link](https:\u002F\u002Fgithub.com\u002Fsamar-khanna\u002FDiffusionSat)|\n|**CRS-Diff**|**CRS-Diff: Controllable Generative Remote Sensing Foundation Model**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.11614)|null|\n|**MetaEarth**|**MetaEarth: A Generative Foundation Model for Global-Scale Remote Sensing Image Generation**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.13570)|[link](https:\u002F\u002Fjiupinjia.github.io\u002Fmetaearth\u002F)|\n|**CRS-Diff**|**CRS-Diff: Controllable Generative Remote Sensing Foundation Model**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.11614)|[link](https:\u002F\u002Fgithub.com\u002FSonettoo\u002FCRS-Diff?tab=readme-ov-file)|\n|**HSIGene**|**HSIGene: A Foundation Model For Hyperspectral Image Generation**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.12470)|[link](https:\u002F\u002Fgithub.com\u002FLiPang\u002FHSIGene)|\n|**Text2Earth**|**Text2Earth: Unlocking Text-driven Remote Sensing Image Generation with a Global-Scale Dataset and a Foundation Model**|Arxiv2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.00895)|[link](https:\u002F\u002Fchen-yang-liu.github.io\u002FText2Earth\u002F)|\n\n## Remote Sensing \u003Cins>Vision-Location\u003C\u002Fins> Foundation Models\n\n|Abbreviation|Title|Publication|Paper|Code & Weights|\n|:---:|---|:---:|:---:|:---:|\n|**CSP**|**CSP: Self-Supervised Contrastive Spatial Pre-Training for Geospatial-Visual Representations**|ICML2023|[CSP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.01118)|[link](https:\u002F\u002Fgengchenmai.github.io\u002Fcsp-website\u002F)|\n|**GeoCLIP**|**GeoCLIP: Clip-Inspired Alignment between Locations and Images for Effective Worldwide Geo-localization**|NeurIPS2023|[GeoCLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16020)|[link](https:\u002F\u002Fvicentevivan.github.io\u002FGeoCLIP\u002F)|\n|**SatCLIP**|**SatCLIP: Global, General-Purpose Location Embeddings with Satellite Imagery**|Arxiv2023|[SatCLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.17179)|[link](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fsatclip)|\n|**RANGE**|**RANGE: Retrieval Augmented Neural Fields for Multi-Resolution Geo-Embeddings**|CVPR2025|[RANGE](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2502.19781)|null|\n|**GAIR**|**GAIR: Improving Multimodal Geo-Foundation Model with Geo-Aligned Implicit Representations**|Arxiv2025|[GAIR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.16683)|null|\n\n## Remote Sensing \u003Cins>Vision-Audio\u003C\u002Fins> Foundation Models\n\n|Abbreviation|Title|Publication|Paper|Code & Weights|\n|:---:|---|:---:|:---:|:---:|\n|**-**|**Self-supervised audiovisual representation learning for remote sensing data**|JAG2022|[Paper](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS1569843222003181)|[link](https:\u002F\u002Fgithub.com\u002Fkhdlr\u002FSoundingEarth)|\n\n\n## Remote Sensing \u003Cins>Task-specific\u003C\u002Fins> Foundation Models\n\n|Abbreviation|Title|Publication|Paper|Code & Weights|Task|\n|:---:|---|:---:|:---:|:---:|:---:|\n|**SS-MAE**|**SS-MAE: Spatial-Spectral Masked Auto-Encoder for Mulit-Source Remote Sensing Image Classification**|TGRS2023|[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10314566\u002F)|[link](https:\u002F\u002Fgithub.com\u002Fsummitgao\u002FSS-MAE?tab=readme-ov-file)|Image Classification|\n|**-**|**A Decoupling Paradigm With Prompt Learning for Remote Sensing Image Change Captioning**|TGRS2023|[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10271701)|[link](https:\u002F\u002Fgithub.com\u002FChen-Yang-Liu\u002FPromptCC)|Remote Sensing Image Change Captioning|\n|**TTP**|**Time Travelling Pixels: Bitemporal Features Integration with Foundation Model for Remote Sensing Image Change Detection**|Arxiv2023|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.16202)|[link](https:\u002F\u002Fgithub.com\u002FKyanChen\u002FTTP)|Change Detection|\n|**CSMAE**|**Exploring Masked Autoencoders for Sensor-Agnostic Image Retrieval in Remote Sensing**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.07782)|[link](https:\u002F\u002Fgithub.com\u002Fjakhac\u002FCSMAE)|Image Retrieval|\n|**RSPrompter**|**RSPrompter: Learning to Prompt for Remote Sensing Instance Segmentation based on Visual Foundation Model**|TGRS2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.16269)|[link](https:\u002F\u002Fgithub.com\u002FKyanChen\u002FRSPrompter)|Instance Segmentation|\n|**BAN**|**A New Learning Paradigm for Foundation Model-based Remote Sensing Change Detection**|TGRS2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.01163)|[link](https:\u002F\u002Fgithub.com\u002Flikyoo\u002FBAN)|Change Detection|\n|**-**|**Change Detection Between Optical Remote Sensing Imagery and Map Data via Segment Anything Model (SAM)**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.09019)|null|Change Detection (Optical & OSM data)|\n|**AnyChange**|**Segment Any Change**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.01188)|null|Zero-shot Change Detection|\n|**RS-CapRet**|**Large Language Models for Captioning and Retrieving Remote Sensing Images**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.06475)|null|Image Caption & Text-image Retrieval|\n|**-**|**Task Specific Pretraining with Noisy Labels for Remote sensing Image Segmentation**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.16164)|null|Image Segmentation (Noisy labels)|\n|**RSBuilding**|**RSBuilding: Towards General Remote Sensing Image Building Extraction and Change Detection with Foundation Model**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.07564)|[link](https:\u002F\u002Fgithub.com\u002FMeize0729\u002FRSBuilding)|Building Extraction and Change Detection|\n|**SAM-Road**|**Segment Anything Model for Road Network Graph Extraction**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.16051)|[link](https:\u002F\u002Fgithub.com\u002Fhtcr\u002Fsam_road)|Road Extraction|\n|**CrossEarth**|**CrossEarth: Geospatial Vision Foundation Model for Domain Generalizable Remote Sensing Semantic Segmentation**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.22629)|[link](https:\u002F\u002Fgithub.com\u002FCuzyoung\u002FCrossEarth)|Domain Generalizable Remote Sensing Semantic Segmentation|\n|**GeoGround**|**GeoGround: A Unified Large Vision-Language Model for Remote Sensing Visual Grounding**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.11904)|[link](https:\u002F\u002Fgithub.com\u002Fzytx121\u002FGeoGround)|Remote Sensing Visual Grounding|\n|**TPOV-Seg**|**TPOV-Seg: Textually Enhanced Prompt Tuning of Vision-Language Models for Open-Vocabulary Remote Sensing Semantic Segmentation**|IEEE TGRS2025|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2025.3624767)|null|Open-Vocabulary Semantic Segmentation|\n|**SegEarth-OV**|**SegEarth-OV: Towards Training-Free Open-Vocabulary Segmentation for Remote Sensing Images**|CVPR2025|[Paper](https:\u002F\u002Fcvpr.thecvf.com\u002Fvirtual\u002F2025\u002Fposter\u002F33431)|[link](https:\u002F\u002Fgithub.com\u002Flikyoo\u002FSegEarth-OV)|Open-Vocabulary Segmentation|\n|**RSRefSeg 2**|**RSRefSeg 2: Decoupling Referring Remote Sensing Image Segmentation With Foundation Models**|IEEE TGRS2026|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2025.3647535)|null|Referring Image Segmentation|\n|**AgriFM**|**AgriFM: A multi-source temporal remote sensing foundation model for Agriculture mapping**|RSE2026|[Paper](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.rse.2026.115234)|[link](https:\u002F\u002Fgithub.com\u002Fflyakon\u002FAgriFM)|Crop Mapping \u002F Agricultural Mapping|\n|**SARATR-X**|**SARATR-X: Toward Building a Foundation Model for SAR Target Recognition**|IEEE TIP2025|[SARATR-X](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10856784)|[link](https:\u002F\u002Fgithub.com\u002Fwaterdisappear\u002FSARATR-X)|SAR Target Recognition|\n\n## Remote Sensing Agents\n|Abbreviation|Title|Publication|Paper|Code & Weights|\n|:---:|---|:---:|:---:|:---:|\n|**GeoLLM-QA**|**Evaluating Tool-Augmented Agents in Remote Sensing Platforms**|ICLR 2024 ML4RS Workshop|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.00709)|null|\n|**RS-Agent**|**RS-Agent: Automating Remote Sensing Tasks through Intelligent Agents**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.07089)|null|\n|**Change-Agent**|**Change-Agent: Toward Interactive Comprehensive Remote Sensing Change Interpretation and Analysis**|TGRS2024|[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10591792)|[link](https:\u002F\u002Fgithub.com\u002FChen-Yang-Liu\u002FChange-Agent)|\n|**GeoLLM-Engine**|**GeoLLM-Engine: A Realistic Environment for Building Geospatial Copilots.**|CVPRW2024|[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2024W\u002FEarthVision\u002Fhtml\u002FSingh_GeoLLM-Engine_A_Realistic_Environment_for_Building_Geospatial_Copilots_CVPRW_2024_paper.html)|null|\n|**PEACE**|**PEACE: Empowering Geologic Map Holistic Understanding with MLLMs**|CVPR2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.06184)|[link](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FPEACE?tab=readme-ov-file)|\n|**-**|**Towards LLM Agents for Earth Observation: The UnivEARTH Dataset**|Arxiv2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.12110)|null|\n|**Geo-OLM**|**Geo-OLM: Enabling Sustainable Earth Observation Studies with Cost-Efficient Open Language Models & State-Driven Workflows**|COMPASS'2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.04319)|[link](https:\u002F\u002Fgithub.com\u002Fdstamoulis\u002Fgeo-olms)|\n|**ThinkGeo**|**ThinkGeo: Evaluating Tool-Augmented Agents for Remote Sensing Tasks**|Arxiv2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.23752)|[link](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FThinkGeo)|\n|**AirSpatialBot**|**AirSpatialBot: A Spatially Aware Aerial Agent for Fine-Grained Vehicle Attribute Recognition and Retrieval**|IEEE TGRS2025|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2025.3570895)|[link](https:\u002F\u002Fgithub.com\u002FVisionXLab\u002FAirSpatialBot)|\n|**OpenEarthAgent**|**OpenEarthAgent: A Unified Framework for Tool-Augmented Geospatial Agents**|Arxiv2026|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.17665)|[link](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FOpenEarthAgent)|\n|**GeoEyes**|**GeoEyes: Geospatial Context-Zoom Agent for Long-Range Vision-Language Understanding on Ultra-High-Resolution Remote Sensing Images**|Arxiv2026|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.14201)|[link](https:\u002F\u002Fgithub.com\u002Fnanocm\u002FGeoEyes)|\n|**Earth-Agent**|**Earth-Agent: Unlocking the Full Landscape of Earth Observation with Agents**|ICLR2026|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.23141)|[link](https:\u002F\u002Fgithub.com\u002Fopendatalab\u002FEarth-Agent)|\n\n## Benchmarks for RSFMs\n|Abbreviation|Title|Publication|Paper|Link|Downstream Tasks|\n|:---:|---|:---:|:---:|:---:|:---:|\n|**-**|**Revisiting pre-trained remote sensing model benchmarks: resizing and normalization matters**|Arxiv2023|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13456)|[link](https:\u002F\u002Fgithub.com\u002Fisaaccorley\u002Fresize-is-all-you-need)|Classification|\n|**GEO-Bench**|**GEO-Bench: Toward Foundation Models for Earth Monitoring**|Arxiv2023|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.03831)|[link](https:\u002F\u002Fgithub.com\u002FServiceNow\u002Fgeo-bench)|Classification & Segmentation|\n|**FoMo-Bench**|**FoMo-Bench: a multi-modal, multi-scale and multi-task Forest Monitoring Benchmark for remote sensing foundation models**|Arxiv2023|[FoMo-Bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.10114)|Comming soon|Classification & Segmentation & Detection for forest monitoring|\n|**PhilEO**|**PhilEO Bench: Evaluating Geo-Spatial Foundation Models**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.04464)|[link](https:\u002F\u002Fgithub.com\u002F91097luke\u002Fphileo-bench)|Segmentation & Regression estimation|\n|**SkySense**|**SkySense: A Multi-Modal Remote Sensing Foundation Model Towards Universal Interpretation for Earth Observation Imagery**|CVPR2024|[SkySense](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.10115)|Targeted open-source|Classification & Segmentation & Detection & Change detection & Multi-Modal Segmentation: Time-insensitive LandCover Mapping & Multi-Modal Segmentation: Time-sensitive Crop Mapping & Multi-Modal Scene Classification|\n|**VLEO-Bench**|**Good at captioning, bad at counting: Benchmarking GPT-4V on Earth observation data**|Arxiv2024|[VLEO-bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.17600)|[link](https:\u002F\u002Fvleo.danielz.ch\u002F)| Location Recognition & Captioning & Scene Classification & Counting & Detection & Change detection|\n|**VRSBench**|**VRSBench: A Versatile Vision-Language Benchmark Dataset for Remote Sensing Image Understanding**|NeurIPS2024|[VRSBench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.12384)|[link](https:\u002F\u002Fvrsbench.github.io\u002F)|Image Captioning & Object Referring & Visual Question Answering|\n|**UrBench**|**UrBench: A Comprehensive Benchmark for Evaluating Large Multimodal Models in Multi-View Urban Scenarios**|AAAI2025|[UrBench](https:\u002F\u002Fdoi.org\u002F10.1609\u002Faaai.v39i10.33163)|[link](https:\u002F\u002Fopendatalab.github.io\u002FUrBench\u002F)|Object Referring & Visual Question Answering & Counting & Scene Classification & Location Recognition & Geolocalization|\n|**PANGAEA**|**PANGAEA: A Global and Inclusive Benchmark for Geospatial Foundation Models**|Arxiv2024|[PANGAEA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.04204)|[link](https:\u002F\u002Fgithub.com\u002Fyurujaja\u002Fpangaea-bench)|Segmentation & Change detection & Regression|\n|**CHOICE**|**CHOICE: Evaluating and Understanding Vision-Language Model Choices in Remote Sensing**|NeurIPS2025|[CHOICE](https:\u002F\u002Fneurips.cc\u002Fvirtual\u002F2025\u002Fposter\u002F121749)|[link](https:\u002F\u002Fgithub.com\u002FShawnAn-WHU\u002FCHOICE)|Perception & Reasoning|\n|**GEO-Bench-VLM**|**GEO-Bench-VLM: Benchmarking Vision-Language Models for Geospatial Tasks**|ICCV2025|[GEO-Bench-VLM](https:\u002F\u002Ficcv.thecvf.com\u002Fvirtual\u002F2025\u002Fposter\u002F2247)|[link](https:\u002F\u002Fgithub.com\u002FThe-AI-Alliance\u002FGEO-Bench-VLM)|Scene Understanding & Counting & Object Classification & Event Detection & Spatial Relations|\n|**Copernicus-Bench**|**Towards a Unified Copernicus Foundation Model for Earth Vision**|Arxiv2025|[Copernicus-Bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11849)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FCopernicus-FM)|Segmentation & Classification & Change detection & Regression|\n|**REOBench**|**REOBench: Benchmarking Robustness of Earth Observation Foundation Models**|Arxiv2025|[REOBench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.16793)|[link](https:\u002F\u002Fgithub.com\u002Flx709\u002FREOBench)|Robustness across 6 Earth observation tasks|\n|**Plantation Bench**|**Plantation Bench: A Multiscale, Multimodal Remote Sensing Benchmark for Plantation Mapping Under Distribution Shift**|ICCVW2025|[Plantation Bench](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ficcvw69036.2025.00310)|null|Plantation Mapping under Distribution Shift|\n|**ChatEarthBench**|**ChatEarthBench: Benchmarking multimodal large language models for Earth observation**|IEEE GRSM2026|[ChatEarthBench](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fmgrs.2026.3650840)|null|Benchmarking EO multimodal large language models|\n|**GeoReason-Bench**|**GeoReason: Aligning Thinking And Answering In Remote Sensing Vision-Language Models Via Logical Consistency Reinforcement Learning**|Arxiv2026|[GeoReason-Bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.04118)|[link](https:\u002F\u002Fgithub.com\u002Fcanlanqianyan\u002FGeoReason)|Logical consistency & multi-step reasoning|\n|**Earth-Bench**|**Earth-Agent: Unlocking the Full Landscape of Earth Observation with Agents**|ICLR2026|[Earth-Bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.23141)|[link](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FSssunset\u002FEarth-Bench)|Tool-augmented EO reasoning & multi-step planning & quantitative spatiotemporal analysis|\n|**OmniEarth**|**OmniEarth: A Benchmark for Evaluating Vision-Language Models in Geospatial Tasks**|Arxiv2026|[OmniEarth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.09471)|[link](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fsjeeudd\u002FOmniEarth)|Perception & Reasoning & Robustness across geospatial tasks|\n\n\n## (Large-scale) Pre-training Datasets\n\n|Abbreviation|Title|Publication|Paper|Attribute|Link|\n|:---:|---|:---:|:---:|:---:|:---:|\n|**fMoW**|**Functional Map of the World**|CVPR2018|[fMoW](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fhtml\u002FChristie_Functional_Map_of_CVPR_2018_paper.html)|**Vision**|[link](https:\u002F\u002Fgithub.com\u002FfMoW)|\n|**SEN12MS**|**SEN12MS -- A Curated Dataset of Georeferenced Multi-Spectral Sentinel-1\u002F2 Imagery for Deep Learning and Data Fusion**|-|[SEN12MS](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.07789)|**Vision**|[link](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.07789)|\n|**BEN-MM**|**BigEarthNet-MM: A Large Scale Multi-Modal Multi-Label Benchmark Archive for Remote Sensing Image Classification and Retrieval**|GRSM2021|[BEN-MM](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9552024)|**Vision**|[link](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9552024)|\n|**MillionAID**|**On Creating Benchmark Dataset for Aerial Image Interpretation: Reviews, Guidances, and Million-AID**|JSTARS2021|[MillionAID](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9393553)|**Vision**|[link](https:\u002F\u002Fcaptain-whu.github.io\u002FDiRS\u002F)|\n|**SeCo**|**Seasonal Contrast: Unsupervised Pre-Training From Uncurated Remote Sensing Data**|ICCV2021|[SeCo](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021\u002Fhtml\u002FManas_Seasonal_Contrast_Unsupervised_Pre-Training_From_Uncurated_Remote_Sensing_Data_ICCV_2021_paper.html)|**Vision**|[link](https:\u002F\u002Fgithub.com\u002FServiceNow\u002Fseasonal-contrast)|\n|**fMoW-S2**|**SatMAE: Pre-training Transformers for Temporal and Multi-Spectral Satellite Imagery**|NeurIPS2022|[fMoW-S2](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2022\u002Fhash\u002F01c561df365429f33fcd7a7faa44c985-Abstract-Conference.html)|**Vision**|[link](https:\u002F\u002Fpurl.stanford.edu\u002Fvg497cb6002)|\n|**TOV-RS-Balanced**|**TOV: The original vision model for optical remote sensing image understanding via self-supervised learning**|JSTARS2023|[TOV](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10110958)|**Vision**|[link](https:\u002F\u002Fgithub.com\u002FGeoX-Lab\u002FG-RSIM\u002Ftree\u002Fmain\u002FTOV_v1)|\n|**SSL4EO-S12**|**SSL4EO-S12: A Large-Scale Multi-Modal, Multi-Temporal Dataset for Self-Supervised Learning in Earth Observation**|GRSM2023|[SSL4EO-S12](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.07044)|**Vision**|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FSSL4EO-S12)|\n|**SSL4EO-L**|**SSL4EO-L: Datasets and Foundation Models for Landsat Imagery**|Arxiv2023|[SSL4EO-L](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.09424)|**Vision**|[link](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Ftorchgeo)|\n|**SatlasPretrain**|**SatlasPretrain: A Large-Scale Dataset for Remote Sensing Image Understanding**|ICCV2023|[SatlasPretrain](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.15660)|**Vision (Supervised)**|[link](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fsatlas\u002Fblob\u002Fmain\u002FSatlasPretrain.md)|\n|**CACo**|**Change-Aware Sampling and Contrastive Learning for Satellite Images**|CVPR2023|[CACo](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fhtml\u002FMall_Change-Aware_Sampling_and_Contrastive_Learning_for_Satellite_Images_CVPR_2023_paper.html)|**Vision**|[Comming soon](https:\u002F\u002Fgithub.com\u002Futkarshmall13\u002FCACo)|\n|**SAMRS**|**SAMRS: Scaling-up Remote Sensing Segmentation Dataset with Segment Anything Model**|NeurIPS2023|[SAMRS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.02034)|**Vision**|[link](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FSAMRS)|\n|**RSVG**|**RSVG: Exploring Data and Models for Visual Grounding on Remote Sensing Data**|TGRS2023|[RSVG](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10056343)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002FZhanYang-nwpu\u002FRSVG-pytorch)|\n|**RS5M**|**RS5M: A Large Scale Vision-Language Dataset for Remote Sensing Vision-Language Foundation Model**|Arxiv2023|[RS5M](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.11300)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002Fom-ai-lab\u002FRS5M)|\n|**GEO-Bench**|**GEO-Bench: Toward Foundation Models for Earth Monitoring**|Arxiv2023|[GEO-Bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.03831)|**Vision (Evaluation)**|[link](https:\u002F\u002Fgithub.com\u002FServiceNow\u002Fgeo-bench)|\n|**RSICap & RSIEval**|**RSGPT: A Remote Sensing Vision Language Model and Benchmark**|Arxiv2023|[RSGPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15266)|**Vision-Language**|[Comming soon](https:\u002F\u002Fgithub.com\u002FLavender105\u002FRSGPT)|\n|**Clay**|**Clay Foundation Model**|-|null|**Vision**|[link](https:\u002F\u002Fclay-foundation.github.io\u002Fmodel\u002F)|\n|**SATIN**|**SATIN: A Multi-Task Metadataset for Classifying Satellite Imagery using Vision-Language Models**|ICCVW2023|[SATIN](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.11619)|**Vision-Language**|[link](https:\u002F\u002Fsatinbenchmark.github.io\u002F)|\n|**SkyScript**|**SkyScript: A Large and Semantically Diverse Vision-Language Dataset for Remote Sensing**|AAAI2024|[SkyScript](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.12856)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002Fwangzhecheng\u002FSkyScript)|\n|**ChatEarthNet**|**ChatEarthNet: a global-scale image-text dataset empowering vision-language geo-foundation models**|ESSD2025|[ChatEarthNet](https:\u002F\u002Fdoi.org\u002F10.5194\u002Fessd-17-1245-2025)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FChatEarthNet)|\n|**LuoJiaHOG**|**LuoJiaHOG: A hierarchy oriented geo-aware image caption dataset for remote sensing image-text retrieval**|ISPRS JPRS2025|[LuoJiaHOG](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.isprsjprs.2025.02.009)|**Vision-Language**|null|\n|**MMEarth**|**MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial Representation Learning**|Arxiv2024|[MMEarth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.02771)|**Vision**|[link](https:\u002F\u002Fvishalned.github.io\u002Fmmearth\u002F)|\n|**SeeFar**|**SeeFar: Satellite Agnostic Multi-Resolution Dataset for Geospatial Foundation Models**|Arxiv2024|[SeeFar](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.06776)|**Vision**|[link](https:\u002F\u002Fcoastalcarbon.ai\u002Fseefar)|\n|**FIT-RS**|**SkySenseGPT: A Fine-Grained Instruction Tuning Dataset and Model for Remote Sensing Vision-Language Understanding**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.10100)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002FLuo-Z13\u002FSkySenseGPT)|\n|**RS-GPT4V**|**RS-GPT4V: A Unified Multimodal Instruction-Following Dataset for Remote Sensing Image Understanding**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.12479)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002FGeoX-Lab\u002FRS-GPT4V\u002Ftree\u002Fmain)|\n|**RS-4M**|**Scaling Efficient Masked Autoencoder Learning on Large Remote Sensing Dataset**|Arxiv2024|[RS-4M](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.11933)|**Vision**|[link](https:\u002F\u002Fgithub.com\u002FFengxiang23\u002FSelectiveMAE)|\n|**Major TOM**|**Major TOM: Expandable Datasets for Earth Observation**|Arxiv2024|[Major TOM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.12095)|**Vision**|[link](https:\u002F\u002Fhuggingface.co\u002FMajor-TOM)|\n|**VRSBench**|**VRSBench: A Versatile Vision-Language Benchmark Dataset for Remote Sensing Image Understanding**|Arxiv2024|[VRSBench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.12384)|**Vision-Language**|[link](https:\u002F\u002Fvrsbench.github.io\u002F)|\n|**MMM-RS**|**MMM-RS: A Multi-modal, Multi-GSD, Multi-scene Remote Sensing Dataset and Benchmark for Text-to-Image Generation**|Arxiv2024|[MMM-RS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.22362)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002Fljl5261\u002FMMM-RS)|\n|**DDFAV**|**DDFAV: Remote Sensing Large Vision Language Models Dataset and Evaluation Benchmark**|RS2025|[DDFAV](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs17040719)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002FHaodongLi2024\u002Frspope)|\n|**M3LEO**|**A Multi-Modal, Multi-Label Earth Observation Dataset Integrating Interferometric SAR and Multispectral Data**|NeurIPS2024|[M3LEO](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Ffile\u002Fbd194b579f60879e04ca9ce8a4ea5da1-Paper-Datasets_and_Benchmarks_Track.pdf)|**Vision**|[link](https:\u002F\u002Fgithub.com\u002Fspaceml-org\u002FM3LEO)|\n|**Copernicus-Pretrain**|**Towards a Unified Copernicus Foundation Model for Earth Vision**|Arxiv2025|[Copernicus-Pretrain](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11849)|**Vision**|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FCopernicus-FM)|\n|**DGTRSD**|**DGTRSD & DGTRS-CLIP: A Dual-Granularity Remote Sensing Image-Text Dataset and Vision Language Foundation Model for Alignment**|Arxiv2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.19311)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002FMitsuiChen14\u002FDGTRS)|\n|**EarthDial-Instruct**|**EarthDial: Turning Multi-sensory Earth Observations to Interactive Dialogues**|CVPR2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.15190)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002Fhiyamdebary\u002FEarthDial)|\n|**GeoPixelD**|**GeoPixel: Pixel Grounding Large Multimodal Model in Remote Sensing**|ICML2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.13925)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FGeoPixel)|\n|**GeoPixInstruct**|**GeoPix: Multi-Modal Large Language Model for Pixel-level Image Understanding in Remote Sensing**|IEEE GRSM2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.06828)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002FNorman-Ou\u002FGeoPix)|\n|**GeoLangBind-2M**|**Rethinking Remote Sensing CLIP: Leveraging Multimodal Large Language Models for High-Quality Vision-Language Dataset**|ICONIP2024|[Paper](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-981-96-6972-1_29)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002Fxiong-zhitong\u002FGeoLB-SigLIP)|\n|**Falcon_SFT**|**Falcon: A Remote Sensing Vision-Language Foundation Model**|Arxiv2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11070)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002FTianHuiLab\u002FFalcon)|\n|**UnivEARTH**|**Towards LLM Agents for Earth Observation: The UnivEARTH Dataset**|Arxiv2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.12110)|**Vision-Language & Agents**|null|\n|**RemoteSAM-270K**|**RemoteSAM: Towards Segment Anything for Earth Observation**|ACMMM2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.18022)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002F1e12Leon\u002FRemoteSAM)|\n|**OpenEarthAgent Dataset**|**OpenEarthAgent: A Unified Framework for Tool-Augmented Geospatial Agents**|Arxiv2026|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.17665)|**Vision-Language & Agents**|[link](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FOpenEarthAgent)|\n|**UHR-CoZ**|**GeoEyes: Geospatial Context-Zoom Agent for Long-Range Vision-Language Understanding on Ultra-High-Resolution Remote Sensing Images**|Arxiv2026|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.14201)|**Vision-Language**|[link](https:\u002F\u002Fgithub.com\u002Fnanocm\u002FGeoEyes)|\n\n## Embeddings data\n\n|Abbreviation|Title|Publication|Paper|Code|Dataset \u002F Product|\n|:---:|---|:---:|:---:|:---:|:---:|\n|**CLAY Embeddings**|**Clay Model v0 Embeddings**|Source Cooperative2024|null|[link](https:\u002F\u002Fgithub.com\u002FClay-foundation)|[link](https:\u002F\u002Fsource.coop\u002Fclay\u002Fclay-model-v0-embeddings)|\n|**Major TOM Embeddings**|**Global and Dense Embeddings of Earth: Major TOM Floating in the Latent Space**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.05600)|[link](https:\u002F\u002Fgithub.com\u002FESA-PhiLab\u002FMajor-TOM)|[link](https:\u002F\u002Fhuggingface.co\u002FMajor-TOM)|\n|**Earth Genome Embeddings**|**Embeddings for all**|Medium2025|[Paper](https:\u002F\u002Fmedium.com\u002Fearthrisemedia\u002Fembeddings-for-all-0e0a29415b26)|null|[link](https:\u002F\u002Fsource.coop\u002Fearthgenome\u002Fearthindexembeddings)|\n|**TESSERA**|**TESSERA: Precomputed FAIR Global Pixel Embeddings for Earth Representation and Analysis**|Arxiv2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.20380)|[link](https:\u002F\u002Fgithub.com\u002Fucam-eo\u002Ftessera)|[link](https:\u002F\u002Fgithub.com\u002Fucam-eo\u002Fgeotessera)|\n|**AlphaEarth**|**AlphaEarth Foundations: An embedding field model for accurate and efficient global mapping from sparse label data**|Arxiv2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.22291)|null|[link](https:\u002F\u002Fdevelopers.google.com\u002Fearth-engine\u002Fdatasets\u002Fcatalog\u002FGOOGLE_SATELLITE_EMBEDDING_V1_ANNUAL)|\n|**ESD**|**Democratizing planetary-scale analysis: An ultra-lightweight Earth embedding database for accurate and flexible global land monitoring**|Arxiv2026|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.11183)|[link](https:\u002F\u002Fgithub.com\u002Fshuangchencc\u002FESD)|[link](https:\u002F\u002Fdata-starcloud.pcl.ac.cn\u002Fiearthdata\u002F64)|\n\n## Relevant Projects\n|Title|Link|Brief Introduction|\n|---|:---:|:---:|\n|**RSFMs (Remote Sensing Foundation Models) Playground**|[link](https:\u002F\u002Fgithub.com\u002Fsynativ\u002FRSFMs)|An open-source playground to streamline the evaluation and fine-tuning of RSFMs on various datasets.|\n|**PANGAEA**|[link](https:\u002F\u002Fgithub.com\u002Fyurujaja\u002Fpangaea-bench)|A Global and Inclusive Benchmark for Geospatial Foundation Models.|\n|**GeoFM**|[link](https:\u002F\u002Fgithub.com\u002Fxiong-zhitong\u002FGeoFM)|Evaluation of Foundation Models for Earth Observation.|\n|**rs-embed**|[link](https:\u002F\u002Fgithub.com\u002Fcybergis\u002Frs-embed)|One line code to get Any Remote Sensing Foundation Model (RSFM) embeddings for Any Place and Any Time.|\n\n## Survey\u002FCommentary Papers\n|Title|Publication|Paper|Attribute|\n|---|:---:|:---:|:---:|\n|**Self-Supervised Remote Sensing Feature Learning: Learning Paradigms, Challenges, and Future Works**|TGRS2023|[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10126079)|**Vision & Vision-Language**|\n|**The Potential of Visual ChatGPT For Remote Sensing**|Arxiv2023|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.13009)|**Vision-Language**|\n|**遥感大模型：进展与前瞻**|武汉大学学报 (信息科学版) 2023|[Paper](http:\u002F\u002Fch.whu.edu.cn\u002Fcn\u002Farticle\u002Fdoi\u002F10.13203\u002Fj.whugis20230341?viewType=HTML)|**Vision & Vision-Language**|\n|**地理人工智能样本：模型、质量与服务**|武汉大学学报 (信息科学版) 2023|[Paper](http:\u002F\u002Fch.whu.edu.cn\u002Farticle\u002Fid\u002F5e67ed6a-aae5-4ec0-ad1b-f2aba89f4617)|**-**|\n|**Brain-Inspired Remote Sensing Foundation Models and Open Problems: A Comprehensive Survey**|JSTARS2023|[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10254282)|**Vision & Vision-Language**|\n|**Revisiting pre-trained remote sensing model benchmarks: resizing and normalization matters**|Arxiv2023|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13456)|**Vision**|\n|**An Agenda for Multimodal Foundation Models for Earth Observation**|IGARSS2023|[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10282966)|**Vision**|\n|**Transfer learning in environmental remote sensing**|RSE2024|[Paper](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0034425723004765)|**Transfer learning**|\n|**遥感基础模型发展综述与未来设想**|遥感学报2023|[Paper](https:\u002F\u002Fwww.ygxb.ac.cn\u002Fzh\u002Farticle\u002Fdoi\u002F10.11834\u002Fjrs.20233313\u002F)|**-**|\n|**On the Promises and Challenges of Multimodal Foundation Models for Geographical, Environmental, Agricultural, and Urban Planning Applications**|Arxiv2023|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.17016)|**Vision-Language**|\n|**Vision-Language Models in Remote Sensing: Current Progress and Future Trends**|IEEE GRSM2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.05726)|**Vision-Language**|\n|**On the Foundations of Earth and Climate Foundation Models**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.04285)|**Vision & Vision-Language**|\n|**Towards Vision-Language Geo-Foundation Model: A Survey**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.09385)|**Vision-Language**|\n|**AI Foundation Models in Remote Sensing: A Survey**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.03464)|**Vision**|\n|**Foundation model for generalist remote sensing intelligence: Potentials and prospects**|Science Bulletin2024|[Paper](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS2095927324006510?via%3Dihub)|**-**|\n|**Advancements in Visual Language Models for Remote Sensing: Datasets, Capabilities, and Enhancement Techniques**|Arxiv2024|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.17283)|**Vision-Language**|\n|**Foundation Models for Remote Sensing and Earth Observation: A survey**|IEEE GRSM2025|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fmgrs.2025.3576766)|**Vision & Vision-Language**|\n|**When Remote Sensing Meets Foundation Model: A Survey and Beyond**|RS2025|[Paper](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs17020179)|**Vision & Vision-Language & Generative & Agents**|\n|**Vision-Language Modeling Meets Remote Sensing: Models, datasets, and perspectives**|IEEE GRSM2025|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fmgrs.2025.3572702)|**Vision-Language**|\n|**Advances on Multimodal Remote Sensing Foundation Models for Earth Observation Downstream Tasks: A Survey**|RS2025|[Paper](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs17213532)|**Vision & Vision-Language**|\n|**Remote Sensing Tuning: A Survey**|CVM2025|[Paper](https:\u002F\u002Fdoi.org\u002F10.26599\u002Fcvm.2025.9450490)|**Vision & Vision-Language**|\n|**多模态遥感基础大模型：研究现状与未来展望**|测绘学报2024|[Paper](http:\u002F\u002Fxb.chinasmp.com\u002FCN\u002F10.11947\u002Fj.AGCS.2024.20240019.)|**Vision & Vision-Language & Generative & Vision-Location**|\n|**When Geoscience Meets Foundation Models: Toward a general geoscience artificial intelligence system**|IEEE GRSM2024|[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10770814)|**Vision & Vision-Language**|\n|**Towards the next generation of Geospatial Artificial Intelligence**|JAG2025|[Paper](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS1569843225000159)|**-**|\n|**Vision Foundation Models in Remote Sensing: A survey**|IEEE GRSM2025|[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10916803)|**Vision**|\n|**MIMRS: A Survey on Masked Image Modeling in Remote Sensing**|IGARSS2025|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Figarss55030.2025.11243448)|**Vision**|\n|**A Review of Challenges and Applications in Remote Sensing Foundation Models**|IGARSS2025|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Figarss55030.2025.11242732)|**Vision & Vision-Language**|\n|**Unleashing the potential of remote sensing foundation models via bridging data and computility islands**|The Innovation2025|[Paper](https:\u002F\u002Fwww.cell.com\u002Fthe-innovation\u002Ffulltext\u002FS2666-6758(25)00044-X)|**-**|\n|**A Survey on Remote Sensing Foundation Models: From Vision to Multimodality**|Arxiv2025|[Paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.22081)|**-**|\n|**A Genealogy of Foundation Models in Remote Sensing**|ACM TSAS2026|[Paper](https:\u002F\u002Fdoi.org\u002F10.1145\u002F3789505)|**Vision & Vision-Language**|\n|**Onboard Deployment of Remote Sensing Foundation Models: A Comprehensive Review of Architecture, Optimization, and Hardware**|RS2026|[Paper](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs18020298)|**Vision & Vision-Language**|\n|**On the foundations of Earth foundation models**|Communications Earth & Environment 2026|[Paper](https:\u002F\u002Fdoi.org\u002F10.1038\u002Fs43247-025-03127-x)|**Vision & Vision-Language**|\n\n\n## Citation\n\nIf you find this repository useful, please consider giving a star :star: and citation:\n\n```\n@inproceedings{guo2024skysense,\n  title={Skysense: A multi-modal remote sensing foundation model towards universal interpretation for earth observation imagery},\n  author={Guo, Xin and Lao, Jiangwei and Dang, Bo and Zhang, Yingying and Yu, Lei and Ru, Lixiang and Zhong, Liheng and Huang, Ziyuan and Wu, Kang and Hu, Dingxiang and others},\n  booktitle={Proceedings of the IEEE\u002FCVF Conference on Computer Vision and Pattern Recognition},\n  pages={27672--27683},\n  year={2024}\n}\n\n@article{li2025unleashing,\n  title={Unleashing the potential of remote sensing foundation models via bridging data and computility islands},\n  author={Li, Yansheng and Tan, Jieyi and Dang, Bo and Ye, Mang and Bartalev, Sergey A and Shinkarenko, Stanislav and Wang, Linlin and Zhang, Yingying and Ru, Lixiang and Guo, Xin and others},\n  journal={The Innovation},\n  year={2025},\n  publisher={Elsevier}\n}\n\n@article{wu2025semantic,\n  author = {Wu, Kang and Zhang, Yingying and Ru, Lixiang and Dang, Bo and Lao, Jiangwei and Yu, Lei and Luo, Junwei and Zhu, Zifan and Sun, Yue and Zhang, Jiahao and Zhu, Qi and Wang, Jian and Yang, Ming and Chen, Jingdong and Zhang, Yongjun and Li, Yansheng},\n  title= {A semantic‑enhanced multi‑modal remote sensing foundation model for Earth observation},\n  journal= {Nature Machine Intelligence},\n  year= {2025},\n  doi= {10.1038\u002Fs42256-025-01078-8},\n  url= {https:\u002F\u002Fdoi.org\u002F10.1038\u002Fs42256-025-01078-8}\n}\n\n@inproceedings{zhu2025skysense,\n  title={Skysense-o: Towards open-world remote sensing interpretation with vision-centric visual-language modeling},\n  author={Zhu, Qi and Lao, Jiangwei and Ji, Deyi and Luo, Junwei and Wu, Kang and Zhang, Yingying and Ru, Lixiang and Wang, Jian and Chen, Jingdong and Yang, Ming and others},\n  booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference},\n  pages={14733--14744},\n  year={2025}\n}\n\n@article{luo2024skysensegpt,\n  title={Skysensegpt: A fine-grained instruction tuning dataset and model for remote sensing vision-language understanding},\n  author={Luo, Junwei and Pang, Zhen and Zhang, Yongjun and Wang, Tingzhu and Wang, Linlin and Dang, Bo and Lao, Jiangwei and Wang, Jian and Chen, Jingdong and Tan, Yihua and others},\n  journal={arXiv preprint arXiv:2406.10100},\n  year={2024}\n}\n```\n","[![维护中](https:\u002F\u002Fimg.shields.io\u002Fbadge\u002FMaintained%3F-yes-green.svg)](https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models\u002Fgraphs\u002Fcommit-activity)\n[![Awesome](https:\u002F\u002Fcdn.rawgit.com\u002Fsindresorhus\u002Fawesome\u002Fd7305f38d29fed78fa85652e3a63e154dd8e8829\u002Fmedia\u002Fbadge.svg)](https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models)\n\u003Cimg alt=\"GitHub 监视者数\" src=\"https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fwatchers\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models?style=social\"> \u003Cimg alt=\"GitHub 星标数\" src=\"https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fstars\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models?style=social\"> \u003Cimg alt=\"GitHub 分支数\" src=\"https:\u002F\u002Fimg.shields.io\u002Fgithub\u002Fforks\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models?style=social\">\n\n# \u003Cp align=center>`Awesome 遥感基础模型`\u003C\u002Fp>\n\n:star2:**遥感基础模型（RSFM）相关的论文、数据集、基准测试、代码及预训练权重的集合。**\n\n## 📢 最新动态\n:fire::fire::fire: 最后更新于 2026.03.13 :fire::fire::fire:\n\n## 目录\n- **模型**\n  - [遥感视觉基础模型](#remote-sensing-vision-foundation-models)\n  - [遥感视觉-语言基础模型](#remote-sensing-vision-language-foundation-models)\n  - [遥感生成式基础模型](#remote-sensing-generative-foundation-models)\n  - [遥感视觉-位置基础模型](#remote-sensing-vision-location-foundation-models)\n  - [遥感视觉-音频基础模型](#remote-sensing-vision-audio-foundation-models)\n  - [遥感任务特定基础模型](#remote-sensing-task-specific-foundation-models)\n  - [遥感智能体](#remote-sensing-agents)\n- **数据集与基准测试**\n  - [RSFM 的基准测试](#benchmarks-for-rSFMs)\n  - [(大规模) 预训练数据集](#large-scale-Pre-training-Datasets)\n  - [嵌入数据](#embeddings-data)\n- **其他**\n  - [相关项目](#relevant-projects)\n  - [综述论文](#survey-papers)\n  \n## 遥感\u003Cins>视觉\u003C\u002Fins>基础模型\n\n|Abbreviation|Title|Publication|Paper|Code & Weights|\n|:---:|---|:---:|:---:|:---:|\n|**GeoKR**|**Geographical Knowledge-Driven Representation Learning for Remote Sensing Images**|TGRS2021|[GeoKR](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9559903)|[link](https:\u002F\u002Fgithub.com\u002Fflyakon\u002FGeographical-Knowledge-driven-Representaion-Learning)|\n|**-**|**Self-Supervised Learning of Remote Sensing Scene Representations Using Contrastive Multiview Coding**|CVPRW2021|[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2021W\u002FEarthVision\u002Fhtml\u002FStojnic_Self-Supervised_Learning_of_Remote_Sensing_Scene_Representations_Using_Contrastive_Multiview_CVPRW_2021_paper.html)|[link](https:\u002F\u002Fgithub.com\u002Fvladan-stojnic\u002FCMC-RSSR)|\n|**GASSL**|**Geography-Aware Self-Supervised Learning**|ICCV2021|[GASSL](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021\u002Fhtml\u002FAyush_Geography-Aware_Self-Supervised_Learning_ICCV_2021_paper.html)|[link](https:\u002F\u002Fgithub.com\u002Fsustainlab-group\u002Fgeography-aware-ssl)|\n|**SeCo**|**Seasonal Contrast: Unsupervised Pre-Training From Uncurated Remote Sensing Data**|ICCV2021|[SeCo](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021\u002Fhtml\u002FManas_Seasonal_Contrast_Unsupervised_Pre-Training_From_Uncurated_Remote_Sensing_Data_ICCV_2021_paper.html)|[link](https:\u002F\u002Fgithub.com\u002FServiceNow\u002Fseasonal-contrast)|\n|**DINO-MM**|**Self-Supervised Vision Transformers for Joint SAR-Optical Representation Learning**|IGARSS2022|[DINO-MM](https:\u002F\u002Fdoi.org\u002F10.1109\u002Figarss46834.2022.9883983)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FDINO-MM)|\n|**SatMAE**|**SatMAE: Pre-training Transformers for Temporal and Multi-Spectral Satellite Imagery**|NeurIPS2022|[SatMAE](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2022\u002Fhash\u002F01c561df365429f33fcd7a7faa44c985-Abstract-Conference.html)|[link](https:\u002F\u002Fgithub.com\u002Fsustainlab-group\u002FSatMAE)|\n|**RS-BYOL**|**Self-Supervised Learning for Invariant Representations From Multi-Spectral and SAR Images**|JSTARS2022|[RS-BYOL](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9880533)|null|\n|**GeCo**|**Geographical Supervision Correction for Remote Sensing Representation Learning**|TGRS2022|[GeCo](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9869651)|null|\n|**RingMo**|**RingMo: A remote sensing foundation model with masked image modeling**|TGRS2022|[RingMo](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9844015)|[Code](https:\u002F\u002Fgithub.com\u002Fcomeony\u002FRingMo)|\n|**RVSA**|**Advancing plain vision transformer toward remote sensing foundation model**|TGRS2022|[RVSA](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9956816)|[link](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FRemote-Sensing-RVSA)|\n|**RSP**|**An Empirical Study of Remote Sensing Pretraining**|TGRS2022|[RSP](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9782149)|[link](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FRemote-Sensing-RVSA)|\n|**MATTER**|**Self-Supervised Material and Texture Representation Learning for Remote Sensing Tasks**|CVPR2022|[MATTER](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2022\u002Fhtml\u002FAkiva_Self-Supervised_Material_and_Texture_Representation_Learning_for_Remote_Sensing_Tasks_CVPR_2022_paper.html)|null|\n|**CSPT**|**Consecutive Pre-Training: A Knowledge Transfer Learning Strategy with Relevant Unlabeled Data for Remote Sensing Domain**|RS2022|[CSPT](https:\u002F\u002Fwww.mdpi.com\u002F2072-4292\u002F14\u002F22\u002F5675#)|[link](https:\u002F\u002Fgithub.com\u002FZhAnGToNG1\u002Ftransfer_learning_cspt)|\n|**-**|**Self-supervised Vision Transformers for Land-cover Segmentation and Classification**|CVPRW2022|[Paper](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2022W\u002FEarthVision\u002Fhtml\u002FScheibenreif_Self-Supervised_Vision_Transformers_for_Land-Cover_Segmentation_and_Classification_CVPRW_2022_paper.html)|[link](https:\u002F\u002Fgithub.com\u002FHSG-AIML\u002FSSLTransformerRS)|\n|**TOV**|**TOV: The original vision model for optical remote sensing image understanding via self-supervised learning**|JSTARS2023|[TOV](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10110958)|[link](https:\u002F\u002Fgithub.com\u002FGeoX-Lab\u002FG-RSIM\u002Ftree\u002Fmain\u002FTOV_v1)|\n|**CMID**|**CMID: A Unified Self-Supervised Learning Framework for Remote Sensing Image Understanding**|TGRS2023|[CMID](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10105625)|[link](https:\u002F\u002Fgithub.com\u002FNJU-LHRS\u002Fofficial-CMID)|\n|**RingMo-Sense**|**RingMo-Sense: Remote Sensing Foundation Model for Spatiotemporal Prediction via Spatiotemporal Evolution Disentangling**|TGRS2023|[RingMo-Sense](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10254320)|null|\n|**AST**|**AST: Adaptive Self-supervised Transformer for Optical Remote Sensing Representation**|ISPRS JPRS2023|[AST](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.isprsjprs.2023.04.003)|null|\n|**IaI-SimCLR**|**Multi-Modal Multi-Objective Contrastive Learning for Sentinel-1\u002F2 Imagery**|CVPRW2023|[IaI-SimCLR](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023W\u002FEarthVision\u002Fhtml\u002FPrexl_Multi-Modal_Multi-Objective_Contrastive_Learning_for_Sentinel-12_Imagery_CVPRW_2023_paper.html)|null|\n|**CACo**|**Change-Aware Sampling and Contrastive Learning for Satellite Images**|CVPR2023|[CACo](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fhtml\u002FMall_Change-Aware_Sampling_and_Contrastive_Learning_for_Satellite_Images_CVPR_2023_paper.html)|[link](https:\u002F\u002Fgithub.com\u002Futkarshmall13\u002FCACo)|\n|**SatLas**|**SatlasPretrain: A Large-Scale Dataset for Remote Sensing Image Understanding**|ICCV2023|[SatLas](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ficcv51070.2023.01538)|[link](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fsatlas)|\n|**GFM**|**Towards Geospatial Foundation Models via Continual Pretraining**|ICCV2023|[GFM](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ficcv51070.2023.01541)|[link](https:\u002F\u002Fgithub.com\u002Fmmendiet\u002FGFM)|\n|**Scale-MAE**|**Scale-MAE: A Scale-Aware Masked Autoencoder for Multiscale Geospatial Representation Learning**|ICCV2023|[Scale-MAE](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ficcv51070.2023.00378)|[link](https:\u002F\u002Fgithub.com\u002Fbair-climate-initiative\u002Fscale-mae)|\n|**DINO-MC**|**DINO-MC: Self-supervised Contrastive Learning for Remote Sensing Imagery with Multi-sized Local Crops**|Arxiv2023|[DINO-MC](https:\u002F\u002Farxiv.org\u002Fabs\u002F2303.06670)|[link](https:\u002F\u002Fgithub.com\u002FWennyXY\u002FDINO-MC)|\n|**CROMA**|**CROMA: Remote Sensing Representations with Contrastive Radar-Optical Masked Autoencoders**|NeurIPS2023|[CROMA](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2311.00566.pdf)|[link](https:\u002F\u002Fgithub.com\u002Fantofuller\u002FCROMA)|\n|**Cross-Scale MAE**|**Cross-Scale MAE: A Tale of Multiscale Exploitation in Remote Sensing**|NeurIPS2023|[Cross-Scale MAE](https:\u002F\u002Fopenreview.net\u002Fpdf?id=5oEVdOd6TV)|[link](https:\u002F\u002Fgithub.com\u002Faicip\u002FCross-Scale-MAE)|\n|**Presto**|**Lightweight, Pre-trained Transformers for Remote Sensing Timeseries**|Arxiv2023|[Presto](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.14065)|[link](https:\u002F\u002Fgithub.com\u002Fnasaharvest\u002Fpresto)|\n|**Prithvi**|**Foundation Models for Generalist Geospatial Artificial Intelligence**|Arxiv2023|[Prithvi](https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.18660)|[link](https:\u002F\u002Fhuggingface.co\u002Fibm-nasa-geospatial)|\n|**-**|**A Self-Supervised Cross-Modal Remote Sensing Foundation Model with Multi-Domain Representation and Cross-Domain Fusion**|IGARSS2023|[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10282433)|null|\n|**EarthPT**|**EarthPT: a time series foundation model for Earth Observation**|NeurIPS2023 CCAI workshop|[EarthPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.07207)|[link](https:\u002F\u002Fgithub.com\u002Faspiaspace\u002FEarthPT)|\n|**USat**|**USat: A Unified Self-Supervised Encoder for Multi-Sensor Satellite Imagery**|Arxiv2023|[USat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.02199)|[link](https:\u002F\u002Fgithub.com\u002Fstanfordmlgroup\u002FUSat)|\n|**AIEarth**|**Analytical Insight of Earth: A Cloud-Platform of Intelligent Computing for Geospatial Big Data**|Arxiv2023|[AIEarth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.16385)|[link](https:\u002F\u002Fengine-aiearth.aliyun.com\u002F#\u002F)|\n|**Clay**|**Clay Foundation Model**|-|null|[link](https:\u002F\u002Fclay-foundation.github.io\u002Fmodel\u002F)|\n|**Hydro**|**Hydro--A Foundation Model for Water in Satellite Imagery**|-|null|[link](https:\u002F\u002Fgithub.com\u002Fisaaccorley\u002Fhydro-foundation-model)|\n|**BFM**|**A Billion-scale Foundation Model for Remote Sensing Images**|IEEE JSTARS2024|[BFM](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fjstars.2024.3401772)|null|\n|**U-BARN**|**Self-Supervised Spatio-Temporal Representation Learning of Satellite Image Time Series**|JSTARS2024|[Paper](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10414422)|[link](https:\u002F\u002Fsrc.koda.cnrs.fr\u002Firis.dumeur\u002Fssl_ubarn)|\n|**GeRSP**|**Generic Knowledge Boosted Pretraining for Remote Sensing Images**|TGRS2024|[GeRSP](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2024.3354031)|[GeRSP](https:\u002F\u002Fgithub.com\u002FfloatingstarZ\u002FGeRSP)|\n|**SwiMDiff**|**SwiMDiff: Scene-Wide Matching Contrastive Learning With Diffusion Constraint for Remote Sensing Image**|TGRS2024|[SwiMDiff](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2024.3371481)|null|\n|**OFA-Net**|**One for All: Toward Unified Foundation Models for Earth Vision**|IGARSS2024|[OFA-Net](https:\u002F\u002Fdoi.org\u002F10.1109\u002Figarss53475.2024.10641637)|null|\n|**-**|**Lightweight and Efficient: A Family of Multimodal Earth Observation Foundation Models**|IGARSS2024|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Figarss53475.2024.10641132)|null|\n|**SMLFR**|**Generative ConvNet Foundation Model With Sparse Modeling and Low-Frequency Reconstruction for Remote Sensing Image Interpretation**|TGRS2024|[SMLFR](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10378718)|[link](https:\u002F\u002Fgithub.com\u002FHIT-SIRS\u002FSMLFR)|\n|**SpectralGPT**|**SpectralGPT: Spectral Remote Sensing Foundation Model**|TPAMI2024|[SpectralGPT](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftpami.2024.3362475)|[link](https:\u002F\u002Fgithub.com\u002Fdanfenghong\u002FIEEE_TPAMI_SpectralGPT)|\n|**S2MAE**|**S2MAE: A Spatial-Spectral Pretraining Foundation Model for Spectral Remote Sensing Data**|CVPR2024|[S2MAE](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2024\u002Fpapers\u002FLi_S2MAE_A_Spatial-Spectral_Pretraining_Foundation_Model_for_Spectral_Remote_Sensing_CVPR_2024_paper.pdf)|null|\n|**SatMAE++**|**Rethinking Transformers Pre-training for Multi-Spectral Satellite Imagery**|CVPR2024|[SatMAE++](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fcvpr52733.2024.02627)|[link](https:\u002F\u002Fgithub.com\u002Ftechmn\u002Fsatmae_pp)|\n|**msGFM**|**Bridging Remote Sensors with Multisensor Geospatial Foundation Models**|CVPR2024|[msGFM](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fcvpr52733.2024.02631)|[link](https:\u002F\u002Fgithub.com\u002Fboranhan\u002FGeospatial_Foundation_Models)|\n|**SkySense**|**SkySense: A Multi-Modal Remote Sensing Foundation Model Towards Universal Interpretation for Earth Observation Imagery**|CVPR2024|[SkySense](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2024\u002Fhtml\u002FGuo_SkySense_A_Multi-Modal_Remote_Sensing_Foundation_Model_Towards_Universal_Interpretation_CVPR_2024_paper.html)|[link](https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FSkySense)|\n|**MTP**|**MTP: Advancing Remote Sensing Foundation Model via Multi-Task Pretraining**|IEEE JSTARS2024|[MTP](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fjstars.2024.3408154)|[link](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FMTP)|\n|**DOFA**|**Neural Plasticity-Inspired Multimodal Foundation Model for Earth Observation**|Arxiv2024|[DOFA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.15356)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FDOFA)|\n|**DeCUR**|**DeCUR: decoupling common & unique representations for multimodal self-supervision**|ECCV2024|[DeCUR](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-3-031-73397-0_17)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FDeCUR)|\n|**MMEarth**|**MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial Representation Learning**|ECCV2024|[MMEarth](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-3-031-73039-9_10)|[link](https:\u002F\u002Fvishalned.github.io\u002Fmmearth\u002F)|\n|**LeMeViT**|**LeMeViT: Efficient Vision Transformer with Learnable Meta Tokens for Remote Sensing Image Interpretation**|IJCAI2024|[LeMeViT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.09789)|[link](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FLeMeViT\u002Ftree\u002Fmain?tab=readme-ov-file)|\n|**SoftCon**|**Multi-Label Guided Soft Contrastive Learning for Efficient Earth Observation Pretraining**|TGRS2024|[SoftCon](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10726860)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002Fsoftcon?tab=readme-ov-file)|\n|**RS-DFM**|**RS-DFM: A Remote Sensing Distributed Foundation Model for Diverse Downstream Tasks**|Arxiv2024|[RS-DFM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.07032)|null|\n|**A2-MAE**|**A2-MAE: A spatial-temporal-spectral unified remote sensing pre-training method based on anchor-aware masked autoencoder**|Arxiv2024|[A2-MAE](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.08079)|null|\n|**OmniSat**|**OmniSat: Self-Supervised Modality Fusion for Earth Observation**|ECCV2024|[OmniSat](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-3-031-73390-1_24)|[link](https:\u002F\u002Fgithub.com\u002Fgastruc\u002FOmniSat?tab=readme-ov-file)|\n|**MM-VSF**|**Towards Knowledge Guided Pretraining Approaches for Multimodal Foundation Models: Applications in Remote Sensing**|Arxiv2024|[MM-VSF](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.19660)|null|\n|**MA3E**|**Masked Angle-Aware Autoencoder for Remote Sensing Images**|ECCV2024|[MA3E](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-3-031-73242-3_15)|[link](https:\u002F\u002Fgithub.com\u002Fbenesakitam\u002FMA3E)|\n|**SAR-JEPA**|**Predicting Gradient is Better: Exploring Self-Supervised Learning for SAR ATR with a Joint-Embedding Predictive Architecture**|ISPRS JPRS2024|[SAR-JEPA](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0924271624003514)|[link](https:\u002F\u002Fgithub.com\u002Fwaterdisappear\u002FSAR-JEPA)|\n|**PIS**|**Pretrain a Remote Sensing Foundation Model by Promoting Intra-instance Similarity**|TGRS2024|[PIS](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10697182)|[link](https:\u002F\u002Fgithub.com\u002FShawnAn-WHU\u002FPIS)|\n|**FG-MAE**|**Feature Guided Masked Autoencoder for Self-Supervised Learning in Remote Sensing**|IEEE JSTARS2024|[FG-MAE](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fjstars.2024.3493237)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FFGMAE)|\n|**RingMo-lite**|**RingMo-Lite: A Remote Sensing Lightweight Network With CNN-Transformer Hybrid Framework**|IEEE TGRS2024|[RingMo-lite](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2024.3360447)|null|\n|**-**|**A Multimodal Unified Representation Learning Framework With Masked Image Modeling for Remote Sensing Images**|IEEE TGRS2024|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2024.3494244)|null|\n|**-**|**Masked Feature Modeling for Generative Self-Supervised Representation Learning of High-Resolution Remote Sensing Images**|IEEE JSTARS2024|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fjstars.2024.3385420)|null|\n|**OReole-FM**|**OReole-FM: successes and challenges toward billion-parameter foundation models for high-resolution satellite imagery**|SIGSPATIAL2024|[OReole-FM](https:\u002F\u002Fdoi.org\u002F10.1145\u002F3678717.3691292)|null|\n|**SatVision-TOA**|**SatVision-TOA: A Geospatial Foundation Model for Coarse-Resolution All-Sky Remote Sensing Imagery**|Arxiv2024|[SatVision-TOA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.17000)|[link](https:\u002F\u002Fgithub.com\u002Fnasa-nccs-hpda\u002Fpytorch-caney)|\n|**Prithvi-EO-2.0**|**Prithvi-EO-2.0: A Versatile Multi-Temporal Foundation Model for Earth Observation Applications**|Arxiv2024|[Prithvi-EO-2.0](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.02732)|[link](https:\u002F\u002Fgithub.com\u002FNASA-IMPACT\u002FPrithvi-EO-2.0)|\n|**WildSAT**|**WildSAT: Learning Satellite Image Representations from Wildlife Observations**|Arxiv2024|[WildSAT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.14428)|[link](https:\u002F\u002Fgithub.com\u002Fmdchuc\u002FHRSFM)|\n|**SpectralEarth**|**SpectralEarth: Training Hyperspectral Foundation Models at Scale**|IEEE JSTARS2025|[SpectralEarth](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fjstars.2025.3581451)|null|\n|**SenPa-MAE**|**SenPa-MAE: Sensor Parameter Aware Masked Autoencoder for Multi-Satellite Self-Supervised Pretraining**|LNCS2025|[SenPa-MAE](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-3-031-85187-2_20)|[link](https:\u002F\u002Fgithub.com\u002FJonathanPrexl\u002FSenPa-MAE)|\n|**RingMo-Aerial**|**RingMo-Aerial: An Aerial Remote Sensing Foundation Model With Affine Transformation Contrastive Learning**|IEEE TPAMI2025|[RingMo-Aerial](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftpami.2025.3602237)|null|\n|**PIEViT**|**Pattern Integration and Enhancement Vision Transformer for Self-Supervised Learning in Remote Sensing**|IEEE TGRS2025|[PIEViT](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2025.3541390)|null|\n|**SeaMo**|**SeaMo: A Multi-Seasonal and Multimodal Remote Sensing Foundation Model**|Information Fusion2025|[SeaMo](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS1566253525004075)|null|\n|**HyperSIGMA**|**HyperSIGMA: Hyperspectral Intelligence Comprehension Foundation Model**|IEEE TPAMI2025|[HyperSIGMA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.11519)|[link](https:\u002F\u002Fgithub.com\u002FWHU-Sigma\u002FHyperSIGMA?tab=readme-ov-file)|\n|**FoMo**|**FoMo: Multi-Modal, Multi-Scale and Multi-Task Remote Sensing Foundation Models for Forest Monitoring**|AAAI2025|[FoMo](https:\u002F\u002Fdoi.org\u002F10.1609\u002Faaai.v39i27.35002)|[link](https:\u002F\u002Fgithub.com\u002FRolnickLab\u002FFoMo-Bench)|\n|**RingMamba**|**RingMamba: Remote Sensing Multisensor Pretraining With Visual State Space Model**|IEEE TGRS2025|[RingMamba](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2025.3603998)|null|\n|**CrossEarth**|**CrossEarth: Geospatial Vision Foundation Model for Domain Generalizable Remote Sensing Semantic Segmentation**|IEEE TPAMI2025|[CrossEarth](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftpami.2025.3649001)|[link](https:\u002F\u002Fgithub.com\u002FCuzyoung\u002FCrossEarth)|\n|**CtxMIM**|**CtxMIM: Context-Enhanced Masked Image Modeling for Remote Sensing Image Understanding**|ACM TOMM2025|[CtxMIM](https:\u002F\u002Fdoi.org\u002F10.1145\u002F3769084)|null|\n|**SatMamba**|**SatMamba: Development of Foundation Models for Remote Sensing Imagery Using State Space Models**|Arxiv2025|[SatMamba](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.00435)|[link](https:\u002F\u002Fgithub.com\u002Fmdchuc\u002FHRSFM)|\n|**Galileo**|**Galileo: Learning Global & Local Features of Many Remote Sensing Modalities**|ICML2025 TerraBytes Workshop|[Galileo](https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.09356)|[link](https:\u002F\u002Fgithub.com\u002Fnasaharvest\u002Fgalileo)|\n|**SatDiFuser**|**Can Generative Geospatial Diffusion Models Excel as Discriminative Geospatial Foundation Models?**|Arxiv2025|[SatDiFuser](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.07890)|null|\n|**RoMA**|**RoMA: Scaling up Mamba-based Foundation Models for Remote Sensing**|Arxiv2025|[RoMA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.10392)|[link](https:\u002F\u002Fgithub.com\u002FMiliLab\u002FRoMA)|\n|**Panopticon**|**Panopticon: Advancing Any-Sensor Foundation Models for Earth Observation**|CVPR2025|[Panopticon](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.10845)|[link](https:\u002F\u002Fgithub.com\u002FPanopticon-FM\u002Fpanopticon)|\n|**HyperFree**|**HyperFree: A Channel-adaptive and Tuning-free Foundation Model for Hyperspectral Remote Sensing Imagery**|CVPR2025|[HyperFree](https:\u002F\u002Frsidea.whu.edu.cn\u002FHyperFree.pdf)|[link](https:\u002F\u002Fgithub.com\u002FJingtao-Li-CVer\u002FHyperFree)|\n|**AnySat**|**AnySat: One Earth Observation Model for Many Resolutions, Scales, and Modalities**|CVPR2025|[AnySat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.14123)|[link](https:\u002F\u002Fgithub.com\u002Fgastruc\u002FAnySat)|\n|**HyperSL**|**HyperSL: A Spectral Foundation Model for Hyperspectral Image Interpretation**|IEEE TGRS2025|[HyperSL](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10981753)|[link](https:\u002F\u002Fgithub.com\u002Fkkweil\u002FHyperSL)|\n|**DynamicVis**|**DynamicVis: An Efficient and General Visual Foundation Model for Remote Sensing Image Understanding**|Arxiv2025|[DynamicVis](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.16426)|[link](https:\u002F\u002Fgithub.com\u002FKyanChen\u002FDynamicVis)|\n|**DeepAndes**|**DeepAndes: A Self-Supervised Vision Foundation Model for Multispectral Remote Sensing Imagery of the Andes**|IEEE JSTARS2025|[DeepAndes](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fjstars.2025.3619423)|null|\n|**TiMo**|**TiMo: Spatiotemporal Foundation Model for Satellite Image Time Series**|Arxiv2025|[TiMo](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.08723)|[link](https:\u002F\u002Fgithub.com\u002FMiliLab\u002FTiMo)|\n|**TerraFM**|**TerraFM: A Scalable Foundation Model for Unified Multisensor Earth Observation**|Arxiv2025|[TerraFM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.06281)|[link](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FTerraFM)|\n|**TESSERA**|**TESSERA: Temporal Embeddings of Surface Spectra for Earth Representation and Analysis**|Arxiv2025|[TESSERA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.20380)|[link](https:\u002F\u002Fgithub.com\u002Fucam-eo\u002Ftessera)|\n|**CGEarthEye**|**CGEarthEye: A High-Resolution Remote Sensing Vision Foundation Model Based on the Jilin-1 Satellite Constellation**|Arxiv2025|[CGEarthEye](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.00356)|null|\n|**MoSAiC**|**MoSAiC: Multi-Modal Multi-Label Supervision-Aware Contrastive Learning for Remote Sensing**|Arxiv2025|[MoSAiC](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.08683)|null|\n|**AlphaEarth**|**AlphaEarth Foundations: An embedding field model for accurate and efficient global mapping from sparse label data**|Arxiv2025|[AlphaEarth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.22291)|null|\n|**MAESTRO**|**MAESTRO: Masked AutoEncoders for Multimodal, Multitemporal, and Multispectral Earth Observation Data**|Arxiv2025|[MAESTRO](https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.10894)|[link](https:\u002F\u002Fgithub.com\u002FIGNF\u002FMAESTRO)|\n|**FedSense**|**Towards Privacy-preserved Pre-training of Remote Sensing Foundation Models with Federated Mutual-guidance Learning**|ICCV2025|[FedSense](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11051)|null|\n|**RS-vHeat**|**RS-vHeat: Heat Conduction Guided Efficient Remote Sensing Foundation Model**|ICCV2025|[RS-vHeat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.17984)|null|\n|**Copernicus-FM**|**Towards a Unified Copernicus Foundation Model for Earth Vision**|ICCV2025|[Copernicus-FM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11849)|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FCopernicus-FM)|\n|**SelectiveMAE**|**Scaling Efficient Masked Autoencoder Learning on Large Remote Sensing Dataset**|ICCV2025|[SelectiveMAE](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.11933)|[link](https:\u002F\u002Fgithub.com\u002FFengxiang23\u002FSelectiveMAE)|\n|**SMARTIES**|**SMARTIES: Spectrum-Aware Multi-Sensor Auto-Encoder for Remote Sensing Images**|ICCV2025|[SMARTIES](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.19585)|[link](https:\u002F\u002Fgsumbul.github.io\u002FSMARTIES\u002F)|\n|**TerraMind**|**TerraMind: Large-Scale Generative Multimodality for Earth Observation**|ICCV2025|[TerraMind](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.11171)|[link](https:\u002F\u002Fgithub.com\u002FIBM\u002Fterramind)|\n|**SkySense V2**|**SkySense V2: A Unified Foundation Model for Multi-modal Remote Sensing**|ICCV2025|[SkySense V2](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.13812)|null|\n|**SkySense++**|**A semantic-enhanced multi-modal remote sensing foundation model for Earth observation**|Nature Machine Intelligence 2025|[SkySense++](https:\u002F\u002Fwww.nature.com\u002Farticles\u002Fs42256-025-01078-8)|[link](https:\u002F\u002Fgithub.com\u002Fkang-wu\u002FSkySensePlusPlus?tab=readme-ov-file)|\n|**FlexiMo**|**FlexiMo: A Flexible Remote Sensing Foundation Model**|IEEE TGRS2026|[FlexiMo](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2026.3656362)|null|\n|**RingMoE**|**RingMoE: Mixture-of-Modality-Experts Multi-Modal Foundation Models for Universal Remote Sensing Image Interpretation**|IEEE TPAMI2026|[RingMoE](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftpami.2025.3643453)|null|\n|**-**|**A Complex-Valued SAR Foundation Model Based on Physically Inspired Representation Learning**|IEEE TIP2026|[Paper](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftip.2026.3652417)|null|\n|**MAPEX**|**MAPEX: Modality-Aware Pruning of Experts for Remote Sensing Foundation Models**|IEEE TGRS2026|[MAPEX](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2026.3652100)|[link](https:\u002F\u002Fgithub.com\u002FHSG-AIML\u002FMAPEX)|\n|**Alliance**|**Alliance: All-in-One Spectral-Spatial-Frequency Awareness Foundation Model**|IEEE TPAMI2026|[Alliance](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftpami.2025.3639595)|null|\n\n## 遥感 \u003Cins>视觉-语言\u003C\u002Fins> 基础模型\n\n|缩写|标题|发表刊物|论文|代码与权重|\n|:---:|---|:---:|:---:|:---:|\n|**RSGPT**|**RSGPT：一种遥感视觉语言模型及基准测试**|Arxiv2023|[RSGPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15266)|[链接](https:\u002F\u002Fgithub.com\u002FLavender105\u002FRSGPT)|\n|**RemoteCLIP**|**RemoteCLIP：面向遥感的视觉语言基础模型**|IEEE TGRS2024|[RemoteCLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.11029)|[链接](https:\u002F\u002Fgithub.com\u002FChenDelong1999\u002FRemoteCLIP)|\n|**GeoRSCLIP**|**RS5M：用于遥感视觉语言基础模型的大规模视觉-语言数据集**|IEEE TGRS2024|[GeoRSCLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.11300)|[链接](https:\u002F\u002Fgithub.com\u002Fom-ai-lab\u002FRS5M?tab=readme-ov-file)|\n|**GRAFT**|**通过地面遥感对齐实现无需标注的遥感视觉语言基础模型**|ICLR2024|[GRAFT](https:\u002F\u002Fopenreview.net\u002Fpdf?id=w9tc699w3Z)|无|\n|**-**|**开拓新领域：探索多模态大语言模型的地缘与地理空间能力**|Arxiv2023|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.14656)|[链接](https:\u002F\u002Fgithub.com\u002Fjonathan-roberts1\u002Fcharting-new-territories)|\n|**-**|**遥感版ChatGPT：利用ChatGPT和视觉模型解决遥感任务**|Arxiv2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.09083)|[链接](https:\u002F\u002Fgithub.com\u002FHaonanGuo\u002FRemote-Sensing-ChatGPT)|\n|**EarthGPT**|**EarthGPT：面向遥感领域多传感器图像理解的通用多模态大语言模型**|IEEE TGRS2024|[EarthGPT](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2024.3409624)|无|\n|**SkyCLIP**|**SkyScript：面向遥感的大规模且语义多样化的视觉-语言数据集**|AAAI2024|[SkyCLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.12856)|[链接](https:\u002F\u002Fgithub.com\u002Fwangzhecheng\u002FSkyScript)|\n|**GeoChat**|**GeoChat：面向遥感的接地型大型视觉-语言模型**|CVPR2024|[GeoChat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.15826)|[链接](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FGeoChat)|\n|**LHRS-Bot**|**LHRS-Bot：借助VGI增强的大型多模态语言模型赋能遥感**|ECCV2024|[LHRS-Bot](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.02544)|[链接](https:\u002F\u002Fgithub.com\u002FNJU-LHRS\u002FLHRS-Bot)|\n|**RS-LLaVA**|**RS-LLaVA：用于遥感影像联合字幕生成与问答的大型视觉语言模型**|RS2024|[RS-LLaVA](https:\u002F\u002Fwww.mdpi.com\u002F2072-4292\u002F16\u002F9\u002F1477)|[链接](https:\u002F\u002Fgithub.com\u002FBigData-KSU\u002FRS-LLaVA?tab=readme-ov-file)|\n|**SkySenseGPT**|**SkySenseGPT：用于遥感视觉语言理解的细粒度指令微调数据集及模型**|Arxiv2024|[SkySenseGPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.10100)|[链接](https:\u002F\u002Fgithub.com\u002FLuo-Z13\u002FSkySenseGPT)|\n|**EarthMarker**|**EarthMarker：用于区域级和点级遥感影像理解的视觉提示学习**|IEEE TGRS2024|[EarthMarker](https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.13596)|[链接](https:\u002F\u002Fgithub.com\u002Fwivizhang\u002FEarthMarker)|\n|**GeoText**|**迈向自然语言引导的无人机：包含空间关系匹配的GeoText-1652基准测试**|ECCV2024|[GeoText](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.12751)|[链接](https:\u002F\u002Fmultimodalgeo.github.io\u002FGeoText\u002F)|\n|**Aquila**|**Aquila：一种分层对齐的视觉-语言模型，用于增强遥感图像理解**|Arxiv2024|[Aquila](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.06074)|无|\n|**LHRS-Bot-Nova**|**LHRS-Bot-Nova：改进的多模态大语言模型，用于遥感视觉-语言解读**|ISPRS JPRS2025|[LHRS-Bot-Nova](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.isprsjprs.2025.06.003)|[链接](https:\u002F\u002Fgithub.com\u002FNJU-LHRS\u002FLHRS-Bot)|\n|**RSCLIP**|**在无人工标注的情况下推动遥感领域视觉-语言模型的极限**|Arxiv2024|[RSCLIP](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.07048)|无|\n|**GeoGround**|**GeoGround：用于遥感视觉定位的统一大型视觉-语言模型**|Arxiv2024|[GeoGround](https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.11904)|[链接](https:\u002F\u002Fgithub.com\u002Fzytx121\u002FGeoGround)|\n|**RingMoGPT**|**RingMoGPT：用于视觉、语言及接地任务的统一遥感基础模型**|TGRS2024|[RingMoGPT](https:\u002F\u002Fieeexplore.ieee.org\u002Fstamp\u002Fstamp.jsp?tp=&arnumber=10777289)|无|\n|**RSUniVLM**|**RSUniVLM：基于面向粒度的专家混合机制的遥感统一视觉语言模型**|Arxiv2024|[RSUniVLM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.05679)|[链接](https:\u002F\u002Frsunivlm.github.io\u002F)|\n|**UniRS**|**UniRS：通过视觉语言模型统一多时相遥感任务**|Arxiv2024|[UniRS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.20742v1)|无|\n|**REO-VLM**|**REO-VLM：改造视觉语言模型以应对地球观测中的回归挑战**|Arxiv2024|[REO-VLM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.16583)|无|\n|**SkyEyeGPT**|**SkyEyeGPT：通过大型语言模型的指令微调统一遥感视觉-语言任务**|ISPRS JPRS2025|[SkyEyeGPT](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.isprsjprs.2025.01.020)|[链接](https:\u002F\u002Fgithub.com\u002FZhanYang-nwpu\u002FSkyEyeGPT)|\n|**VHM**|**VHM：用于遥感图像分析的多功能且诚实的视觉语言模型**|AAAI2025|[VHM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.20213v4)|[链接](https:\u002F\u002Fgithub.com\u002Fopendatalab\u002FVHM)|\n|**TEOChat**|**TEOChat：用于时间序列地球观测数据的大语言和视觉助手**|ICLR2025|[TEOChat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.06234)|[链接](https:\u002F\u002Fgithub.com\u002Fermongroup\u002FTEOChat)|\n|**EarthDial**|**EarthDial：将多感官地球观测转化为交互式对话**|CVPR2025|[EarthDial](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.15190#page=3.84)|[链接](https:\u002F\u002Fgithub.com\u002Fhiyamdebary\u002FEarthDial)|\n|**SkySense-O**|**SkySense-O：以视觉为中心的视觉-语言建模，迈向开放世界遥感解读**|CVPR2025|[SkySense-O](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2025\u002Fpapers\u002FZhu_SkySense-O_Towards_Open-World_Remote_Sensing_Interpretation_with_Vision-Centric_Visual-Language_Modeling_CVPR_2025_paper.pdf)|[链接](https:\u002F\u002Fgithub.com\u002Fzqcrafts\u002FSkySense-O)|\n|**XLRS-Bench**|**XLRS-Bench：您的多模态大语言模型能否理解超大规模超高分辨率遥感影像？**|CVPR2025|[XLRS-Bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.23771)|[链接](https:\u002F\u002Fxlrs-bench.github.io\u002F)|\n|**GeoPix**|**GeoPix：用于遥感中像素级图像理解的多模态大语言模型**|IEEE GRSM2025|[GeoPix](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.06828)|[链接](https:\u002F\u002Fgithub.com\u002FNorman-Ou\u002FGeoPix)|\n|**GeoPixel**|**GeoPixel：遥感中的像素接地型大型多模态模型**|ICML2025|[GeoPixel](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.13925)|[链接](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FGeoPixel)|\n|**Co-LLaVA**|**Co-LLaVA：通过模型协作实现高效的遥感视觉问答**|RS2025|[Co-LLaVA](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs17030466)|无|\n|**GeoMag**|**GeoMag：用于像素级精细遥感图像解析的视觉-语言模型**|ACMMM2025|[GeoMag](https:\u002F\u002Fdoi.org\u002F10.1145\u002F3746027.3754559)|无|\n|**-**|**基于学习评分模型的质量驱动型遥感视觉-语言数据整理**|Arxiv2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.00743)|无|\n|**DOFA-CLIP**|**DOFA-CLIP：用于地球观测的多模态视觉-语言基础模型**|Arxiv2025|[DOFA-CLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.06312)|[链接](https:\u002F\u002Fgithub.com\u002Fxiong-zhitong\u002FGeoLB-SigLIP)|\n|**DGTRS-CLIP**|**DGTRSD & DGTRS-CLIP：用于对齐的双粒度遥感图像-文本数据集及视觉语言基础模型**|Arxiv2025|[DGTRS-CLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.19311)|[链接](https:\u002F\u002Fgithub.com\u002FMitsuiChen14\u002FDGTRS)|\n|**Falcon**|**Falcon：一种遥感视觉-语言基础模型**|Arxiv2025|[Falcon](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11070)|[链接](https:\u002F\u002Fgithub.com\u002FTianHuiLab\u002FFalcon)|\n|**GeoRSMLLM**|**GeoRSMLLM：用于地球科学和遥感领域视觉-语言任务的多模态大语言模型**|Arxiv2025|[GeoRSMLLM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.12490)|无|\n|**LRS-VQA**|**当大型视觉-语言模型遇到大型遥感影像时：粗粒度到细粒度的文本引导标记剪枝**|ICCV2025|[LRS-VQA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.07588)|[链接](https:\u002F\u002Fgithub.com\u002FVisionXLab\u002FLRS-VQA)|\n|**UrbanLLaVA**|**UrbanLLaVA：具有空间推理与理解能力的城市智能多模态大语言模型**|ICCV2025|[UrbanLLaVA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.23219)|[链接](https:\u002F\u002Fgithub.com\u002Ftsinghua-fib-lab\u002FUrbanLLaVA)|\n|**OmniGeo**|**OmniGeo：迈向用于地理空间人工智能的多模态大语言模型**|Arxiv2025|[OmniGeo](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.16326)|无|\n|**EagleVision**|**EagleVision：面向遥感的对象级属性多模态大语言模型**|Arxiv2025|[EagleVision](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.23330)|[链接](https:\u002F\u002Fgithub.com\u002FXiangTodayEatsWhat\u002FEagleVision)|\n|**SegEarth-R1**|**SegEarth-R1：通过大语言模型进行地理空间像素推理**|Arxiv2025|[SegEarth-R1](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.09644)|[链接](https:\u002F\u002Fgithub.com\u002Fearth-insights\u002FSegEarth-R1)|\n|**RemoteSAM**|**RemoteSAM：迈向地球观测的“任何东西都能分割”**|ACMMM2025|[RemoteSAM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.18022)|[链接](https:\u002F\u002Fgithub.com\u002F1e12Leon\u002FRemoteSAM)|\n|**DynamicVL**|**DynamicVL：针对动态城市理解的多模态大语言模型基准测试**|Arxiv2025|[DynamicVL](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.21076)|无|\n|**LISAt**|**LISAt：卫星影像的语言指令分割助手**|Arxiv2025|[LISAt](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.02829)|[链接](https:\u002F\u002Flisat-bair.github.io\u002FLISAt\u002F)|\n|**EarthMind**|**EarthMind：利用大型多模态模型迈向多粒度、多传感器的地球观测**|Arxiv2025|[EarthMind](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.01667)|[链接](https:\u002F\u002Fgithub.com\u002Fshuyansy\u002FEarthMind)|\n|**-**|**遥感大型视觉-语言模型：语义增强的多层级对齐与语义感知专家建模**|Arxiv2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.21863)|无|\n|**RLita**|**RLita：用于遥感基础模型的区域级图像-文本对齐方法**|RS2025|[RLita](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs17101661)|无|\n|**RingMo-Agent**|**RingMo-Agent：用于多平台和多模态推理的统一遥感基础模型**|Arxiv2025|[RingMo-Agent](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.20776)|无|\n|**FUSE-RSVLM**|**FUSE-RSVLM：用于遥感的特征融合视觉-语言模型**|Arxiv2025|[FUSE-RSVLM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.24022)|[链接](https:\u002F\u002Fgithub.com\u002FYunkaidang\u002FRSVLM)|\n|**GeoReason**|**GeoReason：通过逻辑一致性强化学习，在遥感视觉-语言模型中对思维与回答进行对齐**|Arxiv2026|[GeoReason](https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.04118)|[链接](https:\u002F\u002Fgithub.com\u002Fcanlanqianyan\u002FGeoReason)|\n|**RSCoVLM**|**用于遥感多任务学习的视觉-语言模型协同训练**|RS2026|[RSCoVLM](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs18020222)|[链接](https:\u002F\u002Fgithub.com\u002FVisionXLab\u002FRSCoVLM)|\n|**GeoAlignCLIP**|**GeoAlignCLIP：通过多粒度一致性学习提升遥感领域的细粒度视觉-语言对齐**|Arxiv2026|[GeoAlignCLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.09566)|无|\n\n## 遥感\u003Cins>生成式\u003C\u002Fins>基础模型\n\n|缩写|标题|发表|论文|代码与权重|\n|:---:|---|:---:|:---:|:---:|\n|**Seg2Sat**|**Seg2Sat - 使用预训练扩散模型从分割图生成航拍视图**|Github|无|[链接](https:\u002F\u002Fgithub.com\u002FRubenGres\u002FSeg2Sat)|\n|**-**|**生成属于你的苏格兰：基于地图条件的卫星图像生成**|NeurIPS 2023|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.16648)|[链接](https:\u002F\u002Fgithub.com\u002Ftoastyfrosty\u002Fmap-sat)|\n|**GeoRSSD**|**RS5M：用于遥感视觉-语言基础模型的大规模视觉-语言数据集**|ArXiv 2023|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.11300)|[链接](https:\u002F\u002Fhuggingface.co\u002FZilun\u002FGeoRSSD)|\n|**DiffusionSat**|**DiffusionSat：面向卫星影像的生成式基础模型**|ICLR 2024|[DiffusionSat](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.03606)|[链接](https:\u002F\u002Fgithub.com\u002Fsamar-khanna\u002FDiffusionSat)|\n|**CRS-Diff**|**CRS-Diff：可控生成式遥感基础模型**|ArXiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.11614)|无|\n|**MetaEarth**|**MetaEarth：全球尺度遥感图像生成的生成式基础模型**|ArXiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.13570)|[链接](https:\u002F\u002Fjiupinjia.github.io\u002Fmetaearth\u002F)|\n|**CRS-Diff**|**CRS-Diff：可控生成式遥感基础模型**|ArXiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.11614)|[链接](https:\u002F\u002Fgithub.com\u002FSonettoo\u002FCRS-Diff?tab=readme-ov-file)|\n|**HSIGene**|**HSIGene：高光谱图像生成的基础模型**|ArXiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2409.12470)|[链接](https:\u002F\u002Fgithub.com\u002FLiPang\u002FHSIGene)|\n|**Text2Earth**|**Text2Earth：借助全球规模数据集和基础模型解锁文本驱动的遥感图像生成**|ArXiv 2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.00895)|[链接](https:\u002F\u002Fchen-yang-liu.github.io\u002FText2Earth\u002F)|\n\n## 遥感\u003Cins>视觉-位置\u003C\u002Fins>基础模型\n\n|缩写|标题|发表|论文|代码与权重|\n|:---:|---|:---:|:---:|:---:|\n|**CSP**|**CSP：用于地理空间视觉表征的自监督对比度空间预训练**|ICML 2023|[CSP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.01118)|[链接](https:\u002F\u002Fgengchenmai.github.io\u002Fcsp-website\u002F)|\n|**GeoCLIP**|**GeoCLIP：受Clip启发的位置与图像对齐，实现高效的全球地理定位**|NeurIPS 2023|[GeoCLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2309.16020)|[链接](https:\u002F\u002Fvicentevivan.github.io\u002FGeoCLIP\u002F)|\n|**SatCLIP**|**SatCLIP：利用卫星影像构建的全球通用位置嵌入**|ArXiv 2023|[SatCLIP](https:\u002F\u002Farxiv.org\u002Fabs\u002F2311.17179)|[链接](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Fsatclip)|\n|**RANGE**|**RANGE：用于多分辨率地理嵌入的检索增强神经场**|CVPR 2025|[RANGE](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2502.19781)|无|\n|**GAIR**|**GAIR：通过地理对齐的隐式表征改进多模态地理基础模型**|ArXiv 2025|[GAIR](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.16683)|无|\n\n## 遥感\u003Cins>视觉-音频\u003C\u002Fins>基础模型\n\n|缩写|标题|发表|论文|代码与权重|\n|:---:|---|:---:|:---:|:---:|\n|**-**|**遥感数据的自监督视听表征学习**|JAG 2022|[论文](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS1569843222003181)|[链接](https:\u002F\u002Fgithub.com\u002Fkhdlr\u002FSoundingEarth)|\n\n## 遥感 \u003Cins>任务特定\u003C\u002Fins> 基础模型\n\n|缩写|标题|发表期刊|论文|代码与权重|任务|\n|:---:|---|:---:|:---:|:---:|:---:|\n|**SS-MAE**|**SS-MAE：用于多源遥感图像分类的空间-光谱掩码自编码器**|TGRS 2023|[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10314566\u002F)|[链接](https:\u002F\u002Fgithub.com\u002Fsummitgao\u002FSS-MAE?tab=readme-ov-file)|图像分类|\n|**-**|**一种结合提示学习的解耦范式用于遥感图像变化描述**|TGRS 2023|[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10271701)|[链接](https:\u002F\u002Fgithub.com\u002FChen-Yang-Liu\u002FPromptCC)|遥感图像变化描述|\n|**TTP**|**时光旅行像素：基于基础模型的双时相特征融合用于遥感图像变化检测**|Arxiv 2023|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.16202)|[链接](https:\u002F\u002Fgithub.com\u002FKyanChen\u002FTTP)|变化检测|\n|**CSMAE**|**探索掩码自编码器在遥感中传感器无关图像检索中的应用**|Arxiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.07782)|[链接](https:\u002F\u002Fgithub.com\u002Fjakhac\u002FCSMAE)|图像检索|\n|**RSPrompter**|**RSPrompter：基于视觉基础模型的遥感实例分割提示学习**|TGRS 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.16269)|[链接](https:\u002F\u002Fgithub.com\u002FKyanChen\u002FRSPrompter)|实例分割|\n|**BAN**|**一种基于基础模型的遥感变化检测新学习范式**|TGRS 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.01163)|[链接](https:\u002F\u002Fgithub.com\u002Flikyoo\u002FBAN)|变化检测|\n|**-**|**通过 Segment Anything Model (SAM) 进行光学遥感影像与地图数据之间的变化检测**|Arxiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.09019)|无|变化检测（光学与 OSM 数据）|\n|**AnyChange**|**分割任何变化**|Arxiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.01188)|无|零样本变化检测|\n|**RS-CapRet**|**用于遥感图像描述和检索的大语言模型**|Arxiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.06475)|无|图像描述与文本-图像检索|\n|**-**|**带有噪声标签的遥感图像分割任务特定预训练**|Arxiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.16164)|无|图像分割（噪声标签）|\n|**RSBuilding**|**RSBuilding：基于基础模型的通用遥感图像建筑物提取与变化检测**|Arxiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.07564)|[链接](https:\u002F\u002Fgithub.com\u002FMeize0729\u002FRSBuilding)|建筑物提取与变化检测|\n|**SAM-Road**|**用于道路网络图提取的 Segment Anything Model**|Arxiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.16051)|[链接](https:\u002F\u002Fgithub.com\u002Fhtcr\u002Fsam_road)|道路提取|\n|**CrossEarth**|**CrossEarth：面向领域泛化的遥感语义分割地理空间视觉基础模型**|Arxiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.22629)|[链接](https:\u002F\u002Fgithub.com\u002FCuzyoung\u002FCrossEarth)|领域泛化遥感语义分割|\n|**GeoGround**|**GeoGround：用于遥感视觉定位的统一大型视觉-语言模型**|Arxiv 2024|[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2411.11904)|[链接](https:\u002F\u002Fgithub.com\u002Fzytx121\u002FGeoGround)|遥感视觉定位|\n|**TPOV-Seg**|**TPOV-Seg：用于开放词汇遥感语义分割的文本增强型视觉-语言模型提示调优**|IEEE TGRS 2025|[论文](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2025.3624767)|无|开放词汇语义分割|\n|**SegEarth-OV**|**SegEarth-OV：迈向无需训练的遥感图像开放词汇分割**|CVPR 2025|[论文](https:\u002F\u002Fcvpr.thecvf.com\u002Fvirtual\u002F2025\u002Fposter\u002F33431)|[链接](https:\u002F\u002Fgithub.com\u002Flikyoo\u002FSegEarth-OV)|开放词汇分割|\n|**RSRefSeg 2**|**RSRefSeg 2：利用基础模型解耦引用式遥感图像分割**|IEEE TGRS 2026|[论文](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2025.3647535)|无|引用式图像分割|\n|**AgriFM**|**AgriFM：用于农业制图的多源时序遥感基础模型**|RSE 2026|[论文](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.rse.2026.115234)|[链接](https:\u002F\u002Fgithub.com\u002Fflyakon\u002FAgriFM)|作物制图 \u002F 农业制图|\n|**SARATR-X**|**SARATR-X：构建 SAR 目标识别的基础模型**|IEEE TIP 2025|[SARATR-X](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10856784)|[链接](https:\u002F\u002Fgithub.com\u002Fwaterdisappear\u002FSARATR-X)|SAR 目标识别|\n\n## 遥感智能体\n|缩写|标题|发表会议\u002F期刊|论文|代码与权重|\n|:---:|---|:---:|:---:|:---:|\n|**GeoLLM-QA**|**遥感平台中工具增强型智能体的评估**|ICLR 2024 ML4RS Workshop|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.00709)|无|\n|**RS-Agent**|**RS-Agent：通过智能体自动化遥感任务**|Arxiv2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.07089)|无|\n|**Change-Agent**|**Change-Agent：迈向交互式、全面的遥感变化解读与分析**|TGRS2024|[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10591792)|[链接](https:\u002F\u002Fgithub.com\u002FChen-Yang-Liu\u002FChange-Agent)|\n|**GeoLLM-Engine**|**GeoLLM-Engine：构建地理空间协作助手的真实环境**|CVPRW2024|[论文](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2024W\u002FEarthVision\u002Fhtml\u002FSingh_GeoLLM-Engine_A_Realistic_Environment_for_Building_Geospatial_Copilots_CVPRW_2024_paper.html)|无|\n|**PEACE**|**PEACE：利用多模态大语言模型赋能地质图的整体理解**|CVPR2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.06184)|[链接](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FPEACE?tab=readme-ov-file)|\n|**-**|**面向地球观测的大语言模型智能体：UnivEARTH 数据集**|Arxiv2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.12110)|无|\n|**Geo-OLM**|**Geo-OLM：借助低成本开源语言模型与状态驱动的工作流，推动可持续的地球观测研究**|COMPASS'2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.04319)|[链接](https:\u002F\u002Fgithub.com\u002Fdstamoulis\u002Fgeo-olms)|\n|**ThinkGeo**|**ThinkGeo：评估用于遥感任务的工具增强型智能体**|Arxiv2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.23752)|[链接](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FThinkGeo)|\n|**AirSpatialBot**|**AirSpatialBot：一种具备空间感知能力的空中智能体，用于细粒度车辆属性识别与检索**|IEEE TGRS2025|[论文](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ftgrs.2025.3570895)|[链接](https:\u002F\u002Fgithub.com\u002FVisionXLab\u002FAirSpatialBot)|\n|**OpenEarthAgent**|**OpenEarthAgent：工具增强型地理空间智能体的统一框架**|Arxiv2026|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.17665)|[链接](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FOpenEarthAgent)|\n|**GeoEyes**|**GeoEyes：针对超高分辨率遥感影像的长距离视觉-语言理解而设计的地理空间上下文缩放型智能体**|Arxiv2026|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.14201)|[链接](https:\u002F\u002Fgithub.com\u002Fnanocm\u002FGeoEyes)|\n|**Earth-Agent**|**Earth-Agent：用智能体解锁地球观测的全貌**|ICLR2026|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.23141)|[链接](https:\u002F\u002Fgithub.com\u002Fopendatalab\u002FEarth-Agent)|\n\n## RSFM 基准测试\n|缩写|标题|发表|论文|链接|下游任务|\n|:---:|---|:---:|:---:|:---:|:---:|\n|**-**|**重新审视预训练遥感模型基准：调整大小和归一化很重要**|Arxiv2023|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13456)|[链接](https:\u002F\u002Fgithub.com\u002Fisaaccorley\u002Fresize-is-all-you-need)|分类|\n|**GEO-Bench**|**GEO-Bench：迈向地球监测的基础模型**|Arxiv2023|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.03831)|[链接](https:\u002F\u002Fgithub.com\u002FServiceNow\u002Fgeo-bench)|分类与分割|\n|**FoMo-Bench**|**FoMo-Bench：用于遥感基础模型的多模态、多尺度和多任务森林监测基准**|Arxiv2023|[FoMo-Bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.10114)|即将发布|针对森林监测的分类、分割和检测|\n|**PhilEO**|**PhilEO 基准：评估地理空间基础模型**|Arxiv2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.04464)|[链接](https:\u002F\u002Fgithub.com\u002F91097luke\u002Fphileo-bench)|分割与回归估计|\n|**SkySense**|**SkySense：面向地球观测影像通用解读的多模态遥感基础模型**|CVPR2024|[SkySense](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.10115)|目标开源|分类、分割、检测、变化检测以及多模态分割：时间无关的土地覆盖制图、时间相关的作物制图和多模态场景分类|\n|**VLEO-Bench**|**擅长描述，不擅长计数：基于地球观测数据对 GPT-4V 的基准测试**|Arxiv2024|[VLEO-bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.17600)|[链接](https:\u002F\u002Fvleo.danielz.ch\u002F)|位置识别、描述、场景分类、计数、检测和变化检测|\n|**VRSBench**|**VRSBench：用于遥感图像理解的多功能视觉-语言基准数据集**|NeurIPS2024|[VRSBench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.12384)|[链接](https:\u002F\u002Fvrsbench.github.io\u002F)|图像描述、对象指代和视觉问答|\n|**UrBench**|**UrBench：在多视角城市场景中评估大型多模态模型的综合基准**|AAAI2025|[UrBench](https:\u002F\u002Fdoi.org\u002F10.1609\u002Faaai.v39i10.33163)|[链接](https:\u002F\u002Fopendatalab.github.io\u002FUrBench\u002F)|对象指代、视觉问答、计数、场景分类、位置识别和地理定位|\n|**PANGAEA**|**PANGAEA：面向地理空间基础模型的全球性和包容性基准**|Arxiv2024|[PANGAEA](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.04204)|[链接](https:\u002F\u002Fgithub.com\u002Fyurujaja\u002Fpangaea-bench)|分割、变化检测和回归|\n|**CHOICE**|**CHOICE：评估和理解遥感领域视觉-语言模型的选择**|NeurIPS2025|[CHOICE](https:\u002F\u002Fneurips.cc\u002Fvirtual\u002F2025\u002Fposter\u002F121749)|[链接](https:\u002F\u002Fgithub.com\u002FShawnAn-WHU\u002FCHOICE)|感知与推理|\n|**GEO-Bench-VLM**|**GEO-Bench-VLM：针对地理空间任务的视觉-语言模型基准测试**|ICCV2025|[GEO-Bench-VLM](https:\u002F\u002Ficcv.thecvf.com\u002Fvirtual\u002F2025\u002Fposter\u002F2247)|[链接](https:\u002F\u002Fgithub.com\u002FThe-AI-Alliance\u002FGEO-Bench-VLM)|场景理解、计数、物体分类、事件检测和空间关系|\n|**Copernicus-Bench**|**迈向统一的哥白尼地球视觉基础模型**|Arxiv2025|[Copernicus-Bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11849)|[链接](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FCopernicus-FM)|分割、分类、变化检测和回归|\n|**REOBench**|**REOBench：评估地球观测基础模型的鲁棒性**|Arxiv2025|[REOBench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.16793)|[链接](https:\u002F\u002Fgithub.com\u002Flx709\u002FREOBench)|涵盖6项地球观测任务的鲁棒性|\n|**Plantation Bench**|**Plantation Bench：用于分布偏移下种植园测绘的多尺度、多模态遥感基准**|ICCVW2025|[Plantation Bench](https:\u002F\u002Fdoi.org\u002F10.1109\u002Ficcvw69036.2025.00310)|无|分布偏移下的种植园测绘|\n|**ChatEarthBench**|**ChatEarthBench：针对地球观测的多模态大型语言模型基准测试**|IEEE GRSM2026|[ChatEarthBench](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fmgrs.2026.3650840)|无|EO多模态大型语言模型的基准测试|\n|**GeoReason-Bench**|**GeoReason：通过逻辑一致性强化学习，在遥感视觉-语言模型中实现思维与回答的一致性**|Arxiv2026|[GeoReason-Bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.04118)|[链接](https:\u002F\u002Fgithub.com\u002Fcanlanqianyan\u002FGeoReason)|逻辑一致性与多步推理|\n|**Earth-Bench**|**Earth-Agent：借助智能体解锁地球观测的全貌**|ICLR2026|[Earth-Bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.23141)|[链接](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002FSssunset\u002FEarth-Bench)|工具增强的 EO 推理、多步规划和定量时空分析|\n|**OmniEarth**|**OmniEarth：评估视觉-语言模型在地理空间任务中表现的基准**|Arxiv2026|[OmniEarth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.09471)|[链接](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fsjeeudd\u002FOmniEarth)|感知、推理和在地理空间任务中的鲁棒性|\n\n\n## （大规模）预训练数据集\n\n|缩写|标题|出版物|论文|属性|链接|\n|:---:|---|:---:|:---:|:---:|:---:|\n|**fMoW**|**世界功能地图**|CVPR2018|[fMoW](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent_cvpr_2018\u002Fhtml\u002FChristie_Functional_Map_of_CVPR_2018_paper.html)|**视觉**|[link](https:\u002F\u002Fgithub.com\u002FfMoW)|\n|**SEN12MS**|**SEN12MS -- 用于深度学习和数据融合的地理参考多光谱哨兵1\u002F2影像精选数据集**|-|[SEN12MS](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.07789)|**视觉**|[link](https:\u002F\u002Farxiv.org\u002Fabs\u002F1906.07789)|\n|**BEN-MM**|**BigEarthNet-MM：遥感图像分类与检索的大规模多模态多标签基准数据集**|GRSM2021|[BEN-MM](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9552024)|**视觉**|[link](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9552024)|\n|**MillionAID**|**关于创建航空图像理解基准数据集：综述、指南及Million-AID**|JSTARS2021|[MillionAID](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F9393553)|**视觉**|[link](https:\u002F\u002Fcaptain-whu.github.io\u002FDiRS\u002F)|\n|**SeCo**|**季节对比：来自未标注遥感数据的无监督预训练**|ICCV2021|[SeCo](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FICCV2021\u002Fhtml\u002FManas_Seasonal_Contrast_Unsupervised_Pre-Training_From_Uncurated_Remote_Sensing_Data_ICCV_2021_paper.html)|**视觉**|[link](https:\u002F\u002Fgithub.com\u002FServiceNow\u002Fseasonal-contrast)|\n|**fMoW-S2**|**SatMAE：用于时序和多光谱卫星影像的Transformer预训练**|NeurIPS2022|[fMoW-S2](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2022\u002Fhash\u002F01c561df365429f33fcd7a7faa44c985-Abstract-Conference.html)|**视觉**|[link](https:\u002F\u002Fpurl.stanford.edu\u002Fvg497cb6002)|\n|**TOV-RS-Balanced**|**TOV：基于自监督学习的光学遥感图像理解原始视觉模型**|JSTARS2023|[TOV](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10110958)|**视觉**|[link](https:\u002F\u002Fgithub.com\u002FGeoX-Lab\u002FG-RSIM\u002Ftree\u002Fmain\u002FTOV_v1)|\n|**SSL4EO-S12**|**SSL4EO-S12：地球观测中大规模多模态、多时相自监督学习数据集**|GRSM2023|[SSL4EO-S12](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.07044)|**视觉**|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FSSL4EO-S12)|\n|**SSL4EO-L**|**SSL4EO-L：用于Landsat影像的数据集和基础模型**|Arxiv2023|[SSL4EO-L](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.09424)|**视觉**|[link](https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002Ftorchgeo)|\n|**SatlasPretrain**|**SatlasPretrain：用于遥感图像理解的大规模数据集**|ICCV2023|[SatlasPretrain](https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.15660)|**视觉（监督）**|[link](https:\u002F\u002Fgithub.com\u002Fallenai\u002Fsatlas\u002Fblob\u002Fmain\u002FSatlasPretrain.md)|\n|**CACo**|**面向卫星图像的变化感知采样与对比学习**|CVPR2023|[CACo](https:\u002F\u002Fopenaccess.thecvf.com\u002Fcontent\u002FCVPR2023\u002Fhtml\u002FMall_Change-Aware_Sampling_and_Contrastive_Learning_for_Satellite_Images_CVPR_2023_paper.html)|**视觉**|[即将发布](https:\u002F\u002Fgithub.com\u002Futkarshmall13\u002FCACo)|\n|**SAMRS**|**SAMRS：利用Segment Anything Model扩展遥感分割数据集**|NeurIPS2023|[SAMRS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.02034)|**视觉**|[link](https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FSAMRS)|\n|**RSVG**|**RSVG：探索遥感数据上的视觉定位数据与模型**|TGRS2023|[RSVG](https:\u002F\u002Fieeexplore.ieee.org\u002Fdocument\u002F10056343)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002FZhanYang-nwpu\u002FRSVG-pytorch)|\n|**RS5M**|**RS5M：用于遥感视觉-语言基础模型的大规模视觉-语言数据集**|Arxiv2023|[RS5M](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.11300)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002Fom-ai-lab\u002FRS5M)|\n|**GEO-Bench**|**GEO-Bench：迈向地球监测的基础模型**|Arxiv2023|[GEO-Bench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.03831)|**视觉（评估）**|[link](https:\u002F\u002Fgithub.com\u002FServiceNow\u002Fgeo-bench)|\n|**RSICap & RSIEval**|**RSGPT：遥感视觉语言模型及基准测试**|Arxiv2023|[RSGPT](https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.15266)|**视觉-语言**|[即将发布](https:\u002F\u002Fgithub.com\u002FLavender105\u002FRSGPT)|\n|**Clay**|**Clay基础模型**|-|无|**视觉**|[link](https:\u002F\u002Fclay-foundation.github.io\u002Fmodel\u002F)|\n|**SATIN**|**SATIN：使用视觉-语言模型对卫星影像进行分类的多任务元数据集**|ICCVW2023|[SATIN](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.11619)|**视觉-语言**|[link](https:\u002F\u002Fsatinbenchmark.github.io\u002F)|\n|**SkyScript**|**SkyScript：用于遥感的大规模且语义多样化的视觉-语言数据集**|AAAI2024|[SkyScript](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.12856)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002Fwangzhecheng\u002FSkyScript)|\n|**ChatEarthNet**|**ChatEarthNet：赋能视觉-语言地理基础模型的全球规模图像-文本数据集**|ESSD2025|[ChatEarthNet](https:\u002F\u002Fdoi.org\u002F10.5194\u002Fessd-17-1245-2025)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FChatEarthNet)|\n|**LuoJiaHOG**|**LuoJiaHOG：面向遥感图像-文本检索的层次化地理感知图像描述数据集**|ISPRS JPRS2025|[LuoJiaHOG](https:\u002F\u002Fdoi.org\u002F10.1016\u002Fj.isprsjprs.2025.02.009)|**视觉-语言**|无|\n|**MMEarth**|**MMEarth：探索用于地理空间表征学习的多模态前置任务**|Arxiv2024|[MMEarth](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.02771)|**视觉**|[link](https:\u002F\u002Fvishalned.github.io\u002Fmmearth\u002F)|\n|**SeeFar**|**SeeFar：与卫星无关的多分辨率地理基础模型数据集**|Arxiv2024|[SeeFar](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.06776)|**视觉**|[link](https:\u002F\u002Fcoastalcarbon.ai\u002Fseefar)|\n|**FIT-RS**|**SkySenseGPT：用于遥感视觉-语言理解的细粒度指令微调数据集和模型**|Arxiv2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.10100)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002FLuo-Z13\u002FSkySenseGPT)|\n|**RS-GPT4V**|**RS-GPT4V：用于遥感图像理解的统一多模态指令遵循数据集**|Arxiv2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.12479)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002FGeoX-Lab\u002FRS-GPT4V\u002Ftree\u002Fmain)|\n|**RS-4M**|**在大型遥感数据集上高效扩展掩码自编码器学习**|Arxiv2024|[RS-4M](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.11933)|**视觉**|[link](https:\u002F\u002Fgithub.com\u002FFengxiang23\u002FSelectiveMAE)|\n|**Major TOM**|**Major TOM：可扩展的地球观测数据集**|Arxiv2024|[Major TOM](https:\u002F\u002Farxiv.org\u002Fabs\u002F2402.12095)|**视觉**|[link](https:\u002F\u002Fhuggingface.co\u002FMajor-TOM)|\n|**VRSBench**|**VRSBench：用于遥感图像理解的多功能视觉-语言基准数据集**|Arxiv2024|[VRSBench](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.12384)|**视觉-语言**|[link](https:\u002F\u002Fvrsbench.github.io\u002F)|\n|**MMM-RS**|**MMM-RS：用于文本到图像生成的多模态、多GSD、多场景遥感数据集及基准测试**|Arxiv2024|[MMM-RS](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.22362)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002Fljl5261\u002FMMM-RS)|\n|**DDFAV**|**DDFAV：遥感大型视觉语言模型数据集及评估基准**|RS2025|[DDFAV](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs17040719)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002FHaodongLi2024\u002Frspope)|\n|**M3LEO**|**一种整合干涉合成孔径雷达与多光谱数据的多模态、多标签地球观测数据集**|NeurIPS2024|[M3LEO](https:\u002F\u002Fproceedings.neurips.cc\u002Fpaper_files\u002Fpaper\u002F2024\u002Ffile\u002Fbd194b579f60879e04ca9ce8a4ea5da1-Paper-Datasets_and_Benchmarks_Track.pdf)|**视觉**|[link](https:\u002F\u002Fgithub.com\u002Fspaceml-org\u002FM3LEO)|\n|**Copernicus-Pretrain**|**迈向统一的哥白尼地球视觉基础模型**|Arxiv2025|[Copernicus-Pretrain](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11849)|**视觉**|[link](https:\u002F\u002Fgithub.com\u002Fzhu-xlab\u002FCopernicus-FM)|\n|**DGTRSD**|**DGTRSD & DGTRS-CLIP：用于对齐的双粒度遥感图像-文本数据集及视觉语言基础模型**|Arxiv2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.19311)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002FMitsuiChen14\u002FDGTRS)|\n|**EarthDial-Instruct**|**EarthDial：将多感官地球观测转化为互动对话**|CVPR2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.15190)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002Fhiyamdebary\u002FEarthDial)|\n|**GeoPixelD**|**GeoPixel：在遥感中实现像素级定位的大型多模态模型**|ICML2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.13925)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FGeoPixel)|\n|**GeoPixInstruct**|**GeoPix：用于遥感中像素级图像理解的多模态大型语言模型**|IEEE GRSM2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.06828)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002FNorman-Ou\u002FGeoPix)|\n|**GeoLangBind-2M**|**重新思考遥感CLIP：利用多模态大型语言模型构建高质量视觉-语言数据集**|ICONIP2024|[论文](https:\u002F\u002Fdoi.org\u002F10.1007\u002F978-981-96-6972-1_29)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002Fxiong-zhitong\u002FGeoLB-SigLIP)|\n|**Falcon_SFT**|**Falcon：一种遥感视觉-语言基础模型**|Arxiv2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.11070)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002FTianHuiLab\u002FFalcon)|\n|**UnivEARTH**|**迈向地球观测的LLM智能体：UnivEARTH数据集**|Arxiv2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.12110)|**视觉-语言与智能体**|无|\n|**RemoteSAM-270K**|**RemoteSAM：迈向地球观测的Segment Anything**|ACMMM2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.18022)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002F1e12Leon\u002FRemoteSAM)|\n|**OpenEarthAgent Dataset**|**OpenEarthAgent：工具增强型地理空间智能体的统一框架**|Arxiv2026|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.17665)|**视觉-语言与智能体**|[link](https:\u002F\u002Fgithub.com\u002Fmbzuai-oryx\u002FOpenEarthAgent)|\n|**UHR-CoZ**|**GeoEyes：超高清遥感图像上长距离视觉-语言理解的地理上下文缩放智能体**|Arxiv2026|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.14201)|**视觉-语言**|[link](https:\u002F\u002Fgithub.com\u002Fnanocm\u002FGeoEyes)|\n\n## 嵌入数据\n\n|缩写|标题|出版物|论文|代码|数据集\u002F产品|\n|:---:|---|:---:|:---:|:---:|:---:|\n|**CLAY Embeddings**|**Clay Model v0 嵌入**|Source Cooperative2024|无|[链接](https:\u002F\u002Fgithub.com\u002FClay-foundation)|[链接](https:\u002F\u002Fsource.coop\u002Fclay\u002Fclay-model-v0-embeddings)|\n|**Major TOM Embeddings**|**地球的全局稠密嵌入：Major TOM 漂浮在潜在空间中**|Arxiv2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.05600)|[链接](https:\u002F\u002Fgithub.com\u002FESA-PhiLab\u002FMajor-TOM)|[链接](https:\u002F\u002Fhuggingface.co\u002FMajor-TOM)|\n|**Earth Genome Embeddings**|**面向所有人的嵌入**|Medium2025|[论文](https:\u002F\u002Fmedium.com\u002Fearthrisemedia\u002Fembeddings-for-all-0e0a29415b26)|无|[链接](https:\u002F\u002Fsource.coop\u002Fearthgenome\u002Fearthindexembeddings)|\n|**TESSERA**|**TESSERA：用于地球表征与分析的预计算 FAIR 全球像素嵌入**|Arxiv2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.20380)|[链接](https:\u002F\u002Fgithub.com\u002Fucam-eo\u002Ftessera)|[链接](https:\u002F\u002Fgithub.com\u002Fucam-eo\u002Fgeotessera)|\n|**AlphaEarth**|**AlphaEarth 基础：一种基于稀疏标签数据的准确高效全球制图嵌入场模型**|Arxiv2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.22291)|无|[链接](https:\u002F\u002Fdevelopers.google.com\u002Fearth-engine\u002Fdatasets\u002Fcatalog\u002FGOOGLE_SATELLITE_EMBEDDING_V1_ANNUAL)|\n|**ESD**|** democratizing 行星尺度分析：一个超轻量级地球嵌入数据库，用于准确灵活的全球土地监测**|Arxiv2026|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.11183)|[链接](https:\u002F\u002Fgithub.com\u002Fshuangchencc\u002FESD)|[链接](https:\u002F\u002Fdata-starcloud.pcl.ac.cn\u002Fiearthdata\u002F64)|\n\n## 相关项目\n|标题|链接|简要介绍|\n|---|:---:|:---:|\n|**RSFMs（遥感基础模型）游乐场**|[链接](https:\u002F\u002Fgithub.com\u002Fsynativ\u002FRSFMs)|一个开源平台，用于简化在各种数据集上对 RSFM 的评估和微调。|\n|**PANGAEA**|[链接](https:\u002F\u002Fgithub.com\u002Fyurujaja\u002Fpangaea-bench)|一个全球且包容性的地理空间基础模型基准测试。|\n|**GeoFM**|[链接](https:\u002F\u002Fgithub.com\u002Fxiong-zhitong\u002FGeoFM)|用于地球观测的基础模型评估。|\n|**rs-embed**|[链接](https:\u002F\u002Fgithub.com\u002Fcybergis\u002Frs-embed)|一行代码即可获取任何遥感基础模型（RSFM）在任何地点、任何时间的嵌入。|\n\n## 调查\u002F评论论文\n|标题|发表刊物|论文链接|属性|\n|---|:---:|:---:|:---:|\n|**自监督遥感特征学习：学习范式、挑战与未来工作**|TGRS2023|[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10126079)|**视觉与视觉-语言**|\n|**Visual ChatGPT在遥感中的潜力**|Arxiv2023|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.13009)|**视觉-语言**|\n|**遥感大模型：进展与前瞻**|武汉大学学报 (信息科学版) 2023|[论文](http:\u002F\u002Fch.whu.edu.cn\u002Fcn\u002Farticle\u002Fdoi\u002F10.13203\u002Fj.whugis20230341?viewType=HTML)|**视觉与视觉-语言**|\n|**地理人工智能样本：模型、质量与服务**|武汉大学学报 (信息科学版) 2023|[论文](http:\u002F\u002Fch.whu.edu.cn\u002Farticle\u002Fid\u002F5e67ed6a-aae5-4ec0-ad1b-f2aba89f4617)|**-**|\n|**受大脑启发的遥感基础模型及开放问题：综合综述**|JSTARS2023|[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10254282)|**视觉与视觉-语言**|\n|**重新审视预训练遥感模型基准：尺寸调整和归一化很重要**|Arxiv2023|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.13456)|**视觉**|\n|**面向地球观测的多模态基础模型议程**|IGARSS2023|[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10282966)|**视觉**|\n|**环境遥感中的迁移学习**|RSE2024|[论文](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS0034425723004765)|**迁移学习**|\n|**遥感基础模型发展综述与未来设想**|遥感学报2023|[论文](https:\u002F\u002Fwww.ygxb.ac.cn\u002Fzh\u002Farticle\u002Fdoi\u002F10.11834\u002Fjrs.20233313\u002F)|**-**|\n|**关于多模态基础模型在地理、环境、农业和城市规划应用中的前景与挑战**|Arxiv2023|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2312.17016)|**视觉-语言**|\n|**遥感中的视觉-语言模型：当前进展与未来趋势**|IEEE GRSM2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2305.05726)|**视觉-语言**|\n|**地球与气候基础模型的基础**|Arxiv2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2405.04285)|**视觉与视觉-语言**|\n|**迈向视觉-语言地理基础模型：综述**|Arxiv2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2406.09385)|**视觉-语言**|\n|**遥感中的AI基础模型：综述**|Arxiv2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2408.03464)|**视觉**|\n|**通用遥感智能的基础模型：潜力与前景**|Science Bulletin2024|[论文](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS2095927324006510?via%3Dihub)|**-**|\n|**遥感领域视觉语言模型的进展：数据集、能力与增强技术**|Arxiv2024|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.17283)|**视觉-语言**|\n|**遥感与地球观测的基础模型：综述**|IEEE GRSM2025|[论文](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fmgrs.2025.3576766)|**视觉与视觉-语言**|\n|**当遥感遇到基础模型：综述及更进一步**|RS2025|[论文](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs17020179)|**视觉、视觉-语言、生成模型与智能体**|\n|**视觉-语言建模与遥感的结合：模型、数据集与视角**|IEEE GRSM2025|[论文](https:\u002F\u002Fdoi.org\u002F10.1109\u002Fmgrs.2025.3572702)|**视觉-语言**|\n|**面向地球观测下游任务的多模态遥感基础模型进展：综述**|RS2025|[论文](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs17213532)|**视觉与视觉-语言**|\n|**遥感微调：综述**|CVM2025|[论文](https:\u002F\u002Fdoi.org\u002F10.26599\u002Fcvm.2025.9450490)|**视觉与视觉-语言**|\n|**多模态遥感基础大模型：研究现状与未来展望**|测绘学报2024|[论文](http:\u002F\u002Fxb.chinasmp.com\u002FCN\u002F10.11947\u002Fj.AGCS.2024.20240019.)|**视觉、视觉-语言、生成模型与视觉-位置**|\n|**当地球科学遇到基础模型：迈向通用地球科学人工智能系统**|IEEE GRSM2024|[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10770814)|**视觉与视觉-语言**|\n|**迈向下一代空间人工智能**|JAG2025|[论文](https:\u002F\u002Fwww.sciencedirect.com\u002Fscience\u002Farticle\u002Fpii\u002FS1569843225000159)|**-**|\n|**遥感领域的视觉基础模型：综述**|IEEE GRSM2025|[论文](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F10916803)|**视觉**|\n|**MIMRS：遥感领域掩码图像建模综述**|IGARSS2025|[论文](https:\u002F\u002Fdoi.org\u002F10.1109\u002Figarss55030.2025.11243448)|**视觉**|\n|**遥感基础模型中的挑战与应用回顾**|IGARSS2025|[论文](https:\u002F\u002Fdoi.org\u002F10.1109\u002Figarss55030.2025.11242732)|**视觉与视觉-语言**|\n|**通过打通数据与计算孤岛释放遥感基础模型的潜力**|The Innovation2025|[论文](https:\u002F\u002Fwww.cell.com\u002Fthe-innovation\u002Ffulltext\u002FS2666-6758(25)00044-X)|**-**|\n|**遥感基础模型综述：从视觉到多模态**|Arxiv2025|[论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.22081)|**-**|\n|**遥感领域基础模型的谱系**|ACM TSAS2026|[论文](https:\u002F\u002Fdoi.org\u002F10.1145\u002F3789505)|**视觉与视觉-语言**|\n|**遥感基础模型的机载部署：架构、优化与硬件的全面回顾**|RS2026|[论文](https:\u002F\u002Fdoi.org\u002F10.3390\u002Frs18020298)|**视觉与视觉-语言**|\n|**地球基础模型的基础**|Communications Earth & Environment 2026|[论文](https:\u002F\u002Fdoi.org\u002F10.1038\u002Fs43247-025-03127-x)|**视觉与视觉-语言**|\n\n## 引用\n\n如果您觉得本仓库有用，请考虑给个 star :star: 并引用：\n\n```\n@inproceedings{guo2024skysense,\n  title={Skysense：面向地球观测影像通用解译的多模态遥感基础模型},\n  author={郭欣、劳江伟、党博、张莹莹、于磊、茹立祥、钟利恒、黄子源、吴康、胡丁翔等},\n  booktitle={IEEE\u002FCVF计算机视觉与模式识别会议论文集},\n  pages={27672--27683},\n  year={2024}\n}\n\n@article{li2025unleashing,\n  title={通过弥合数据与计算能力孤岛释放遥感基础模型潜力},\n  author={李彦生、谭杰毅、党博、叶莽、巴塔列夫·谢尔盖·阿、申卡连科·斯坦尼斯拉夫、王琳琳、张莹莹、茹立祥、郭欣等},\n  journal={The Innovation},\n  year={2025},\n  publisher={Elsevier}\n}\n\n@article{wu2025semantic,\n  author = {吴康、张莹莹、茹立祥、党博、劳江伟、于磊、罗俊威、朱子凡、孙悦、张嘉豪、朱琪、王健、杨明、陈京东、张永军、李彦生},\n  title= {用于地球观测的语义增强型多模态遥感基础模型},\n  journal= {Nature Machine Intelligence},\n  year= {2025},\n  doi= {10.1038\u002Fs42256-025-01078-8},\n  url= {https:\u002F\u002Fdoi.org\u002F10.1038\u002Fs42256-025-01078-8}\n}\n\n@inproceedings{zhu2025skysense,\n  title={Skysense-o：以视觉为中心的视觉-语言建模实现开放世界遥感解译},\n  author={朱琪、劳江伟、季德义、罗俊威、吴康、张莹莹、茹立祥、王健、陈京东、杨明等},\n  booktitle={计算机视觉与模式识别会议论文集},\n  pages={14733--14744},\n  year={2025}\n}\n\n@article{luo2024skysensegpt,\n  title={Skysensegpt：用于遥感视觉-语言理解的细粒度指令调优数据集及模型},\n  author={罗俊威、庞震、张永军、王廷柱、王琳琳、党博、劳江伟、王健、陈京东、谭义华等},\n  journal={arXiv预印本 arXiv:2406.10100},\n  year={2024}\n}\n```","# Awesome-Remote-Sensing-Foundation-Models 快速上手指南\n\n`Awesome-Remote-Sensing-Foundation-Models` 是一个汇集了遥感基础模型（RSFMs）相关论文、数据集、基准测试、代码及预训练权重的精选列表。本指南将帮助您快速搭建环境并尝试使用列表中推荐的模型。\n\n## 环境准备\n\n在开始之前，请确保您的开发环境满足以下要求：\n\n*   **操作系统**: Linux (推荐 Ubuntu 20.04+) 或 macOS。Windows 用户建议使用 WSL2。\n*   **Python 版本**: 3.8 或更高版本。\n*   **GPU**: 推荐使用 NVIDIA GPU (显存建议 16GB 以上以运行大型基础模型)，并安装对应的 CUDA 驱动。\n*   **前置依赖**:\n    *   `git`: 用于克隆仓库。\n    *   `conda` 或 `venv`: 用于管理虚拟环境。\n    *   `PyTorch`: 大多数模型基于 PyTorch 构建。\n\n> **国内加速建议**：\n> *   推荐使用 **清华大学开源软件镜像站** 或 **阿里云镜像站** 加速 `pip` 和 `conda` 包的下载。\n> *   若访问 GitHub 缓慢，可使用国内镜像站（如 `ghproxy.com`）克隆仓库。\n\n## 安装步骤\n\n由于该仓库是模型集合而非单一软件包，您需要先克隆仓库，然后根据具体想使用的模型（如 SatMAE, RingMo, SpectralGPT 等）进入对应子目录进行安装。以下是通用流程：\n\n### 1. 克隆仓库\n```bash\n# 使用国内加速代理克隆（可选）\ngit clone https:\u002F\u002Fghproxy.com\u002Fhttps:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models.git\ncd Awesome-Remote-Sensing-Foundation-Models\n```\n\n### 2. 创建虚拟环境\n```bash\nconda create -n rsfm python=3.9 -y\nconda activate rsfm\n```\n\n### 3. 安装基础依赖\n大多数模型需要 PyTorch 和 torchvision。建议根据官方文档安装匹配您 CUDA 版本的 PyTorch。\n```bash\n# 示例：安装 PyTorch (CUDA 11.8 版本)，使用清华源加速\npip install torch torchvision torchaudio --index-url https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n### 4. 安装特定模型依赖\n请在仓库的 **Models** 部分选择您感兴趣的模型（例如 `SatMAE`），进入其代码链接指向的子项目目录安装。\n\n以 **SatMAE** 为例：\n```bash\n# 假设已克隆 SatMAE 代码到本地 satmae 目录\ncd satmae\npip install -r requirements.txt\n# 或使用国内源\npip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n```\n\n> **注意**：不同模型（如 `RingMo`, `SpectralGPT`, `Clay`）的依赖项可能不同，请务必查阅具体模型仓库中的 `README.md` 或 `requirements.txt`。\n\n## 基本使用\n\n以下以 **SatMAE**（用于多光谱卫星图像预训练的 Transformer 模型）为例，展示如何加载预训练权重并进行简单的推理。其他模型的使用逻辑类似，请参考各自仓库的具体文档。\n\n### 1. 下载预训练权重\n从对应模型的发布页面或 Hugging Face 下载 `.pth` 或 `.ckpt` 权重文件，放置在项目目录下（例如 `pretrained\u002Fsatmae.pth`）。\n\n### 2. 编写推理脚本\n创建一个名为 `inference.py` 的文件：\n\n```python\nimport torch\nfrom torchvision import transforms\nfrom PIL import Image\n# 导入具体模型的架构定义 (需根据实际模型调整导入路径)\nfrom models.satmae import vit_large_patch16 \n\n# 配置设备\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n\n# 初始化模型\nmodel = vit_large_patch16()\ncheckpoint = torch.load('pretrained\u002Fsatmae.pth', map_location=device)\nmodel.load_state_dict(checkpoint['model'], strict=False)\nmodel.to(device)\nmodel.eval()\n\n# 数据预处理 (根据模型要求调整)\ntransform = transforms.Compose([\n    transforms.Resize((224, 224)),\n    transforms.ToTensor(),\n    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n])\n\n# 加载遥感图像\nimage_path = \"example_satellite_image.tif\" # 替换为您的图片路径\nimage = Image.open(image_path).convert(\"RGB\")\ninput_tensor = transform(image).unsqueeze(0).to(device)\n\n# 前向传播\nwith torch.no_grad():\n    features = model(input_tensor)\n    \nprint(f\"特征提取完成，输出形状：{features.shape}\")\n```\n\n### 3. 运行脚本\n```bash\npython inference.py\n```\n\n---\n**提示**：本仓库涵盖了视觉、视觉 - 语言、生成式等多种类型的遥感基础模型。对于 `SpectralGPT`（高光谱）或 `Prithvi`（时间序列）等专用模型，请确保输入数据的维度（如波段数、时间步长）与模型要求一致。","某省级自然资源监测中心急需利用卫星影像快速识别全省范围内的违规耕地占用情况，以应对季度巡查任务。\n\n### 没有 Awesome-Remote-Sensing-Foundation-Models 时\n- **模型选型盲目**：团队需在海量论文中手动筛选适合多光谱或 SAR 图像的预训练模型，耗时数周仍难以确定最优基线。\n- **数据标注成本高昂**：由于缺乏强大的通用特征提取器，针对特定地块变化检测任务需要标注数万张样本才能训练出可用模型。\n- **跨传感器适配困难**：面对光学与雷达混合数据，自行研发联合表征学习算法技术门槛极高，导致多源数据融合效果不佳。\n- **复现周期漫长**：寻找分散的代码库和权重文件极其困难，环境配置和代码调试往往占据项目 80% 的时间。\n\n### 使用 Awesome-Remote-Sensing-Foundation-Models 后\n- **精准锁定模型**：直接查阅分类清晰的列表，迅速定位到如 SatMAE（多光谱）或 DINO-MM（SAR-光学联合）等成熟模型作为起点。\n- **小样本高效微调**：利用列表中提供的强大预训练权重，仅需少量标注样本即可通过微调实现高精度的违规地块识别。\n- **多源数据无缝融合**：直接调用已验证的视觉 - 语言或多模态基础模型代码，轻松实现光学与雷达数据的互补分析。\n- **即插即用加速落地**：一键获取关联的代码仓库与权重文件，将原本数月的算法研发周期压缩至几天，快速投入业务运行。\n\nAwesome-Remote-Sensing-Foundation-Models 通过一站式聚合前沿模型、数据与代码，将遥感 AI 开发从“重复造轮子”转变为“站在巨人肩膀上”的高效创新。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002FJack-bo1220_Awesome-Remote-Sensing-Foundation-Models_33fb1e17.png","Jack-bo1220","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002FJack-bo1220_9473edaf.jpg","https:\u002F\u002Fgithub.com\u002FJack-bo1220",1757,161,"2026-04-05T01:41:48","","未说明",{"notes":84,"python":82,"dependencies":85},"该仓库是一个遥感基础模型（RSFMs）的论文、数据集、基准测试、代码和预训练权重的集合列表（Awesome List），本身不是一个单一的独立软件工具。因此，README 中未提供统一的运行环境需求。具体的操作系统、GPU、内存、Python 版本及依赖库要求取决于用户选择运行的列表中某个特定模型（如 SatMAE, RingMo, SpectralGPT 等），需参考各模型对应的独立代码仓库链接获取详细安装说明。",[],[14,37],"2026-03-27T02:49:30.150509","2026-04-06T09:25:55.684423",[90,95,100,105,110,115,120,125],{"id":91,"question_zh":92,"answer_zh":93,"source_url":94},18568,"SkySense 的代码什么时候发布？","SkySense 的权重和使用代码已于 2024 年 7 月针对性地开源。如需进一步信息，可联系武汉大学李彦胜教授（yansheng.li@whu.edu.cn）或蚂蚁集团钟立恒博士（zhongliheng.zlh@alibaba-inc.com）。","https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models\u002Fissues\u002F29",{"id":96,"question_zh":97,"answer_zh":98,"source_url":99},18569,"如何联系项目维护者进行合作或咨询？","可以通过电子邮件联系维护者 Bo Dang，邮箱地址为：bodang@whu.edu.cn。","https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models\u002Fissues\u002F5",{"id":101,"question_zh":102,"answer_zh":103,"source_url":104},18570,"RSP 项目的正确代码仓库地址是什么？","RSP 项目的正确 GitHub 仓库地址是 https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FRSP，而不是 https:\u002F\u002Fgithub.com\u002FViTAE-Transformer\u002FRemote-Sensing-RVSA。","https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models\u002Fissues\u002F21",{"id":106,"question_zh":107,"answer_zh":108,"source_url":109},18571,"有哪些推荐的遥感图像 - 文本检索数据集或模型？","推荐关注 LuoJiaHOG（层级导向的地理感知图像描述数据集）和 GeoRSClip（基于 RS5M 训练的 CLIP 模型）。此外，CSMAE 模型也可用于传感器无关的遥感图像检索任务。","https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models\u002Fissues\u002F18",{"id":111,"question_zh":112,"answer_zh":113,"source_url":114},18572,"是否有基于 Mamba 架构的遥感基础模型推荐？","是的，推荐关注 RoMA（基于 Mamba 的可扩展自监督预训练框架）和 DynamicVis。这些模型旨在解决 Vision Transformer 在处理高分辨率图像时的二次复杂度问题，适用于大规模无标签数据的预训练。","https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models\u002Fissues\u002F38",{"id":116,"question_zh":117,"answer_zh":118,"source_url":119},18573,"有没有关于遥感视觉 - 语言基础模型或智能体（Agents）的相关工作推荐？","推荐参考以下工作：1. 用于遥感图像变化描述的解耦范式与提示学习（PromptCC）；2. 用于交互式遥感变化解释与分析的智能体（Change-Agent）。","https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models\u002Fissues\u002F33",{"id":121,"question_zh":122,"answer_zh":123,"source_url":124},18574,"是否有适合遥感基础模型评测的基准（Benchmark）推荐？","可以参考 Pangaea-bench，这是一个针对遥感基础模型的评测基准项目。","https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models\u002Fissues\u002F32",{"id":126,"question_zh":127,"answer_zh":128,"source_url":129},18575,"有哪些大规模地球观测（EO）数据集可用？","推荐 Major TOM 数据集，目前包含超过 60TB 的地球观测数据，并且未来还会持续更新。相关论文可在 arXiv 上查阅。","https:\u002F\u002Fgithub.com\u002FJack-bo1220\u002FAwesome-Remote-Sensing-Foundation-Models\u002Fissues\u002F27",[]]