[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-microsoft--TimeCraft":3,"tool-microsoft--TimeCraft":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",144730,2,"2026-04-07T23:26:32",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107888,"2026-04-06T11:32:50",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":10,"last_commit_at":59,"category_tags":60,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[35,15,13,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":77,"owner_twitter":78,"owner_website":79,"owner_url":80,"languages":81,"stars":101,"forks":102,"last_commit_at":103,"license":104,"difficulty_score":10,"env_os":105,"env_gpu":106,"env_ram":105,"env_deps":107,"category_tags":112,"github_topics":113,"view_count":32,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":117,"updated_at":118,"faqs":119,"releases":120},5378,"microsoft\u002FTimeCraft","TimeCraft","Official code for TimeCraft: A Time Series Generation Framework for Real-World Applications","TimeCraft 是一个基于扩散模型的时间序列生成框架，专为医疗、金融、能源等现实应用场景设计。它旨在解决现有方法难以跨领域通用、缺乏可控性以及生成数据对下游任务帮助有限等痛点。通过合成高质量的时间序列数据，TimeCraft 能有效缓解真实数据稀缺问题，在保护隐私的同时支持安全的模拟与预测分析。\n\n该工具特别适合研究人员和开发者使用，尤其是那些需要处理多领域数据或希望利用文本指令定制生成内容的团队。TimeCraft 拥有三大核心技术亮点：首先，它构建了包含“语义原型”的通用潜在空间，仅需少量样本即可自适应新领域，实现强大的跨域泛化能力；其次，创新性地引入多智能体系统，支持通过自然语言描述（如趋势、季节性）灵活控制生成结果，让过程更具可解释性；最后，采用目标感知适应机制，确保生成的数据能切实提升下游模型的性能，而不仅仅是模仿训练分布。这使得 TimeCraft 成为探索假设场景和训练鲁棒模型的得力助手。","![Logo](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_TimeCraft_readme_e1bb8cc10134.png)\n\nhttps:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002F35bc7ee3-f7a2-4949-96fc-1d1b977e0df1\n\n# Time Series Generation for Real-World Applications \nThe rapid advancement of artificial intelligence has increasingly emphasized the critical role of time series data in powering intelligent decision-making across diverse domains, including healthcare, finance, energy, and transportation. In these fields, the ability to generate high-quality synthetic time series has become particularly valuable. **Time series generation** technology plays a vital role in alleviating **data scarcity**, especially in scenarios where collecting real-world data is expensive, time-consuming, or impractical. It also enables **privacy-preserving** analysis by producing realistic but non-identifiable synthetic data, reducing the risks associated with sharing sensitive information. Moreover, it supports **simulation and forecasting in risk-free environments**, allowing researchers and practitioners to safely explore hypothetical scenarios and train robust models. Together, these capabilities make time series generation an essential tool for a wide range of real-world applications.\n\nDespite its potential, most existing methods are **limited to single-domain generation** and struggle to generalize across diverse real-world scenarios, where time series patterns vary significantly. In addition, traditional models often **lack controllability**—they generate data unconditionally, without the ability to guide specific trends, seasonality, or domain characteristics. Yet such control is crucial in practical applications, where tailored synthetic data is needed to support specific scenarios. Furthermore, many approaches focus solely on **replicating the training data distribution**, without considering whether the generated data is truly beneficial for downstream tasks\n\nTo address these limitations, we propose **TimeCraft**, a generic **diffusion model-based time series generation framework** designed for real world applications with the following characters:\n\n1. ​**Cross-domain generalization**: \nTimeCraft introduces a ​​universal latent space​​ for time series by learning a shared set of *semantic prototypes* (analogous to a \"dictionary\" of temporal patterns). These prototypes encode domain-invariant features such as trends and seasonality, which are reusable across domains.\nTo adapt to new domains, TimeCraft employs a lightweight ​​Prototype Assignment Module (PAM)​​ that dynamically computes domain-specific weights for the prototypes using few-shot examples. This process constructs a *domain prompt*—a latent representation that captures the target domain’s unique characteristics without explicit labels or retraining.  Leveraging these prompts, TimeCraft generates high-fidelity time series that align with the structure of previously unseen domains.\n→ Jump to details: [✨Time Series Prototypes: The Key to Cross-Domain Generation](#✨1-time-series-prototypes-the-key-to-cross-domain-generation)\n\n2. **Text-based control​**​: Text carries rich semantic information, domain knowledge, and instance-specific cues that can guide time series generation in a more controllable and interpretable way. TimeCraft leverages a *multi-agent text generation system* to produce high-quality textual descriptions of time series patterns. These descriptions are used to construct paired time series–text data for training. Building on this, TimeCraft introduces a hybrid framework that combines semantic prototypes with free-form textual prompts, enabling flexible yet domain-grounded control over the generated time series.\n→ Jump to details: [✨Multi-Agent System and Hybrid Conditioning for Text based Control](#✨2-multi-agent-system-and-hybrid-conditioning-for-text-based-control)\n\n3. **Target-aware adaptation**: TimeCraft introduces a novel approach where synthetic samples are generated with the explicit goal of improving downstream model performance—rather than simply mimicking the training data distribution. It incorporates an *influence-guided diffusion mechanism* that optimizes sample generation by quantifying the expected reduction in task-specific loss using *influence functions*. This ensures that the generated data is not only realistic, but also strategically tailored to enhance performance in practical applications such as forecasting, classification, and anomaly detection.\n→ Jump to details: [✨Target-Aware Generation with Influence Function Guidance](#✨3-target-aware-generation-with-influence-function-guidance)\n\n**TimeCraft** offers a unified, practical solution for real-world time series generation—combining cross-domain generalization, text-based control, and task-aware adaptation. It’s designed to produce high-quality, controllable synthetic data that’s both realistic and useful for downstream applications.\n\n\n#### Microsoft Research Blogs:\n1. [TimeCraft: A universal framework for time-series generation](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Farticles\u002Ftimecraft-a-universal-framework-for-time-series-generation\u002F)\n2. [TimeDP: Creating cross-domain synthetic time-series data](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Farticles\u002Ftimedp-creating-cross-domain-synthetic-time-series-data\u002F)\n3. [TimeCraft：面向真实世界的跨域泛化、文本可控与任务感知通用时间序列生成框架](https:\u002F\u002Fmp.weixin.qq.com\u002Fs\u002Faq3EqnNykXfNMz9LVyRpnw)\n\n---\n## 🚀 News & Updates (2026)\n\nWe are excited to announce three major research breakthroughs integrated into **TimeCraft**, significantly expanding the frontier of TSG toward **Causality**, **Foundation Models**, and **Continuous-time Modeling**:\n\n*   **[CaTSG] Causal Control via Diffusion Models:** We introduce **CaTSG**, a novel framework that incorporates causal constraints into the diffusion process. By moving beyond mere statistical correlation, CaTSG allows for the generation of realistic time series that adhere to underlying causal structures, facilitating robust \"what-if\" analysis and risk evaluation. \n    [[Paper]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2509.20846) | [[Code]](.\u002FCaTSG)\n\n*   **[OATS] Online Data Augmentation for TSFMs:** To empower the next generation of Time Series Foundation Models (TSFMs), we developed **OATS**. It provides a dynamic, online data augmentation engine that synthesizes model-tailored samples during pre-training, significantly improving the generalization and zero-shot performance of large-scale temporal models. \n    [[Paper]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2601.19040) | [[Code]](.\u002FOATS)\n\n*   **[MN-TSG] Continuous Generation with Irregular Observations:** Real-world data is often sparse and non-uniformly sampled. **MN-TSG** enables continuous-time generation by modeling latent physiological or physical dynamics, allowing the synthesis of realistic, high-fidelity temporal patterns even from highly irregular or incomplete observations. \n    [[Paper]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2601.13534) | [[Code]](.\u002FDiff-MN)\n\n---\n## 🗺️ Framework Overview\n![TimeDP framework overview.](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_TimeCraft_readme_9dc288d31eca.png)\nTimeCraft supports **three flexible input branches**. Users can **activate any one, any two, or all three inputs** depending on their application scenario:\n\n1. Inference Example (Few-shot Time Series Prompting)\nProvide a few sample time series from your target domain to guide the generation process.\n\n2. Text Description (Text-based Control)\nUse natural language prompts to control trends, seasonality, or domain-specific styles in generated time series.\n\n3. Downstream Task Model and Data (Target-Aware Guidance)\nLeverage gradients from a downstream model to guide generation toward improving task-specific performance.\n\n## 📊 Performance\nTimeCraft achieves state-of-the-art results across multiple dimensions of time series generation:\n\n#### Best Generation Fedility (In-domain & Out-of-domain)\nWe conduct evaluation on real-world datasets spanning four major domains: **energy, transportation, meteorology, and finance**. Generation quality is rigorously assessed using statistical metrics like Maximum Mean Discrepancy (MMD) and Kullback-Leibler (KL) divergence.For in-domain generation, TimeCraft achieves the **best performance on 11 out of 12 datasets**, with MMD reduced by 25.9% and KL divergence reduced by 53.0% on average, compared to leading baselines. On unseen domains, TimeCraft also demonstrate best generalization abilities among baselines.\n\n![Fedility performance.](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_TimeCraft_readme_a3400839e748.png)\n\n#### Strongest Text Controllability\nTimeCraft achieves the highest text-to-series consistency, improving MSE by 12.52% and MAE by 6.34% compared to generation without text input, and also ranks best in human evaluations. See detailed results in the [paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.02445).\n\n\n#### Best Downstream Task Performance\nWe tested it on **six medical datasets**, covering tasks like **ICU stay prediction and rare disease diagnosis**.\nCompared to other methods, TarDiff consistently generates data that leads to better or comparable downstream performance — sometimes even outperforms real data. See detailed results in the [paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.17613).\n\n\n\n\n## 📚 Related Papers\n#### Cross Domain Time Series Generation\n- [AAAI 2025] TimeDP: Learning to Generate Multi-Domain Time Series with Domain Prompts, [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2501.05403) \u002F [Code](TimeDP)\n\n\n#### Controllability\n- 🆕🔥[2026] Causal Time Series Generation via Diffusion Models, [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2509.20846) \u002F [Code](CaTSG)\n- [ICML 2025] BRIDGE: Bootstrapping Text to Control Time-Series Generation via Multi-Agent Iterative Optimization and Diffusion Modelling, [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.02445) \u002F [Code](BRIDGE)\n\n#### Adaptability\n- [KDD 2025] TarDiff: Target-Oriented Diffusion Guidance  for Synthetic Electronic Health Record  Time Series Generation, [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.17613) \u002F [Code](TarDiff)\n\n#### General Time Series Techniques\n- 🆕🔥[2026] OATS: Online Data Augmentation for Time Series Foundation Models, [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2601.19040) \u002F [Code](OATS)\n- 🆕🔥[2026] MN-TSG: Continuous Time Series Generation with Irregular Observations, [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2601.13534) \u002F [Code](Diff-MN)\n- [ICLR 2024] MG-TSD: Multi-granularity Time Series Diffusion Models with Guided Learning Process, [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.05751) \u002F [Code](https:\u002F\u002Fgithub.com\u002FHundredl\u002FMG-TSD)\n- [TKDE 2025] TimeRAF: Retrieval-Augmented Foundation model for Zero-shot Time Series Forecasting, [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.20810)\n- [KDD 2025] InvDiff: Invariant Guidance for Bias Mitigation in Diffusion Models, [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.08480) \u002F [Code](https:\u002F\u002Fgithub.com\u002FHundredl\u002FInvDiff)\n\n#### Finance Application\n\n- [AAAI 2026] Controllable Financial Market Generation with Diffusion Guided Meta Agent, [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.12991) \u002F [Code](DiGA)\n- [ICLR 2025] MarS: a Financial Market Simulation Engine Powered by Generative Foundation Model, [Paper](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.07486)\n\n\n## 🔑 Key Features  \n\n* **Multi-Domain Time Series Generation**: Robust cross-domain generalization enabled by **few-shot learning**, requiring minimal data from new domains.\n* **Controllable Generation**: Natural language **text-based control** allows users to specify desired characteristics like trends or seasonality.\n* **Target-Aware Generation**: Synthesized data is explicitly optimized to improve downstream model performance on tasks like forecasting or classification.\n* **Diffusion-Based Framework**: Ensures high-fidelity, stable, and diverse time series through powerful diffusion modeling.\n* **Automated Time Series Description**: Generates descriptive text to enhance interpretability and support paired training or analysis.\n* **State-of-the-Art Results**: Achieves superior performance across both in-domain and unseen-domain benchmarks for both fedility and controllability.\n\n\n\n## 🚀Quick Start\n### 1. Environment setups \nClone this repository and setup enviroment.\n```bash\nconda env create -f environment.yml\n```\n\n### 2. How to Use Data\n#### 2.1 Supported Public Datasets\n\nTimeCraft includes automatic support for downloading and preprocessing several publicly available datasets, for example:\n\n- [Temperature and Rain Dataset (Monash)](https:\u002F\u002Fzenodo.org\u002Frecords\u002F5129091\u002Ffiles\u002Ftemperature_rain_dataset_without_missing_values.zip?download=1)\n- [Wind Dataset (4-Second Interval)](https:\u002F\u002Fzenodo.org\u002Frecords\u002F4656032\u002Ffiles\u002Fwind_4_seconds_dataset.zip?download=1)\n- [Pedestrian Counts Dataset](https:\u002F\u002Fzenodo.org\u002Frecords\u002F4656626\u002Ffiles\u002Fpedestrian_counts_dataset.zip?download=1)\n\nYou can manually download these datasets from the links above, or simply run the `prepare_datasets.py` script, which automates the download, extraction, and transformation into model-ready formats.\n\n#### 2.2 Downloading and Processing Datasets\n\nRun the following command to execute the script:\n\n```bash\npython TimeDP\u002Futils\u002Fprepare_datasets.py\n```\n\nThis script performs several preprocessing steps:\n\n1. **Dataset Download**:\n   - Automatically fetches public datasets from sources like Zenodo (e.g., Monash TSF datasets for temperature\u002Frain, wind, and pedestrian counts).\n   - Loads benchmark datasets (e.g., solar, electricity, traffic) using GluonTS.\n   - Also retrieves example financial time series from the TimeGAN repository (e.g., stock prices).\n\n2. **Data Preprocess**:\n   - Concatenates train and test splits to form a complete time series.\n   - Saves a multivariate series into a time-indexed CSV format under `.\u002Fdata\u002F`.\n   - Converts `.tsf` (Time Series Format) files into pandas DataFrames.\n   - Extracts series based on feature tags like `PRCP_SUM` for rain or `T_MEAN` for temperature.\n\n3. **Sliding Window Segmentation**:\n   - For each dataset, applies sliding window segmentation with various sequence lengths (`24, 96, 168, 336`).\n   - Each window forms a data sample of fixed length.\n   - Outputs `.npy` files for training and validation sets (e.g., `electricity_96_train.npy`).\n\n4. **Zero-shot Setup (Optional)**:\n   - For selected datasets like `stock` and `web`, prepares fixed test and prompt samples for zero-shot evaluation.\n   - Saves prompt\u002Ftest slices and exports prompt sequences to CSV for inspection.\n\n### 3. Preparation for text controlled generation (Optional)  \n#### 3.1 Get text templates \n\nWe provide example text templates and you can use them directly to build your dataset [here](process\u002Ftext_templates_example.json).\nThese templates are designed to describe time series data in a structured and diverse manner, covering various domains and statistical characteristics.\n\nYou can also collect and refine your own text templates using our multi-agent framework. \n\n#### 3.2 Apply text templates to generate textual descriptions for time-series data\n\nWe apply text templates to generate textual descriptions of time-series data by extracting statistical features (e.g., mean, standard deviation, trend) from each time window. These features are then filled into predefined templates to create descriptive narratives. Optionally, the descriptions are optimized using a large language model (LLM) for clarity and quality.\n\nThe implementation is available here:  [Code Link](process\u002Fts_to_text.py).\n\nThe results are saved in CSV files with the suffix `_with_descriptions.csv`. \n\nDataset split details can be found here: [Dataset Split](supplementary\u002Fdataset_split.md).\n\n### 4. Preparation for target-aware generation (Optional) \n\n#### 4.1 TarDiff Data & Pre-processing\n\n1. **Pre-processing description**\n   Detailed instructions describing how raw MIMIC-III data was processed into the format suitable for our models are provided in `supplementary\u002Fmimiciii_prepare.md`.\n   Follow these instructions to replicate the preprocessing and feature extraction pipeline used in our experiments.\n\n2. **Dataset download**\n   You can access the raw datasets at the following links:\n\n   * [eICU Collaborative Research Database](https:\u002F\u002Feicu-crd.mit.edu\u002F)\n\n   * [MIMIC-III Clinical Database](https:\u002F\u002Fphysionet.org\u002Fcontent\u002Fmimiciii\u002F1.4\u002F)\n\n   > **Note:** Both datasets require prior approval and credentialing before download.\n\n   Our focus is specifically on the multivariate time-series records available in these datasets.\n\n3. **Default data format**\n   By default, the data loaders expect a pickled **tuple** containing:\n\n   * `data`: shape **(N, F, T)**, representing *N* samples, *F* features, and *T* time steps.\n   * `labels`: shape **(N,)**, corresponding labels for each sample.\n\n#### 4.2 Prepare the Guidance Set  \n\nTarDiff requires a **guidance set** whose distribution closely approximates that of the downstream task targets. This distributional alignment allows the model to steer the diffusion process toward generating data that is more relevant and useful for downstream applications.  \n\nIn our demo setting, we simply use the **training set** as a proxy for the guidance set. Users can later replace it with a more customized subset based on attribution methods (e.g., influence scores, gradient similarity) if desired.\n\n#### 4.3 Prepare the downstream model for guidance  \n\nTarDiff requires a downstream model to compute gradients that guide the diffusion process toward generating task-relevant data.  \nTo achieve optimal utility, users are encouraged to use their **own downstream models** that best reflect the real application scenario (e.g., mortality prediction, sepsis detection).\n\nThe downstream model can be any differentiable architecture (e.g., RNN, Transformer, CNN) and should be trained on the same task as the generation target.  \nDuring inference, TarDiff uses the gradients of the downstream loss with respect to generated samples to guide each denoising step.\n\n**Optional: Use a simple RNN model as downstream guidance**  \nWe provide an example RNN classifier for classification-based tasks. It takes input time series of shape `(batch_size, time_steps, features)`.\n\n### 5. Training the TimeCraft Framework\n\nUse `main.py` for model training and `visualize.py` for domain prompt visualization. \n\nThe detailed descriptions about command line arguments can be referred to in [this document](supplementary\u002Ftraining_details.md).\n\n\n###  6. Generation with TimeCraft Framework\n\n####  6.1 Controllable Generation with Domain Prompts\nUse `inference.py` for model inference. TimeCraft can generate cross-domain time series according to the given domain prompts (composed of prototypes) Commands can be found here: [inference details](supplementary\u002Finference_prototype.md).\n\n####  6.2 Controllable Generation with Domain Prompts and Text\nUse `inference.py` for model inference. TimeCraft can generate desired time series according to the given domain prompts (composed of prototypes) and texts. Commands can be found here: [inference details](supplementary\u002Finference_prototype_text.md).\n\n####  6.3 Target-Aware Generation for Specific Downstream Tasks\nUse `inference.py` with the TarDiff module enabled to perform target-aware generation.  \nTimeCraft can generate synthetic time series specifically tailored to improve downstream task performance by integrating guidance signals from your task-specific model and guidance set. Commands can be found here: [inference details](supplementary\u002Finference_guidance.md).\n\n## ⚙️ Example Runs and Expected Results\nWe provide example runs on electricity data set: [examples](supplementary\u002Fexamples.md).\n\nTo further demonstrate the utility of our task-specific data generation approach, we also provide an example run on the MIMIC-III ICU Stay prediction task: [examples](supplementary\u002Fexample_for_mimic_icustay.md).\n\n\n## 🔍 Details of Each Component\n\n### ✨1. Time Series Prototypes: The Key to Cross-Domain Generation  \n\nAt the core of **TimeCraft** lies the concept of **Time Series Prototypes**—a foundational mechanism that enables effective cross-domain generalization. Much like how words serve as the fundamental building blocks for large language models, **time series prototypes** act as the smallest units that define time series styles. These prototypes encapsulate essential patterns such as **trends, seasonal variations, and periodic fluctuations**, allowing the model to understand and generate diverse time series data across multiple domains.  \n\nEach prototype represents a fundamental time series component, and by **learning, combining, and reassembling these units**, **TimeCraft** achieves strong **cross-domain adaptability**. This innovative approach enables the model to generate realistic and domain-consistent time series, even in fields with limited available data.  \n\n![Prototype Like Word.](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_TimeCraft_readme_dcf155aab4fe.png)\n\n### Few-shot Prompting for Time Series Generation \n\nReal-world applications often require **personalized time series generation**, tailored to specific **domains, styles, or constraints**. However, due to the inherent complexity of time series data, manually describing the desired **trends, periodicity, and stochastic variations** can be highly challenging—especially for **unseen domains**.  \n\nTo address this, we introduce an **example-driven generation mechanism**, where users can simply provide **a few sample time series from the target domain** instead of manually specifying the style.  \n\n**How It Works:**  \n- The **Prototype Assignment Module (PAM)** extracts key characteristics from the provided samples, automatically constructing **domain prompts** that serve as conditional inputs for the generation process.  \n- These **domain prompts** enable **TimeCraft** to generate time series that accurately reflect the statistical and temporal properties of the target domain.  \n- By leveraging learned **time series prototypes**, the model generalizes well to **new, unseen domains** while maintaining high fidelity and controllability.  \n\nThis approach eliminates the need for explicit domain labels or textual descriptions, making **TimeCraft** a **highly flexible and adaptive** time series generation framework suited for a wide range of real-world applications.  \n\n---\n### ✨2. Multi-Agent System and Hybrid Conditioning for Text based Control\n#### Time Series to Text Data Preparation Through Multi-Agent Systems\n\nGenerating time series from text can be a highly useful technique as text provides clear and intuitive descriptions of desired trends, statistical properties, and domain-specific nuances. \nHowever, real-world applications often face the dilemma of limited domain-specific text data to guide generation. This lack of data restricts the ability to specify desired trends and statistical features for time series generation accurately.\n\nThe critical challenge of **text-controlled time series generation** begins with creating **high-quality text-TS pairings** - a task complicated by the scarcity of domain-specific descriptive data. Our solution introduces a **three-stage multi-agent framework** that revolutionizes text template creation:  \n\n1. **Text Template Collection**: We collect diverse sources of time series-related texts, such as articles, reports, and news, to construct a set of general-purpose text templates. These templates are domain-agnostic and can be adapted to different datasets and domains.  \n2. **Automated Evaluation**: The generated text descriptions are evaluated to assess the quality of the descriptions in supporting downstream tasks.  \n3. **Feedback-Driven Refinement**: Based on the evaluation results, the text descriptions are refined iteratively by the system, improving their accuracy and alignment with target domain characteristics.\n\nThrough this iterative process, the system generates **domain-agnostic templates** that can later be customized for specific domains and time series characteristics, ensuring high-quality text-to-time series pairings for controlled generation tasks. Statistical features are programmatically injected into templates, creating text descriptions that preserve essential temporal semantics, enabling the creation of text prompts that precisely capture **latent temporal patterns**, **domain-specific constraints**, and **instance-level characteristics** through natural language.  \n\n![Text Preparation](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_TimeCraft_readme_7ee834676956.jpeg)\n\n#### Text to Time Series Control: Bridging Modalities Through Hybrid Conditioning  \n\nThe discrete nature of textual data poses a significant challenge when trying to control the continuous structure of time series data. \nWe address the challenge of **text-controlled time series generation** by integrating **textual descriptions** with **semantic prototypes** in a **hybrid prompt**. This enhances the model’s ability to generalize across domains. Diffusion models are used for their proven capability in generating high-quality time series. The **hybrid prompt** is fed into the **cross-attention layers** of the diffusion model, improving control over the generation process. \n\n---\n\n### ✨3. Target-Aware Generation with Influence Function Guidance\n\nTimeCraft includes a lightweight guidance mechanism that enables *task-aware* synthetic time series generation.\nRather than relying solely on stylistic or domain-level prompts, this mechanism integrates feedback from downstream models to actively steer the diffusion process toward generating data that is directly beneficial for the target application.\n\n| Component | Role |\n|-----------|------|\n| **Guidance Set** | A small collection of time-series whose distribution mirrors the target task. For a quick start you can reuse the training set; advanced users may curate or weight the set with influence scores. |\n| **Downstream Model** | Any differentiable network trained on the task of interest (e.g., RNN, Transformer). During generation its loss gradients provide step-by-step direction. |\n| **Guidance Module** | Injects the downstream gradients into each denoising step, gently steering the diffusion trajectory without altering the backbone generator. |\n\nTogether, these core components form a seamless feedback loop where the **guidance set** defines the downstream data distribution, the **downstream model** encodes the specific task requirements, and the **guidance module** translates these signals into actionable gradients. As a result, TimeCraft efficiently guides the diffusion process to produce synthetic data tailored precisely to your downstream objectives.\n\n## Contributing\n\nThis project welcomes contributions and suggestions.  Most contributions require you to agree to a\nContributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us\nthe rights to use your contribution. For details, visit https:\u002F\u002Fcla.opensource.microsoft.com.\n\nWhen you submit a pull request, a CLA bot will automatically determine whether you need to provide\na CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions\nprovided by the bot. You will only need to do this once across all repos using our CLA.\n\nThis project has adopted the [Microsoft Open Source Code of Conduct](https:\u002F\u002Fopensource.microsoft.com\u002Fcodeofconduct\u002F).\nFor more information see the [Code of Conduct FAQ](https:\u002F\u002Fopensource.microsoft.com\u002Fcodeofconduct\u002Ffaq\u002F) or\ncontact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.\n\n## Trademarks\n\nThis project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft\ntrademarks or logos is subject to and must follow\n[Microsoft's Trademark & Brand Guidelines](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Flegal\u002Fintellectualproperty\u002Ftrademarks\u002Fusage\u002Fgeneral).\nUse of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.\nAny use of third-party trademarks or logos are subject to those third-party's policies.\n\n\n\n","![Logo](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_TimeCraft_readme_e1bb8cc10134.png)\n\nhttps:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002F35bc7ee3-f7a2-4949-96fc-1d1b977e0df1\n\n# 面向真实世界应用的时间序列生成\n人工智能的快速发展使得时间序列数据在医疗、金融、能源和交通等多个领域的智能决策中扮演着越来越重要的角色。在这些领域，生成高质量合成时间序列的能力变得尤为关键。**时间序列生成**技术在缓解**数据稀缺性**方面发挥着重要作用，尤其是在收集真实数据成本高昂、耗时或不切实际的情况下。此外，它还能通过生成既真实又不可识别的合成数据来实现**隐私保护**分析，从而降低敏感信息共享带来的风险。同时，该技术还支持在**无风险环境中进行模拟与预测**，使研究人员和从业者能够安全地探索假设场景并训练稳健的模型。综上所述，时间序列生成已成为广泛应用于现实世界的必备工具。\n\n然而，尽管潜力巨大，现有的大多数方法仍**局限于单一领域的生成**，难以在时间序列模式差异显著的多样化真实场景中实现泛化。此外，传统模型往往**缺乏可控性**——它们以无条件的方式生成数据，无法引导特定的趋势、季节性或领域特征。但在实际应用中，这种控制能力至关重要，因为需要定制化的合成数据来支持特定场景。更进一步，许多方法仅关注于**复制训练数据的分布**，而未考虑生成的数据是否真正有益于下游任务。\n\n为解决上述局限性，我们提出了**TimeCraft**，一个基于扩散模型的通用时间序列生成框架，专为真实世界应用设计，具备以下特点：\n\n1. **跨领域泛化**：\n   TimeCraft通过学习一组共享的*语义原型*（类似于时间模式的“词典”），为时间序列引入了一个通用的潜在空间。这些原型编码了趋势和季节性等领域无关的特征，可在不同领域间重复使用。\n   为了适应新领域，TimeCraft采用了一个轻量级的*原型分配模块（PAM）*，利用少量示例动态计算原型的领域特定权重。这一过程构建出一个*领域提示*——一种无需显式标签或重新训练即可捕捉目标领域独特特征的潜在表示。借助这些提示，TimeCraft能够生成与此前未见领域结构一致的高保真时间序列。\n   → 跳转至详情：[✨时间序列原型：跨领域生成的关键](#✨1-time-series-prototypes-the-key-to-cross-domain-generation)\n\n2. **基于文本的控制**：\n   文本蕴含丰富的语义信息、领域知识以及实例特定的线索，能够以更加可控和可解释的方式指导时间序列生成。TimeCraft利用一个*多智能体文本生成系统*，生成高质量的时间序列模式描述文本。这些描述被用于构建配对的时间序列–文本数据进行训练。在此基础上，TimeCraft引入了一种混合框架，将语义原型与自由形式的文本提示相结合，从而实现对生成时间序列的灵活但具有领域基础的控制。\n   → 跳转至详情：[✨多智能体系统与混合条件设置：基于文本的控制](#✨2-multi-agent-system-and-hybrid-conditioning-for-text-based-control)\n\n3. **目标感知的自适应**：\n   TimeCraft提出了一种新颖的方法，即生成合成样本时明确以提升下游模型性能为目标，而非简单地模仿训练数据的分布。它引入了一种*影响导向的扩散机制*，通过使用*影响函数*量化任务特定损失的预期减少量来优化样本生成。这确保了生成的数据不仅真实，而且经过策略性调整，能够有效提升预测、分类和异常检测等实际应用中的性能。\n   → 跳转至详情：[✨影响函数引导的目标感知生成](#✨3-target-aware-generation-with-influence-function-guidance)\n\n**TimeCraft**提供了一个统一且实用的解决方案，用于真实世界的时间序列生成——结合了跨领域泛化、基于文本的控制以及任务感知的自适应。它旨在生成高质量、可控的合成数据，既真实又对下游应用有用。\n\n\n#### 微软研究院博客：\n1. [TimeCraft：通用的时间序列生成框架](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Farticles\u002Ftimecraft-a-universal-framework-for-time-series-generation\u002F)\n2. [TimeDP：创建跨领域合成时间序列数据](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Fresearch\u002Farticles\u002Ftimedp-creating-cross-domain-synthetic-time-series-data\u002F)\n3. [TimeCraft：面向真实世界的跨域泛化、文本可控与任务感知通用时间序列生成框架](https:\u002F\u002Fmp.weixin.qq.com\u002Fs\u002Faq3EqnNykXfNMz9LVyRpnw)\n\n---\n## 🚀 新闻与更新（2026年）\n\n我们很高兴宣布三项重大研究突破已集成到**TimeCraft**中，极大地拓展了TSG的前沿领域，分别涉及**因果关系**、**基础模型**和**连续时间建模**：\n\n*   **[CaTSG] 基于扩散模型的因果控制**：我们推出了**CaTSG**，这是一个将因果约束融入扩散过程的新框架。通过超越单纯的统计相关性，CaTSG能够生成符合潜在因果结构的真实时间序列，从而促进稳健的“假设情景”分析和风险评估。\n    [[论文]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2509.20846) | [[代码]](.\u002FCaTSG)\n\n*   **[OATS] 面向TSFM的在线数据增强**：为赋能下一代时间序列基础模型（TSFM），我们开发了**OATS**。它提供了一个动态的在线数据增强引擎，在预训练过程中合成针对模型定制的样本，显著提升大规模时间模型的泛化能力和零样本性能。\n    [[论文]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2601.19040) | [[代码]](.\u002FOATS)\n\n*   **[MN-TSG] 具有不规则观测的连续生成**：真实世界的数据往往是稀疏且采样不均匀的。**MN-TSG**通过建模潜在的生理或物理动力学，实现了连续时间生成，即使在高度不规则或不完整的情况下也能合成逼真的高保真时间序列模式。\n    [[论文]](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2601.13534) | [[代码]](.\u002FDiff-MN)\n\n---\n\n## 🗺️ 框架概述\n![TimeDP框架概览。](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_TimeCraft_readme_9dc288d31eca.png)\nTimeCraft支持**三条灵活的输入分支**。用户可以根据应用场景**激活任意一个、任意两个或全部三个输入**：\n\n1. 推理示例（少样本时间序列提示）\n提供来自目标领域的若干示例时间序列，以指导生成过程。\n\n2. 文本描述（基于文本的控制）\n使用自然语言提示来控制生成时间序列中的趋势、季节性或领域特定风格。\n\n3. 下游任务模型与数据（目标感知引导）\n利用下游模型的梯度来引导生成，以提升任务相关的性能。\n\n## 📊 性能\nTimeCraft在时间序列生成的多个维度上均达到了最先进水平：\n\n#### 最佳生成保真度（域内与域外）\n我们在涵盖四个主要领域的真实世界数据集上进行了评估：**能源、交通、气象和金融**。生成质量通过最大均值差异（MMD）和Kullback-Leibler（KL）散度等统计指标进行严格评估。对于域内生成，TimeCraft在12个数据集中有11个取得了**最佳性能**，与领先的基线方法相比，MMD平均降低了25.9%，KL散度平均降低了53.0%。在未见过的领域中，TimeCraft同样表现出优于其他基线的最佳泛化能力。\n\n![保真度性能。](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_TimeCraft_readme_a3400839e748.png)\n\n#### 最强的文本可控性\nTimeCraft实现了最高的文本到序列一致性，相较于无文本输入的生成，其均方误差（MSE）提高了12.52%，平均绝对误差（MAE）提高了6.34%，并且在人工评估中也位居第一。详细结果请参见[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.02445)。\n\n#### 最佳的下游任务性能\n我们将其测试于**六个医学数据集**，涵盖了**ICU住院预测和罕见病诊断**等任务。\n与其他方法相比，TarDiff始终能够生成有助于提升或达到同等下游任务性能的数据——有时甚至优于真实数据。详细结果请参见[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.17613)。\n\n\n\n\n## 📚 相关论文\n#### 跨领域时间序列生成\n- [AAAI 2025] TimeDP：学习使用领域提示生成多领域时间序列，[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2501.05403) \u002F [代码](TimeDP)\n\n\n#### 可控性\n- 🆕🔥[2026] 基于扩散模型的因果时间序列生成，[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2509.20846) \u002F [代码](CaTSG)\n- [ICML 2025] BRIDGE：通过多智能体迭代优化和扩散建模实现文本驱动的时间序列生成，[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2503.02445) \u002F [代码](BRIDGE)\n\n#### 适应性\n- [KDD 2025] TarDiff：面向目标的扩散引导技术，用于合成电子健康记录时间序列的生成，[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2504.17613) \u002F [代码](TarDiff)\n\n#### 通用时间序列技术\n- 🆕🔥[2026] OATS：面向时间序列基础模型的在线数据增强，[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2601.19040) \u002F [代码](OATS)\n- 🆕🔥[2026] MN-TSG：具有不规则观测的连续时间序列生成，[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2601.13534) \u002F [代码](Diff-MN)\n- [ICLR 2024] MG-TSD：具有引导式学习过程的多粒度时间序列扩散模型，[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2403.05751) \u002F [代码](https:\u002F\u002Fgithub.com\u002FHundredl\u002FMG-TSD)\n- [TKDE 2025] TimeRAF：用于零样本时间序列预测的检索增强型基础模型，[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.20810)\n- [KDD 2025] InvDiff：用于扩散模型中偏见缓解的不变性引导，[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2412.08480) \u002F [代码](https:\u002F\u002Fgithub.com\u002FHundredl\u002FInvDiff)\n\n#### 金融应用\n\n- [AAAI 2026] 基于扩散引导的元代理实现可控金融市场生成，[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2408.12991) \u002F [代码](DiGA)\n- [ICLR 2025] MarS：由生成式基础模型驱动的金融市场模拟引擎，[论文](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2409.07486)\n\n\n## 🔑 主要特性  \n\n* **多领域时间序列生成**：通过**少样本学习**实现强大的跨领域泛化能力，只需少量新领域数据即可。\n* **可控生成**：基于自然语言的**文本控制**使用户能够指定所需的趋势、季节性等特征。\n* **目标感知生成**：合成数据经过显式优化，可提升下游模型在预测或分类等任务上的性能。\n* **基于扩散的框架**：通过强大的扩散建模技术，确保生成的高保真度、稳定性和多样性。\n* **自动化时间序列描述**：自动生成描述性文本，以增强可解释性，并支持配对训练或分析。\n* **最先进成果**：在域内及未见过领域的基准测试中，在保真度和可控性方面均取得卓越性能。\n\n\n\n## 🚀 快速入门\n### 1. 环境设置 \n克隆此仓库并设置环境。\n```bash\nconda env create -f environment.yml\n```\n\n### 2. 如何使用数据\n#### 2.1 支持的公开数据集\n\nTimeCraft 自动支持下载和预处理多个公开可用的数据集，例如：\n\n- [温度与降雨数据集（莫纳什大学）](https:\u002F\u002Fzenodo.org\u002Frecords\u002F5129091\u002Ffiles\u002Ftemperature_rain_dataset_without_missing_values.zip?download=1)\n- [风速数据集（4秒间隔）](https:\u002F\u002Fzenodo.org\u002Frecords\u002F4656032\u002Ffiles\u002Fwind_4_seconds_dataset.zip?download=1)\n- [行人计数数据集](https:\u002F\u002Fzenodo.org\u002Frecords\u002F4656626\u002Ffiles\u002Fpedestrian_counts_dataset.zip?download=1)\n\n您可以从上述链接手动下载这些数据集，或者直接运行 `prepare_datasets.py` 脚本，该脚本会自动完成下载、解压以及转换为模型就绪格式的过程。\n\n#### 2.2 下载并处理数据集\n\n运行以下命令以执行脚本：\n\n```bash\npython TimeDP\u002Futils\u002Fprepare_datasets.py\n```\n\n此脚本会执行多项预处理步骤：\n\n1. **数据集下载**：\n   - 自动从 Zenodo 等来源获取公共数据集（例如莫纳什 TSF 数据集中的温度\u002F降雨、风速和行人计数数据）。\n   - 使用 GluonTS 加载基准数据集（例如太阳能、电力、交通流量等）。\n   - 同时从 TimeGAN 仓库中获取示例金融时间序列数据（例如股票价格）。\n\n2. **数据预处理**：\n   - 将训练集和测试集拼接成完整的时间序列。\n   - 将多变量时间序列保存为带时间索引的 CSV 格式，存储在 `.\u002Fdata\u002F` 目录下。\n   - 将 `.tsf`（时间序列格式）文件转换为 pandas DataFrame。\n   - 根据特征标签提取特定序列，例如 `PRCP_SUM` 表示降雨量，`T_MEAN` 表示平均温度。\n\n3. **滑动窗口分割**：\n   - 对每个数据集应用滑动窗口分割，设置不同的序列长度（`24, 96, 168, 336`）。\n   - 每个窗口形成一个固定长度的数据样本。\n   - 输出用于训练和验证的 `.npy` 文件（例如 `electricity_96_train.npy`）。\n\n4. **零样本设置（可选）**：\n   - 对于选定的数据集，如 `stock` 和 `web`，准备固定的测试样本和提示样本，用于零样本评估。\n   - 保存提示\u002F测试切片，并将提示序列导出为 CSV 文件以便检查。\n\n### 3. 文本控制生成的准备工作（可选）\n#### 3.1 获取文本模板\n\n我们提供了示例文本模板，您可以直接使用它们来构建自己的数据集，具体模板请见[这里](process\u002Ftext_templates_example.json)。这些模板旨在以结构化且多样化的方式描述时间序列数据，涵盖不同领域和统计特征。\n\n您也可以利用我们的多智能体框架收集并优化您自己的文本模板。\n\n#### 3.2 应用文本模板生成时间序列数据的文本描述\n\n我们通过提取每个时间窗口的统计特征（例如均值、标准差、趋势），并将这些特征填入预定义的模板中，从而生成时间序列数据的描述性文本。可选地，这些描述还可以使用大型语言模型（LLM）进行优化，以提高清晰度和质量。\n\n实现代码请参见：[代码链接](process\u002Fts_to_text.py)。\n\n生成的结果将保存为后缀为 `_with_descriptions.csv` 的 CSV 文件。数据集划分详情请参阅：[数据集划分](supplementary\u002Fdataset_split.md)。\n\n### 4. 靶向感知生成的准备工作（可选）\n\n#### 4.1 TarDiff 数据及预处理\n\n1. **预处理说明**\n   关于如何将原始 MIMIC-III 数据处理为适合我们模型的格式的详细说明，请参阅 `supplementary\u002Fmimiciii_prepare.md`。请按照该文档中的说明操作，以复现我们在实验中使用的预处理和特征提取流程。\n\n2. **数据集下载**\n   您可以通过以下链接访问原始数据集：\n\n   * [eICU 合作研究数据库](https:\u002F\u002Feicu-crd.mit.edu\u002F)\n   * [MIMIC-III 临床数据库](https:\u002F\u002Fphysionet.org\u002Fcontent\u002Fmimiciii\u002F1.4\u002F)\n\n   > **注意：** 两个数据集在下载前均需获得批准并完成认证。\n\n   我们主要关注这些数据集中提供的多变量时间序列记录。\n\n3. **默认数据格式**\n   默认情况下，数据加载器期望一个包含以下内容的 pickle 格式元组：\n\n   * `data`：形状为 `(N, F, T)`，表示 *N* 个样本、*F* 个特征和 *T* 个时间步。\n   * `labels`：形状为 `(N,)`，对应每个样本的标签。\n\n#### 4.2 准备引导集\n\nTarDiff 需要一个引导集，其分布应尽可能接近下游任务的目标分布。这种分布上的对齐有助于模型引导扩散过程，生成更符合下游应用需求的数据。\n\n在我们的演示场景中，我们直接将训练集作为引导集的替代品。用户可以根据需要，后续使用基于归因方法（例如影响分数、梯度相似性）的自定义子集来替换它。\n\n#### 4.3 为引导准备下游模型\n\nTarDiff 需要一个下游模型来计算梯度，以指导扩散过程生成与任务相关的内容。为了达到最佳效果，建议用户使用最能反映实际应用场景的**自有下游模型**（例如死亡率预测、脓毒症检测等）。\n\n下游模型可以是任何可微分架构（例如 RNN、Transformer、CNN），并且应在与生成目标相同的任务上进行训练。在推理过程中，TarDiff 会利用下游损失函数关于生成样本的梯度来指导每一步去噪过程。\n\n**可选：使用简单的 RNN 模型作为下游引导**\n我们提供了一个用于分类任务的 RNN 分类器示例，其输入时间为序列的形状为 `(batch_size, time_steps, features)`。\n\n### 5. 训练 TimeCraft 框架\n\n使用 `main.py` 进行模型训练，使用 `visualize.py` 进行领域提示的可视化。有关命令行参数的详细说明，请参阅[本文档](supplementary\u002Ftraining_details.md)。\n\n### 6. 使用 TimeCraft 框架进行生成\n\n#### 6.1 基于领域提示的可控生成\n使用 `inference.py` 进行模型推理。TimeCraft 可以根据给定的领域提示（由原型组成）生成跨领域的时序数据。相关命令请参见：[推理详情](supplementary\u002Finference_prototype.md)。\n\n#### 6.2 基于领域提示和文本的可控生成\n使用 `inference.py` 进行模型推理。TimeCraft 可以根据给定的领域提示（由原型组成）和文本生成所需的时序数据。相关命令请参见：[推理详情](supplementary\u002Finference_prototype_text.md)。\n\n#### 6.3 针对特定下游任务的目标感知生成\n启用 TarDiff 模块，使用 `inference.py` 进行目标感知生成。  \n通过整合来自特定任务模型和指导集的引导信号，TimeCraft 能够生成专门用于提升下游任务性能的合成时序数据。相关命令请参见：[推理详情](supplementary\u002Finference_guidance.md)。\n\n## ⚙️ 示例运行及预期结果\n我们提供了电力数据集上的示例运行：[示例](supplementary\u002Fexamples.md)。\n\n为进一步展示我们针对特定任务的数据生成方法的实用性，我们也提供了 MIMIC-III ICU 住院预测任务的示例运行：[示例](supplementary\u002Fexample_for_mimic_icustay.md)。\n\n## 🔍 各组件详解\n\n### ✨1. 时序原型：跨领域生成的关键\n\n**TimeCraft** 的核心是 **时序原型** 的概念——这一基础机制使得模型能够有效实现跨领域泛化。正如单词是大型语言模型的基本构建单元一样，**时序原型** 则是定义时序风格的最小单位。这些原型封装了诸如 **趋势、季节性变化和周期性波动** 等关键模式，使模型能够理解和生成多个领域的多样化时序数据。\n\n每个原型代表一个基本的时序组成部分，通过 **学习、组合和重新组装这些单元**，**TimeCraft** 实现了强大的 **跨领域适应能力**。这种创新方法使模型能够在数据有限的领域中生成逼真且符合领域特征的时序数据。\n\n![原型如单词。](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_TimeCraft_readme_dcf155aab4fe.png)\n\n### 少样本提示下的时序生成\n\n现实应用中常常需要 **个性化时序生成**，以满足特定的 **领域、风格或约束条件**。然而，由于时序数据本身的复杂性，手动描述期望的 **趋势、周期性和随机波动** 很难实现，尤其是在面对 **未见过的领域** 时更是如此。\n\n为此，我们引入了一种 **基于示例的生成机制**，用户只需提供 **目标领域的几条样本时序**，而无需手动指定风格。\n\n**工作原理：**  \n- **原型分配模块 (PAM)** 会从提供的样本中提取关键特征，自动构建作为生成过程条件输入的 **领域提示**。  \n- 这些 **领域提示** 使 **TimeCraft** 能够生成准确反映目标领域统计和时间特性的时序数据。  \n- 通过利用已学习的 **时序原型**，模型在保持高保真度和可控性的同时，也能很好地推广到 **新的、未见过的领域**。\n\n这种方法消除了对显式领域标签或文本描述的需求，使 **TimeCraft** 成为一个 **高度灵活且自适应** 的时序生成框架，适用于广泛的现实应用场景。\n\n---\n### ✨2. 多智能体系统与混合条件控制文本\n#### 通过多智能体系统准备时序到文本数据\n\n从文本生成时序数据是一项非常有用的技术，因为文本能够清晰直观地描述期望的趋势、统计特性以及领域特有的细微差别。然而，在实际应用中，往往面临可用于指导生成的领域特定文本数据不足的问题。这种数据匮乏限制了准确指定时序生成所需趋势和统计特征的能力。\n\n**文本控制时序生成** 的关键挑战在于创建 **高质量的文本-时序配对**，而这又因领域特定描述性数据的稀缺而变得复杂。我们的解决方案引入了一个 **三阶段的多智能体框架**，彻底革新了文本模板的创建方式：\n\n1. **文本模板收集**：我们收集各类与时序相关的文本资源，如文章、报告和新闻，以构建一套通用的文本模板。这些模板不依赖于特定领域，可以适配不同的数据集和领域。\n2. **自动化评估**：对生成的文本描述进行评估，以判断其在支持下游任务方面的质量。\n3. **反馈驱动的优化**：根据评估结果，系统会迭代优化文本描述，提高其准确性和与目标领域特征的一致性。\n\n通过这一迭代过程，系统生成了 **领域无关的模板**，随后可根据具体领域和时序特征进行定制，从而确保高质量的文本-时序配对，用于受控生成任务。同时，系统还会将统计特征程序化地注入模板中，生成既能保留关键时间语义，又能通过自然语言精确捕捉 **潜在的时间模式**、**领域特定约束** 和 **实例级特征** 的文本提示。\n\n![文本准备](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_TimeCraft_readme_7ee834676956.jpeg)\n\n#### 文本到时序控制：通过混合条件桥接模态\n\n文本数据的离散性质给控制连续结构的时序数据带来了巨大挑战。为了解决 **文本控制时序生成** 的难题，我们采用 **混合提示** 的方式，将 **文本描述** 与 **语义原型** 结合起来，从而增强模型的跨领域泛化能力。扩散模型因其在生成高质量时序数据方面的卓越能力而被选用。**混合提示** 被输入到扩散模型的 **交叉注意力层** 中，进一步提升了对生成过程的控制能力。\n\n---\n\n### ✨3. 基于影响力函数指导的目标感知生成\n\nTimeCraft 包含一种轻量级的指导机制，能够实现*任务感知*的合成时间序列生成。该机制不依赖于单纯的艺术风格或领域级别的提示词，而是通过整合下游模型的反馈，主动引导扩散过程，生成对目标任务直接有益的数据。\n\n| 组件 | 作用 |\n|-----------|------|\n| **指导集** | 一小批时间序列数据，其分布与目标任务一致。为了快速上手，您可以复用训练集；高级用户则可以根据影响力得分对数据集进行筛选或加权。 |\n| **下游模型** | 任何针对目标任务训练的可微网络（例如 RNN、Transformer）。在生成过程中，其损失梯度会提供逐步的指导方向。 |\n| **指导模块** | 将下游梯度注入到每一步去噪过程中，温和地引导扩散轨迹，而不会改变基础生成器的结构。 |\n\n这些核心组件共同构成了一个无缝的反馈回路：**指导集**定义了下游数据的分布，**下游模型**编码了具体的任务需求，而**指导模块**则将这些信号转化为可操作的梯度。因此，TimeCraft 能够高效地引导扩散过程，生成完全贴合您下游目标的合成数据。\n\n## 贡献说明\n\n本项目欢迎各类贡献和建议。大多数贡献都需要您同意一份贡献者许可协议（CLA），声明您有权且确实授予我们使用您贡献的权利。有关详情，请访问 https:\u002F\u002Fcla.opensource.microsoft.com。\n\n当您提交拉取请求时，CLA 机器人会自动判断您是否需要提供 CLA，并相应地标记您的 PR（例如添加状态检查或评论）。请按照机器人提供的指示操作即可。对于所有使用我们 CLA 的仓库，您只需完成一次此步骤。\n\n本项目已采纳 [微软开源行为准则](https:\u002F\u002Fopensource.microsoft.com\u002Fcodeofconduct\u002F)。如需更多信息，请参阅 [行为准则常见问题解答](https:\u002F\u002Fopensource.microsoft.com\u002Fcodeofconduct\u002Ffaq\u002F) 或发送邮件至 [opencode@microsoft.com](mailto:opencode@microsoft.com) 提出进一步的问题或意见。\n\n## 商标声明\n\n本项目可能包含项目、产品或服务相关的商标或标识。未经授权使用微软商标或标识的行为必须遵守并遵循 [微软商标与品牌指南](https:\u002F\u002Fwww.microsoft.com\u002Fen-us\u002Flegal\u002Fintellectualproperty\u002Ftrademarks\u002Fusage\u002Fgeneral)。在本项目的修改版本中使用微软商标或标识时，不得造成混淆或暗示微软的赞助关系。任何第三方商标或标识的使用均应遵守相关第三方的政策。","# TimeCraft 快速上手指南\n\nTimeCraft 是一个基于扩散模型的通用时间序列生成框架，专为真实世界应用设计。它支持**跨域泛化**（少样本学习）、**文本可控生成**以及**任务感知适配**，能够生成高质量、可控制且对下游任务有益的合成时间序列数据。\n\n## 1. 环境准备\n\n在开始之前，请确保您的系统满足以下要求：\n\n*   **操作系统**: Linux (推荐 Ubuntu 20.04+) 或 macOS\n*   **Python 版本**: 3.8 - 3.10\n*   **硬件要求**: 建议使用配备 NVIDIA GPU 的环境以加速扩散模型训练与推理（需安装 CUDA 驱动）\n*   **前置依赖**:\n    *   Conda (推荐用于环境管理)\n    *   Git\n\n## 2. 安装步骤\n\n### 克隆仓库\n首先，从 GitHub 克隆 TimeCraft 项目代码：\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fuser-attachments\u002Fassets\u002F35bc7ee3-f7a2-4949-96fc-1d1b977e0df1.git\ncd TimeCraft\n```\n\n### 创建并激活环境\n项目提供了 `environment.yml` 文件以简化依赖安装。推荐使用 Conda 创建独立环境：\n\n```bash\nconda env create -f environment.yml\nconda activate timecraft\n```\n\n> **提示**：如果下载依赖速度较慢，可配置国内镜像源（如清华源或阿里源）加速安装：\n> ```bash\n> conda config --add channels https:\u002F\u002Fmirrors.tuna.tsinghua.edu.cn\u002Fanaconda\u002Fpkgs\u002Fmain\u002F\n> conda config --add channels https:\u002F\u002Fmirrors.tuna.tsinghua.edu.cn\u002Fanaconda\u002Fpkgs\u002Ffree\u002F\n> conda config --set show_channel_urls yes\n> ```\n\n## 3. 基本使用\n\nTimeCraft 支持三种灵活的输入模式，用户可根据场景单独或组合使用：**少样本示例**、**文本描述**、**下游任务引导**。\n\n### 模式一：基于少样本的跨域生成 (Few-shot Prompting)\n只需提供目标领域的少量样本，即可生成该领域的高保真时间序列。\n\n```python\nfrom timecraft import TimeCraftGenerator\n\n# 初始化模型\ngenerator = TimeCraftGenerator(model_path=\"pretrained\u002Funiversal_checkpoint.pt\")\n\n# 准备少量目标域样本 (例如：能源数据)\nfew_shot_samples = load_data(\"path\u002Fto\u002Fenergy_samples.csv\") \n\n# 生成新数据\nsynthetic_data = generator.generate(\n    prompt_samples=few_shot_samples,\n    num_samples=1000\n)\n\nsave_data(synthetic_data, \"generated_energy_series.csv\")\n```\n\n### 模式二：基于文本的可控生成 (Text-based Control)\n使用自然语言描述期望的趋势、季节性或特定领域特征。\n\n```python\n# 定义文本提示\ntext_prompt = \"Generate a financial time series with an upward trend, high volatility, and weekly seasonality.\"\n\n# 执行生成\nsynthetic_data = generator.generate(\n    text_prompt=text_prompt,\n    num_samples=500\n)\n```\n\n### 模式三：任务感知生成 (Target-Aware Guidance)\n结合下游任务模型（如分类器或预测器），生成能显著提升该任务性能的数据。\n\n```python\n# 加载下游任务模型和数据\ndownstream_model = load_classifier(\"path\u002Fto\u002Fclassifier.pth\")\ntarget_data = load_data(\"path\u002Fto\u002Fscare_medical_data.csv\")\n\n# 启用影响函数引导进行生成\nsynthetic_data = generator.generate_target_aware(\n    model=downstream_model,\n    reference_data=target_data,\n    task_type=\"classification\",\n    num_samples=2000\n)\n```\n\n### 进阶功能：因果控制与连续时间生成\nTimeCraft 还集成了最新的研究模块（需单独导入）：\n*   **CaTSG**: 用于符合因果结构的生成。\n*   **MN-TSG**: 用于处理不规则采样数据的连续时间生成。\n\n```python\n# 示例：使用 MN-TSG 处理不规则观测数据\nfrom timecraft.modules import MNTSG\n\ncontinuous_generator = MNTSG()\nirregular_data = load_irregular_observations(\"patient_vitals.log\")\nsynthetic_continuous = continuous_generator.generate(irregular_data)\n```\n\n现在您可以利用生成的合成数据进行数据增强、隐私保护分析或风险模拟实验了。","某新能源电网运营商正试图构建一个能够预测极端天气下区域负荷波动的 AI 模型，但缺乏足够的历史故障数据。\n\n### 没有 TimeCraft 时\n- **数据极度匮乏**：真实的极端天气停电记录寥寥无几，导致模型因训练样本不足而无法捕捉罕见风险模式。\n- **跨域泛化困难**：现有生成工具仅针对单一区域训练，无法将成熟电网的波动规律迁移到新接入的偏远地区电网。\n- **缺乏可控性**：传统方法只能随机生成数据，无法通过指令定制“高温叠加设备老化”等特定复合场景的负荷曲线。\n- **隐私合规风险**：直接使用包含用户用电习惯的真实敏感数据进行联合建模，面临严格的数据隐私法律限制。\n\n### 使用 TimeCraft 后\n- **高质量小样本增强**：利用少量极端案例，TimeCraft 基于扩散模型生成高保真合成数据，显著提升了模型对罕见故障的识别率。\n- **无缝跨域迁移**：通过其通用的语义原型空间，仅需几个新区域的样本即可动态调整权重，快速生成适配新电网特征的时序数据。\n- **文本精准控制**：运维人员可直接输入“模拟台风过境导致的阶梯式负荷骤降”，TimeCraft 即生成符合该描述的具体时序曲线。\n- **安全隐私保护**：生成的合成数据保留了统计特征但剥离了个人身份信息，使得跨部门数据共享与模型训练完全合规。\n\nTimeCraft 通过跨域泛化与文本可控生成能力，将稀缺且敏感的实时数据转化为丰富、安全且可定制的资产，彻底打破了现实世界时序应用的数据瓶颈。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fmicrosoft_TimeCraft_e1bb8cc1.png","microsoft","Microsoft","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fmicrosoft_4900709c.png","Open source projects and samples from Microsoft",null,"opensource@microsoft.com","OpenAtMicrosoft","https:\u002F\u002Fopensource.microsoft.com","https:\u002F\u002Fgithub.com\u002Fmicrosoft",[82,86,90,94,98],{"name":83,"color":84,"percentage":85},"Python","#3572A5",96.6,{"name":87,"color":88,"percentage":89},"JavaScript","#f1e05a",2.7,{"name":91,"color":92,"percentage":93},"HTML","#e34c26",0.7,{"name":95,"color":96,"percentage":97},"Shell","#89e051",0,{"name":99,"color":100,"percentage":97},"CSS","#663399",1067,62,"2026-04-05T08:05:38","MIT","未说明","未说明 (基于扩散模型 Diffusion Model 和多智能体系统，通常强烈建议使用 NVIDIA GPU)",{"notes":108,"python":109,"dependencies":110},"README 中未列出具体的硬件和软件版本要求，仅指示通过 'conda env create -f environment.yml' 命令自动创建环境。该工具包含扩散模型、多智能体文本生成系统及跨域泛化模块，实际运行通常需要较高的 GPU 显存（建议 16GB+）以支持模型训练和推理。项目集成了多个子模块（如 CaTSG, OATS, MN-TSG），具体依赖需查看仓库中的 environment.yml 文件。","未说明 (通过 environment.yml 配置)",[111],"未说明 (需参考 environment.yml)",[14],[114,115,116],"generative-ai","machine-learning","timeseries-analysis","2026-03-27T02:49:30.150509","2026-04-08T12:51:42.948703",[],[]]