[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-declare-lab--multimodal-deep-learning":3,"tool-declare-lab--multimodal-deep-learning":61},[4,18,26,36,44,53],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",159636,2,"2026-04-17T23:33:34",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",108322,"2026-04-10T11:39:34",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":32,"last_commit_at":50,"category_tags":51,"status":17},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,"2026-04-10T01:20:03",[52,13,15,14],"插件",{"id":54,"name":55,"github_repo":56,"description_zh":57,"stars":58,"difficulty_score":32,"last_commit_at":59,"category_tags":60,"status":17},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[52,14],{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":66,"readme_en":67,"readme_zh":68,"quickstart_zh":69,"use_case_zh":70,"hero_image_url":71,"owner_login":72,"owner_name":73,"owner_avatar_url":74,"owner_bio":75,"owner_company":76,"owner_location":76,"owner_email":76,"owner_twitter":76,"owner_website":77,"owner_url":78,"languages":79,"stars":92,"forks":93,"last_commit_at":94,"license":95,"difficulty_score":96,"env_os":97,"env_gpu":98,"env_ram":97,"env_deps":99,"category_tags":110,"github_topics":112,"view_count":10,"oss_zip_url":76,"oss_zip_packed_at":76,"status":17,"created_at":116,"updated_at":117,"faqs":118,"releases":129},8700,"declare-lab\u002Fmultimodal-deep-learning","multimodal-deep-learning","This repository contains various models targetting multimodal representation learning, multimodal fusion for downstream tasks such as multimodal sentiment analysis.","multimodal-deep-learning 是一个专注于多模态深度学习的开源代码库，旨在帮助开发者和研究人员解决文本、音频与视觉等多源数据的融合与分析难题。它核心解决了如何让机器同时理解多种感官信息的问题，特别适用于多模态情感分析、讽刺检测及对话中的情绪识别等复杂任务。\n\n该工具非常适合人工智能领域的研究人员、算法工程师及高校学生使用，为他们提供了从经典到前沿的多种模型实现，包括基于 PyTorch、TensorFlow 和 Keras 框架的完整代码。其独特的技术亮点在于集成了如 Multimodal-Infomax（MMIM）和 MISA 等先进算法：MMIM 通过层级互信息最大化技术显著提升了特征融合效果，而 MISA 则巧妙地将模态不变性与特异性表示分离，增强了模型的鲁棒性。此外，库中还整理了 MELD、MUStARD 等多个高质量多模态数据集，方便用户直接开展实验。无论是希望复现顶级会议论文成果，还是想要构建自己的多模态应用原型，multimodal-deep-learning 都能提供坚实的技术支撑和丰富的参考范例。","# Multimodal Deep Learning\n\n🎆 🎆 🎆 Announcing the multimodal deep learning repository that contains implementation of various deep learning-based models to solve different multimodal problems such as multimodal representation learning, multimodal fusion for downstream tasks e.g., multimodal sentiment analysis.\n\n```For those enquiring about how to extract visual and audio features, please check this out: https:\u002F\u002Fgithub.com\u002Fsoujanyaporia\u002FMUStARD```\n\n- [Models](#models)\n  * [Multimodal-Infomax (PyTorch)](#improving-multimodal-fusion-with-hierarchical-mutual-information-maximization-for-multimodal-sentiment-analysis)\n  * [MISA (PyTorch)](#MISA-Modality--Invariant-and--Specific-Representations-for-Multimodal-Sentiment-Analysis)\n  * [BBFN (PyTorch)](#Bi-Bimodal-Modality-Fusion-for-Correlation-Controlled-Multimodal-Sentiment-Analysis)\n  * [Hfusion (Keras)](#hfusion)\n  * [contextual-attention-based-LSTM (Tensorflow)](#Attention-based-multimodal-fusion-for-sentiment-analysis)\n  * [bc-LSTM (Keras)](#Context--Dependent-Sentiment-Analysis-in-User-Generated-Videos)\n  * [Contextual-Multimodal-Fusion (Keras)](#Contextual-Inter--modal-Attention-for-Multimodal-Sentiment-Analysis)\n  * [Tensor Fusion Network (PyTorch)](tensor-fusion-network-tfn)\n  * [Low-rank-Multimodal-Fusion (PyTorch)](Low-rank-Multimodal-Fusion)\n\n- [Datasets](#datasets)\n  * [MELD](##meld-a-multimodal-multi-party-dataset-for-emotion-recognition-in-conversation)\n  * [MUStARD](#MUStARD-Multimodal-Sarcasm-Detection-Dataset)\n  * [M2H2](#M2H2-A-Multimodal-Multiparty-Hindi-Dataset-For-Humor-Recognition-in-Conversations\n)\n\n# Models\n\n## Improving Multimodal Fusion with Hierarchical Mutual Information Maximization for Multimodal Sentiment Analysis\n\nThis repository contains the official implementation code of the paper [Improving Multimodal Fusion with Hierarchical Mutual Information Maximization for Multimodal Sentiment Analysis](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2109.00412.pdf), accepted to **EMNLP 2021**.\n\n:fire:  If you would be interested in other multimodal works in our DeCLaRe Lab, welcome to visit the [clustered repository](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fmultimodal-deep-learning)\n\n## Introduction\nMultimodal-informax (MMIM) synthesizes fusion results from multi-modality input through a two-level mutual information (MI) maximization. We use BA (Barber-Agakov) lower bound and contrastive predictive coding as the target function to be maximized. To facilitate the computation, we design an entropy estimation module with associated history data memory to facilitate the computation of BA lower bound and the training process.\n\n![Alt text](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002FMultimodal-Infomax\u002Fblob\u002Fmain\u002Fimg\u002FModelFigSingle.png?raw=true \"Model\")\n\n## Usage\n1. Download the CMU-MOSI and CMU-MOSEI dataset from [Google Drive]() or [Baidu Disk](). Please them under the folder `Multimodal-Infomax\u002Fdatasets`\n\n2. Set up the environment (need conda prerequisite)\n```\nconda env create -f environment.yml\nconda activate MMIM\n```\n\n3. Start training\n```\npython main.py --dataset mosi --contrast\n```\n\n## Citation\nPlease cite our paper if you find our work useful for your research:\n```bibtex\n@article{han2021improving,\n  title={Improving Multimodal Fusion with Hierarchical Mutual Information Maximization for Multimodal Sentiment Analysis},\n  author={Han, Wei and Chen, Hui and Poria, Soujanya},\n  journal={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},\n  year={2021}\n}\n```\n\n## Contact \nShould you have any question, feel free to contact me through [henryhan88888@gmail.com](henryhan88888@gmail.com)\n## MISA: Modality-Invariant and -Specific Representations for Multimodal Sentiment Analysis\nCode for the [ACM MM 2020](https:\u002F\u002F2020.acmmm.org) paper [MISA: Modality-Invariant and -Specific Representations for Multimodal Sentiment Analysis](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.03545.pdf)\n\n\n\u003Cp align=\"center\">\n  \u003Cimg width=\"600\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_779aef8e9748.png\">\n\u003C\u002Fp>\n\n\n\n### Setup the environment\n\nWe work with a conda environment.\n\n```\nconda env create -f environment.yml\nconda activate misa-code\n```\n\n### Data Download\n\n- Install [CMU Multimodal SDK](https:\u002F\u002Fgithub.com\u002FA2Zadeh\u002FCMU-MultimodalSDK). Ensure, you can perform ```from mmsdk import mmdatasdk```.    \n- Option 1: Download [pre-computed splits](https:\u002F\u002Fdrive.google.com\u002Fdrive\u002Ffolders\u002F1IBwWNH0XjPnZWaAlP1U2tIJH6Rb3noMI?usp=sharing) and place the contents inside ```datasets``` folder.     \n- Option 2: Re-create splits by downloading data from MMSDK. For this, simply run the code as detailed next.\n\n### Running the code\n\n1. ```cd src```\n2. Set ```word_emb_path``` in ```config.py``` to [glove file](http:\u002F\u002Fnlp.stanford.edu\u002Fdata\u002Fglove.840B.300d.zip).\n3. Set ```sdk_dir``` to the path of CMU-MultimodalSDK.\n2. ```python train.py --data mosi```. Replace ```mosi``` with ```mosei``` or ```ur_funny``` for other datasets.\n\n### Citation\n\nIf this paper is useful for your research, please cite us at:\n\n```\n@article{hazarika2020misa,\n  title={MISA: Modality-Invariant and-Specific Representations for Multimodal Sentiment Analysis},\n  author={Hazarika, Devamanyu and Zimmermann, Roger and Poria, Soujanya},\n  journal={arXiv preprint arXiv:2005.03545},\n  year={2020}\n}\n```\n\n### Contact\n\nFor any questions, please email at [hazarika@comp.nus.edu.sg](mailto:hazarika@comp.nus.edu.sg)\n\n## Bi-Bimodal Modality Fusion for Correlation-Controlled Multimodal Sentiment Analysis\n\nThis repository contains official implementation of the paper: [Bi-Bimodal Modality Fusion for Correlation-Controlled Multimodal Sentiment Analysis (ICMI 2021)](https:\u002F\u002Farxiv.org\u002Fabs\u002F2107.13669)\n\n### Model Architecture\n\nOverview of our Bi-Bimodal Fusion Network (BBFN). It learns two text-related pairs of representations, text-acoustic and text-visual by enforcing each pair of modalities to complement mutually. Finally, the four (two pairs) head representations are concatenated\nto generate the final prediction.\n\n![Alt text](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_bb8462098e9a.png)\n\nA single complementation layer: two identical pipelines (left and right) propagate the main modality and fuse that\nwith complementary modality with regularization and gated control.\n\n![Alt text](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_bbe895a49416.png)\n\n### Results\n\nResults on the test set of CMU-MOSI and CMU-MOSEI dataset. Notation: △ indicates results in the corresponding line are excerpted from previous papers; † means the results are reproduced with publicly visible source code and applicable hyperparameter setting; ‡ shows the results have experienced paired t-test with 𝑝 \u003C 0.05 and demonstrate significant improvement over MISA, the state-of-the-art model.\n\n![Alt text](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_0e9ced7f82f5.png)\n\n### Usage\n1. Set up conda environemnt\n```\nconda env create -f environment.yml\nconda activate BBFN\n```\n\n2. Install [CMU Multimodal SDK](https:\u002F\u002Fgithub.com\u002FA2Zadeh\u002FCMU-MultimodalSDK)\n\n3. Set `sdk_dir` in `src\u002Fconfig.py` to the path of CMU-MultimodalSDK\n\n4. Train the model\n```\ncd src\npython main.py --dataset \u003Cdataset_name> --data_path \u003Cpath_to_dataset>\n```\nWe provide a script `scripts\u002Frun.sh` for your reference.\n\n### Citation\nPlease cite our paper if you find our work useful  for your research:\n```bibtex\n@article{han2021bi,\n  title={Bi-Bimodal Modality Fusion for Correlation-Controlled Multimodal Sentiment Analysis},\n  author={Han, Wei and Chen, Hui and Gelbukh, Alexander and Zadeh, Amir and Morency, Louis-philippe and Poria, Soujanya},\n  journal={ICMI 2021},\n  year={2021}\n}\n```\n\n### Contact\nShould you have any question, feel free to contact me through [henryhan88888@gmail.com](henryhan88888@gmail.com)\n\n# Hfusion\nCodes for the paper ``Multimodal sentiment analysis using hierarchical fusion with context modeling``\n\n## How to run\n``python3 hfusion.py``\n\n## Requirements\n\nKeras >= 2.0, Tensorflow >= 1.7, Numpy, Scikit-learn\n\n## Citation\n\n``Majumder, N., Hazarika, D., Gelbukh, A., Cambria, E. and Poria, S., 2018. Multimodal sentiment analysis using hierarchical fusion with context modeling. Knowledge-Based Systems, 161, pp.124-133.``\n\n# Attention-based multimodal fusion for sentiment analysis\nAttention-based multimodal fusion for sentiment analysis\n\nCode for the paper\n\n[Context-Dependent Sentiment Analysis in User-Generated Videos](http:\u002F\u002Fsentic.net\u002Fcontext-dependent-sentiment-analysis-in-user-generated-videos.pdf) (ACL 2017).\n\n[Multi-level Multiple Attentions for Contextual Multimodal Sentiment Analysis](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8215597\u002F)(ICDM 2017).\n\n![Alt text](contextual-attention-based-LSTM\u002Fatlstm3.jpg?raw=true \"The attention based fusion mechanism (ICDM 2017)\")\n\n\n### Preprocessing\n**Edit:** the create_data.py is obsolete. The pre-processed datasets have already been provided in the dataset\u002F folder in the repo. Use them directly.\n\nAs data is typically present in utterance format, we combine all the utterances belonging to a video using the following code\n\n```\npython create_data.py\n```\n\nNote: This will create speaker independent train and test splits\nIn dataset\u002Fmosei, extract the zip into a folder named 'raw'.\nAlso, extract 'unimodal_mosei_3way.pickle.zip'\n\n### Running the model\n\nSample command:\n\nWith attention-based fusion:\n```\npython run.py --unimodal True --fusion True\npython run.py --unimodal False --fusion True\n```\nWithout attention-based and with concatenation-based fusion:\n```\npython run.py --unimodal True --fusion False\npython run.py --unimodal False --fusion False\n```\nUtterance level attention:\n```\npython run.py --unimodal False --fusion True --attention_2 True\npython run.py --unimodal False --fusion True --attention_2 True\n```\nNote:\n1. Keeping the unimodal flag as True (default False) shall train all unimodal lstms first (level 1 of the network mentioned in the paper)\n2. Setting --fusion True applies only to multimodal network.\n\n### Datasets:\nWe provide results on the [MOSI](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.06259.pdf), [MOSEI ](http:\u002F\u002Faclweb.org\u002Fanthology\u002FP18-1208) and [IEMOCAP](https:\u002F\u002Fsail.usc.edu\u002Fiemocap\u002F) datasets.\u003Cbr>\nPlease cite the creators.\n\nWe are adding more datasets, stay tuned.\n\nUse ```--data [mosi|mosei|iemocap] and --classes [2|3|6]``` in the above commands to test different configurations on different datasets.\n\nmosi: 2 classes\u003Cbr>\nmosei: 3 classes\u003Cbr>\niemocap: 6 classes\u003Cbr>\n\nExample: \n```\npython run.py --unimodal False --fusion True --attention_2 True --data mosei --classes 3\n```\n\n#### Dataset details\n##### MOSI:\n2 classes: Positive\u002FNegative \u003Cbr>\nRaw Features: (Pickle files) \u003Cbr>\nAudio: dataset\u002Fmosi\u002Fraw\u002Faudio_2way.pickle \u003Cbr>\nText: dataset\u002Fmosi\u002Fraw\u002Ftext_2way.pickle \u003Cbr>\nVideo: dataset\u002Fmosi\u002Fraw\u002Fvideo_2way.pickle \u003Cbr>\n\n**Each file contains: \u003Cbr>**\ntrain_data, train_label, test_data, test_label, maxlen, train_length, test_length\n\ntrain_data - np.array of dim (62, 63, feature_dim) \u003Cbr>\ntrain_label - np.array of dim (62, 63, 2) \u003Cbr>\ntest_data - np.array of dim (31, 63, feature_dim) \u003Cbr>\ntest_label - np.array of dim (31, 63, 2) \u003Cbr>\nmaxlen - max utterance length  int of value 63 \u003Cbr>\ntrain_length - utterance length of each video in train data. \u003Cbr>\ntest_length - utterance length of each video in test data. \u003Cbr>\n\nTrain\u002FTest split: 62\u002F31 videos. Each video has utterances. The videos are padded to 63 utterances.\n\n##### IEMOCAP:\n6 classes: happy\u002Fsad\u002Fneutral\u002Fangry\u002Fexcited\u002Ffrustrated\u003Cbr>\nRaw Features: dataset\u002Fiemocap\u002Fraw\u002FIEMOCAP_features_raw.pkl (Pickle files) \u003Cbr>\nThe file contains:  \nvideoIDs[vid] = List of utterance IDs in this video in the order of occurance \u003Cbr>\nvideoSpeakers[vid] = List of speaker turns. e.g. [M, M, F, M, F]. here M = Male, F = Female \u003Cbr>\nvideoText[vid] = List of textual features for each utterance in video vid. \u003Cbr>\nvideoAudio[vid] = List of audio features for each utterance in video vid. \u003Cbr>\nvideoVisual[vid] = List of visual features for each utterance in video vid. \u003Cbr>\nvideoLabels[vid] = List of label indices for each utterance in video vid. \u003Cbr>\nvideoSentence[vid] = List of sentences for each utterance in video vid. \u003Cbr>\ntrainVid =  List of videos (videos IDs) in train set. \u003Cbr>\ntestVid =  List of videos (videos IDs) in test set. \u003Cbr>\n\nRefer to the file dataset\u002Fiemocap\u002Fraw\u002FloadIEMOCAP.py for more information.\nWe use this data to create a speaker independent train and test splits in the format. (videos x utterances x features)\n\nTrain\u002FTest split: 120\u002F31 videos. Each video has utterances. The videos are padded to 110 utterances.\n\n##### MOSEI:\n3 classes: positive\u002Fnegative\u002Fneutral \u003Cbr>\nRaw Features: (Pickle files) \u003Cbr>\nAudio: dataset\u002Fmosei\u002Fraw\u002Faudio_3way.pickle \u003Cbr>\nText: dataset\u002Fmosei\u002Fraw\u002Ftext_3way.pickle \u003Cbr>\nVideo: dataset\u002Fmosei\u002Fraw\u002Fvideo_3way.pickle \u003Cbr>\n\nThe file contains:\ntrain_data, train_label, test_data, test_label, maxlen, train_length, test_length\n\ntrain_data - np.array of dim (2250, 98, feature_dim) \u003Cbr>\ntrain_label - np.array of dim (62, 63, 2) \u003Cbr>\ntest_data - np.array of dim (31, 63, feature_dim) \u003Cbr>\ntest_label - np.array of dim (31, 63, 2) \u003Cbr>\nmaxlen - max utterance length  int of value 98 \u003Cbr>\ntrain_length - utterance length of each video in train data. \u003Cbr>\ntest_length - utterance length of each video in test data. \u003Cbr>\n\nTrain\u002FTest split: 2250\u002F678 videos. Each video has utterances. The videos are padded to 98 utterances.\n\n\n### Citation \n\nIf using this code, please cite our work using : \n```\n@inproceedings{soujanyaacl17,\n  title={Context-dependent sentiment analysis in user-generated videos},\n  author={Poria, Soujanya  and Cambria, Erik and Hazarika, Devamanyu and Mazumder, Navonil and Zadeh, Amir and Morency, Louis-Philippe},\n  booktitle={Association for Computational Linguistics},\n  year={2017}\n}\n\n@inproceedings{poriaicdm17, \nauthor={S. Poria and E. Cambria and D. Hazarika and N. Mazumder and A. Zadeh and L. P. Morency}, \nbooktitle={2017 IEEE International Conference on Data Mining (ICDM)}, \ntitle={Multi-level Multiple Attentions for Contextual Multimodal Sentiment Analysis}, \nyear={2017},  \npages={1033-1038}, \nkeywords={data mining;feature extraction;image classification;image fusion;learning (artificial intelligence);sentiment analysis;attention-based networks;context learning;contextual information;contextual multimodal sentiment;dynamic feature fusion;multilevel multiple attentions;multimodal sentiment analysis;recurrent model;utterances;videos;Context modeling;Feature extraction;Fuses;Sentiment analysis;Social network services;Videos;Visualization}, \ndoi={10.1109\u002FICDM.2017.134}, \nmonth={Nov},}\n```\n\n### Credits\n\n[Soujanya Poria](http:\u002F\u002Fsporia.info\u002F)\n\n[Gangeshwar Krishnamurthy](http:\u002F\u002Fwww.gangeshwark.com\u002F) (gangeshwark@gmail.com; Github: @gangeshwark)\n\n# Context-Dependent Sentiment Analysis in User Generated Videos\nCode for the paper [Context-Dependent Sentiment Analysis in User-Generated Videos](http:\u002F\u002Fsentic.net\u002Fcontext-dependent-sentiment-analysis-in-user-generated-videos.pdf) (ACL 2017).\n\n### Requirements\nCode is written in Python (2.7) and requires Keras (2.0.6) with Theano backend.\n\n### Description\nIn this paper, we propose a LSTM-based model that enables utterances to capture contextual information from their surroundings in the same video, thus aiding the classification process in multimodal sentiment analysis.\n\n![Alt text](bc-LSTM\u002Fnetwork.jpg?raw=true \"Title\")\n\nThis repository contains the code for the mentioned paper. Each contextual LSTM (Figure 2 in the paper) is implemented as shown in above figure. For more details, please refer to the paper.   \nNote: Unlike the paper, we haven't used an SVM on the penultimate layer. This is in effort to keep the whole network differentiable at some performance cost.\n\n### Dataset\nWe provide results on the [MOSI dataset](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.06259.pdf)  \nPlease cite the creators\n\n\n### Preprocessing\nAs data is typically present in utterance format, we combine all the utterances belonging to a video using the following code\n\n```\npython create_data.py\n```\n\nNote: This will create speaker independent train and test splits\n\n### Running sc-lstm\n\nSample command:\n\n```\npython lstm.py --unimodal True\npython lstm.py --unimodal False\n```\n\nNote: Keeping the unimodal flag as True (default False) shall train all unimodal lstms first (level 1 of the network mentioned in the paper)\n\n### Citation\n\nIf using this code, please cite our work using :\n```\n@inproceedings{soujanyaacl17,\n  title={Context-dependent sentiment analysis in user-generated videos},\n  author={Poria, Soujanya  and Cambria, Erik and Hazarika, Devamanyu and Mazumder, Navonil and Zadeh, Amir and Morency, Louis-Philippe},\n  booktitle={Association for Computational Linguistics},\n  year={2017}\n}\n```\n\n### Credits\n\nDevamanyu Hazarika, Soujanya Poria\n\n# Contextual Inter-modal Attention for Multimodal Sentiment Analysis\nCode for the paper [Contextual Inter-modal Attention for Multi-modal Sentiment Analysis](http:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FD18-1382) (EMNLP 2018).\n\n### Dataset\nWe provide results on the [MOSI dataset](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.06259.pdf).  \nPlease cite the creators.\n\n## Requirements:\nPython 3.5  \nKeras (Tensorflow backend)  2.2.4  \nScikit-learn 0.20.0  \n\n\n### Experiments\n\n```\npython create_data.py\npython trimodal_attention_models.py\n```\n\n### Citation\n\nIf you use this code in your research, please cite our work using:\n```\n@inproceedings{ghosal2018contextual,\n  title={Contextual Inter-modal Attention for Multi-modal Sentiment Analysis},\n  author={Ghosal, Deepanway and Akhtar, Md Shad and Chauhan, Dushyant and Poria, Soujanya and Ekbal, Asif and Bhattacharyya, Pushpak},\n  booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},\n  pages={3454--3466},\n  year={2018}\n}\n```\n\n### Credits\n\nSome of the functionalities in this repo are borrowed from https:\u002F\u002Fgithub.com\u002Fsoujanyaporia\u002Fcontextual-utterance-level-multimodal-sentiment-analysis\n\n### Authors\n\n[Deepanway Ghosal](https:\u002F\u002Fgithub.com\u002Fdeepanwayx), [Soujanya Poria](https:\u002F\u002Fgithub.com\u002Fsoujanyaporia)\n\n## Tensor Fusion Network (TFN)\n\n## IMPORTANT NOTICE\n\nThe CMU-MultimodalSDK on which this repo depend has drastically changed its API since this code is written. Hence the code in this repo cannot be run off-the-shelf anymore. However, the code for the model itself can still be of reference.\n\n# Tensor Fusion Networks\n\nThis is a PyTorch implementation of:\n\nZadeh, Amir, et al. \"Tensor fusion network for multimodal sentiment analysis.\" EMNLP 2017 Oral.\n\nIt requires PyTorch and the CMU Multimodal Data SDK (https:\u002F\u002Fgithub.com\u002FA2Zadeh\u002FCMU-MultimodalDataSDK) \nto function properly. The training data (CMU-MOSI dataset) will be automatically downloaded if you run the script for the first time.\n\nThe model is defined in `model.py`, and the training script is `train.py`.\nHere's a list of commandline arguments for `train.py`:\n\n\n```\n--dataset: default is 'MOSI', currently don't really support other datasets. Just ignore this option\n\n--epochs: max number of epochs, default is 50\n\n--batch_size: batch size, default is 32\n\n--patience: specifies the early stopping condition, similar to that in Keras, default 20\n\n--cuda: whether or not to use GPU, default False\n\n--model_path: a string that specifies the location for storing trained models, default='models'\n\n--max_len: max sequence length when preprocessing data, default=20\n```\n\nIn a nutshell, you can train the model using the following command:\n\n```\npython train.py --epochs 100 --patience 10\n```\n\nThe script starts with a randomly selected set of hyper-parameters. If you want to tune it, you can change them yourself in the script.\n\n### Citation\n\nIf you use this code in your research, please cite our work using:\n```\n@inproceedings{tensoremnlp17,\ntitle={Tensor Fusion Network for Multimodal Sentiment Analysis},\nauthor={Zadeh, Amir and Chen, Minghai and Poria, Soujanya and Cambria, Erik and Morency, Louis-Philippe},\nbooktitle={Empirical Methods in Natural Language Processing, EMNLP},\nyear={2017}\n}\n```\n## Low rank Multimodal Fusion\n\nThis is the repository for \"Efficient Low-rank Multimodal Fusion with Modality-Specific Factors\", Liu and Shen, et. al. ACL 2018.\n\n## Dependencies\n\nPython 2.7 (now experimentally has Python 3.6+ support)\n\n```\ntorch=0.3.1\nsklearn\nnumpy\n```\n\nYou can install the libraries via `python -m pip install -r requirements.txt`.\n\n\n## Data for Experiments\n\nThe processed data for the experiments (CMU-MOSI, IEMOCAP, POM) can be downloaded here:\n\nhttps:\u002F\u002Fdrive.google.com\u002Fopen?id=1CixSaw3dpHESNG0CaCJV6KutdlANP_cr\n\nTo run the code, you should download the pickled datasets and put them in the `data` directory.\n\nNote that there might be NaN values in acoustic features, you could replace them with 0s.\n\n## Training Your Model\n\nTo run the code for experiments (grid search), use the scripts `train_xxx.py`. They have some commandline arguments as listed here:\n\n```\n`--run_id`: an user-specified unique ID to ensure that saved results\u002Fmodels don't override each other.\n\n`--epochs`: the number of maximum epochs in training. Since early-stopping is used to prevent overfitting, in actual training the number of epochs could be less than what you specify here.\n\n`--patience`: if the model performance does not improve in `--patience` many validation evaluations consecutively, the training will early-stop.\n\n`output_dim`: output dimension of the model. Default value in each script should work.\n\n`signiture`: an optional string that's added to the output file name. Intended to use as some sort of comment.\n\n`cuda`: whether or not to use GPU in training. If not specified, will use CPU.\n\n`data_path`: the path to the data directory. Defaults to '.\u002Fdata', but if you prefer storing the data else where you can change this.\n\n`model_path`: the path to the directory where models will be saved.\n\n`output_path`: the path to the directory where the grid search results will be saved.\n\n`max_len`: the maximum length of training data sequences. Longer\u002Fshorter sequences will be truncated\u002Fpadded.\n\n`emotion`: (exclusive for IEMOCAP) specifies which emotion category you want to train the model to predict. Can be 'happy', 'sad', 'angry', 'neutral'.\n```\n\nAn example would be\n\n`python train_mosi.py --run_id 19260817 --epochs 50 --patience 20 --output_dim 1 --signiture test_run_big_model`\n\n## Hyperparameters\n\nSome hyper parameters for reproducing the results in the paper are in the `hyperparams.txt` file.\n\n## Citation\n\n```\n@misc{liu2018efficient,\n      title={Efficient Low-rank Multimodal Fusion with Modality-Specific Factors}, \n      author={Zhun Liu and Ying Shen and Varun Bharadhwaj Lakshminarasimhan and Paul Pu Liang and Amir Zadeh and Louis-Philippe Morency},\n      year={2018},\n      eprint={1806.00064},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI}\n}\n\n```\n\n# Dataset\n\n# MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversation\n\n## Note\n\n:fire: :fire: :fire: For updated baselines please visit this link: [conv-emotion](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fconv-emotion)\n\n## Leaderboard\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_1491652a74b4.png)\n\n## Updates\n\n10\u002F10\u002F2020: New paper and SOTA in Emotion Recognition in Conversations on the MELD dataset. Refer to the directory [COSMIC](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fconv-emotion\u002Ftree\u002Fmaster\u002FCOSMIC) for the code. Read the paper -- [COSMIC: COmmonSense knowledge for eMotion Identification in Conversations](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.02795.pdf).\n\n22\u002F05\u002F2019: MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversation has been accepted as a full paper at ACL 2019. The updated paper can be found here - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1810.02508.pdf\n\n22\u002F05\u002F2019: Dyadic MELD has been released. It can be used to test dyadic conversational models.\n\n15\u002F11\u002F2018: The problem in the train.tar.gz has been fixed. \n\n## Research Works using MELD\n\nZhang, Yazhou, Qiuchi Li, Dawei Song, Peng Zhang, and Panpan Wang. \"Quantum-Inspired Interactive Networks for Conversational Sentiment Analysis.\" IJCAI 2019.\n\nZhang, Dong, Liangqing Wu, Changlong Sun, Shoushan Li, Qiaoming Zhu, and Guodong Zhou. \"Modeling both Context-and Speaker-Sensitive Dependence for Emotion Detection in Multi-speaker Conversations.\" IJCAI 2019.\n\nGhosal, Deepanway, Navonil Majumder, Soujanya Poria, Niyati Chhaya, and Alexander Gelbukh. \"DialogueGCN: A Graph Convolutional Neural Network for Emotion Recognition in Conversation.\" EMNLP 2019.\n\n\n----------------------------------------------------\n\n## Introduction\nMultimodal EmotionLines Dataset (MELD) has been created by enhancing and extending EmotionLines dataset. MELD contains the same dialogue instances available in EmotionLines, but it also encompasses audio and visual modality along with text. MELD has more than 1400 dialogues and 13000 utterances from Friends TV series. Multiple speakers participated in the dialogues. Each utterance in a dialogue has been labeled by any of these seven emotions -- Anger, Disgust, Sadness, Joy, Neutral, Surprise and Fear. MELD also has sentiment (positive, negative and neutral) annotation for each utterance.\n\n### Example Dialogue\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_fd3b5c995f4e.jpeg)\n\n### Dataset Statistics\n| Statistics                      | Train   | Dev     | Test    |\n|---------------------------------|---------|---------|---------|\n| # of modality                   | {a,v,t} | {a,v,t} | {a,v,t} |\n| # of unique words               | 10,643  | 2,384   | 4,361   |\n| Avg. utterance length           | 8.03    | 7.99    | 8.28    |\n| Max. utterance length           | 69      | 37      | 45      |\n| Avg. # of emotions per dialogue | 3.30    | 3.35    | 3.24    |\n| # of dialogues                  | 1039    | 114     | 280     |\n| # of utterances                 | 9989    | 1109    | 2610    |\n| # of speakers                   | 260     | 47      | 100     |\n| # of emotion shift              | 4003    | 427     | 1003    |\n| Avg. duration of an utterance   | 3.59s   | 3.59s   | 3.58s   |\n\nPlease visit https:\u002F\u002Faffective-meld.github.io for more details.\n\n### Dataset Distribution\n\n|          | Train | Dev | Test |\n|----------|-------|-----|------|\n| Anger    | 1109  | 153 | 345  |\n| Disgust  | 271   | 22  | 68   |\n| Fear     | 268   | 40  | 50   |\n| Joy      | 1743  | 163 | 402  |\n| Neutral  | 4710  | 470 | 1256 |\n| Sadness  | 683   | 111 | 208  |\n| Surprise | 1205  | 150 | 281  |\n\n\n## Purpose\nMultimodal data analysis exploits information from multiple-parallel data channels for decision making. With the rapid growth of AI, multimodal emotion recognition has gained a major research interest, primarily due to its potential applications in many challenging tasks, such as dialogue generation, multimodal interaction etc. A conversational emotion recognition system can be used to generate appropriate responses by analysing user emotions. Although there are numerous works carried out on multimodal emotion recognition, only a very few actually focus on understanding emotions in conversations. However, their work is limited only to dyadic conversation understanding and thus not scalable to emotion recognition in multi-party conversations having more than two participants. EmotionLines can be used as a resource for emotion recognition for text only, as it does not include data from other modalities such as visual and audio. At the same time, it should be noted that there is no multimodal multi-party conversational dataset available for emotion recognition research. In this work, we have extended, improved, and further developed EmotionLines dataset for the multimodal scenario. Emotion recognition in sequential turns has several challenges and context understanding is one of them. The emotion change and emotion flow in the sequence of turns in a dialogue make accurate context modelling a difficult task. In this dataset, as we have access to the multimodal data sources for each dialogue, we hypothesise that it will improve the context modelling thus benefiting the overall emotion recognition performance.  This dataset can also be used to develop a multimodal affective dialogue system. IEMOCAP, SEMAINE are multimodal conversational datasets which contain emotion label for each utterance. However, these datasets are dyadic in nature, which justifies the importance of our Multimodal-EmotionLines dataset. The other publicly available multimodal emotion and sentiment recognition datasets are MOSEI, MOSI, MOUD. However, none of those datasets is conversational.\n\n## Dataset Creation\nThe first step deals with finding the timestamp of every utterance in each of the dialogues present in the EmotionLines dataset. To accomplish this, we crawled through the subtitle files of all the episodes which contains the beginning and the end timestamp of the utterances. This process enabled us to obtain season ID, episode ID, and timestamp of each utterance in the episode. We put two constraints whilst obtaining the timestamps: (a) timestamps of the utterances in a dialogue must be in increasing order, (b) all the utterances in a dialogue have to belong to the same episode and scene.\nConstraining with these two conditions revealed that in EmotionLines, a few dialogues consist of multiple natural dialogues. We filtered out those cases from the dataset. Because of this error correction step, in our case, we have the different number of dialogues as compare to the EmotionLines. After obtaining the timestamp of each utterance, we extracted their corresponding audio-visual clips from the source episode. Separately, we also took out the audio content from those video clips. Finally, the dataset contains visual, audio, and textual modality for each dialogue.\n\n## Paper\nThe paper explaining this dataset can be found - https:\u002F\u002Farxiv.org\u002Fpdf\u002F1810.02508.pdf\n\n## Download the data\nPlease visit - http:\u002F\u002Fweb.eecs.umich.edu\u002F~mihalcea\u002Fdownloads\u002FMELD.Raw.tar.gz to download the raw data. Data are stored in .mp4 format and can be found in XXX.tar.gz files. Annotations can be found in https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002FMELD\u002Ftree\u002Fmaster\u002Fdata\u002FMELD.\n\n## Description of the .csv files\n\n### Column Specification\n| Column Name  | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |\n|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| Sr No.       | Serial numbers of the utterances mainly for referencing the utterances in case of different versions or multiple copies with different subsets |\n| Utterance    | Individual utterances from EmotionLines as a string.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |\n| Speaker      | Name of the speaker associated with the utterance.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |\n| Emotion      | The emotion (neutral, joy, sadness, anger, surprise, fear, disgust) expressed by the speaker in the utterance.                                                                                                                                                                                                                                                                                                                                                                                                                                         |\n| Sentiment    | The sentiment (positive, neutral, negative) expressed by the speaker in the utterance.                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |\n| Dialogue_ID  | The index of the dialogue starting from 0.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |\n| Utterance_ID | The index of the particular utterance in the dialogue starting from 0.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |\n| Season       | The season no. of Friends TV Show to which a particular utterance belongs.                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |\n| Episode      | The episode no. of Friends TV Show in a particular season to which the utterance belongs.                                                                                                                                                                                                                                                                                                                                                                                                                                                              |\n| StartTime    | The starting time of the utterance in the given episode in the format 'hh:mm:ss,ms'.                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |\n| EndTime      | The ending time of the utterance in the given episode in the format 'hh:mm:ss,ms'.                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |\n\n### The files\n- \u002Fdata\u002FMELD\u002Ftrain_sent_emo.csv - contains the utterances in the training set along with Sentiment and Emotion labels.\n- \u002Fdata\u002FMELD\u002Fdev_sent_emo.csv - contains the utterances in the dev set along with Sentiment and Emotion labels.\n- \u002Fdata\u002FMELD\u002Ftest_sent_emo.csv - contains the utterances in the test set along with Sentiment and Emotion labels.\n- \u002Fdata\u002FMELD_Dyadic\u002Ftrain_sent_emo_dya.csv - contains the utterances in the training set of the dyadic variant of MELD along with Sentiment and Emotion labels. For getting the video clip corresponding to a particular utterance refer to the columns 'Old_Dialogue_ID' and 'Old_Utterance_ID'.\n- \u002Fdata\u002FMELD_Dyadic\u002Fdev_sent_emo_dya.csv - contains the utterances in the dev set of the dyadic variant along with Sentiment and Emotion labels. For getting the video clip corresponding to a particular utterance refer to the columns 'Old_Dialogue_ID' and 'Old_Utterance_ID'.\n- \u002Fdata\u002FMELD_Dyadic\u002Ftest_sent_emo_dya.csv - contains the utterances in the test set of the dyadic variant along with Sentiment and Emotion labels. For getting the video clip corresponding to a particular utterance refer to the columns 'Old_Dialogue_ID' and 'Old_Utterance_ID'.\n\n## Description of Pickle Files\nThere are 13 pickle files comprising of the data and features used for training the baseline models. Following is a brief description of each of the pickle files.\n\n### Data pickle files:\n\n* **data_emotion.p, data_sentiment.p** - These are the primary data files which contain 5 different elements stored as a list.\n    * *data*: It consists of a dictionary with the following key\u002Fvalue pairs.\n        * *text*: original sentence.\n        * *split*: train\u002Fval\u002Ftest - denotes the which split the tuple belongs to.\n        * *y*: label of the sentence.\n        * *dialog*: ID of the dialog the utterance belongs to.\n        * *utterance*: utterance number of the dialog ID.\n        * *num_words*: number of words in the utterance.\n    * W: glove embedding matrix\n    * vocab: the vocabulary of the dataset\n    * word_idx_map: mapping of each word from vocab to its index in W.\n    * max_sentence_length: maximum number of tokens in an utterance in the dataset.\n    * label_index: mapping of each label (emotion or sentiment) to its assigned index, eg. label_index['neutral']=0               \n```python\nimport pickle\ndata, W, vocab, word_idx_map, max_sentence_length, label_index = pickle.load(open(filepath, 'rb'))\n```\n\n* **text_glove_average_emotion.pkl, text_glove_average_sentiment.pkl** - It consists of 300 dimensional textual feature vectors of each utterance initialized as the average of the Glove embeddings of all tokens per utterance. It is a list comprising of 3 dictionaries for train, val and the test set with each dictionary indexed in the format *dia_utt*, where dia is the dialogue id and utt is the utterance id. For eg. train_text_avg_emb['0_0'].shape = (300, )\n```python\nimport pickle\ntrain_text_avg_emb, val_text_avg_emb, test_text_avg_emb = pickle.load(open(filepath, 'rb'))\n```\n\n\n\n* **audio_embeddings_feature_selection_emotion.pkl,audio_embeddings_feature_selection_sentiment.pkl** - It consists of 1611\u002F1422 dimensional audio feature vectors of each utterance trained for emotion\u002Fsentiment classification. These features are originally extracted from [openSMILE](https:\u002F\u002Fwww.audeering.com\u002Fopensmile\u002F) and then followed by L2-based feature selection using SVM. It is a list comprising of 3 dictionaries for train, val and the test set with each dictionary indexed in the format *dia_utt*, where dia is the dialogue id and utt is the utterance id. For eg. train_audio_emb['0_0'].shape = (1611, ) or (1422, )\n```python\nimport pickle\ntrain_audio_emb, val_audio_emb, test_audio_emb = pickle.load(open(filepath, 'rb'))\n```\n\n\n### Model output pickle files:\n\n* **text_glove_CNN_emotion.pkl, text_glove_CNN_sentiment.pkl** - It consists of 100 dimensional textual features obtained after training on a CNN-based [network](https:\u002F\u002Fgithub.com\u002Fdennybritz\u002Fcnn-text-classification-tf) for emotion\u002Fsentiment calssification. It is a list comprising of 3 dictionaries for train, val and the test set with each dictionary indexed in the format *dia_utt*, where dia is the dialogue id and utt is the utterance id. For eg. train_text_CNN_emb['0_0'].shape = (100, )\n```python\nimport pickle\ntrain_text_CNN_emb, val_text_CNN_emb, test_text_CNN_emb = pickle.load(open(filepath, 'rb'))\n```\n\n* **text_emotion.pkl, text_sentiment.pkl** - These files contain the contextual feature representations as produced by the uni-modal bcLSTM model. It consists of 600 dimensional textual feature vector for each utterance for emotion\u002Fsentiment classification stored as a dictionary indexed with dialogue id. It is a list comprising of 3 dictionaries for train, val and the test set. For eg. train_text_emb['0'].shape = (33, 600), where 33 is the maximum number of utterances in a dialogue. Dialogues with less utterances are padded with zero-vectors.\n```python\nimport pickle\ntrain_text_emb, val_text_emb, test_text_emb = pickle.load(open(filepath, 'rb'))\n```\n\n* **audio_emotion.pkl, audio_sentiment.pkl** - These files contain the contextual feature representations as produced by the uni-modal bcLSTM model. It consists of 300\u002F600 dimensional audio feature vector for each utterance for emotion\u002Fsentiment classification stored as a dictionary indexed with dialogue id. It is a list comprising of 3 dictionaries for train, val and the test set. For eg. train_audio_emb['0'].shape = (33, 300) or (33, 600), where 33 is the maximum number of utterances in a dialogue. Dialogues with less utterances are padded with zero-vectors.\n```python\nimport pickle\ntrain_audio_emb, val_audio_emb, test_audio_emb = pickle.load(open(filepath, 'rb'))\n```\n\n\n* **bimodal_sentiment.pkl** - This file contains the contextual feature representations as produced by the bi-imodal bcLSTM model. It consists of 600 dimensional bimodal (text, audio) feature vector for each utterance for sentiment classification stored as a dictionary indexed with dialogue id. It is a list comprising of 3 dictionaries for train, val and the test set. For eg. train_bimodal_emb['0'].shape = (33, 600), where 33 is the maximum number of utterances in a dialogue. Dialogues with less utterances are padded with zero-vectors.\n```python\nimport pickle\ntrain_bimodal_emb, val_bimodal_emb, test_bimodal_emb = pickle.load(open(filepath, 'rb'))\n```\n\n\n\n## Description of Raw Data\n- There are 3 folders (.tar.gz files)-train, dev and test; each of which corresponds to video clips from the utterances in the 3 .csv files.\n- In any folder, each video clip in the raw data corresponds to one utterance in the corresponding .csv file. The video clips are named in the format: diaX1\\_uttX2.mp4, where X1 is the Dialogue\\_ID and X2 is the Utterance_ID as provided in the corresponding .csv file, denoting the particular utterance.\n- For example, consider the video clip **dia6_utt1.mp4** in **train.tar.gz**. The corresponding utterance for this video clip will be in the file **train_sent_emp.csv** with **Dialogue_ID=6** and **Utterance_ID=1**, which is *'You liked it? You really liked it?'*\n\n## Reading the Data\nThere are 2 python scripts provided in '.\u002Futils\u002F':\n- read_meld.py \\- displays the path of the video file corresponding to an utterance in the .csv file from MELD.\n- read_emorynlp \\- displays the path of the video file corresponding to an utterance in the .csv file from Multimodal EmoryNLP Emotion Detection dataset.\n\n## Labelling\nFor experimentation, all the labels are represented as one-hot encodings, the indices for which are as follows:\n- **Emotion** - {'neutral': 0, 'surprise': 1, 'fear': 2, 'sadness': 3, 'joy': 4, 'disgust': 5, 'anger': 6}. Therefore, the label corresponding to the emotion *'joy'* would be [0., 0., 0., 0., 1., 0., 0.]\n- **Sentiment** - {'neutral': 0, 'positive': 1, 'negative': 2}. Therefore, the label corresponding to the sentiment *'positive'* would be [0., 1., 0.]\n\n## Class Weights\nFor the baseline on emotion classification, the following class weights were used. The indexing is the same as mentioned above.\nClass Weights: [4.0, 15.0, 15.0, 3.0, 1.0, 6.0, 3.0].\n\n## Run the baseline\n\nPlease follow these steps to run the baseline - \n\n1. Download the features from [here](http:\u002F\u002Fweb.eecs.umich.edu\u002F~mihalcea\u002Fdownloads\u002FMELD.Features.Models.tar.gz).\n2. Copy these features into `.\u002Fdata\u002Fpickles\u002F`\n3. To train\u002Ftest the baseline model, run the file: `baseline\u002Fbaseline.py` as follows:\n    - `python baseline.py -classify [Sentiment|Emotion] -modality [text|audio|bimodal] [-train|-test]` \n    - example command to train text unimodal for sentiment classification: `python baseline.py -classify Sentiment -modality text -train`\n    - use `python baseline.py -h` to get help text for the parameters.\n4. For pre-trained models, download the model weights from [here](http:\u002F\u002Fweb.eecs.umich.edu\u002F~mihalcea\u002Fdownloads\u002FMELD.Features.Models.tar.gz) and place the pickle files inside `.\u002Fdata\u002Fmodels\u002F`.\n\n## Citation\nPlease cite the following papers if you find this dataset useful in your research\n\nS. Poria, D. Hazarika, N. Majumder, G. Naik, E. Cambria, R. Mihalcea. MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversation. ACL 2019.\n\nChen, S.Y., Hsu, C.C., Kuo, C.C. and Ku, L.W. EmotionLines: An Emotion Corpus of Multi-Party Conversations. arXiv preprint arXiv:1802.08379 (2018).\n\n# Multimodal EmoryNLP Emotion Recognition Dataset\n----------------------------------------------------\n## Description\nMultimodal EmoryNLP Emotion Detection Dataset has been created by enhancing and extending EmoryNLP Emotion Detection dataset. It contains the same dialogue instances available in EmoryNLP Emotion Detection dataset, but it also encompasses audio and visual modality along with text. There are more than 800 dialogues and 9000 utterances from Friends TV series exist in the multimodal EmoryNLP dataset. Multiple speakers participated in the dialogues. Each utterance in a dialogue has been labeled by any of these seven emotions -- Neutral, Joyful, Peaceful, Powerful, Scared, Mad and Sad. The annotations are borrowed from the original dataset.\n### Dataset Statistics\n| Statistics                      | Train   | Dev     | Test    |\n|---------------------------------|---------|---------|---------|\n| # of modality                   | {a,v,t} | {a,v,t} | {a,v,t} |\n| # of unique words               | 9,744  | 2,123   | 2,345   |\n| Avg. utterance length           | 7.86    | 6.97    | 7.79    |\n| Max. utterance length           | 78      | 60      | 61      |\n| Avg. # of emotions per scene | 4.10    | 4.00    | 4.40    |\n| # of dialogues                  | 659    | 89     | 79     |\n| # of utterances                 | 7551    | 954    | 984    |\n| # of speakers                   | 250     | 46      | 48     |\n| # of emotion shift              | 4596    | 575     | 653    |\n| Avg. duration of an utterance   | 5.55s   | 5.46s   | 5.27s   |\n\n### Dataset Distribution\n\n|          | Train | Dev | Test |\n|----------|-------|-----|------|\n| Joyful   | 1677  | 205 | 217  |\n| Mad      | 785   | 97  | 86   |\n| Neutral  | 2485  | 322 | 288  |\n| Peaceful | 638   | 82  | 111  |\n| Powerful | 551   | 70  | 96   |\n| Sad      | 474   | 51  | 70   |\n| Scared   | 941   | 127 | 116  |\n\n## Data\nVideo clips of this dataset can be download from [this link](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1UQduKw8QTqGf3RafxrTDfI1NyInYK3fr\u002Fview?usp=sharing).\nThe annotation files can be found in https:\u002F\u002Fgithub.com\u002FSenticNet\u002FMELD\u002Ftree\u002Fmaster\u002Fdata\u002Femorynlp. There are 3 .csv files. Each entry in the first column of these csv files contain an utterance whose corresponding video clip can be found [here](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1UQduKw8QTqGf3RafxrTDfI1NyInYK3fr\u002Fview?usp=sharing). Each utterance and its video clip is indexed by the season no., episode no., scene id and utterance id. For example, **sea1\\_ep2\\_sc6\\_utt3.mp4** implies the clip corresponds to the utterance with season no. 1, episode no. 2, scene\\_id 6 and utterance\\_id 3. A scene is simply a dialogue. This indexing is consistent with the original dataset. The .csv files and the video files are divided into the train, validation and test set in accordance with the original dataset. Annotations have been directly borrowed from the original EmoryNLP dataset (Zahiri et al. (2018)).\n\n### Description of the .csv files\n\n#### Column Specification\n| Column Name  | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |\n|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| Utterance    | Individual utterances from EmoryNLP as a string.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |\n| Speaker      | Name of the speaker associated with the utterance.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |\n| Emotion      | The emotion (Neutral, Joyful, Peaceful, Powerful, Scared, Mad and Sad) expressed by the speaker in the utterance.                                                                                                                                                                                                                                                                                                                                                                                                                                         |\n| Scene_ID  | The index of the dialogue starting from 0.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |\n| Utterance_ID | The index of the particular utterance in the dialogue starting from 0.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |\n| Season       | The season no. of Friends TV Show to which a particular utterance belongs.                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |\n| Episode      | The episode no. of Friends TV Show in a particular season to which the utterance belongs.                                                                                                                                                                                                                                                                                                                                                                                                                                                              |\n| StartTime    | The starting time of the utterance in the given episode in the format 'hh:mm:ss,ms'.                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |\n| EndTime      | The ending time of the utterance in the given episode in the format 'hh:mm:ss,ms'.\n\n***Note***: There are a few utterances for which we were not able to find the start and end time due to some inconsistencies in the subtitles. Such utterances have been omitted from the dataset. However, we encourage the users to find the corresponding utterances from the original dataset and generate video clips for the same.\n## Citation\nPlease cite the following papers if you find this dataset useful in your research\n\nS. Zahiri and J. D. Choi. Emotion Detection on TV Show Transcripts with Sequence-based Convolutional Neural Networks. In The AAAI Workshop on Affective Content Analysis, AFFCON'18, 2018.\n\nS. Poria, D. Hazarika, N. Majumder, G. Naik, E. Cambria, R. Mihalcea. MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversation. ACL 2019.\n\n# MUStARD: Multimodal Sarcasm Detection Dataset\n\nThis repository contains the dataset and code for our ACL 2019 paper:\n \n[Towards Multimodal Sarcasm Detection (An _Obviously_ Perfect Paper)](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FP19-1455\u002F)\n\nWe release the MUStARD dataset which is a multimodal video corpus for research in automated sarcasm discovery. The dataset\nis compiled from popular TV shows including *Friends*, *The Golden Girls*, *The Big Bang Theory*, and\n*Sarcasmaholics Anonymous*. MUStARD consists of audiovisual utterances annotated with sarcasm labels. Each utterance is\naccompanied by its context, which provides additional information on the scenario where the utterance occurs.\n\n## Example Instance\n\n![Example instance](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_647c90660a10.jpg)\n\n\u003Cp align=\"center\"> Example sarcastic utterance from the dataset along with its context and transcript. \u003C\u002Fp>     \n\n## Raw Videos\n\nWe provide a [Google Drive folder with the raw video clips](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1i9ixalVcXskA5_BkNnbR60sqJqvGyi6E\u002Fview?usp=sharing),\nincluding both the utterances and their respective context\n\n## Data Format\n\nThe annotations and transcripts of the audiovisual clips are available at [`data\u002Fsarcasm_data.json`](data\u002Fsarcasm_data.json).\nEach instance in the JSON file is allotted one identifier (e.g. \"1\\_60\") which is a dictionary of the following items:   \n\n| Key                     | Value                                                                          | \n| ----------------------- |:------------------------------------------------------------------------------:| \n| `utterance`             | The text of the target utterance to classify.                                  | \n| `speaker`               | Speaker of the target utterance.                                               | \n| `context`               | List of utterances (in chronological order) preceding the target utterance.    | \n| `context_speakers`      | Respective speakers of the context utterances.                                 | \n| `sarcasm`               | Binary label for sarcasm tag.                                                  | \n\nExample format in JSON:\n\n```json\n{\n  \"1_60\": {\n    \"utterance\": \"It's just a privilege to watch your mind at work.\",\n    \"speaker\": \"SHELDON\",\n    \"context\": [\n      \"I never would have identified the fingerprints of string theory in the aftermath of the Big Bang.\",\n      \"My apologies. What's your plan?\"\n    ],\n    \"context_speakers\": [\n      \"LEONARD\",\n      \"SHELDON\"\n    ],\n    \"sarcasm\": true\n  }\n}\n```\n\n## Citation\n\nPlease cite the following paper if you find this dataset useful in your research:\n\n```bibtex\n@inproceedings{mustard,\n    title = \"Towards Multimodal Sarcasm Detection (An  \\_Obviously\\_ Perfect Paper)\",\n    author = \"Castro, Santiago  and\n      Hazarika, Devamanyu  and\n      P{\\'e}rez-Rosas, Ver{\\'o}nica  and\n      Zimmermann, Roger  and\n      Mihalcea, Rada  and\n      Poria, Soujanya\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)\",\n    month = \"7\",\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n}\n```\n\n## Run the code\n\n1. Setup an environment with Conda:\n\n    ```bash\n    conda env create -f environment.yml\n    conda activate mustard\n    python -c \"import nltk; nltk.download('punkt')\"\n    ```\n\n2. Download [Common Crawl pretrained GloVe word vectors of size 300d, 840B tokens](http:\u002F\u002Fnlp.stanford.edu\u002Fdata\u002Fglove.840B.300d.zip)\nsomewhere.\n\n3. [Download the pre-extracted visual features](https:\u002F\u002Fdrive.google.com\u002Fopen?id=1Ff1WDObGKqpfbvy7-H1mD8YWvBS-Kf26) to the `data\u002F` folder (so `data\u002Ffeatures\u002F` contains the folders `context_final\u002F` and `utterances_final\u002F` with the features) or [extract the visual features](visual) yourself.\n\n4. [Download the pre-extracted BERT features](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1GYv74vN80iX_IkEmkJhkjDRGxLvraWuZ\u002Fview?usp=sharing) and place the two files directly under the folder `data\u002F` (so they are `data\u002Fbert-output.jsonl` and `data\u002Fbert-output-context.jsonl`), or [extract the BERT features in another environment with Python 2 and TensorFlow 1.11.0 following\n[\"Using BERT to extract fixed feature vectors (like ELMo)\" from BERT's repo](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert\u002Ftree\u002Fd66a146741588fb208450bde15aa7db143baaa69#using-bert-to-extract-fixed-feature-vectors-like-elmo)\nand running:\n\n    ```bash\n    # Download BERT-base uncased in some dir:\n    wget https:\u002F\u002Fstorage.googleapis.com\u002Fbert_models\u002F2018_10_18\u002Funcased_L-12_H-768_A-12.zip\n    # Then put the location in this var:\n    BERT_BASE_DIR=...\n    \n    python extract_features.py \\\n      --input_file=data\u002Fbert-input.txt \\\n      --output_file=data\u002Fbert-output.jsonl \\\n      --vocab_file=${BERT_BASE_DIR}\u002Fvocab.txt \\\n      --bert_config_file=${BERT_BASE_DIR}\u002Fbert_config.json \\\n      --init_checkpoint=${BERT_BASE_DIR}\u002Fbert_model.ckpt \\\n      --layers=-1,-2,-3,-4 \\\n      --max_seq_length=128 \\\n      --batch_size=8\n    ```\n\n5. Check the options in `python train_svm.py -h` to select a run configuration (or modify [`config.py`](config.py)) and then run it:\n\n    ```bash\n    python train_svm.py  # add the flags you want\n    ```\n\n6. Evaluation: We evaluate using weighted F-score metric in a 5-fold cross validation scheme. The fold indices are available at `data\u002Fsplit_incides.p` . Refer to our baseline scripts for more details.\n\n# M2H2: A Multimodal Multiparty Hindi Dataset For Humor Recognition in Conversations\n\n:zap: :zap: :zap: Baseline Codes will be released soon!\n\n:fire::fire::fire: [Read the paper](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.01260)\n\nThe M2H2 dataset is compiled from a famous TV show \"Shrimaan Shrimati Phir Se\" (Total of 4.46 hours in length) and annotated them manually. We make groups of these samples (utterances) based on their context into scenes. Each utterance in each scene consists of a label indicating humor of that utterance i.e., humor or non-humor. Besides, each utterance is also annotated with its speaker and listener information. In multiparty conversation, listener identification poses a great challenge. In our dataset, we define the listener as the party in the conversation to whom the speaker is replying. Each utterance in each scene is coupled with its context utterances, which are preceding turns by the speakers participating in the conversation. It also contains multi-party conversations that are more challenging to classify than dyadic variants.\n\n# Data Format\n\n![Alt text](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_f9d63f69ce49.png)\n\n## Text Data\n\n:fire::fire::fire: ***The ``Raw-Text\u002FEp-NUMBER.tsv`` acts as a master annotation file which does not only contain the textual data but also contains other metadata as described below. It also contains the manually annotated labels of the utterances. Using the Episode id and scene id, one can map the utterances in the ``Raw-Text`` folder to the corresponding audio and visual segments in ``Raw-Audio`` and ``Raw-Visual``. This should result in multimodal data. The ``Label`` column in the TSV files e.g., ``Raw-Text\u002FEp-NUMBER.tsv`` contains the desired manually annotated labels for each utterance.***\n\nThe text data are stored in TSV format. Each of the file is named as ``Raw-Text\u002FEp-NUMBER.tsv``. Here the ``NUMBER`` is episode number which one should use to map with the corresponding audio and visual segments. The text data contains the following fields:\n\n```\nScenes: The scene id. It will match the corresponding audio and visual segments.\nSI. No.: Utterance number.\nStart_time: Start time of the utterance in the video.\nEnd_time: End time of the utterance in the video.\nUtterance: The spoken utterance.\nLabel: The annotated label of the utterance. This can either be humor or non-humor.\nSpeaker: The format is \"Speaker,listener\". It has the form of \"Speaker_name,utterance_id\" e.g., \"Dilruba,u3\" which means the speaker is Dilruba and he is responding to utterance no. 3. This is particularly useful to resolve coreferences in a multiparty conversation.\n```\n## Audio Data\n\nEvery episode has a dedicated folder e.g., ``Raw-Audio\u002F22\u002F`` contains all the annotated audio samples for Episode no. 22.\n\nFor every episode, each scene has a dedicated folder e.g., ``Raw-Audio\u002F22\u002FScene_1`` contains all the annotated audio samples for Episode no. 22 Scene 1.\n\n## Visual Data\n\nEvery episode has a dedicated folder e.g., ``Raw-Visual\u002F22\u002F`` contains all the annotated visual samples for Episode no. 22.\n\nFor every episode, each scene has a dedicated folder e.g., ``Raw-Visual\u002F22\u002FScene_1`` contains all the annotated visual samples for Episode no. 22 Scene 1.\n\n# Baselines\n\n:zap: :zap: :zap: Baseline Codes will be released soon!\n\n# Citation\n\nDushyant Singh Chauhan, Gopendra Vikram Singh, Navonil Majumder, Amir Zadeh,, Asif Ekbal, Pushpak Bhattacharyya, Louis-philippe Morency, and Soujanya Poria. 2021. [M2H2: A Multimodal MultipartyHindi Dataset For Humor Recognition in Conversations. In ICMI ’21: 23rd ACM International Conference on Multimodal Interaction](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.01260), Montreal, Canada. ACM, New York, NY, USA, 5 pages.\n\n","# 多模态深度学习\n\n🎆 🎆 🎆 宣布推出多模态深度学习仓库，其中包含多种基于深度学习的模型实现，用于解决不同的多模态问题，例如多模态表示学习、面向下游任务的多模态融合（如多模态情感分析）。\n\n```对于那些询问如何提取视觉和音频特征的人，请查看这里：https:\u002F\u002Fgithub.com\u002Fsoujanyaporia\u002FMUStARD```\n\n- [模型](#models)\n  * [多模态信息最大化（PyTorch）](#improving-multimodal-fusion-with-hierarchical-mutual-information-maximization-for-multimodal-sentiment-analysis)\n  * [MISA（PyTorch）](#MISA-Modality--Invariant-and--Specific-Representations-for-Multimodal-Sentiment-Analysis)\n  * [BBFN（PyTorch）](#Bi-Bimodal-Modality-Fusion-for-Correlation-Controlled-Multimodal-Sentiment-Analysis)\n  * [Hfusion（Keras）](#hfusion)\n  * [基于上下文注意力的LSTM（TensorFlow）](#Attention-based-multimodal-fusion-for-sentiment-analysis)\n  * [bc-LSTM（Keras）](#Context--Dependent-Sentiment-Analysis-in-User-Generated-Videos)\n  * [上下文多模态融合（Keras）](#Contextual-Inter--modal-Attention-for-Multimodal-Sentiment-Analysis)\n  * [张量融合网络（PyTorch）](tensor-fusion-network-tfn)\n  * [低秩多模态融合（PyTorch）](Low-rank-Multimodal-Fusion)\n\n- [数据集](#datasets)\n  * [MELD](##meld-a-multimodal-multi-party-dataset-for-emotion-recognition-in-conversation)\n  * [MUStARD](#MUStARD-Multimodal-Sarcasm-Detection-Dataset)\n  * [M2H2](#M2H2-A-Multimodal-Multiparty-Hindi-Dataset-For-Humor-Recognition-in-Conversations\n)\n\n# 模型\n\n## 通过层次化互信息最大化改进多模态融合以进行多模态情感分析\n\n本仓库包含论文《通过层次化互信息最大化改进多模态融合以进行多模态情感分析》的官方实现代码，该论文已被**EMNLP 2021**接收。\n\n:fire: 如果您对我们DeCLaRe实验室的其他多模态研究感兴趣，欢迎访问[聚类仓库](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fmultimodal-deep-learning)。\n\n## 简介\n多模态信息最大化（MMIM）通过两级互信息（MI）最大化来综合多模态输入的融合结果。我们使用Barber-Agakov下界和对比预测编码作为需要最大化的目标函数。为了便于计算，我们设计了一个带有历史数据存储的熵估计模块，以帮助计算BA下界并加速训练过程。\n\n![Alt text](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002FMultimodal-Infomax\u002Fblob\u002Fmain\u002Fimg\u002FModelFigSingle.png?raw=true \"模型\")\n\n## 使用方法\n1. 从[Google Drive]()或[百度网盘]()下载CMU-MOSI和CMU-MOSEI数据集，并将其放置在`Multimodal-Infomax\u002Fdatasets`文件夹下。\n2. 设置环境（需先安装conda）\n```\nconda env create -f environment.yml\nconda activate MMIM\n```\n3. 开始训练\n```\npython main.py --dataset mosi --contrast\n```\n\n## 引用\n如果您认为我们的工作对您的研究有帮助，请引用我们的论文：\n```bibtex\n@article{han2021improving,\n  title={Improving Multimodal Fusion with Hierarchical Mutual Information Maximization for Multimodal Sentiment Analysis},\n  author={Han, Wei and Chen, Hui and Poria, Soujanya},\n  journal={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},\n  year={2021}\n}\n```\n\n## 联系方式\n如有任何问题，请随时通过[henryhan88888@gmail.com](henryhan88888@gmail.com)与我联系。\n## MISA：面向多模态情感分析的模态不变与特定表示\n这是ACM MM 2020会议论文《MISA：面向多模态情感分析的模态不变与特定表示》的代码实现（论文链接：https:\u002F\u002Farxiv.org\u002Fpdf\u002F2005.03545.pdf）\n\n\n\u003Cp align=\"center\">\n  \u003Cimg width=\"600\" src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_779aef8e9748.png\">\n\u003C\u002Fp>\n\n\n\n### 环境设置\n\n我们使用conda环境。\n\n```\nconda env create -f environment.yml\nconda activate misa-code\n```\n\n### 数据下载\n\n- 安装[CMU多模态SDK](https:\u002F\u002Fgithub.com\u002FA2Zadeh\u002FCMU-MultimodalSDK)。确保可以执行```from mmsdk import mmdatasdk```。\n- 方案一：下载[预计算好的划分](https:\u002F\u002Fdrive.google.com\u002Fdrive\u002Ffolders\u002F1IBwWNH0XjPnZWaAlP1U2tIJH6Rb3noMI?usp=sharing)，并将内容放入`datasets`文件夹中。\n- 方案二：从MMSDK下载数据并重新创建划分。为此，只需按照以下说明运行代码。\n\n### 运行代码\n\n1. ```cd src```\n2. 将```config.py```中的```word_emb_path```设置为[glove词向量文件](http:\u002F\u002Fnlp.stanford.edu\u002Fdata\u002Fglove.840B.300d.zip)。\n3. 将```sdk_dir```设置为CMU-MultimodalSDK的路径。\n2. ```python train.py --data mosi```。对于其他数据集，将```mosi```替换为```mosei```或```ur_funny```。\n\n### 引用\n\n如果本文对您的研究有所帮助，请引用我们：\n\n```\n@article{hazarika2020misa,\n  title={MISA: Modality-Invariant and-Specific Representations for Multimodal Sentiment Analysis},\n  author={Hazarika, Devamanyu and Zimmermann, Roger and Poria, Soujanya},\n  journal={arXiv preprint arXiv:2005.03545},\n  year={2020}\n}\n```\n\n### 联系方式\n\n如有任何问题，请发送邮件至[hazarika@comp.nus.edu.sg](mailto:hazarika@comp.nus.edu.sg)。\n\n## 双双模态融合用于相关性控制的多模态情感分析\n\n本仓库包含论文《双双模态融合用于相关性控制的多模态情感分析（ICMI 2021）》的官方实现。\n\n### 模型架构\n\n我们的双双模态融合网络（BBFN）概述。它通过强制每对模态相互补充，学习两组与文本相关的表示：文本-声学和文本-视觉。最后，将这四组（两对）主要表示拼接在一起，生成最终的预测。\n\n![Alt text](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_bb8462098e9a.png)\n\n单个互补层：左右两条相同的管道分别传播主模态，并在正则化和门控控制下将其与互补模态融合。\n\n![Alt text](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_bbe895a49416.png)\n\n### 结果\n\nCMU-MOSI和CMU-MOSEI数据集测试集上的结果。符号说明：△表示相应行的结果摘自先前的论文；†表示结果是根据公开的源代码和适用的超参数设置复现的；‡表示结果经过配对t检验，p值小于0.05，表明相比当前最先进的模型MISA有显著提升。\n\n![Alt text](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_0e9ced7f82f5.png)\n\n### 使用方法\n1. 设置 conda 环境\n```\nconda env create -f environment.yml\nconda activate BBFN\n```\n\n2. 安装 [CMU 多模态 SDK](https:\u002F\u002Fgithub.com\u002FA2Zadeh\u002FCMU-MultimodalSDK)\n\n3. 将 `src\u002Fconfig.py` 中的 `sdk_dir` 设置为 CMU-MultimodalSDK 的路径\n\n4. 训练模型\n```\ncd src\npython main.py --dataset \u003C数据集名称> --data_path \u003C数据集路径>\n```\n我们提供了一个脚本 `scripts\u002Frun.sh` 供您参考。\n\n### 引用\n如果您认为我们的工作对您的研究有帮助，请引用我们的论文：\n```bibtex\n@article{han2021bi,\n  title={Bi-Bimodal Modality Fusion for Correlation-Controlled Multimodal Sentiment Analysis},\n  author={Han, Wei and Chen, Hui and Gelbukh, Alexander and Zadeh, Amir and Morency, Louis-philippe and Poria, Soujanya},\n  journal={ICMI 2021},\n  year={2021}\n}\n```\n\n### 联系方式\n如有任何问题，欢迎通过 [henryhan88888@gmail.com](henryhan88888@gmail.com) 与我联系。\n\n# Hfusion\n用于论文《使用上下文建模的层次融合进行多模态情感分析》的代码\n\n## 运行方法\n``python3 hfusion.py``\n\n## 需求\nKeras >= 2.0, Tensorflow >= 1.7, Numpy, Scikit-learn\n\n## 引用\n``Majumder, N., Hazarika, D., Gelbukh, A., Cambria, E. and Poria, S., 2018. Multimodal sentiment analysis using hierarchical fusion with context modeling. Knowledge-Based Systems, 161, pp.124-133.``\n\n# 基于注意力的多模态情感分析融合\n基于注意力的多模态情感分析融合\n\n论文代码\n\n[用户生成视频中的上下文依赖情感分析](http:\u002F\u002Fsentic.net\u002Fcontext-dependent-sentiment-analysis-in-user-generated-videos.pdf)（ACL 2017）。\n\n[用于上下文多模态情感分析的多级多注意力机制](https:\u002F\u002Fieeexplore.ieee.org\u002Fabstract\u002Fdocument\u002F8215597\u002F)（ICDM 2017）。\n\n![Alt text](contextual-attention-based-LSTM\u002Fatlstm3.jpg?raw=true \"基于注意力的融合机制（ICDM 2017）\")\n\n\n### 预处理\n**编辑：** create_data.py 已经过时。预处理后的数据集已经放在仓库的 dataset\u002F 文件夹中，请直接使用。\n\n由于数据通常以话语形式存在，我们使用以下代码将属于同一视频的所有话语合并在一起：\n\n```\npython create_data.py\n```\n\n注意：这将创建与说话者无关的训练和测试划分。\n\n在 dataset\u002Fmosei 中，将压缩包解压到名为 'raw' 的文件夹中。同时解压 'unimodal_mosei_3way.pickle.zip'。\n\n### 运行模型\n\n示例命令：\n\n使用基于注意力的融合：\n```\npython run.py --unimodal True --fusion True\npython run.py --unimodal False --fusion True\n```\n\n不使用基于注意力、采用拼接融合的方式：\n```\npython run.py --unimodal True --fusion False\npython run.py --unimodal False --fusion False\n```\n\n话语级别的注意力：\n```\npython run.py --unimodal False --fusion True --attention_2 True\npython run.py --unimodal False --fusion True --attention_2 True\n```\n\n注意：\n1. 将 unimodal 标志保持为 True（默认为 False）会先训练所有单模态 LSTM（论文中提到的网络第一层）。\n2. 设置 --fusion True 仅适用于多模态网络。\n\n### 数据集：\n我们提供了在 [MOSI](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.06259.pdf)、[MOSEI](http:\u002F\u002Faclweb.org\u002Fanthology\u002FP18-1208) 和 [IEMOCAP](https:\u002F\u002Fsail.usc.edu\u002Fiemocap\u002F) 数据集上的结果。\u003Cbr>\n请引用数据集的创建者。\n\n我们正在添加更多数据集，敬请期待。\n\n在上述命令中使用 ```--data [mosi|mosei|iemocap] 和 --classes [2|3|6]``` 来测试不同配置在不同数据集上的表现。\n\nmosi：2 类\u003Cbr>\nmosei：3 类\u003Cbr>\niemocap：6 类\u003Cbr>\n\n示例：\n```\npython run.py --unimodal False --fusion True --attention_2 True --data mosei --classes 3\n```\n\n#### 数据集详情\n##### MOSI：\n2 类：正面\u002F负面 \u003Cbr>\n原始特征：（Pickle 文件） \u003Cbr>\n音频：dataset\u002Fmosi\u002Fraw\u002Faudio_2way.pickle \u003Cbr>\n文本：dataset\u002Fmosi\u002Fraw\u002Ftext_2way.pickle \u003Cbr>\n视频：dataset\u002Fmosi\u002Fraw\u002Fvideo_2way.pickle \u003Cbr>\n\n**每个文件包含：\u003Cbr>**\ntrain_data、train_label、test_data、test_label、maxlen、train_length、test_length\n\ntrain_data - 维度为 (62, 63, 特征维度) 的 np.array \u003Cbr>\ntrain_label - 维度为 (62, 63, 2) 的 np.array \u003Cbr>\ntest_data - 维度为 (31, 63, 特征维度) 的 np.array \u003Cbr>\ntest_label - 维度为 (31, 63, 2) 的 np.array \u003Cbr>\nmaxlen - 最大话语长度，值为 63 \u003Cbr>\ntrain_length - 训练数据中每段视频的话语长度。\u003Cbr>\ntest_length - 测试数据中每段视频的话语长度。\u003Cbr>\n\n训练\u002F测试划分：62\u002F31 段视频。每段视频都有话语。视频被填充至 63 个话语。\n\n##### IEMOCAP：\n6 类：高兴\u002F悲伤\u002F neutral\u002F愤怒\u002F兴奋\u002F沮丧\u003Cbr>\n原始特征：dataset\u002Fiemocap\u002Fraw\u002FIEMOCAP_features_raw.pkl（Pickle 文件）\u003Cbr>\n该文件包含：\u003Cbr>\nvideoIDs[vid] = 该视频中话语 ID 列表，按出现顺序排列 \u003Cbr>\nvideoSpeakers[vid] = 说话人轮次列表。例如 [M, M, F, M, F]。其中 M = 男性，F = 女性 \u003Cbr>\nvideoText[vid] = 该视频中每段话语的文本特征列表。\u003Cbr>\nvideoAudio[vid] = 该视频中每段话语的音频特征列表。\u003Cbr>\nvideoVisual[vid] = 该视频中每段话语的视觉特征列表。\u003Cbr>\nvideoLabels[vid] = 该视频中每段话语的标签索引列表。\u003Cbr>\nvideoSentence[vid] = 该视频中每段话语的句子列表。\u003Cbr>\ntrainVid = 训练集中视频（视频 ID）列表。\u003Cbr>\ntestVid = 测试集中视频（视频 ID）列表。\u003Cbr>\n\n更多信息请参阅文件 dataset\u002Fiemocap\u002Fraw\u002FloadIEMOCAP.py。\n我们使用这些数据创建了格式化的、与说话者无关的训练和测试划分。（视频 x 话语 x 特征）\n\n训练\u002F测试划分：120\u002F31 段视频。每段视频都有话语。视频被填充至 110 个话语。\n\n##### MOSEI：\n3 类：正面\u002F负面\u002Fneutral \u003Cbr>\n原始特征：（Pickle 文件） \u003Cbr>\n音频：dataset\u002Fmosei\u002Fraw\u002Faudio_3way.pickle \u003Cbr>\n文本：dataset\u002Fmosei\u002Fraw\u002Ftext_3way.pickle \u003Cbr>\n视频：dataset\u002Fmosei\u002Fraw\u002Fvideo_3way.pickle \u003Cbr>\n\n该文件包含：\ntrain_data、train_label、test_data、test_label、maxlen、train_length、test_length\n\ntrain_data - 维度为 (2250, 98, 特征维度) 的 np.array \u003Cbr>\ntrain_label - 维度为 (62, 63, 2) 的 np.array \u003Cbr>\ntest_data - 维度为 (31, 63, 特征维度) 的 np.array \u003Cbr>\ntest_label - 维度为 (31, 63, 2) 的 np.array \u003Cbr>\nmaxlen - 最大话语长度，值为 98 \u003Cbr>\ntrain_length - 训练数据中每段视频的话语长度。\u003Cbr>\ntest_length - 测试数据中每段视频的话语长度。\u003Cbr>\n\n训练\u002F测试划分：2250\u002F678 段视频。每段视频都有话语。视频被填充至 98 个话语。\n\n### 引用\n\n如果您使用此代码，请使用以下方式引用我们的工作：\n```\n@inproceedings{soujanyaacl17,\n  title={上下文相关的用户生成视频情感分析},\n  author={Poria, Soujanya  and Cambria, Erik and Hazarika, Devamanyu and Mazumder, Navonil and Zadeh, Amir and Morency, Louis-Philippe},\n  booktitle={计算语言学协会},\n  year={2017}\n}\n\n@inproceedings{poriaicdm17, \nauthor={S. Poria and E. Cambria and D. Hazarika and N. Mazumder and A. Zadeh and L. P. Morency}, \nbooktitle={2017 IEEE数据挖掘国际会议（ICDM）}, \ntitle={用于情境多模态情感分析的多级多注意力机制},\nyear={2017},  \npages={1033-1038}, \nkeywords={数据挖掘；特征提取；图像分类；图像融合；机器学习（人工智能）；情感分析；基于注意力的网络；情境学习；情境信息；情境多模态情感；动态特征融合；多级多注意力机制；多模态情感分析；循环模型；话语；视频；情境建模；特征提取；熔接器；情感分析；社交网络服务；视频；可视化}, \ndoi={10.1109\u002FICDM.2017.134}, \nmonth={11月},}\n```\n\n### 致谢\n\n[Soujanya Poria](http:\u002F\u002Fsporia.info\u002F)\n\n[Gangeshwar Krishnamurthy](http:\u002F\u002Fwww.gangeshwark.com\u002F) (gangeshwark@gmail.com; Github: @gangeshwark)\n\n# 用户生成视频中的上下文相关情感分析\n用于论文《用户生成视频中的上下文相关情感分析》（ACL 2017）的代码。\n\n### 要求\n代码使用 Python 2.7 编写，需要 Keras 2.0.6 和 Theano 后端。\n\n### 描述\n在本文中，我们提出了一种基于 LSTM 的模型，该模型使话语能够从同一视频中的周围环境中捕获上下文信息，从而帮助进行多模态情感分析的分类过程。\n\n![替代文本](bc-LSTM\u002Fnetwork.jpg?raw=true \"标题\")\n\n此仓库包含上述论文的代码。每个上下文 LSTM（论文中的图 2）都如上图所示实现。更多细节请参阅论文。  \n注意：与论文不同的是，我们在倒数第二层未使用 SVM。这是为了保持整个网络的可微性，尽管可能会牺牲一些性能。\n\n### 数据集\n我们在 [MOSI 数据集](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.06259.pdf) 上提供了结果。  \n请引用数据集的创建者。\n\n### 预处理\n由于数据通常以话语形式存在，我们使用以下代码将属于同一视频的所有话语合并在一起：\n\n```\npython create_data.py\n```\n\n注意：这将创建与说话人无关的训练和测试分割。\n\n### 运行 sc-lstm\n\n示例命令：\n\n```\npython lstm.py --unimodal True\npython lstm.py --unimodal False\n```\n\n注意：将 unimodal 标志保持为 True（默认为 False）将首先训练所有单模态 LSTM（论文中提到的网络第 1 层）。\n\n### 引用\n\n如果您使用此代码，请使用以下方式引用我们的工作：\n```\n@inproceedings{soujanyaacl17,\n  title={上下文相关的用户生成视频情感分析},\n  author={Poria, Soujanya  and Cambria, Erik and Hazarika, Devamanyu and Mazumder, Navonil and Zadeh, Amir and Morency, Louis-Philippe},\n  booktitle={计算语言学协会},\n  year={2017}\n}\n```\n\n### 致谢\n\nDevamanyu Hazarika、Soujanya Poria\n\n# 多模态情感分析中的情境跨模态注意力\n用于论文《多模态情感分析中的情境跨模态注意力》（EMNLP 2018）的代码。\n\n### 数据集\n我们在 [MOSI 数据集](https:\u002F\u002Farxiv.org\u002Fpdf\u002F1606.06259.pdf) 上提供了结果。  \n请引用数据集的创建者。\n\n## 需求：\nPython 3.5  \nKeras（TensorFlow 后端）2.2.4  \nScikit-learn 0.20.0  \n\n### 实验\n\n```\npython create_data.py\npython trimodal_attention_models.py\n```\n\n### 引用\n\n如果您在研究中使用此代码，请使用以下方式引用我们的工作：\n```\n@inproceedings{ghosal2018contextual,\n  title={多模态情感分析中的情境跨模态注意力},\n  author={Ghosal, Deepanway and Akhtar, Md Shad and Chauhan, Dushyant and Poria, Soujanya and Ekbal, Asif and Bhattacharyya, Pushpak},\n  booktitle={2018年自然语言处理经验方法会议论文集},\n  pages={3454--3466},\n  year={2018}\n}\n```\n\n### 致谢\n\n此仓库中的部分功能借鉴自 https:\u002F\u002Fgithub.com\u002Fsoujanyaporia\u002Fcontextual-utterance-level-multimodal-sentiment-analysis。\n\n### 作者\n\n[Deepanway Ghosal](https:\u002F\u002Fgithub.com\u002Fdeepanwayx)、[Soujanya Poria](https:\u002F\u002Fgithub.com\u002Fsoujanyaporia)\n\n## 张量融合网络（TFN）\n\n## 重要通知\n\n此仓库所依赖的 CMU 多模态 SDK 自编写此代码以来，其 API 已发生重大变化。因此，此仓库中的代码已无法直接运行。然而，模型本身的代码仍可供参考。\n\n# 张量融合网络\n\n这是对以下内容的 PyTorch 实现：\n\nZadeh, Amir, 等人。“用于多模态情感分析的张量融合网络。” EMNLP 2017 口头报告。\n\n它需要 PyTorch 和 CMU 多模态数据 SDK（https:\u002F\u002Fgithub.com\u002FA2Zadeh\u002FCMU-MultimodalDataSDK）才能正常运行。如果您首次运行脚本，训练数据（CMU-MOSI 数据集）将自动下载。\n\n模型定义在 `model.py` 中，训练脚本是 `train.py`。以下是 `train.py` 的命令行参数列表：\n\n```\n--dataset：默认为 'MOSI'，目前不支持其他数据集。请忽略此选项。\n\n--epochs：最大训练轮数，默认为 50。\n\n--batch_size：批量大小，默认为 32。\n\n--patience：指定早停条件，类似于 Keras 中的设置，默认为 20。\n\n--cuda：是否使用 GPU，默认为 False。\n\n--model_path：指定保存训练模型的位置的字符串，默认为 'models'。\n\n--max_len：预处理数据时的最大序列长度，默认为 20。\n```\n\n简而言之，您可以使用以下命令训练模型：\n\n```\npython train.py --epochs 100 --patience 10\n```\n\n该脚本最初会随机选择一组超参数。如果您想调整它们，可以自行在脚本中修改。\n\n### 引用\n\n如果您在研究中使用此代码，请使用以下方式引用我们的工作：\n```\n@inproceedings{tensoremnlp17,\ntitle={用于多模态情感分析的张量融合网络},\nauthor={Zadeh, Amir and Chen, Minghai and Poria, Soujanya and Cambria, Erik and Morency, Louis-Philippe},\nbooktitle={自然语言处理经验方法会议，EMNLP},\nyear={2017}\n}\n```\n## 低秩多模态融合\n\n这是 Liu 和 Shen 等人 ACL 2018 论文“具有模态特异性因子的有效低秩多模态融合”的代码库。\n\n## 依赖项\n\nPython 2.7（现已实验性地支持 Python 3.6+）\n\n```\ntorch=0.3.1\nsklearn\nnumpy\n```\n\n您可以通过 `python -m pip install -r requirements.txt` 安装这些库。\n\n## 实验数据\n\n用于实验的处理后数据（CMU-MOSI、IEMOCAP、POM）可在此下载：\n\nhttps:\u002F\u002Fdrive.google.com\u002Fopen?id=1CixSaw3dpHESNG0CaCJV6KutdlANP_cr\n\n要运行代码，您需要下载这些序列化数据集，并将其放置在 `data` 目录中。\n\n请注意，声学特征中可能存在 NaN 值，您可以将其替换为 0。\n\n## 训练您的模型\n\n要运行实验代码（网格搜索），请使用脚本 `train_xxx.py`。这些脚本包含以下命令行参数：\n\n```\n`--run_id`: 用户指定的唯一 ID，以确保保存的结果\u002F模型不会相互覆盖。\n\n`--epochs`: 训练的最大轮数。由于使用了早停机制来防止过拟合，实际训练的轮数可能少于此处指定的值。\n\n`--patience`: 如果模型性能在连续 `--patience` 次验证评估中没有提升，则训练将提前停止。\n\n`output_dim`: 模型的输出维度。每个脚本中的默认值通常适用。\n\n`signiture`: 可选字符串，会添加到输出文件名中，用作某种注释。\n\n`cuda`: 是否在训练中使用 GPU。如果未指定，则使用 CPU。\n\n`data_path`: 数据目录的路径。默认为 '.\u002Fdata'，但如果您希望将数据存储在其他位置，可以更改此设置。\n\n`model_path`: 用于保存模型的目录路径。\n\n`output_path`: 用于保存网格搜索结果的目录路径。\n\n`max_len`: 训练数据序列的最大长度。超过此长度的序列会被截断或填充。\n\n`emotion`: （仅适用于 IEMOCAP）指定您希望模型预测的情绪类别。可以是 'happy'、'sad'、'angry' 或 'neutral'。\n```\n\n示例命令如下：\n\n`python train_mosi.py --run_id 19260817 --epochs 50 --patience 20 --output_dim 1 --signiture test_run_big_model`\n\n## 超参数\n\n用于复现论文中结果的一些超参数位于 `hyperparams.txt` 文件中。\n\n## 引用\n\n```\n@misc{liu2018efficient,\n      title={Efficient Low-rank Multimodal Fusion with Modality-Specific Factors}, \n      author={Zhun Liu and Ying Shen and Varun Bharadhwaj Lakshminarasimhan and Paul Pu Liang and Amir Zadeh and Louis-Philippe Morency},\n      year={2018},\n      eprint={1806.00064},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI}\n}\n\n```\n\n# 数据集\n\n# MELD：多模态多人对话情感识别数据集\n\n## 注意\n\n:fire: :fire: :fire: 如需更新的基线，请访问此链接：[conv-emotion](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fconv-emotion)\n\n## 排行榜\n\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_1491652a74b4.png)\n\n## 更新\n\n2020年10月10日：关于 MELD 数据集上对话情感识别的新论文及 SOTA 结果。相关代码请参阅 [COSMIC](https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fconv-emotion\u002Ftree\u002Fmaster\u002FCOSMIC) 目录。论文标题为——[COSMIC: COmmonSense knowledge for eMotion Identification in Conversations](https:\u002F\u002Farxiv.org\u002Fpdf\u002F2010.02795.pdf)。\n\n2019年5月22日：MELD：多模态多人对话情感识别数据集已被 ACL 2019 全文接收。更新后的论文可在 https:\u002F\u002Farxiv.org\u002Fpdf\u002F1810.02508.pdf 查看。\n\n2019年5月22日：双人版 MELD 已发布，可用于测试双人对话模型。\n\n2018年11月15日：修复了 train.tar.gz 中的问题。\n\n## 使用 MELD 的研究工作\n\nZhang, Yazhou, Qiuchi Li, Dawei Song, Peng Zhang, and Panpan Wang. “受量子启发的交互网络用于对话情感分析。” IJCAI 2019。\n\nZhang, Dong, Liangqing Wu, Changlong Sun, Shoushan Li, Qiaoming Zhu, and Guodong Zhou. “建模上下文与说话者敏感依赖性以进行多说话者对话中的情感检测。” IJCAI 2019。\n\nGhosal, Deepanway, Navonil Majumder, Soujanya Poria, Niyati Chhaya, and Alexander Gelbukh. “DialogueGCN：用于对话情感识别的图卷积神经网络。” EMNLP 2019。\n\n----------------------------------------------------\n\n## 简介\n多模态 EmotionLines 数据集（MELD）是在 EmotionLines 数据集的基础上扩展和增强而创建的。MELD 包含 EmotionLines 中的所有对话实例，同时还增加了音频和视觉模态信息。MELD 包含来自电视剧《老友记》的 1400 多段对话和 13000 条话语。对话中有多个说话者参与。每条话语都被标注为七种情绪之一——愤怒、厌恶、悲伤、喜悦、 neutral、惊讶和恐惧。此外，MELD 还为每条话语提供了情感标注（正面、负面和 neutral）。\n\n### 示例对话\n![](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_fd3b5c995f4e.jpeg)\n\n### 数据集统计\n| 统计指标                      | 训练集   | 验证集     | 测试集    |\n|---------------------------------|---------|---------|---------|\n| 模态数量                   | {a,v,t} | {a,v,t} | {a,v,t} |\n| 唯一词数               | 10,643  | 2,384   | 4,361   |\n| 平均话语长度           | 8.03    | 7.99    | 8.28    |\n| 最大话语长度           | 69      | 37      | 45      |\n| 每段对话平均情绪数     | 3.30    | 3.35    | 3.24    |\n| 对话数量                  | 1039    | 114     | 280     |\n| 话语数量                 | 9989    | 1109    | 2610    |\n| 说话者数量               | 260     | 47      | 100     |\n| 情绪转换次数              | 4003    | 427     | 1003    |\n| 平均话语时长             | 3.59s   | 3.59s   | 3.58s   |\n\n更多详情请访问 https:\u002F\u002Faffective-meld.github.io。\n\n### 数据分布\n\n|          | 训练集 | 验证集 | 测试集 |\n|----------|-------|-----|------|\n| 愤怒    | 1109  | 153 | 345  |\n| 厌恶  | 271   | 22  | 68   |\n| 恐惧     | 268   | 40  | 50   |\n| 喜悦      | 1743  | 163 | 402  |\n| Neutral  | 4710  | 470 | 1256 |\n| 悲伤  | 683   | 111 | 208  |\n| 惊讶 | 1205  | 150 | 281  |\n\n## 目的\n多模态数据分析利用多个并行数据通道中的信息来进行决策。随着人工智能的快速发展，多模态情感识别已成为研究热点，这主要得益于其在对话生成、多模态交互等众多挑战性任务中的潜在应用。对话式情感识别系统可以通过分析用户的情感来生成适当的回应。尽管已有大量关于多模态情感识别的研究工作，但真正专注于理解对话中情感的研究却寥寥无几。然而，这些研究仅限于两人之间的对话理解，因此无法扩展到包含两个以上参与者的多方对话情感识别。EmotionLines 可以作为纯文本情感识别的资源，因为它不包含视觉和音频等其他模态的数据。同时需要注意的是，目前尚不存在用于情感识别研究的多模态多方对话数据集。在本工作中，我们对 EmotionLines 数据集进行了扩展、改进，并进一步开发以适应多模态场景。在连续轮次中进行情感识别存在诸多挑战，其中上下文理解便是其中之一。对话中轮次序列的情感变化与情感流动使得准确建模上下文成为一项艰巨的任务。在本数据集中，由于我们能够获取每个对话的多模态数据源，我们假设这将有助于改善上下文建模，从而提升整体的情感识别性能。该数据集还可用于开发多模态情感对话系统。IEMOCAP 和 SEMAINE 是包含每句话情感标签的多模态对话数据集。然而，这些数据集本质上是两人对话，这也凸显了我们提出的 Multimodal-EmotionLines 数据集的重要性。其他公开可用的多模态情感与情绪识别数据集包括 MOSEI、MOSI 和 MOUD，但这些数据集均非对话形式。\n\n## 数据集创建\n第一步是为 EmotionLines 数据集中每个对话中的每一句台词找到时间戳。为此，我们遍历了所有剧集的字幕文件，从中提取了每句台词的开始和结束时间戳。通过这一过程，我们获得了每集的季号、集号以及每句台词的时间戳。在获取时间戳时，我们设定了两个约束条件：(a) 对话中各句台词的时间戳必须按顺序递增；(b) 对话中的所有台词必须属于同一集和同一场景。基于这两个条件的限制，我们发现 EmotionLines 中有少数对话由多个自然对话组成。我们将这些情况从数据集中剔除。由于这一纠错步骤，我们的数据集所包含的对话数量与原始 EmotionLines 数据集有所不同。在获取每句台词的时间戳后，我们从原始剧集中提取了相应的视听片段，并单独截取出这些视频片段中的音频内容。最终，本数据集为每个对话提供了视觉、音频和文本三种模态的数据。\n\n## 论文\n有关该数据集的论文可参见：https:\u002F\u002Farxiv.org\u002Fpdf\u002F1810.02508.pdf\n\n## 下载数据\n请访问 http:\u002F\u002Fweb.eecs.umich.edu\u002F~mihalcea\u002Fdownloads\u002FMELD.Raw.tar.gz 下载原始数据。数据以 .mp4 格式存储，压缩包为 XXX.tar.gz 文件。标注信息可在 https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002FMELD\u002Ftree\u002Fmaster\u002Fdata\u002FMELD 找到。\n\n## .csv 文件说明\n\n### 列说明\n| 列名       | 说明                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |\n|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| 序号       | 发言的序号，主要用于在不同版本或包含不同子集的多个副本中引用具体的发言。 |\n| 发言内容   | EmotionLines 数据集中以字符串形式表示的单个发言。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |\n| 发言者     | 与该发言相关的发言者姓名。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |\n| 情感       | 发言者在该发言中表达的情感（中性、喜悦、悲伤、愤怒、惊讶、恐惧、厌恶）。                                                                                                                                                                                                                                                                                                                                                                                                                                         |\n| 情感倾向   | 发言者在该发言中表达的情感倾向（正面、中性、负面）。                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |\n| 对话语篇ID | 对话语篇的索引，从0开始。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |\n| 发言ID     | 该发言在对话语篇中的索引，从0开始。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |\n| 季节数     | 该发言所属的《老友记》电视剧的季节数。                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |\n| 集数       | 该发言所属的某一季中具体剧集的集数。                                                                                                                                                                                                                                                                                                                                                                                                                                                              |\n| 开始时间   | 该发言在给定剧集中开始的时间，格式为“时:分:秒,毫秒”。                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |\n| 结束时间   | 该发言在给定剧集中结束的时间，格式为“时:分:秒,毫秒”。                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |\n\n### 文件说明\n- \u002Fdata\u002FMELD\u002Ftrain_sent_emo.csv - 包含训练集中的话语及其情感和情绪标签。\n- \u002Fdata\u002FMELD\u002Fdev_sent_emo.csv - 包含验证集中的话语及其情感和情绪标签。\n- \u002Fdata\u002FMELD\u002Ftest_sent_emo.csv - 包含测试集中的话语及其情感和情绪标签。\n- \u002Fdata\u002FMELD_Dyadic\u002Ftrain_sent_emo_dya.csv - 包含MELD双人对话变体训练集中的话语及其情感和情绪标签。要获取与特定话语对应的视频片段，请参考“Old_Dialogue_ID”和“Old_Utterance_ID”两列。\n- \u002Fdata\u002FMELD_Dyadic\u002Fdev_sent_emo_dya.csv - 包含双人对话变体验证集中的话语及其情感和情绪标签。要获取与特定话语对应的视频片段，请参考“Old_Dialogue_ID”和“Old_Utterance_ID”两列。\n- \u002Fdata\u002FMELD_Dyadic\u002Ftest_sent_emo_dya.csv - 包含双人对话变体测试集中的话语及其情感和情绪标签。要获取与特定话语对应的视频片段，请参考“Old_Dialogue_ID”和“Old_Utterance_ID”两列。\n\n## Pickle 文件说明\n共有13个Pickle文件，包含了用于训练基线模型的数据和特征。以下是每个Pickle文件的简要说明。\n\n### 数据Pickle文件：\n\n* **data_emotion.p、data_sentiment.p** - 这是主要的数据文件，包含以列表形式存储的5个不同元素。\n    * *data*: 一个字典，包含以下键值对：\n        * *text*: 原始句子。\n        * *split*: train\u002Fval\u002Ftest - 表示该元组所属的划分（训练集、验证集或测试集）。\n        * *y*: 句子的标签。\n        * *dialog*: 该话语所属对话的ID。\n        * *utterance*: 对话ID中的话语编号。\n        * *num_words*: 话语中的单词数。\n    * W: Glove嵌入矩阵。\n    * vocab: 数据集的词汇表。\n    * word_idx_map: 词汇表中每个词到其在W中索引的映射。\n    * max_sentence_length: 数据集中话语的最大标记数。\n    * label_index: 每个标签（情绪或情感）与其分配索引的映射，例如label_index['neutral']=0。\n```python\nimport pickle\ndata, W, vocab, word_idx_map, max_sentence_length, label_index = pickle.load(open(filepath, 'rb'))\n```\n\n* **text_glove_average_emotion.pkl、text_glove_average_sentiment.pkl** - 包含每个话语的300维文本特征向量，这些特征向量由该话语中所有标记的Glove嵌入取平均得到。它是一个列表，包含针对训练集、验证集和测试集的3个字典，每个字典以*dia_utt*格式索引，其中dia为对话ID，utt为话语ID。例如，train_text_avg_emb['0_0'].shape = (300, )。\n```python\nimport pickle\ntrain_text_avg_emb、val_text_avg_emb、test_text_avg_emb = pickle.load(open(filepath, 'rb'))\n```\n\n\n\n* **audio_embeddings_feature_selection_emotion.pkl、audio_embeddings_feature_selection_sentiment.pkl** - 包含每个话语的1611\u002F1422维音频特征向量，这些特征是为情绪\u002F情感分类训练得到的。这些特征最初是从[openSMILE](https:\u002F\u002Fwww.audeering.com\u002Fopensmile\u002F)提取的，随后使用基于L2范数的特征选择方法，并通过SVM进行筛选。它是一个列表，包含针对训练集、验证集和测试集的3个字典，每个字典以*dia_utt*格式索引，其中dia为对话ID，utt为话语ID。例如，train_audio_emb['0_0'].shape = (1611, )或(1422, )。\n```python\nimport pickle\ntrain_audio_emb、val_audio_emb、test_audio_emb = pickle.load(open(filepath, 'rb'))\n```\n\n\n### 模型输出Pickle文件：\n\n* **text_glove_CNN_emotion.pkl、text_glove_CNN_sentiment.pkl** - 包含在基于CNN的[网络](https:\u002F\u002Fgithub.com\u002Fdennybritz\u002Fcnn-text-classification-tf)上训练后得到的100维文本特征，用于情绪\u002F情感分类。它是一个列表，包含针对训练集、验证集和测试集的3个字典，每个字典以*dia_utt*格式索引，其中dia为对话ID，utt为话语ID。例如，train_text_CNN_emb['0_0'].shape = (100, )。\n```python\nimport pickle\ntrain_text_CNN_emb、val_text_CNN_emb、test_text_CNN_emb = pickle.load(open(filepath, 'rb'))\n```\n\n\n* **text_emotion.pkl、text_sentiment.pkl** - 这些文件包含由单模态bcLSTM模型生成的上下文特征表示。它们为每个话语存储600维文本特征向量，用于情绪\u002F情感分类，以对话ID为索引的字典形式呈现。它是一个列表，包含针对训练集、验证集和测试集的3个字典。例如，train_text_emb['0'].shape = (33, 600)，其中33是对话中最多的话语数量。话语较少的对话会用零向量填充。\n```python\nimport pickle\ntrain_text_emb、val_text_emb、test_text_emb = pickle.load(open(filepath, 'rb'))\n```\n\n\n* **audio_emotion.pkl、audio_sentiment.pkl** - 这些文件包含由单模态bcLSTM模型生成的上下文特征表示。它们为每个话语存储300\u002F600维音频特征向量，用于情绪\u002F情感分类，以对话ID为索引的字典形式呈现。它是一个列表，包含针对训练集、验证集和测试集的3个字典。例如，train_audio_emb['0'].shape = (33, 300)或(33, 600)，其中33是对话中最多的话语数量。话语较少的对话会用零向量填充。\n```python\nimport pickle\ntrain_audio_emb、val_audio_emb、test_audio_emb = pickle.load(open(filepath, 'rb'))\n```\n\n\n* **bimodal_sentiment.pkl** - 该文件包含由双模态bcLSTM模型生成的上下文特征表示。它为每个话语存储600维双模态（文本、音频）特征向量，用于情感分类，以对话ID为索引的字典形式呈现。它是一个列表，包含针对训练集、验证集和测试集的3个字典。例如，train_bimodal_emb['0'].shape = (33, 600)，其中33是对话中最多的话语数量。话语较少的对话会用零向量填充。\n```python\nimport pickle\ntrain_bimodal_emb、val_bimodal_emb、test_bimodal_emb = pickle.load(open(filepath, 'rb'))\n```\n\n## 原始数据描述\n- 数据集包含3个文件夹（.tar.gz文件）：train、dev和test；每个文件夹分别对应3个.csv文件中的语音片段。\n- 在任何一个文件夹中，原始数据中的每个视频片段都对应于相应.csv文件中的一个语句。视频片段的命名格式为：diaX1\\_uttX2.mp4，其中X1是对话ID，X2是语句ID，与相应.csv文件中的信息一致，用于标识特定的语句。\n- 例如，考虑train.tar.gz中的视频片段dia6_utt1.mp4。该视频片段对应的语句将在train_sent_emp.csv文件中，其Dialogue_ID=6且Utterance_ID=1，即*'You liked it? You really liked it?'*。\n\n## 数据读取\n在’.\u002Futils\u002F’目录下提供了2个Python脚本：\n- read_meld.py \\- 显示MELD数据集中.csv文件中某一语句对应的视频文件路径。\n- read_emorynlp \\- 显示多模态EmoryNLP情感识别数据集中.csv文件中某一语句对应的视频文件路径。\n\n## 标签定义\n在实验中，所有标签均以独热编码表示，其索引如下：\n- **情感** - {'neutral': 0, 'surprise': 1, 'fear': 2, 'sadness': 3, 'joy': 4, 'disgust': 5, 'anger': 6}。因此，情感为*'joy'*的标签为[0., 0., 0., 0., 1., 0., 0.]\n- **情绪** - {'neutral': 0, 'positive': 1, 'negative': 2}。因此，情绪为*'positive'*的标签为[0., 1., 0.]\n\n## 类别权重\n在情感分类的基线模型中，使用了以下类别权重。索引方式与上述相同。\n类别权重：[4.0, 15.0, 15.0, 3.0, 1.0, 6.0, 3.0]。\n\n## 运行基线模型\n\n请按照以下步骤运行基线模型：\n\n1. 从[这里](http:\u002F\u002Fweb.eecs.umich.edu\u002F~mihalcea\u002Fdownloads\u002FMELD.Features.Models.tar.gz)下载特征文件。\n2. 将这些特征文件复制到`.\u002Fdata\u002Fpickles\u002F`目录下。\n3. 要训练或测试基线模型，请运行文件：`baseline\u002Fbaseline.py`，命令如下：\n    - `python baseline.py -classify [Sentiment|Emotion] -modality [text|audio|bimodal] [-train|-test]`\n    - 示例命令：训练文本单模态的情感分类模型：`python baseline.py -classify Sentiment -modality text -train`\n    - 使用`python baseline.py -h`可获取参数帮助信息。\n4. 对于预训练模型，从[这里](http:\u002F\u002Fweb.eecs.umich.edu\u002F~mihalcea\u002Fdownloads\u002FMELD.Features.Models.tar.gz)下载模型权重，并将pickle文件放入`.\u002Fdata\u002Fmodels\u002F`目录中。\n\n## 引用\n如果您在研究中使用了本数据集，请引用以下论文：\n\nS. Poria, D. Hazarika, N. Majumder, G. Naik, E. Cambria, R. Mihalcea. MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversation. ACL 2019.\n\nChen, S.Y., Hsu, C.C., Kuo, C.C. and Ku, L.W. EmotionLines: An Emotion Corpus of Multi-Party Conversations. arXiv preprint arXiv:1802.08379 (2018).\n\n# 多模态EmoryNLP情感识别数据集\n----------------------------------------------------\n## 描述\n多模态EmoryNLP情感检测数据集是在EmoryNLP情感检测数据集的基础上扩展和增强而创建的。它包含了EmoryNLP情感检测数据集中相同的对话实例，同时还增加了音频和视觉模态，与文本模态一起构成了多模态数据。多模态EmoryNLP数据集中包含来自电视剧《老友记》的800多个对话和9000多个语句。对话中有多个说话者参与。每个语句都被标注为以下七种情感之一——中性、喜悦、平静、强大、恐惧、愤怒和悲伤。这些标注直接来源于原始数据集。\n### 数据集统计信息\n| 统计指标                      | Train   | Dev     | Test    |\n|---------------------------------|---------|---------|---------|\n| 模态数量                   | {a,v,t} | {a,v,t} | {a,v,t} |\n| 独特词汇数               | 9,744  | 2,123   | 2,345   |\n| 平均语句长度           | 7.86    | 6.97    | 7.79    |\n| 最大语句长度           | 78      | 60      | 61      |\n| 每个场景平均情感数量 | 4.10    | 4.00    | 4.40    |\n| 对话数量                  | 659    | 89     | 79     |\n| 语句数量                 | 7551    | 954    | 984    |\n| 说话者数量               | 250     | 46      | 48     |\n| 情感转换次数              | 4596    | 575     | 653    |\n| 平均语句时长             | 5.55s   | 5.46s   | 5.27s   |\n\n### 数据分布\n\n|          | Train | Dev | Test |\n|----------|-------|-----|------|\n| 喜悦   | 1677  | 205 | 217  |\n| 愤怒      | 785   | 97  | 86   |\n| 中性  | 2485  | 322 | 288  |\n| 平静 | 638   | 82  | 111  |\n| 强大 | 551   | 70  | 96   |\n| 悲伤 | 474   | 51  | 70   |\n| 恐惧 | 941   | 127 | 116  |\n\n## 数据\n本数据集的视频片段可以从[此链接](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1UQduKw8QTqGf3RafxrTDfI1NyInYK3fr\u002Fview?usp=sharing)下载。\n标注文件可在https:\u002F\u002Fgithub.com\u002FSenticNet\u002FMELD\u002Ftree\u002Fmaster\u002Fdata\u002Femorynlp找到。共有3个.csv文件。这些csv文件的第一列每条记录都包含一个语句，其对应的视频片段可以在[此处](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1UQduKw8QTqGf3RafxrTDfI1NyInYK3fr\u002Fview?usp=sharing)找到。每个语句及其视频片段都按季号、集号、场景编号和语句编号进行索引。例如，**sea1\\_ep2\\_sc6\\_utt3.mp4**表示该片段对应于第1季第2集第6场景第3语句。一个场景即为一段对话。这种索引方式与原始数据集保持一致。csv文件和视频文件均按照原始数据集的划分方式分为训练集、验证集和测试集。标注直接沿用了原始的EmoryNLP数据集（Zahiri等，2018）。\n\n### .csv 文件说明\n\n#### 列规范\n| 列名         | 说明                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |\n|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| Utterance    | 来自 EmoryNLP 的单个话语，以字符串形式表示。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |\n| Speaker      | 与该话语相关联的说话者姓名。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |\n| Emotion      | 说话者在该话语中表达的情绪（中性、喜悦、平静、强大、害怕、愤怒和悲伤）。                                                                                                                                                                                                                                                                                                                                                                                                                                         |\n| Scene_ID     | 对话的索引，从 0 开始。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |\n| Utterance_ID | 对话中特定话语的索引，从 0 开始。                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |\n| Season       | 该话语所属的《老友记》电视剧的季数。                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |\n| Episode      | 该话语所属的某一季中的集数。                                                                                                                                                                                                                                                                                                                                                                                                                                                              |\n| StartTime    | 该话语在相应集中的开始时间，格式为“hh:mm:ss,ms”。                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |\n| EndTime      | 该话语在相应集中的结束时间，格式为“hh:mm:ss,ms”。\n\n***注***：由于字幕存在一些不一致之处，我们未能找到少数话语的开始和结束时间。这些话语已被从数据集中剔除。不过，我们鼓励用户从原始数据集中找到相应的话语，并为其生成视频片段。\n## 引用\n如果您在研究中使用了本数据集，请引用以下论文：\n\nS. Zahiri 和 J. D. Choi. 基于序列卷积神经网络的电视剧剧本情感检测。载于 AAAI 情感内容分析研讨会，AFFCON'18，2018 年。\n\nS. Poria、D. Hazarika、N. Majumder、G. Naik、E. Cambria、R. Mihalcea. MELD：用于对话中情感识别的多模态多人数据集。ACL 2019。\n\n# MUStARD：多模态讽刺检测数据集\n\n本仓库包含我们2019年ACL论文的数据集和代码：\n\n[迈向多模态讽刺检测（一篇_显然_完美的论文）](https:\u002F\u002Fwww.aclweb.org\u002Fanthology\u002FP19-1455\u002F)\n\n我们发布了MUStARD数据集，这是一个用于自动化讽刺识别研究的多模态视频语料库。该数据集来源于热门电视剧，包括《老友记》、《黄金女郎》、《生活大爆炸》和《讽刺狂人匿名会》。MUStARD由带有讽刺标签的视听话语组成。每个话语都附有其上下文，提供了话语发生场景的额外信息。\n\n## 示例实例\n\n![示例实例](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_647c90660a10.jpg)\n\n\u003Cp align=\"center\"> 数据集中的一段讽刺性话语及其上下文和文字稿。 \u003C\u002Fp>     \n\n## 原始视频\n\n我们提供了一个包含原始视频片段的[Google Drive文件夹](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1i9ixalVcXskA5_BkNnbR60sqJqvGyi6E\u002Fview?usp=sharing)，其中包括话语及其各自的上下文。\n\n## 数据格式\n\n视听片段的标注和文字稿可在[`data\u002Fsarcasm_data.json`](data\u002Fsarcasm_data.json)中找到。JSON文件中的每个实例都被分配了一个标识符（例如“1_60”），它是一个包含以下项的字典：\n\n| 键                     | 值                                                                          | \n| ----------------------- |:------------------------------------------------------------------------------:| \n| `utterance`             | 待分类目标话语的文本。                                  | \n| `speaker`               | 目标话语的说话者。                                               | \n| `context`               | 在目标话语之前按时间顺序排列的话语列表。    | \n| `context_speakers`      | 上下文话语各自的说话者。                                 | \n| `sarcasm`               | 讽刺标签的二值标记。                                                  | \n\nJSON中的示例格式：\n\n```json\n{\n  \"1_60\": {\n    \"utterance\": \"能亲眼见证你的思维运作，真是莫大的荣幸。\",\n    \"speaker\": \"谢尔顿\",\n    \"context\": [\n      \"我从未想到能在宇宙大爆炸的余波中发现弦理论的指纹。\",\n      \"抱歉，请问你的计划是什么？\"\n    ],\n    \"context_speakers\": [\n      \"莱纳德\",\n      \"谢尔顿\"\n    ],\n    \"sarcasm\": true\n  }\n}\n```\n\n## 引用\n\n如果您在研究中使用了此数据集，请引用以下论文：\n\n```bibtex\n@inproceedings{mustard,\n    title = \"迈向多模态讽刺检测（一篇\\_显然\\_完美的论文）\",\n    author = \"Castro, Santiago  and\n      Hazarika, Devamanyu  and\n      P{\\'e}rez-Rosas, Ver{\\'o}nica  and\n      Zimmermann, Roger  and\n      Mihalcea, Rada  and\n      Poria, Soujanya\",\n    booktitle = \"第57届计算语言学协会年会论文集（第一卷：长篇论文）\",\n    month = \"7\",\n    year = \"2019\",\n    address = \"意大利佛罗伦萨\",\n    publisher = \"计算语言学协会\",\n}\n```\n\n## 运行代码\n\n1. 使用Conda设置环境：\n\n    ```bash\n    conda env create -f environment.yml\n    conda activate mustard\n    python -c \"import nltk; nltk.download('punkt')\"\n    ```\n\n2. 将[Common Crawl预训练的300维、8400亿词的GloVe词向量](http:\u002F\u002Fnlp.stanford.edu\u002Fdata\u002Fglove.840B.300d.zip)下载到某个位置。\n\n3. [下载预提取的视觉特征](https:\u002F\u002Fdrive.google.com\u002Fopen?id=1Ff1WDObGKqpfbvy7-H1mD8YWvBS-Kf26)到`data\u002F`文件夹中（使`data\u002Ffeatures\u002F`包含`context_final\u002F`和`utterances_final\u002F`两个文件夹，内含特征），或者自行[提取视觉特征](visual)。\n\n4. [下载预提取的BERT特征](https:\u002F\u002Fdrive.google.com\u002Ffile\u002Fd\u002F1GYv74vN80iX_IkEmkJhkjDRGxLvraWuZ\u002Fview?usp=sharing)，并将两个文件直接放置在`data\u002F`文件夹下（即`data\u002Fbert-output.jsonl`和`data\u002Fbert-output-context.jsonl`），或在另一个配备Python 2和TensorFlow 1.11.0的环境中，按照[BERT仓库中的“使用BERT提取固定特征向量（如ELMo）”](https:\u002F\u002Fgithub.com\u002Fgoogle-research\u002Fbert\u002Ftree\u002Fd66a146741588fb208450bde15aa7db143baaa69#using-bert-to-extract-fixed-feature-vectors-like-elmo)方法提取BERT特征，并运行：\n\n    ```bash\n    # 在某个目录下载BERT-base uncased模型：\n    wget https:\u002F\u002Fstorage.googleapis.com\u002Fbert_models\u002F2018_10_18\u002Funcased_L-12_H-768_A-12.zip\n    # 然后将路径赋给这个变量：\n    BERT_BASE_DIR=...\n    \n    python extract_features.py \\\n      --input_file=data\u002Fbert-input.txt \\\n      --output_file=data\u002Fbert-output.jsonl \\\n      --vocab_file=${BERT_BASE_DIR}\u002Fvocab.txt \\\n      --bert_config_file=${BERT_BASE_DIR}\u002Fbert_config.json \\\n      --init_checkpoint=${BERT_BASE_DIR}\u002Fbert_model.ckpt \\\n      --layers=-1,-2,-3,-4 \\\n      --max_seq_length=128 \\\n      --batch_size=8\n    ```\n\n5. 检查`python train_svm.py -h`中的选项以选择运行配置（或修改[`config.py`](config.py)），然后运行：\n\n    ```bash\n    python train_svm.py  # 添加您需要的标志\n    ```\n\n6. 评估：我们使用加权F分数指标，在5折交叉验证方案中进行评估。折索引可在`data\u002Fsplit_incides.p`中找到。更多细节请参考我们的基线脚本。\n\n# M2H2：多模态多人印地语对话幽默识别数据集\n\n:zap: :zap: :zap: 基线代码即将发布！\n\n:fire::fire::fire: [阅读论文](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.01260)\n\nM2H2数据集来源于著名电视剧《Shrimaan Shrimati Phir Se》（总时长4.46小时），并进行了人工标注。我们根据上下文将这些样本（话语）分组为场景。每个场景中的每条话语都带有标签，指示该话语是否具有幽默性，即幽默或非幽默。此外，每条话语还标注了说话者和听者的信息。在多人对话中，识别听者是一项重大挑战。在我们的数据集中，我们将听者定义为对话中说话者正在回应的那一方。每个场景中的每条话语都与其上下文话语相关联，这些上下文话语是由参与对话的各方轮流说出的。该数据集还包括多人对话，其分类难度比两人对话更高。\n\n# 数据格式\n\n![替代文本](https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_readme_f9d63f69ce49.png)\n\n## 文本数据\n\n:fire::fire::fire: ***``Raw-Text\u002FEp-NUMBER.tsv`` 文件充当主标注文件，不仅包含文本数据，还包含如下所述的其他元数据。它同样包含了对话语句的手动标注标签。通过使用剧集 ID 和场景 ID，可以将 ``Raw-Text`` 文件夹中的话语句映射到 ``Raw-Audio`` 和 ``Raw-Visual`` 中对应的音频和视觉片段，从而形成多模态数据。TSV 文件中的 ``Label`` 列，例如 ``Raw-Text\u002FEp-NUMBER.tsv``，包含了每条话语句所需的手动标注标签。***\n\n文本数据以 TSV 格式存储。每个文件的命名格式为 ``Raw-Text\u002FEp-NUMBER.tsv``。其中的 ``NUMBER`` 表示剧集编号，可用于与相应的音频和视觉片段进行匹配。文本数据包含以下字段：\n\n```\nScenes：场景 ID，用于与对应的音频和视觉片段相匹配。\nSI. No.：话语句编号。\nStart_time：话语句在视频中的开始时间。\nEnd_time：话语句在视频中的结束时间。\nUtterance：口头话语句。\nLabel：话语句的标注标签，可为幽默或非幽默。\nSpeaker：格式为“说话者,听者”，形式为“说话者姓名,话语句编号”，例如“Dilruba,u3”，表示说话者是 Dilruba，且其回应的是第 3 号话语句。这一信息对于解析多方对话中的指代关系尤为有用。\n```\n\n## 音频数据\n\n每个剧集都有一个专用文件夹，例如 ``Raw-Audio\u002F22\u002F`` 包含第 22 集的所有已标注音频样本。\n\n对于每集而言，每个场景也有一个专用文件夹，例如 ``Raw-Audio\u002F22\u002FScene_1`` 包含第 22 集第 1 场的所有已标注音频样本。\n\n## 视觉数据\n\n每个剧集都有一个专用文件夹，例如 ``Raw-Visual\u002F22\u002F`` 包含第 22 集的所有已标注视觉样本。\n\n对于每集而言，每个场景也有一个专用文件夹，例如 ``Raw-Visual\u002F22\u002FScene_1`` 包含第 22 集第 1 场的所有已标注视觉样本。\n\n# 基线模型\n\n:zap: :zap: :zap: 基线代码即将发布！\n\n# 引用\n\nDushyant Singh Chauhan, Gopendra Vikram Singh, Navonil Majumder, Amir Zadeh, Asif Ekbal, Pushpak Bhattacharyya, Louis-philippe Morency, and Soujanya Poria. 2021. [M2H2：用于对话中幽默识别的多模态多方印地语数据集。收录于 ICMI ’21：第 23 届 ACM 国际多模态交互会议](https:\u002F\u002Farxiv.org\u002Fabs\u002F2108.01260)，加拿大蒙特利尔。ACM，美国纽约州纽约市，5 页。","# multimodal-deep-learning 快速上手指南\n\n本仓库集成了多种基于深度学习的多模态模型实现，主要用于解决多模态表示学习、多模态融合及情感分析等任务（如 CMU-MOSI, CMU-MOSEI 数据集）。\n\n## 环境准备\n\n*   **操作系统**: Linux \u002F macOS (Windows 需使用 WSL)\n*   **核心依赖**:\n    *   Python 3.x\n    *   Conda (推荐用于环境管理)\n    *   PyTorch \u002F TensorFlow \u002F Keras (具体版本视所选模型而定，通常由 `environment.yml` 自动配置)\n*   **数据预处理工具**:\n    *   [CMU Multimodal SDK](https:\u002F\u002Fgithub.com\u002FA2Zadeh\u002FCMU-MultimodalSDK) (部分模型如 MISA, BBFN 必需)\n    *   GloVe 词向量文件 (部分模型必需)\n\n> **注意**：部分模型（如 MMIM, MISA, BBFN）主要基于 PyTorch，而 Hfusion 和 bc-LSTM 基于 Keras\u002FTensorFlow。请根据你要运行的具体模型选择对应的环境配置。\n\n## 安装步骤\n\n以下以主流的 **PyTorch 系列模型**（如 MMIM, MISA, BBFN）为例进行安装。\n\n### 1. 克隆仓库\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fmultimodal-deep-learning.git\ncd multimodal-deep-learning\n```\n\n### 2. 配置 Conda 环境\n进入你想要运行的具体模型目录（例如 `Multimodal-Infomax`, `MISA`, 或 `BBFN`），然后创建环境。\n\n**示例：安装 MMIM (Multimodal-Infomax) 环境**\n```bash\ncd Multimodal-Infomax\nconda env create -f environment.yml\nconda activate MMIM\n```\n\n**示例：安装 MISA 环境**\n```bash\ncd MISA\nconda env create -f environment.yml\nconda activate misa-code\n```\n\n**示例：安装 BBFN 环境**\n```bash\ncd BBFN\nconda env create -f environment.yml\nconda activate BBFN\n```\n\n### 3. 安装额外依赖 (如需)\n对于 **MISA** 和 **BBFN** 模型，需要手动安装 CMU Multimodal SDK：\n```bash\n# 确保在激活的 conda 环境中执行\npip install git+https:\u002F\u002Fgithub.com\u002FA2Zadeh\u002FCMU-MultimodalSDK.git\n```\n*注：对于 MISA，还需在 `src\u002Fconfig.py` 中配置 `word_emb_path` 指向下载的 [GloVe](http:\u002F\u002Fnlp.stanford.edu\u002Fdata\u002Fglove.840B.300d.zip) 文件路径，并设置 `sdk_dir`。*\n\n## 基本使用\n\n使用前请确保已下载相应数据集（如 CMU-MOSI, CMU-MOSEI）并放置在指定文件夹（通常为 `datasets` 或通过代码配置路径）。\n\n### 1. 训练 Multimodal-Infomax (MMIM)\n```bash\npython main.py --dataset mosi --contrast\n```\n*参数说明：`--dataset` 可选 `mosi` 或 `mosei`。*\n\n### 2. 训练 MISA\n进入源码目录后运行：\n```bash\ncd src\npython train.py --data mosi\n```\n*参数说明：`--data` 可选 `mosi`, `mosei` 或 `ur_funny`。*\n\n### 3. 训练 BBFN\n```bash\ncd src\npython main.py --dataset \u003Cdataset_name> --data_path \u003Cpath_to_dataset>\n```\n*参考脚本：可查阅仓库内的 `scripts\u002Frun.sh` 获取完整参数示例。*\n\n### 4. 运行 Hfusion (Keras 版本)\n无需复杂配置，直接运行：\n```bash\npython3 hfusion.py\n```\n\n### 5. 运行 Attention-based LSTM (TensorFlow 版本)\n```bash\n# 使用注意力机制融合\npython run.py --unimodal False --fusion True --data mosei --classes 3\n\n# 仅使用单模态\npython run.py --unimodal True --fusion False --data mosi --classes 2\n```\n*参数说明：`--classes` 对应不同数据集的分类数 (mosi: 2, mosei: 3, iemocap: 6)。*\n\n---\n**数据获取提示**：\n若下载官方数据集速度较慢，可尝试在相关论文页面或国内学术镜像站搜索 \"CMU-MOSI\" 或 \"CMU-MOSEI\" 的百度网盘资源。对于视觉和音频特征提取的具体方法，请参考作者提供的辅助仓库：[MUStARD](https:\u002F\u002Fgithub.com\u002Fsoujanyaporia\u002FMUStARD)。","某电商团队正致力于升级其视频评论分析系统，旨在从用户上传的评测视频中精准识别对产品的真实情感倾向。\n\n### 没有 multimodal-deep-learning 时\n- **模态割裂导致误判**：团队只能单独分析字幕文本或音频语调，无法捕捉用户“嘴上说喜欢但表情嫌弃”的反讽场景，导致情感打分严重失真。\n- **特征融合粗糙**：自行编写的简单拼接代码无法处理视觉、听觉和文本间的复杂非线性关系，模型在噪声较大的真实视频数据上泛化能力极差。\n- **研发周期漫长**：复现论文中的多模态注意力机制或互信息最大化算法需要从零搭建底层架构，耗费数周时间调试且难以保证效果对齐。\n- **缺乏专用数据集支持**：找不到针对多模态幽默或讽刺检测的标准数据集（如 MUStARD），导致模型训练缺乏高质量的标注数据支撑。\n\n### 使用 multimodal-deep-learning 后\n- **精准捕捉细微情感**：直接调用 MISA 或 BBFN 等预置模型，成功分离并融合了模态不变性与特异性特征，准确识别出用户视频中的反讽与隐含情绪。\n- **高级融合策略落地**：利用内置的 Multimodal-Infomax 算法，通过层级互信息最大化自动优化多源数据融合，显著提升了在嘈杂环境下的分析准确率。\n- **快速验证与部署**：基于提供的 PyTorch\u002FTensorFlow 实现代码和详细环境配置，团队在两天内即可完成从数据加载到模型训练的全流程，大幅缩短上线时间。\n- **丰富数据资源集成**：直接接入仓库集成的 MELD 和 MUStARD 等多模态数据集，快速构建了针对特定场景（如多人对话、幽默检测）的鲁棒模型。\n\nmultimodal-deep-learning 将复杂的多模态算法研发转化为高效的模块化调用，让团队能专注于业务逻辑而非底层数学实现的重复造轮子。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fdeclare-lab_multimodal-deep-learning_7eee81a1.png","declare-lab","Deep Cognition and Language Research (DeCLaRe) Lab","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fdeclare-lab_27a2ce73.png","",null,"https:\u002F\u002Fdeclare-lab.github.io","https:\u002F\u002Fgithub.com\u002Fdeclare-lab",[80,84,88],{"name":81,"color":82,"percentage":83},"OpenEdge ABL","#5ce600",98.9,{"name":85,"color":86,"percentage":87},"Python","#3572A5",1.1,{"name":89,"color":90,"percentage":91},"Shell","#89e051",0,913,166,"2026-04-12T03:25:05","MIT",4,"未说明","未说明 (项目涉及深度学习模型训练，通常建议配备 NVIDIA GPU，但 README 中未明确具体型号或显存要求)",{"notes":100,"python":101,"dependencies":102},"该项目包含多个不同子模型（如 MMIM, MISA, BBFN 等），分别基于 PyTorch、TensorFlow 或 Keras 实现，需根据具体使用的模型安装对应框架。所有子项目均强烈建议使用 Conda 管理环境（通过 environment.yml 创建）。运行前需手动下载 CMU-MOSI、CMU-MOSEI 等数据集以及预训练的 GloVe 词向量文件，并配置相应的数据路径。部分模块依赖 CMU Multimodal SDK 进行数据预处理。","未说明 (部分脚本使用 python3 调用，且依赖 Conda 环境)",[103,104,105,106,107,108,109],"PyTorch","TensorFlow>=1.7","Keras>=2.0","NumPy","Scikit-learn","CMU-MultimodalSDK","GloVe embeddings",[14,111,35],"其他",[64,113,114,115],"multimodal-learning","multimodal-interactions","multimodal-sentiment-analysis","2026-03-27T02:49:30.150509","2026-04-18T11:11:36.162377",[119,124],{"id":120,"question_zh":121,"answer_zh":122,"source_url":123},38971,"运行代码时提示找不到 'bimodal.pickle' 文件，该如何解决？","该文件不需要单独下载。您可以利用现有的 'unimodal.pickle' 文件，通过运行 'hfusion.py' 脚本来生成 'bimodal.pickle' 文件。","https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fmultimodal-deep-learning\u002Fissues\u002F1",{"id":125,"question_zh":126,"answer_zh":127,"source_url":128},38970,"提供的数据集下载链接无法访问（显示 404 错误），是否有其他来源？","维护者已提供新的 Google Drive 链接，您可以在此处找到所有数据：https:\u002F\u002Fdrive.google.com\u002Fdrive\u002Ffolders\u002F1djN_EkrwoRLUt7Vq_QfNZgCl_24wBiIK。如果原链接失效，请尝试使用此备用链接。","https:\u002F\u002Fgithub.com\u002Fdeclare-lab\u002Fmultimodal-deep-learning\u002Fissues\u002F6",[]]