Peter180
/

DLF

Model card Files Files and versions Community

peter-wang321 commited on Dec 16, 2024

Commit

9157432

1 Parent(s): ab35d3d

Initial DLF commit

Browse files

Files changed (37) hide show

.gitignore +3 -0
config.py +40 -0
config/config.json +98 -0
config/readme.md +1 -0
data_loader.py +156 -0
log/readme.md +1 -0
pt/readme.md +1 -0
requirements.txt +6 -0
result/normal/mosei.csv +11 -0
result/normal/mosi.csv +21 -0
result/readme.md +1 -0
run.py +201 -0
test.py +7 -0
train.py +7 -0
trains/ATIO.py +15 -0
trains/__init__.py +1 -0
trains/singleTask/DLF.py +233 -0
trains/singleTask/HingeLoss.py +57 -0
trains/singleTask/__init__.py +1 -0
trains/singleTask/distillnets/get_distillation_kernel.py +96 -0
trains/singleTask/distillnets/get_distillation_kernel_homo.py +100 -0
trains/singleTask/misc.py +196 -0
trains/singleTask/model/DLF.py +345 -0
trains/singleTask/utils/__init__.py +1 -0
trains/singleTask/utils/misc.py +196 -0
trains/subNets/AlignNets.py +106 -0
trains/subNets/BertTextEncoder.py +52 -0
trains/subNets/__init__.py +2 -0
trains/subNets/transformers_encoder/multihead_attention.py +154 -0
trains/subNets/transformers_encoder/position_embedding.py +77 -0
trains/subNets/transformers_encoder/transformer.py +205 -0
trains/utils/__init__.py +2 -0
trains/utils/functions.py +51 -0
trains/utils/metricsTop.py +125 -0
utils/__init__.py +2 -0
utils/functions.py +51 -0
utils/metricsTop.py +111 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+log/*.log
+pt/*.pth
+__pycache__/

config.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import json
+import os
+from pathlib import Path
+from easydict import EasyDict as edict
+def get_config_regression(model_name, dataset_name, config_file=""):
+    """
+    Get the regression config of given dataset and model from config file.
+    Parameters:
+        config_file (str): Path to config file, if given an empty string, will use default config file.
+        model_name (str): Name of model.
+        dataset_name (str): Name of dataset.
+    Returns:
+        config (dict): config of the given dataset and model
+    """
+    if config_file == "":
+        config_file = Path(__file__).parent / "config" / "config_regression.json"
+    with open(config_file, 'r') as f:
+        config_all = json.load(f)
+    model_common_args = config_all[model_name]['commonParams']
+    model_dataset_args = config_all[model_name]['datasetParams'][dataset_name]
+    dataset_args = config_all['datasetCommonParams'][dataset_name]
+    # use aligned feature if the model requires it, otherwise use unaligned feature
+    dataset_args = dataset_args['aligned'] if (model_common_args['need_data_aligned'] and 'aligned' in dataset_args) else dataset_args['unaligned']
+    config = {}
+    config['model_name'] = model_name
+    config['dataset_name'] = dataset_name
+    config.update(dataset_args)
+    config.update(model_common_args)
+    config.update(model_dataset_args)
+    config['featurePath'] = os.path.join(config_all['datasetCommonParams']['dataset_root_dir'], config['featurePath'])
+    config = edict(config)
+    return config

config/config.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "datasetCommonParams": {
+    "dataset_root_dir": "../dataset",
+    "mosi": {
+      "aligned": {
+        "featurePath": "MOSI/Processed/aligned_50.pkl",
+        "feature_dims": [768, 5, 20],
+        "train_samples": 1284,
+        "num_classes": 3,
+        "language": "en",
+        "KeyEval": "Loss"
+      },
+      "unaligned": {
+        "featurePath": "MOSI/Processed/unaligned_50.pkl",
+        "feature_dims": [768, 5, 20],
+        "train_samples": 1284,
+        "num_classes": 3,
+        "language": "en",
+        "KeyEval": "Loss"
+      }
+    },
+    "mosei": {
+      "aligned": {
+        "featurePath": "MOSEI/Processed/aligned_50.pkl",
+        "feature_dims": [768, 74, 35],
+        "train_samples": 16326,
+        "num_classes": 3,
+        "language": "en",
+        "KeyEval": "Loss"
+      },
+      "unaligned": {
+        "featurePath": "MOSEI/Processed/unaligned_50.pkl",
+        "feature_dims": [768, 74, 35],
+        "train_samples": 16326,
+        "num_classes": 3,
+        "language": "en",
+        "KeyEval": "Loss"
+      }
+    }
+  },
+  "DLF": {
+    "commonParams": {
+      "need_data_aligned": true,
+      "need_model_aligned": true,
+      "early_stop": 10,
+      "use_bert": true,
+      "use_finetune": true,
+      "attn_mask": true,
+      "update_epochs": 10
+    },
+    "datasetParams": {
+      "mosi": {
+        "attn_dropout_a": 0.2,
+        "attn_dropout_v": 0.0,
+        "relu_dropout": 0.0,
+        "embed_dropout": 0.2,
+        "res_dropout": 0.0,
+        "dst_feature_dim_nheads": [50, 10],
+        "batch_size": 16,
+        "learning_rate": 0.0001,
+        "nlevels": 2,
+        "conv1d_kernel_size_l": 5,
+        "conv1d_kernel_size_a": 5,
+        "conv1d_kernel_size_v": 5,
+        "text_dropout": 0.5,
+        "attn_dropout": 0.3,
+        "output_dropout": 0.5,
+        "grad_clip": 0.6,
+        "patience": 5,
+        "weight_decay": 0.005,
+        "transformers": "bert",
+        "pretrained": "bert-base-uncased"
+      },
+      "mosei": {
+        "attn_dropout_a": 0.0,
+        "attn_dropout_v": 0.0,
+        "relu_dropout": 0.0,
+        "embed_dropout": 0.0,
+        "res_dropout": 0.0,
+        "dst_feature_dim_nheads": [50, 10],
+        "batch_size": 16,
+        "learning_rate": 0.0001,
+        "nlevels": 2,
+        "conv1d_kernel_size_l": 3,
+        "conv1d_kernel_size_a": 3,
+        "conv1d_kernel_size_v": 3,
+        "text_dropout": 0.1,
+        "attn_dropout": 0.5,
+        "output_dropout": 0.5,
+        "grad_clip": 0.6,
+        "patience": 5,
+        "weight_decay": 0.001,
+        "transformers": "bert",
+        "pretrained": "bert-base-uncased"
+      }
+    }
+  }
+  }

config/readme.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ config file

data_loader.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import logging
+import pickle
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset
+__all__ = ['MMDataLoader']
+logger = logging.getLogger('MMSA')
+class MMDataset(Dataset):
+    def __init__(self, args, mode='train'):
+        self.mode = mode
+        self.args = args
+        DATASET_MAP = {
+            'mosi': self.__init_mosi,
+            'mosei': self.__init_mosei,
+        }
+        DATASET_MAP[args['dataset_name']]()
+    def __init_mosi(self):
+        with open(self.args['featurePath'], 'rb') as f:
+            data = pickle.load(f)
+        if 'use_bert' in self.args and self.args['use_bert']:
+            self.text = data[self.mode]['text_bert'].astype(np.float32)
+        else:
+            self.text = data[self.mode]['text'].astype(np.float32)
+        self.vision = data[self.mode]['vision'].astype(np.float32)
+        self.audio = data[self.mode]['audio'].astype(np.float32)
+        self.raw_text = data[self.mode]['raw_text']
+        self.ids = data[self.mode]['id']
+        if self.args['feature_T'] != "":
+            with open(self.args['feature_T'], 'rb') as f:
+                data_T = pickle.load(f)
+            if 'use_bert' in self.args and self.args['use_bert']:
+                self.text = data_T[self.mode]['text_bert'].astype(np.float32)
+                self.args['feature_dims'][0] = 768
+            else:
+                self.text = data_T[self.mode]['text'].astype(np.float32)
+                self.args['feature_dims'][0] = self.text.shape[2]
+        if self.args['feature_A'] != "":
+            with open(self.args['feature_A'], 'rb') as f:
+                data_A = pickle.load(f)
+            self.audio = data_A[self.mode]['audio'].astype(np.float32)
+            self.args['feature_dims'][1] = self.audio.shape[2]
+        if self.args['feature_V'] != "":
+            with open(self.args['feature_V'], 'rb') as f:
+                data_V = pickle.load(f)
+            self.vision = data_V[self.mode]['vision'].astype(np.float32)
+            self.args['feature_dims'][2] = self.vision.shape[2]
+        self.labels = {
+            'M': np.array(data[self.mode]['regression_labels']).astype(np.float32)
+        }
+        logger.info(f"{self.mode} samples: {self.labels['M'].shape}")
+        if not self.args['need_data_aligned']:
+            if self.args['feature_A'] != "":
+                self.audio_lengths = list(data_A[self.mode]['audio_lengths'])
+            else:
+                self.audio_lengths = data[self.mode]['audio_lengths']
+            if self.args['feature_V'] != "":
+                self.vision_lengths = list(data_V[self.mode]['vision_lengths'])
+            else:
+                self.vision_lengths = data[self.mode]['vision_lengths']
+        self.audio[self.audio == -np.inf] = 0
+        if 'need_normalized' in self.args and self.args['need_normalized']:
+            self.__normalize()
+    def __init_mosei(self):
+        return self.__init_mosi()
+    def __init_sims(self):
+        return self.__init_mosi()
+    def __truncate(self):
+        def do_truncate(modal_features, length):
+            if length == modal_features.shape[1]:
+                return modal_features
+            truncated_feature = []
+            padding = np.array([0 for i in range(modal_features.shape[2])])
+            for instance in modal_features:
+                for index in range(modal_features.shape[1]):
+                    if((instance[index] == padding).all()):
+                        if(index + length >= modal_features.shape[1]):
+                            truncated_feature.append(instance[index:index+20])
+                            break
+                    else:
+                        truncated_feature.append(instance[index:index+20])
+                        break
+            truncated_feature = np.array(truncated_feature)
+            return truncated_feature
+        text_length, audio_length, video_length = self.args['seq_lens']
+        self.vision = do_truncate(self.vision, video_length)
+        self.text = do_truncate(self.text, text_length)
+        self.audio = do_truncate(self.audio, audio_length)
+    def __normalize(self):
+        self.vision = np.mean(self.vision, axis=1, keepdims=True)
+        self.audio = np.mean(self.audio, axis=1, keepdims=True)
+        self.vision[self.vision != self.vision] = 0
+        self.audio[self.audio != self.audio] = 0
+    def __len__(self):
+        return len(self.labels['M'])
+    def get_seq_len(self):
+        if 'use_bert' in self.args and self.args['use_bert']:
+            return (self.text.shape[2], self.audio.shape[1], self.vision.shape[1])
+        else:
+            return (self.text.shape[1], self.audio.shape[1], self.vision.shape[1])
+    def get_feature_dim(self):
+        return self.text.shape[2], self.audio.shape[2], self.vision.shape[2]
+    def __getitem__(self, index):
+        sample = {
+            'raw_text': self.raw_text[index],
+            'text': torch.Tensor(self.text[index]),
+            'audio': torch.Tensor(self.audio[index]),
+            'vision': torch.Tensor(self.vision[index]),
+            'index': index,
+            'id': self.ids[index],
+            'labels': {k: torch.Tensor(v[index].reshape(-1)) for k, v in self.labels.items()}
+        }
+        if not self.args['need_data_aligned']:
+            sample['audio_lengths'] = self.audio_lengths[index]
+            sample['vision_lengths'] = self.vision_lengths[index]
+        return sample
+def MMDataLoader(args, num_workers):
+    datasets = {
+        'train': MMDataset(args, mode='train'),
+        'valid': MMDataset(args, mode='valid'),
+        'test': MMDataset(args, mode='test')
+    }
+    if 'seq_lens' in args:
+        args['seq_lens'] = datasets['train'].get_seq_len()
+    dataLoader = {
+        ds: DataLoader(datasets[ds],
+                       batch_size=args['batch_size'],
+                       num_workers=num_workers,
+                       shuffle=True)
+        for ds in datasets.keys()
+    }
+    return dataLoader

log/readme.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ training log

pt/readme.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Here the trained models are saved.

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers==4.33.1
+huggingface-hub==0.17.1
+numpy==1.21.5
+scipy==1.9.1
+scikit-learn==1.0.2
+pandas==1.4.4

result/normal/mosei.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+Time,Model,acc_7,acc_5,acc_2,F1_score,Corr,MAE,Loss
+2024/12/08 13:54:11 - ,DLF,"(53.9, 0.0)","(55.7, 0.0)","(85.42, 0.0)","(85.27, 0.0)","(76.36, 0.0)","(53.61, 0.0)","(53.66, 0.0)"
+2024/12/08 16:00:34 - ,DLF,"(53.9, 0.0)","(55.7, 0.0)","(85.42, 0.0)","(85.27, 0.0)","(76.36, 0.0)","(53.61, 0.0)","(53.66, 0.0)"
+2024/12/08 21:00:18 - ,DLF,"(53.87, 0.0)","(55.66, 0.0)","(84.31, 0.0)","(84.36, 0.0)","(75.75, 0.0)","(53.86, 0.0)","(53.82, 0.0)"
+2024/12/08 22:10:45 - ,DLF,"(53.9, 0.0)","(55.7, 0.0)","(85.42, 0.0)","(85.27, 0.0)","(76.36, 0.0)","(53.61, 0.0)","(53.66, 0.0)"
+2024/12/09 00:43:05 - ,DLF,"(53.14, 0.0)","(55.03, 0.0)","(84.92, 0.0)","(84.9, 0.0)","(76.46, 0.0)","(54.05, 0.0)","(54.03, 0.0)"
+2024/12/09 02:40:19 - ,DLF,"(53.9, 0.0)","(55.7, 0.0)","(85.42, 0.0)","(85.27, 0.0)","(76.36, 0.0)","(53.61, 0.0)","(53.66, 0.0)"
+2024/12/09 04:45:03 - ,DLF,"(53.9, 0.0)","(55.7, 0.0)","(85.42, 0.0)","(85.27, 0.0)","(76.36, 0.0)","(53.61, 0.0)","(53.66, 0.0)"
+2024/12/09 16:10:24 - ,DLF,"(53.9, 0.0)","(55.7, 0.0)","(85.42, 0.0)","(85.27, 0.0)","(76.36, 0.0)","(53.61, 0.0)","(53.66, 0.0)"
+2024/12/13 02:59:46 - ,DLF,"(53.9, 0.0)","(55.7, 0.0)","(85.42, 0.0)","(85.27, 0.0)","(76.36, 0.0)","(53.61, 0.0)","(53.66, 0.0)"
+2024/12/15 23:53:26 - ,DLF,"(53.9, 0.0)","(55.7, 0.0)","(85.42, 0.0)","(85.27, 0.0)","(76.36, 0.0)","(53.61, 0.0)","(53.66, 0.0)"

result/normal/mosi.csv ADDED Viewed

	@@ -0,0 +1,21 @@

+Time,Model,acc_7,acc_5,acc_2,F1_score,Corr,MAE,Loss
+2024/12/08 14:55:08 - ,DLF,"(45.63, 0.0)","(52.77, 0.0)","(84.45, 0.0)","(84.42, 0.0)","(79.43, 0.0)","(72.2, 0.0)","(72.2, 0.0)"
+2024/12/08 15:20:12 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/08 15:45:53 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/08 19:28:22 - ,DLF,"(45.63, 0.0)","(52.77, 0.0)","(84.45, 0.0)","(84.42, 0.0)","(79.43, 0.0)","(72.2, 0.0)","(72.2, 0.0)"
+2024/12/08 19:46:53 - ,DLF,"(45.63, 0.0)","(52.77, 0.0)","(84.45, 0.0)","(84.42, 0.0)","(79.43, 0.0)","(72.2, 0.0)","(72.2, 0.0)"
+2024/12/08 20:05:59 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/08 20:26:47 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/08 20:42:22 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/09 11:34:14 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/09 12:00:35 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/09 15:54:01 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/09 18:33:19 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/13 02:44:39 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/15 20:13:12 - ,DLF,"(46.36, 0.0)","(53.35, 0.0)","(83.38, 0.0)","(83.4, 0.0)","(78.83, 0.0)","(72.89, 0.0)","(72.94, 0.0)"
+2024/12/15 20:38:24 - ,DLF,"(44.75, 0.0)","(51.9, 0.0)","(83.84, 0.0)","(83.85, 0.0)","(78.2, 0.0)","(72.78, 0.0)","(72.78, 0.0)"
+2024/12/15 20:50:42 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/15 23:25:21 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/16 03:44:10 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/16 04:06:00 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"
+2024/12/16 04:59:28 - ,DLF,"(47.08, 0.0)","(52.33, 0.0)","(85.06, 0.0)","(85.04, 0.0)","(78.14, 0.0)","(73.08, 0.0)","(73.07, 0.0)"

result/readme.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ The results will be saved here as a csv file

run.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import gc
+import logging
+import os
+import time
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import torch
+from config import get_config_regression
+from data_loader import MMDataLoader
+from trains import ATIO
+from utils import assign_gpu, setup_seed
+from trains.singleTask.model import DLF
+from trains.singleTask.distillnets import get_distillation_kernel, get_distillation_kernel_homo
+from trains.singleTask.misc import softmax
+import sys
+from datetime import datetime
+now = datetime.now()
+format = "%Y/%m/%d %H:%M:%S"
+formatted_now = now.strftime(format)
+formatted_now = str(formatted_now)+" - "
+os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
+logger = logging.getLogger('MMSA')
+def _set_logger(log_dir, model_name, dataset_name, verbose_level):
+    # base logger
+    log_file_path = Path(log_dir) / f"{model_name}-{dataset_name}.log"
+    logger = logging.getLogger('MMSA')
+    logger.setLevel(logging.DEBUG)
+    # file handler
+    fh = logging.FileHandler(log_file_path)
+    fh_formatter = logging.Formatter('%(asctime)s - %(name)s [%(levelname)s] - %(message)s')
+    fh.setLevel(logging.DEBUG)
+    fh.setFormatter(fh_formatter)
+    logger.addHandler(fh)
+    # stream handler
+    stream_level = {0: logging.ERROR, 1: logging.INFO, 2: logging.DEBUG}
+    ch = logging.StreamHandler()
+    ch.setLevel(stream_level[verbose_level])
+    ch_formatter = logging.Formatter('%(name)s - %(message)s')
+    ch.setFormatter(ch_formatter)
+    logger.addHandler(ch)
+    return logger
+def DLF_run(
+    model_name, dataset_name, config=None, config_file="", seeds=[], is_tune=False,
+    tune_times=500, feature_T="", feature_A="", feature_V="",
+    model_save_dir="", res_save_dir="", log_dir="",
+    gpu_ids=[0], num_workers=1, verbose_level=1, mode = '', is_training = False
+):
+    # Initialization
+    model_name = model_name.upper()
+    dataset_name = dataset_name.lower()
+    if config_file != "":
+        config_file = Path(config_file)
+    else: # use default config files
+        config_file = Path(__file__).parent / "config" / "config.json"
+    if not config_file.is_file():
+        raise ValueError(f"Config file {str(config_file)} not found.")
+    if model_save_dir == "":
+        model_save_dir = Path.home() / "MMSA" / "saved_models"
+    Path(model_save_dir).mkdir(parents=True, exist_ok=True)
+    if res_save_dir == "":
+        res_save_dir = Path.home() / "MMSA" / "results"
+    Path(res_save_dir).mkdir(parents=True, exist_ok=True)
+    if log_dir == "":
+        log_dir = Path.home() / "MMSA" / "logs"
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+    seeds = seeds if seeds != [] else [1111, 1112, 1113, 1114, 1115]
+    logger = _set_logger(log_dir, model_name, dataset_name, verbose_level)
+    args = get_config_regression(model_name, dataset_name, config_file)
+    args.is_training = is_training
+    args.mode = mode # train or test
+    args['model_save_path'] = Path(model_save_dir) / f"{args['model_name']}-{args['dataset_name']}.pth"
+    args['device'] = assign_gpu(gpu_ids)
+    args['train_mode'] = 'regression'
+    args['feature_T'] = feature_T
+    args['feature_A'] = feature_A
+    args['feature_V'] = feature_V
+    if config:
+        args.update(config)
+    res_save_dir = Path(res_save_dir) / "normal"
+    res_save_dir.mkdir(parents=True, exist_ok=True)
+    model_results = []
+    for i, seed in enumerate(seeds):
+        setup_seed(seed)
+        args['cur_seed'] = i + 1
+        result = _run(args, num_workers, is_tune)
+        model_results.append(result)
+    if args.is_training:
+        criterions = list(model_results[0].keys())
+        # save result to csv
+        csv_file = res_save_dir / f"{dataset_name}.csv"
+        if csv_file.is_file():
+            df = pd.read_csv(csv_file)
+        else:
+            df = pd.DataFrame(columns=["Time"]+["Model"] + criterions)
+        # save results
+        res = [model_name]
+        for c in criterions:
+            values = [r[c] for r in model_results]
+            mean = round(np.mean(values)*100, 2)
+            std = round(np.std(values)*100, 2)
+            res.append((mean, std))
+        res = [formatted_now]+res
+        df.loc[len(df)] = res
+        df.to_csv(csv_file, index=None)
+        logger.info(f"Results saved to {csv_file}.")
+def _run(args, num_workers=4, is_tune=False, from_sena=False):
+    dataloader = MMDataLoader(args, num_workers)
+    if args.is_training:
+        print("training for DLF")
+        args.gd_size_low = 64
+        args.w_losses_low = [1, 10]
+        args.metric_low = 'l1'
+        args.gd_size_high = 32
+        args.w_losses_high = [1, 10]
+        args.metric_high = 'l1'
+        to_idx = [0, 1, 2]
+        from_idx = [0, 1, 2]
+        assert len(from_idx) >= 1
+        model = []
+        model_DLF = getattr(DLF, 'DLF')(args)
+        model_distill_homo = getattr(get_distillation_kernel_homo, 'DistillationKernel')(n_classes=1,
+                                                                               hidden_size=
+                                                                               args.dst_feature_dim_nheads[0],
+                                                                               gd_size=args.gd_size_low,
+                                                                               to_idx=to_idx, from_idx=from_idx,
+                                                                               gd_prior=softmax([0, 0, 1, 0, 1, 0], 0.25),
+                                                                               gd_reg=10,
+                                                                               w_losses=args.w_losses_low,
+                                                                               metric=args.metric_low,
+                                                                               alpha=1 / 8,
+                                                                               hyp_params=args)
+        model_distill_hetero = getattr(get_distillation_kernel, 'DistillationKernel')(n_classes=1,
+                                                                                   hidden_size=
+                                                                                   args.dst_feature_dim_nheads[0] * 2,
+                                                                                   gd_size=args.gd_size_high,
+                                                                                   to_idx=to_idx, from_idx=from_idx,
+                                                                                   gd_prior=softmax([0, 0, 1, 0, 1, 1], 0.25),
+                                                                                   gd_reg=10,
+                                                                                   w_losses=args.w_losses_high,
+                                                                                   metric=args.metric_high,
+                                                                                   alpha=1 / 8,
+                                                                                   hyp_params=args)
+        model_DLF = model_DLF.cuda()
+        model = [model_DLF]
+    else:
+        print("testing phase for DLF")
+        model = getattr(DLF, 'DLF')(args)
+        model = model.cuda()
+    trainer = ATIO().getTrain(args)
+    #test
+    if args.mode == 'test':
+        model.load_state_dict(torch.load('./pt/DLF'+str(args.dataset_name)+'.pth'),strict=False)
+        results = trainer.do_test(model, dataloader['test'], mode="TEST")
+        sys.stdout.flush()
+        input('[Press Any Key to start another run]')
+    #train
+    else:
+        epoch_results = trainer.do_train(model, dataloader, return_epoch_results=from_sena)
+        model[0].load_state_dict(torch.load('./pt/DLF'+str(args.dataset_name)+'.pth'))
+        results = trainer.do_test(model[0], dataloader['test'], mode="TEST")
+        del model
+        torch.cuda.empty_cache()
+        gc.collect()
+        time.sleep(1)
+    return results

test.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+Testing script for DLF
+"""
+from run import DLF_run
+DLF_run(model_name='DLF', dataset_name='mosei', is_tune=False, seeds=[1111], model_save_dir="./pt",
+         res_save_dir="./result", log_dir="./log", mode='test', is_training=False)

train.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+Training script for DLF
+"""
+from run import DLF_run
+DLF_run(model_name='DLF', dataset_name='mosi', is_tune=False, seeds=[1111], model_save_dir="./pt",
+         res_save_dir="./result", log_dir="./log", mode='train', is_training=True)

trains/ATIO.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""
+ATIO -- All Trains in One
+"""
+from .singleTask import *
+__all__ = ['ATIO']
+class ATIO():
+    def __init__(self):
+        self.TRAIN_MAP = {
+            'DLF': DLF,
+        }
+    def getTrain(self, args):
+        return self.TRAIN_MAP[args['model_name']](args)

trains/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .ATIO import ATIO

trains/singleTask/DLF.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import logging
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import optim
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+from tqdm import tqdm
+from ..utils import MetricsTop, dict_to_str
+from .HingeLoss import HingeLoss
+logger = logging.getLogger('MMSA')
+class MSE(nn.Module):
+    def __init__(self):
+        super(MSE, self).__init__()
+    def forward(self, pred, real):
+        diffs = torch.add(real, -pred)
+        n = torch.numel(diffs.data)
+        mse = torch.sum(diffs.pow(2)) / n
+        return mse
+class DLF():
+    def __init__(self, args):
+        self.args = args
+        self.criterion = nn.L1Loss()
+        self.cosine = nn.CosineEmbeddingLoss()
+        self.metrics = MetricsTop(args.train_mode).getMetics(args.dataset_name)
+        self.MSE = MSE()
+        self.sim_loss = HingeLoss()
+    def do_train(self, model, dataloader, return_epoch_results=False):
+        # 0: DLF model
+        params = model[0].parameters()
+        optimizer = optim.Adam(params, lr=self.args.learning_rate)
+        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, verbose=True, patience=self.args.patience)
+        epochs, best_epoch = 0, 0
+        if return_epoch_results:
+            epoch_results = {
+                'train': [],
+                'valid': [],
+                'test': []
+            }
+        min_or_max = 'min' if self.args.KeyEval in ['Loss'] else 'max'
+        best_valid = 1e8 if min_or_max == 'min' else 0
+        net = []
+        net_DLF = model[0]
+        net.append(net_DLF)
+        model = net
+        while True:
+            epochs += 1
+            y_pred, y_true = [], []
+            for mod in model:
+                mod.train()
+            train_loss = 0.0
+            left_epochs = self.args.update_epochs
+            with tqdm(dataloader['train']) as td:
+                for batch_data in td:
+                    if left_epochs == self.args.update_epochs:
+                        optimizer.zero_grad()
+                    left_epochs -= 1
+                    vision = batch_data['vision'].to(self.args.device)
+                    audio = batch_data['audio'].to(self.args.device)
+                    text = batch_data['text'].to(self.args.device)
+                    labels = batch_data['labels']['M'].to(self.args.device)
+                    labels = labels.view(-1, 1)
+                    output = model[0](text, audio, vision)
+                    # task loss
+                    loss_task_all = self.criterion(output['output_logit'], labels)
+                    loss_task_l_hetero = self.criterion(output['logits_l_hetero'], labels)
+                    loss_task_v_hetero = self.criterion(output['logits_v_hetero'], labels)
+                    loss_task_a_hetero = self.criterion(output['logits_a_hetero'], labels)
+                    loss_task_c = self.criterion(output['logits_c'], labels)
+                    # total MSA loss L_msa
+                    loss_task = 1* (1 * loss_task_all + 1*loss_task_c  + 3 * loss_task_l_hetero + 1*loss_task_v_hetero + 1*loss_task_a_hetero)
+                    # reconstruction loss L_r
+                    loss_recon_l = self.MSE(output['recon_l'], output['origin_l'])
+                    loss_recon_v = self.MSE(output['recon_v'], output['origin_v'])
+                    loss_recon_a = self.MSE(output['recon_a'], output['origin_a'])
+                    loss_recon = loss_recon_l + loss_recon_v + loss_recon_a
+                    # specific loss L_s
+                    loss_sl_slr = self.MSE(output['s_l'].permute(1, 2, 0), output['s_l_r'])
+                    loss_sv_slv = self.MSE(output['s_v'].permute(1, 2, 0), output['s_v_r'])
+                    loss_sa_sla = self.MSE(output['s_a'].permute(1, 2, 0), output['s_a_r'])
+                    loss_s_sr = loss_sl_slr + loss_sv_slv + loss_sa_sla
+                    # ort loss L_o
+                    if self.args.dataset_name == 'mosi':
+                        num = 50
+                    elif self.args.dataset_name == 'mosei':
+                        num = 10
+                    cosine_similarity_s_c_l = self.cosine(output['s_l'].reshape(-1, num), output['c_l'].reshape(-1, num), torch.tensor([-1]).cuda())
+                    cosine_similarity_s_c_v = self.cosine(output['s_v'].reshape(-1, num), output['c_v'].reshape(-1, num), torch.tensor([-1]).cuda())
+                    cosine_similarity_s_c_a = self.cosine(output['s_a'].reshape(-1, num), output['c_a'].reshape(-1, num), torch.tensor([-1]).cuda())
+                    loss_ort = cosine_similarity_s_c_l + cosine_similarity_s_c_v + cosine_similarity_s_c_a
+                    # triplet margin loss L_m
+                    c_l, c_v, c_a = output['c_l_sim'], output['c_v_sim'], output['c_a_sim']
+                    ids, feats = [], []
+                    for i in range(labels.size(0)):
+                        feats.append(c_l[i].view(1, -1))
+                        feats.append(c_v[i].view(1, -1))
+                        feats.append(c_a[i].view(1, -1))
+                        ids.append(labels[i].view(1, -1))
+                        ids.append(labels[i].view(1, -1))
+                        ids.append(labels[i].view(1, -1))
+                    feats = torch.cat(feats, dim=0)
+                    ids = torch.cat(ids, dim=0)
+                    loss_sim = self.sim_loss(ids, feats)
+                    #overall loss L_DLF
+                    combined_loss = loss_task + (loss_s_sr + loss_recon + (loss_sim+loss_ort) * 0.1) * 0.1
+                    combined_loss.backward()
+                    if self.args.grad_clip != -1.0:
+                        params = list(model[0].parameters())
+                        nn.utils.clip_grad_value_(params, self.args.grad_clip)
+                    train_loss += combined_loss.item()
+                    y_pred.append(output['output_logit'].cpu())
+                    y_true.append(labels.cpu())
+                    if not left_epochs:
+                        optimizer.step()
+                        left_epochs = self.args.update_epochs
+                if not left_epochs:
+                    # update
+                    optimizer.step()
+            train_loss = train_loss / len(dataloader['train'])
+            pred, true = torch.cat(y_pred), torch.cat(y_true)
+            train_results = self.metrics(pred, true)
+            logger.info(
+                f">> Epoch: {epochs} "
+                f"TRAIN -({self.args.model_name}) [{epochs - best_epoch}/{epochs}/{self.args.cur_seed}] "
+                f">> total_loss: {round(train_loss, 4)} "
+                f"{dict_to_str(train_results)}"
+            )
+            # validation
+            val_results = self.do_test(model[0], dataloader['valid'], mode="VAL")
+            test_results = self.do_test(model[0], dataloader['test'], mode="TEST")
+            cur_valid = val_results[self.args.KeyEval]
+            scheduler.step(val_results['Loss'])
+            # save each epoch model
+            torch.save(model[0].state_dict(), './pt/' + str(self.args.dataset_name) + '_' + str(epochs) + '.pth')
+            # save best model
+            isBetter = cur_valid <= (best_valid - 1e-6) if min_or_max == 'min' else cur_valid >= (best_valid + 1e-6)
+            if isBetter:
+                best_valid, best_epoch = cur_valid, epochs
+                # save model
+                model_save_path = './pt/DLF' + str(self.args.dataset_name)+'.pth'
+                torch.save(model[0].state_dict(), model_save_path)
+            if return_epoch_results:
+                train_results["Loss"] = train_loss
+                epoch_results['train'].append(train_results)
+                epoch_results['valid'].append(val_results)
+                test_results = self.do_test(model, dataloader['test'], mode="TEST")
+                epoch_results['test'].append(test_results)
+            # early stop
+            if epochs - best_epoch >= self.args.early_stop:
+                return epoch_results if return_epoch_results else None
+    def do_test(self, model, dataloader, mode="VAL", return_sample_results=False):
+        model.eval()
+        y_pred, y_true = [], []
+        eval_loss = 0.0
+        if return_sample_results:
+            ids, sample_results = [], []
+            all_labels = []
+            features = {
+                "Feature_t": [],
+                "Feature_a": [],
+                "Feature_v": [],
+                "Feature_f": [],
+            }
+        with torch.no_grad():
+            with tqdm(dataloader) as td:
+                for batch_data in td:
+                    vision = batch_data['vision'].to(self.args.device)
+                    audio = batch_data['audio'].to(self.args.device)
+                    text = batch_data['text'].to(self.args.device)
+                    labels = batch_data['labels']['M'].to(self.args.device)
+                    labels = labels.view(-1, 1)
+                    output = model(text, audio, vision)
+                    loss = self.criterion(output['output_logit'], labels)
+                    eval_loss += loss.item()
+                    y_pred.append(output['output_logit'].cpu())
+                    y_true.append(labels.cpu())
+        eval_loss = eval_loss / len(dataloader)
+        pred, true = torch.cat(y_pred), torch.cat(y_true)
+        eval_results = self.metrics(pred, true)
+        eval_results["Loss"] = round(eval_loss, 4)
+        logger.info(f"{mode}-({self.args.model_name}) >> {dict_to_str(eval_results)}")
+        if return_sample_results:
+            eval_results["Ids"] = ids
+            eval_results["SResults"] = sample_results
+            for k in features.keys():
+                features[k] = np.concatenate(features[k], axis=0)
+            eval_results['Features'] = features
+            eval_results['Labels'] = all_labels
+        return eval_results

trains/singleTask/HingeLoss.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+import torch.nn as nn
+class HingeLoss(nn.Module):
+    def __init__(self):
+        super(HingeLoss, self).__init__()
+    def compute_cosine(self, x, y):
+        # x = self.compute_compact_s(x)
+        # y = self.compute_compact_s(y)
+        x_norm = torch.sqrt(torch.sum(torch.pow(x, 2), 1)+1e-8)
+        x_norm = torch.max(x_norm, 1e-8*torch.ones_like(x_norm))
+        y_norm = torch.sqrt(torch.sum(torch.pow(y, 2), 1)+1e-8)
+        y_norm = torch.max(y_norm, 1e-8*torch.ones_like(y_norm))
+        cosine = torch.sum(x * y, 1) / (x_norm * y_norm)
+        return cosine
+    def forward(self, ids, feats, margin=0.1):
+        B, F = feats.shape
+        s = feats.repeat(1, B).view(-1, F) # B**2 X F
+        s_ids = ids.view(B, 1).repeat(1, B) # B X B
+        t = feats.repeat(B, 1) # B**2 X F
+        t_ids = ids.view(1, B).repeat(B, 1) # B X B
+        cosine = self.compute_cosine(s, t) # B**2
+        equal_mask = torch.eye(B, dtype=torch.bool) # B X B
+        s_ids = s_ids[~equal_mask].view(B, B-1) # B X (B-1)
+        t_ids = t_ids[~equal_mask].view(B, B-1) # B X (B-1)
+        cosine = cosine.view(B, B)[~equal_mask].view(B, B-1) # B X (B-1)
+        sim_mask = (s_ids == t_ids) # B X (B-1)
+        margin = 0.15 * abs(s_ids - t_ids)#[~sim_mask].view(B, B - 3)
+        loss = 0
+        loss_num = 0
+        for i in range(B):
+            sim_num = sum(sim_mask[i])
+            dif_num = B - 1 - sim_num
+            if not sim_num or not dif_num:
+                continue
+            sim_cos = cosine[i, sim_mask[i]].reshape(-1, 1).repeat(1, dif_num)
+            dif_cos = cosine[i, ~sim_mask[i]].reshape(-1, 1).repeat(1, sim_num).transpose(0, 1)
+            t_margin = margin[i, ~sim_mask[i]].reshape(-1, 1).repeat(1, sim_num).transpose(0, 1)
+            loss_i = torch.max(torch.zeros_like(sim_cos), t_margin - sim_cos + dif_cos).mean()
+            loss += loss_i
+            loss_num += 1
+        if loss_num == 0:
+            loss_num = 1
+        loss = loss / loss_num
+        return loss

trains/singleTask/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .DLF import DLF

trains/singleTask/distillnets/get_distillation_kernel.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""Graph distillation for hetero GD"""
+import torch
+from torch.autograd import Variable
+import torch.nn as nn
+import torch.nn.functional as F
+from ..utils import distance_metric, min_cosine
+class DistillationKernel(nn.Module):
+  """Graph Distillation kernel.
+  Calculate the edge weights e_{j->k} for each j. Modality k is specified by
+  to_idx, and the other modalities are specified by from_idx.
+  """
+  def __init__(self, n_classes, hidden_size, gd_size, to_idx, from_idx,
+               gd_prior, gd_reg, w_losses, metric, alpha, hyp_params):
+    super(DistillationKernel, self).__init__()
+    self.W_logit = nn.Linear(n_classes, gd_size)
+    self.W_repr = nn.Linear(hidden_size, gd_size)
+    self.W_edge = nn.Linear(gd_size * 4, 1)
+    self.gd_size = gd_size
+    self.to_idx = to_idx
+    self.from_idx = from_idx
+    self.alpha = alpha
+    self.gd_prior = Variable(torch.FloatTensor(gd_prior).cuda())
+    self.gd_reg = gd_reg
+    self.w_losses = w_losses
+    self.metric = metric
+    self.hyp_params = hyp_params
+  def forward(self, logits, reprs):
+    """
+    Args:
+      logits: (n_modalities, batch_size, n_classes)
+      reprs: (n_modalities, batch_siz`, hidden_size)
+    Return:
+      edges: weights e_{j->k} (n_modalities_from, batch_size)
+    """
+    n_modalities, batch_size = logits.size()[:2]
+    z_logits = self.W_logit(logits.view(n_modalities * batch_size, -1))
+    z_reprs = self.W_repr(reprs.view(n_modalities * batch_size, -1))
+    z = torch.cat(
+        (z_logits, z_reprs), dim=1).view(n_modalities, batch_size,
+                                         self.gd_size * 2)
+    edges = []
+    for j in self.to_idx:
+      for i in self.from_idx:
+        if i == j:
+          continue
+        else:
+          # To calculate e_{j->k}, concatenate z^j, z^k
+          e = self.W_edge(torch.cat((z[j], z[i]), dim=1))
+          edges.append(e)
+    edges = torch.cat(edges, dim=1)
+    edges_origin = edges.sum(0).unsqueeze(0).transpose(0, 1)
+    edges = F.softmax(edges * self.alpha, dim=1).transpose(0, 1)
+    return edges, edges_origin
+  def distillation_loss(self, logits, reprs, edges):
+    """Calculate graph distillation losses, which include:
+    regularization loss, loss for logits, and loss for representation.
+    """
+    loss_reg = (edges.mean(1) - self.gd_prior).pow(2).sum() * self.gd_reg
+    loss_logit, loss_repr = 0, 0
+    x = 0
+    for j in self.to_idx:
+      for i, idx in enumerate(self.from_idx):
+        if i == j:
+          continue
+        else:
+          w_distill = edges[x] + self.gd_prior[x]
+          # print(edges.sum(1), w_distill.sum(0))
+          loss_logit += self.w_losses[0] * distance_metric(
+            logits[j], logits[idx], self.metric, w_distill)
+          loss_repr += self.w_losses[1] * min_cosine(
+            reprs[j], reprs[idx], self.metric, w_distill)
+          x = x + 1
+    return loss_reg, loss_logit, loss_repr
+def get_distillation_kernel(n_classes,
+                            hidden_size,
+                            gd_size,
+                            to_idx,
+                            from_idx,
+                            gd_prior,
+                            gd_reg,
+                            w_losses,
+                            metric,
+                            alpha=1 / 8):
+  return DistillationKernel(n_classes, hidden_size, gd_size, to_idx, from_idx,
+                            gd_prior, gd_reg, w_losses, metric, alpha)

trains/singleTask/distillnets/get_distillation_kernel_homo.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Graph distillation for homo GD"""
+import torch
+from torch.autograd import Variable
+import torch.nn as nn
+import torch.nn.functional as F
+from ..utils import distance_metric, min_cosine
+class DistillationKernel(nn.Module):
+  """Graph Distillation kernel.
+  Calculate the edge weights e_{j->k} for each j. Modality k is specified by
+  to_idx, and the other modalities are specified by from_idx.
+  """
+  def __init__(self, n_classes, hidden_size, gd_size, to_idx, from_idx,
+               gd_prior, gd_reg, w_losses, metric, alpha, hyp_params):
+    super(DistillationKernel, self).__init__()
+    self.W_logit = nn.Linear(n_classes, gd_size)
+    self.W_repr = nn.Linear(hidden_size, gd_size)
+    self.W_edge = nn.Linear(gd_size * 4, 1)
+    self.gd_size = gd_size
+    self.to_idx = to_idx
+    self.from_idx = from_idx
+    self.alpha = alpha
+    self.gd_prior = Variable(torch.FloatTensor(gd_prior).cuda())
+    self.gd_reg = gd_reg
+    self.w_losses = w_losses
+    self.metric = metric
+    self.hyp_params = hyp_params
+  def forward(self, logits, reprs):
+    """
+    Args:
+      logits: (n_modalities, batch_size, n_classes)
+      reprs: (n_modalities, batch_siz`, hidden_size)
+    Return:
+      edges: weights e_{j->k} (n_modalities_from, batch_size)
+    """
+    n_modalities, batch_size = logits.size()[:2]
+    z_logits = self.W_logit(logits.view(n_modalities * batch_size, -1))
+    z_reprs = self.W_repr(reprs.view(n_modalities * batch_size, -1))
+    z = torch.cat(
+        (z_logits, z_reprs), dim=1).view(n_modalities, batch_size,
+                                         self.gd_size * 2)
+    edges = []
+    for j in self.to_idx:
+      for i in self.from_idx:
+        if i == j:
+          continue
+        else:
+          # To calculate e_{j->k}, concatenate z^j, z^k
+          e = self.W_edge(torch.cat((z[j], z[i]), dim=1))
+          edges.append(e)
+    edges = torch.cat(edges, dim=1)
+    edges_origin = edges.sum(0).unsqueeze(0).transpose(0, 1)  # original value of edges
+    edges = F.softmax(edges * self.alpha, dim=1).transpose(0, 1)  # normalized value of edges
+    return edges, edges_origin
+  def distillation_loss(self, logits, reprs, edges):
+    """Calculate graph distillation losses, which include:
+    regularization loss, loss for logits, and loss for representation.
+    """
+    loss_reg = (edges.mean(1) - self.gd_prior).pow(2).sum() * self.gd_reg
+    loss_logit, loss_repr = 0, 0
+    x = 0
+    for j in self.to_idx:
+      for i, idx in enumerate(self.from_idx):
+        if i == j:
+          continue
+        else:
+          w_distill = edges[x] + self.gd_prior[x]
+          # print(edges.sum(1), w_distill.sum(0))
+          loss_logit += self.w_losses[0] * distance_metric(
+            logits[j], logits[idx], self.metric, w_distill)
+          loss_repr += self.w_losses[1] * distance_metric(
+            reprs[j], reprs[idx], self.metric, w_distill)
+          x = x + 1
+    return loss_reg, loss_logit, loss_repr
+def get_distillation_kernel(n_classes,
+                            hidden_size,
+                            gd_size,
+                            to_idx,
+                            from_idx,
+                            gd_prior,
+                            gd_reg,
+                            w_losses,
+                            metric,
+                            alpha=1 / 8):
+  return DistillationKernel(n_classes, hidden_size, gd_size, to_idx, from_idx,
+                            gd_prior, gd_reg, w_losses, metric, alpha)

trains/singleTask/misc.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import numpy as np
+from sklearn.metrics import average_precision_score
+import torch
+import torch.nn.functional as F
+import torch.utils.data
+def to_numpy(array):
+  if isinstance(array, np.ndarray):
+    return array
+  if isinstance(array, torch.autograd.Variable):
+    array = array.data
+  if array.is_cuda:
+    array = array.cpu()
+  return array.numpy()
+def squeeze(array):
+  if not isinstance(array, list) or len(array) > 1:
+    return array
+  else:  # len(array) == 1:
+    return array[0]
+def unsqueeze(array):
+  if isinstance(array, list):
+    return array
+  else:
+    return [array]
+def is_due(*args):
+  """Determines whether to perform an action or not, depending on the epoch.
+     Used for logging, saving, learning rate decay, etc.
+  Args:
+    *args: epoch, due_at (due at epoch due_at) epoch, num_epochs,
+          due_every (due every due_every epochs)
+          step, due_every (due every due_every steps)
+  Returns:
+    due: boolean: perform action or not
+  """
+  if len(args) == 2 and isinstance(args[1], list):
+    epoch, due_at = args
+    due = epoch in due_at
+  elif len(args) == 3:
+    epoch, num_epochs, due_every = args
+    due = (due_every >= 0) and (epoch % due_every == 0 or epoch == num_epochs)
+  else:
+    step, due_every = args
+    due = (due_every > 0) and (step % due_every == 0)
+  return due
+def softmax(w, t=1.0, axis=None):
+  w = np.array(w) / t
+  e = np.exp(w - np.amax(w, axis=axis, keepdims=True))
+  dist = e / np.sum(e, axis=axis, keepdims=True)
+  return dist
+def min_cosine(student, teacher, option, weights=None):
+  cosine = torch.nn.CosineEmbeddingLoss()
+  dists = cosine(student, teacher.detach(), torch.tensor([-1]).cuda())
+  if weights is None:
+    dist = dists.mean()
+  else:
+    dist = (dists * weights).mean()
+  return dist
+def distance_metric(student, teacher, option, weights=None):
+  """Distance metric to calculate the imitation loss.
+  Args:
+    student: batch_size x n_classes
+    teacher: batch_size x n_classes
+    option: one of [cosine, l2, l2, kl]
+    weights: batch_size or float
+  Returns:
+    The computed distance metric.
+  """
+  if option == 'cosine':
+    dists = 1 - F.cosine_similarity(student, teacher.detach(), dim=1)
+    # dists = 1 - F.cosine_similarity(student, teacher, dim=1)
+  elif option == 'l2':
+    dists = (student-teacher.detach()).pow(2).sum(1)
+  elif option == 'l1':
+    dists = torch.abs(student-teacher.detach()).sum(1)
+  elif option == 'kl':
+    assert weights is None
+    T = 8
+    # averaged for each minibatch
+    dist = F.kl_div(
+        F.log_softmax(student / T), F.softmax(teacher.detach() / T)) * (
+            T * T)
+    return dist
+  else:
+    raise NotImplementedError
+  if weights is None:
+    dist = dists.mean()
+  else:
+    dist = (dists * weights).mean()
+  return dist
+def get_segments(input, timestep):
+  """Split entire input into segments of length timestep.
+  Args:
+    input: 1 x total_length x n_frames x ...
+    timestep: the timestamp.
+  Returns:
+    input: concatenated video segments
+    start_indices: indices of the segments
+  """
+  assert input.size(0) == 1, 'Test time, batch_size must be 1'
+  input.squeeze_(dim=0)
+  # Find overlapping segments
+  length = input.size()[0]
+  step = timestep // 2
+  num_segments = (length - timestep) // step + 1
+  start_indices = (np.arange(num_segments) * step).tolist()
+  if length % step > 0:
+    start_indices.append(length - timestep)
+  # Get the segments
+  segments = []
+  for s in start_indices:
+    segment = input[s: (s + timestep)].unsqueeze(0)
+    segments.append(segment)
+  input = torch.cat(segments, dim=0)
+  return input, start_indices
+def get_stats(logit, label):
+  '''
+  Calculate the accuracy.
+  '''
+  logit = to_numpy(logit)
+  label = to_numpy(label)
+  pred = np.argmax(logit, 1)
+  acc = np.sum(pred == label)/label.shape[0]
+  return acc, pred, label
+def get_stats_detection(logit, label, n_classes=52):
+  '''
+  Calculate the accuracy and average precisions.
+  '''
+  logit = to_numpy(logit)
+  label = to_numpy(label)
+  scores = softmax(logit, axis=1)
+  pred = np.argmax(logit, 1)
+  length = label.shape[0]
+  acc = np.sum(pred == label)/length
+  keep_bg = label == 0
+  acc_bg = np.sum(pred[keep_bg] == label[keep_bg])/label[keep_bg].shape[0]
+  ratio_bg = np.sum(keep_bg)/length
+  keep_action = label != 0
+  acc_action = np.sum(
+      pred[keep_action] == label[keep_action]) / label[keep_action].shape[0]
+  # Average precision
+  y_true = np.zeros((len(label), n_classes))
+  y_true[np.arange(len(label)), label] = 1
+  acc = np.sum(pred == label)/label.shape[0]
+  aps = average_precision_score(y_true, scores, average=None)
+  aps = list(filter(lambda x: not np.isnan(x), aps))
+  ap = np.mean(aps)
+  return ap, acc, acc_bg, acc_action, ratio_bg, pred, label
+def info(text):
+  print('\033[94m' + text + '\033[0m')
+def warn(text):
+  print('\033[93m' + text + '\033[0m')
+def err(text):
+  print('\033[91m' + text + '\033[0m')

trains/singleTask/model/DLF.py ADDED Viewed

	@@ -0,0 +1,345 @@

+"""
+here is the mian backbone for DLF
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...subNets import BertTextEncoder
+from ...subNets.transformers_encoder.transformer import TransformerEncoder
+class DLF(nn.Module):
+    def __init__(self, args):
+        super(DLF, self).__init__()
+        if args.use_bert:
+            self.text_model = BertTextEncoder(use_finetune=args.use_finetune, transformers=args.transformers,
+                                              pretrained=args.pretrained)
+        self.use_bert = args.use_bert
+        dst_feature_dims, nheads = args.dst_feature_dim_nheads
+        if args.dataset_name == 'mosi':
+            if args.need_data_aligned:
+                self.len_l, self.len_v, self.len_a = 50, 50, 50
+            else:
+                self.len_l, self.len_v, self.len_a = 50, 500, 375
+        if args.dataset_name == 'mosei':
+            if args.need_data_aligned:
+                self.len_l, self.len_v, self.len_a = 50, 50, 50
+            else:
+                self.len_l, self.len_v, self.len_a = 50, 500, 500
+        self.orig_d_l, self.orig_d_a, self.orig_d_v = args.feature_dims
+        self.d_l = self.d_a = self.d_v = dst_feature_dims
+        self.num_heads = nheads
+        self.layers = args.nlevels
+        self.attn_dropout = args.attn_dropout
+        self.attn_dropout_a = args.attn_dropout_a
+        self.attn_dropout_v = args.attn_dropout_v
+        self.relu_dropout = args.relu_dropout
+        self.embed_dropout = args.embed_dropout
+        self.res_dropout = args.res_dropout
+        self.output_dropout = args.output_dropout
+        self.text_dropout = args.text_dropout
+        self.attn_mask = args.attn_mask
+        combined_dim_low = self.d_a
+        combined_dim_high = self.d_a
+        combined_dim = (self.d_l + self.d_a + self.d_v ) + self.d_l * 3
+        output_dim = 1
+        # 1. Temporal convolutional layers for initial feature
+        self.proj_l = nn.Conv1d(self.orig_d_l, self.d_l, kernel_size=args.conv1d_kernel_size_l, padding=0, bias=False)
+        self.proj_a = nn.Conv1d(self.orig_d_a, self.d_a, kernel_size=args.conv1d_kernel_size_a, padding=0, bias=False)
+        self.proj_v = nn.Conv1d(self.orig_d_v, self.d_v, kernel_size=args.conv1d_kernel_size_v, padding=0, bias=False)
+        # 2. Modality-specific encoder
+        self.encoder_s_l = self.get_network(self_type='l', layers = self.layers)
+        self.encoder_s_v = self.get_network(self_type='v', layers = self.layers)
+        self.encoder_s_a = self.get_network(self_type='a', layers = self.layers)
+        #   Modality-shared encoder
+        self.encoder_c = self.get_network(self_type='l', layers = self.layers)
+        # 3. Decoder for reconstruct three modalities
+        self.decoder_l = nn.Conv1d(self.d_l * 2, self.d_l, kernel_size=1, padding=0, bias=False)
+        self.decoder_v = nn.Conv1d(self.d_v * 2, self.d_v, kernel_size=1, padding=0, bias=False)
+        self.decoder_a = nn.Conv1d(self.d_a * 2, self.d_a, kernel_size=1, padding=0, bias=False)
+        # for calculate cosine sim between s_x
+        self.proj_cosine_l = nn.Linear(combined_dim_low * (self.len_l - args.conv1d_kernel_size_l + 1), combined_dim_low)
+        self.proj_cosine_v = nn.Linear(combined_dim_low * (self.len_v - args.conv1d_kernel_size_v + 1), combined_dim_low)
+        self.proj_cosine_a = nn.Linear(combined_dim_low * (self.len_a - args.conv1d_kernel_size_a + 1), combined_dim_low)
+        # for align c_l, c_v, c_a
+        self.align_c_l = nn.Linear(combined_dim_low * (self.len_l - args.conv1d_kernel_size_l + 1), combined_dim_low)
+        self.align_c_v = nn.Linear(combined_dim_low * (self.len_v - args.conv1d_kernel_size_v + 1), combined_dim_low)
+        self.align_c_a = nn.Linear(combined_dim_low * (self.len_a - args.conv1d_kernel_size_a + 1), combined_dim_low)
+        self.self_attentions_c_l = self.get_network(self_type='l')
+        self.self_attentions_c_v = self.get_network(self_type='v')
+        self.self_attentions_c_a = self.get_network(self_type='a')
+        self.proj1_c = nn.Linear(self.d_l * 3, self.d_l * 3)
+        self.proj2_c = nn.Linear(self.d_l * 3, self.d_l * 3)
+        self.out_layer_c = nn.Linear(self.d_l * 3, output_dim)
+        # 4 Multimodal Crossmodal Attentions
+        self.trans_l_with_a = self.get_network(self_type='la', layers = self.layers)
+        self.trans_l_with_v = self.get_network(self_type='lv', layers = self.layers)
+        self.trans_a_with_l = self.get_network(self_type='al')
+        self.trans_a_with_v = self.get_network(self_type='av')
+        self.trans_v_with_l = self.get_network(self_type='vl')
+        self.trans_v_with_a = self.get_network(self_type='va')
+        self.trans_l_mem = self.get_network(self_type='l_mem', layers=self.layers)
+        self.trans_a_mem = self.get_network(self_type='a_mem', layers=3)
+        self.trans_v_mem = self.get_network(self_type='v_mem', layers=3)
+        # 5. fc layers for shared features
+        self.proj1_l_low = nn.Linear(combined_dim_low * (self.len_l - args.conv1d_kernel_size_l + 1), combined_dim_low)
+        self.proj2_l_low = nn.Linear(combined_dim_low, combined_dim_low * (self.len_l - args.conv1d_kernel_size_l + 1))
+        self.out_layer_l_low = nn.Linear(combined_dim_low * (self.len_l - args.conv1d_kernel_size_l + 1), output_dim)
+        self.proj1_v_low = nn.Linear(combined_dim_low * (self.len_v - args.conv1d_kernel_size_v + 1), combined_dim_low)
+        self.proj2_v_low = nn.Linear(combined_dim_low, combined_dim_low * (self.len_v - args.conv1d_kernel_size_v + 1))
+        self.out_layer_v_low = nn.Linear(combined_dim_low * (self.len_v - args.conv1d_kernel_size_v + 1), output_dim)
+        self.proj1_a_low = nn.Linear(combined_dim_low * (self.len_a - args.conv1d_kernel_size_a + 1), combined_dim_low)
+        self.proj2_a_low = nn.Linear(combined_dim_low, combined_dim_low * (self.len_a - args.conv1d_kernel_size_a + 1))
+        self.out_layer_a_low = nn.Linear(combined_dim_low * (self.len_a - args.conv1d_kernel_size_a + 1), output_dim)
+        # 6. fc layers for specific features
+        self.proj1_l_high = nn.Linear(combined_dim_high, combined_dim_high)
+        self.proj2_l_high = nn.Linear(combined_dim_high, combined_dim_high)
+        self.out_layer_l_high = nn.Linear(combined_dim_high, output_dim)
+        self.proj1_v_high = nn.Linear(combined_dim_high, combined_dim_high)
+        self.proj2_v_high = nn.Linear(combined_dim_high, combined_dim_high)
+        self.out_layer_v_high = nn.Linear(combined_dim_high, output_dim)
+        self.proj1_a_high = nn.Linear(combined_dim_high, combined_dim_high)
+        self.proj2_a_high = nn.Linear(combined_dim_high, combined_dim_high)
+        self.out_layer_a_high = nn.Linear(combined_dim_high, output_dim)
+        # 7. project for fusion
+        self.projector_l = nn.Linear(self.d_l, self.d_l)
+        self.projector_v = nn.Linear(self.d_v, self.d_v)
+        self.projector_a = nn.Linear(self.d_a, self.d_a)
+        self.projector_c = nn.Linear(3 * self.d_l, 3 * self.d_l)
+        # 8. final project
+        self.proj1 = nn.Linear(combined_dim, combined_dim)
+        self.proj2 = nn.Linear(combined_dim, combined_dim)
+        self.out_layer = nn.Linear(combined_dim, output_dim)
+    def get_network(self, self_type='l', layers=-1):
+        if self_type in ['l', 'al', 'vl']:
+            embed_dim, attn_dropout = self.d_l, self.attn_dropout
+        elif self_type in ['a', 'la', 'va']:
+            embed_dim, attn_dropout = self.d_a, self.attn_dropout_a
+        elif self_type in ['v', 'lv', 'av']:
+            embed_dim, attn_dropout = self.d_v, self.attn_dropout_v
+        elif self_type == 'l_mem':
+            embed_dim, attn_dropout = self.d_l, self.attn_dropout
+        elif self_type == 'a_mem':
+            embed_dim, attn_dropout = self.d_a, self.attn_dropout
+        elif self_type == 'v_mem':
+            embed_dim, attn_dropout = self.d_v, self.attn_dropout
+        else:
+            raise ValueError("Unknown network type")
+        return TransformerEncoder(embed_dim=embed_dim,
+                                  num_heads=self.num_heads,
+                                  layers=max(self.layers, layers),
+                                  attn_dropout=attn_dropout,
+                                  relu_dropout=self.relu_dropout,
+                                  res_dropout=self.res_dropout,
+                                  embed_dropout=self.embed_dropout,
+                                  attn_mask=self.attn_mask)
+    def forward(self, text, audio, video):
+        #extraction
+        if self.use_bert:
+            text = self.text_model(text)
+        x_l = F.dropout(text.transpose(1, 2), p=self.text_dropout, training=self.training)
+        x_a = audio.transpose(1, 2)
+        x_v = video.transpose(1, 2)
+        proj_x_l = x_l if self.orig_d_l == self.d_l else self.proj_l(x_l)
+        proj_x_a = x_a if self.orig_d_a == self.d_a else self.proj_a(x_a)
+        proj_x_v = x_v if self.orig_d_v == self.d_v else self.proj_v(x_v)
+        proj_x_l = proj_x_l.permute(2, 0, 1)
+        proj_x_v = proj_x_v .permute(2, 0, 1)
+        proj_x_a = proj_x_a.permute(2, 0, 1)
+        #disentanglement
+        s_l = self.encoder_s_l(proj_x_l)
+        s_v = self.encoder_s_v(proj_x_v)
+        s_a = self.encoder_s_a(proj_x_a)
+        c_l = self.encoder_c(proj_x_l)
+        c_v = self.encoder_c(proj_x_v)
+        c_a = self.encoder_c(proj_x_a)
+        s_l = s_l.permute(1, 2, 0)
+        s_v = s_v.permute(1, 2, 0)
+        s_a = s_a.permute(1, 2, 0)
+        c_l = c_l.permute(1, 2, 0)
+        c_v = c_v.permute(1, 2, 0)
+        c_a = c_a.permute(1, 2, 0)
+        c_list = [c_l, c_v, c_a]
+        c_l_sim = self.align_c_l(c_l.contiguous().view(x_l.size(0), -1))
+        c_v_sim = self.align_c_v(c_v.contiguous().view(x_l.size(0), -1))
+        c_a_sim = self.align_c_a(c_a.contiguous().view(x_l.size(0), -1))
+        recon_l = self.decoder_l(torch.cat([s_l, c_list[0]], dim=1))
+        recon_v = self.decoder_v(torch.cat([s_v, c_list[1]], dim=1))
+        recon_a = self.decoder_a(torch.cat([s_a, c_list[2]], dim=1))
+        recon_l = recon_l.permute(2, 0, 1)
+        recon_v = recon_v.permute(2, 0, 1)
+        recon_a = recon_a.permute(2, 0, 1)
+        s_l_r = self.encoder_s_l(recon_l).permute(1, 2, 0)
+        s_v_r = self.encoder_s_v(recon_v).permute(1, 2, 0)
+        s_a_r = self.encoder_s_a(recon_a).permute(1, 2, 0)
+        s_l = s_l.permute(2, 0, 1)
+        s_v = s_v.permute(2, 0, 1)
+        s_a = s_a.permute(2, 0, 1)
+        c_l = c_l.permute(2, 0, 1)
+        c_v = c_v.permute(2, 0, 1)
+        c_a = c_a.permute(2, 0, 1)
+       #enhancement
+        hs_l_low = c_l.transpose(0, 1).contiguous().view(x_l.size(0), -1)
+        repr_l_low = self.proj1_l_low(hs_l_low)
+        hs_proj_l_low = self.proj2_l_low(
+            F.dropout(F.relu(repr_l_low, inplace=True), p=self.output_dropout, training=self.training))
+        hs_proj_l_low += hs_l_low
+        logits_l_low = self.out_layer_l_low(hs_proj_l_low)
+        hs_v_low = c_v.transpose(0, 1).contiguous().view(x_v.size(0), -1)
+        repr_v_low = self.proj1_v_low(hs_v_low)
+        hs_proj_v_low = self.proj2_v_low(
+            F.dropout(F.relu(repr_v_low, inplace=True), p=self.output_dropout, training=self.training))
+        hs_proj_v_low += hs_v_low
+        logits_v_low = self.out_layer_v_low(hs_proj_v_low)
+        hs_a_low = c_a.transpose(0, 1).contiguous().view(x_a.size(0), -1)
+        repr_a_low = self.proj1_a_low(hs_a_low)
+        hs_proj_a_low = self.proj2_a_low(
+            F.dropout(F.relu(repr_a_low, inplace=True), p=self.output_dropout, training=self.training))
+        hs_proj_a_low += hs_a_low
+        logits_a_low = self.out_layer_a_low(hs_proj_a_low)
+        c_l_att = self.self_attentions_c_l(c_l)
+        if type(c_l_att) == tuple:
+            c_l_att = c_l_att[0]
+        c_l_att = c_l_att[-1]
+        c_v_att = self.self_attentions_c_v(c_v)
+        if type(c_v_att) == tuple:
+            c_v_att = c_v_att[0]
+        c_v_att = c_v_att[-1]
+        c_a_att = self.self_attentions_c_a(c_a)
+        if type(c_a_att) == tuple:
+            c_a_att = c_a_att[0]
+        c_a_att = c_a_att[-1]
+        c_fusion = torch.cat([c_l_att, c_v_att, c_a_att], dim=1)
+        c_proj = self.proj2_c(
+            F.dropout(F.relu(self.proj1_c(c_fusion), inplace=True), p=self.output_dropout,
+                      training=self.training))
+        c_proj += c_fusion
+        logits_c = self.out_layer_c(c_proj)
+        # LFA
+        # L --> L
+        h_ls = s_l
+        h_ls = self.trans_l_mem(h_ls)
+        if type(h_ls) == tuple:
+            h_ls = h_ls[0]
+        last_h_l = last_hs = h_ls[-1]
+        # A --> L
+        h_l_with_as = self.trans_l_with_a(s_l, s_a, s_a)
+        h_as = h_l_with_as
+        h_as = self.trans_a_mem(h_as)
+        if type(h_as) == tuple:
+            h_as = h_as[0]
+        last_h_a = last_hs = h_as[-1]
+        # V --> L
+        h_l_with_vs = self.trans_l_with_v(s_l, s_v, s_v)
+        h_vs = h_l_with_vs
+        h_vs = self.trans_v_mem(h_vs)
+        if type(h_vs) == tuple:
+            h_vs = h_vs[0]
+        last_h_v = last_hs = h_vs[-1]
+        hs_proj_l_high = self.proj2_l_high(
+            F.dropout(F.relu(self.proj1_l_high(last_h_l), inplace=True), p=self.output_dropout, training=self.training))
+        hs_proj_l_high += last_h_l
+        logits_l_high = self.out_layer_l_high(hs_proj_l_high)
+        hs_proj_v_high = self.proj2_v_high(
+            F.dropout(F.relu(self.proj1_v_high(last_h_v), inplace=True), p=self.output_dropout, training=self.training))
+        hs_proj_v_high += last_h_v
+        logits_v_high = self.out_layer_v_high(hs_proj_v_high)
+        hs_proj_a_high = self.proj2_a_high(
+            F.dropout(F.relu(self.proj1_a_high(last_h_a), inplace=True), p=self.output_dropout,
+                      training=self.training))
+        hs_proj_a_high += last_h_a
+        logits_a_high = self.out_layer_a_high(hs_proj_a_high)
+        #fusion
+        last_h_l = torch.sigmoid(self.projector_l(hs_proj_l_high))
+        last_h_v = torch.sigmoid(self.projector_v(hs_proj_v_high))
+        last_h_a = torch.sigmoid(self.projector_a(hs_proj_a_high))
+        c_fusion = torch.sigmoid(self.projector_c(c_fusion))
+        last_hs = torch.cat([last_h_l, last_h_v, last_h_a, c_fusion], dim=1)
+        #prediction
+        last_hs_proj = self.proj2(
+            F.dropout(F.relu(self.proj1(last_hs), inplace=True), p=self.output_dropout, training=self.training))
+        last_hs_proj += last_hs
+        output = self.out_layer(last_hs_proj)
+        res = {
+            'origin_l': proj_x_l,
+            'origin_v': proj_x_v,
+            'origin_a': proj_x_a,
+            's_l': s_l,
+            's_v': s_v,
+            's_a': s_a,
+            'c_l': c_l,
+            'c_v': c_v,
+            'c_a': c_a,
+            's_l_r': s_l_r,
+            's_v_r': s_v_r,
+            's_a_r': s_a_r,
+            'recon_l': recon_l,
+            'recon_v': recon_v,
+            'recon_a': recon_a,
+            'c_l_sim': c_l_sim,
+            'c_v_sim': c_v_sim,
+            'c_a_sim': c_a_sim,
+            'logits_l_hetero': logits_l_high,
+            'logits_v_hetero': logits_v_high,
+            'logits_a_hetero': logits_a_high,
+            'logits_c': logits_c,
+            'output_logit': output
+        }
+        return res

trains/singleTask/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .misc import *

trains/singleTask/utils/misc.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import numpy as np
+from sklearn.metrics import average_precision_score
+import torch
+import torch.nn.functional as F
+import torch.utils.data
+def to_numpy(array):
+  if isinstance(array, np.ndarray):
+    return array
+  if isinstance(array, torch.autograd.Variable):
+    array = array.data
+  if array.is_cuda:
+    array = array.cpu()
+  return array.numpy()
+def squeeze(array):
+  if not isinstance(array, list) or len(array) > 1:
+    return array
+  else:  # len(array) == 1:
+    return array[0]
+def unsqueeze(array):
+  if isinstance(array, list):
+    return array
+  else:
+    return [array]
+def is_due(*args):
+  """Determines whether to perform an action or not, depending on the epoch.
+     Used for logging, saving, learning rate decay, etc.
+  Args:
+    *args: epoch, due_at (due at epoch due_at) epoch, num_epochs,
+          due_every (due every due_every epochs)
+          step, due_every (due every due_every steps)
+  Returns:
+    due: boolean: perform action or not
+  """
+  if len(args) == 2 and isinstance(args[1], list):
+    epoch, due_at = args
+    due = epoch in due_at
+  elif len(args) == 3:
+    epoch, num_epochs, due_every = args
+    due = (due_every >= 0) and (epoch % due_every == 0 or epoch == num_epochs)
+  else:
+    step, due_every = args
+    due = (due_every > 0) and (step % due_every == 0)
+  return due
+def softmax(w, t=1.0, axis=None):
+  w = np.array(w) / t
+  e = np.exp(w - np.amax(w, axis=axis, keepdims=True))
+  dist = e / np.sum(e, axis=axis, keepdims=True)
+  return dist
+def min_cosine(student, teacher, option, weights=None):
+  cosine = torch.nn.CosineEmbeddingLoss()
+  dists = cosine(student, teacher.detach(), torch.tensor([-1]).cuda())
+  if weights is None:
+    dist = dists.mean()
+  else:
+    dist = (dists * weights).mean()
+  return dist
+def distance_metric(student, teacher, option, weights=None):
+  """Distance metric to calculate the imitation loss.
+  Args:
+    student: batch_size x n_classes
+    teacher: batch_size x n_classes
+    option: one of [cosine, l2, l2, kl]
+    weights: batch_size or float
+  Returns:
+    The computed distance metric.
+  """
+  if option == 'cosine':
+    dists = 1 - F.cosine_similarity(student, teacher.detach(), dim=1)
+    # dists = 1 - F.cosine_similarity(student, teacher, dim=1)
+  elif option == 'l2':
+    dists = (student-teacher.detach()).pow(2).sum(1)
+  elif option == 'l1':
+    dists = torch.abs(student-teacher.detach()).sum(1)
+  elif option == 'kl':
+    # assert weights is None
+    T = 8
+    # averaged for each minibatch
+    dist = F.kl_div(
+        F.log_softmax(student / T), F.softmax(teacher.detach() / T)) * (
+            T * T)
+    return dist
+  else:
+    raise NotImplementedError
+  if weights is None:
+    dist = dists.mean()
+  else:
+    dist = (dists * weights).mean()
+  return dist
+def get_segments(input, timestep):
+  """Split entire input into segments of length timestep.
+  Args:
+    input: 1 x total_length x n_frames x ...
+    timestep: the timestamp.
+  Returns:
+    input: concatenated video segments
+    start_indices: indices of the segments
+  """
+  assert input.size(0) == 1, 'Test time, batch_size must be 1'
+  input.squeeze_(dim=0)
+  # Find overlapping segments
+  length = input.size()[0]
+  step = timestep // 2
+  num_segments = (length - timestep) // step + 1
+  start_indices = (np.arange(num_segments) * step).tolist()
+  if length % step > 0:
+    start_indices.append(length - timestep)
+  # Get the segments
+  segments = []
+  for s in start_indices:
+    segment = input[s: (s + timestep)].unsqueeze(0)
+    segments.append(segment)
+  input = torch.cat(segments, dim=0)
+  return input, start_indices
+def get_stats(logit, label):
+  '''
+  Calculate the accuracy.
+  '''
+  logit = to_numpy(logit)
+  label = to_numpy(label)
+  pred = np.argmax(logit, 1)
+  acc = np.sum(pred == label)/label.shape[0]
+  return acc, pred, label
+def get_stats_detection(logit, label, n_classes=52):
+  '''
+  Calculate the accuracy and average precisions.
+  '''
+  logit = to_numpy(logit)
+  label = to_numpy(label)
+  scores = softmax(logit, axis=1)
+  pred = np.argmax(logit, 1)
+  length = label.shape[0]
+  acc = np.sum(pred == label)/length
+  keep_bg = label == 0
+  acc_bg = np.sum(pred[keep_bg] == label[keep_bg])/label[keep_bg].shape[0]
+  ratio_bg = np.sum(keep_bg)/length
+  keep_action = label != 0
+  acc_action = np.sum(
+      pred[keep_action] == label[keep_action]) / label[keep_action].shape[0]
+  # Average precision
+  y_true = np.zeros((len(label), n_classes))
+  y_true[np.arange(len(label)), label] = 1
+  acc = np.sum(pred == label)/label.shape[0]
+  aps = average_precision_score(y_true, scores, average=None)
+  aps = list(filter(lambda x: not np.isnan(x), aps))
+  ap = np.mean(aps)
+  return ap, acc, acc_bg, acc_action, ratio_bg, pred, label
+def info(text):
+  print('\033[94m' + text + '\033[0m')
+def warn(text):
+  print('\033[93m' + text + '\033[0m')
+def err(text):
+  print('\033[91m' + text + '\033[0m')

trains/subNets/AlignNets.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import torch
+import torch.nn as nn
+__all__ = ['AlignSubNet']
+class CTCModule(nn.Module):
+    def __init__(self, in_dim, out_seq_len):
+        '''
+        This module is performing alignment from A (e.g., audio) to B (e.g., text).
+        :param in_dim: Dimension for input modality A
+        :param out_seq_len: Sequence length for output modality B
+        From: https://github.com/yaohungt/Multimodal-Transformer
+        '''
+        super(CTCModule, self).__init__()
+        # Use LSTM for predicting the position from A to B
+        self.pred_output_position_inclu_blank = nn.LSTM(in_dim, out_seq_len+1, num_layers=2, batch_first=True) # 1 denoting blank
+        self.out_seq_len = out_seq_len
+        self.softmax = nn.Softmax(dim=2)
+    def forward(self, x):
+        '''
+        :input x: Input with shape [batch_size x in_seq_len x in_dim]
+        '''
+        # NOTE that the index 0 refers to blank.
+        pred_output_position_inclu_blank, _ = self.pred_output_position_inclu_blank(x)
+        prob_pred_output_position_inclu_blank = self.softmax(pred_output_position_inclu_blank) # batch_size x in_seq_len x out_seq_len+1
+        prob_pred_output_position = prob_pred_output_position_inclu_blank[:, :, 1:] # batch_size x in_seq_len x out_seq_len
+        prob_pred_output_position = prob_pred_output_position.transpose(1,2) # batch_size x out_seq_len x in_seq_len
+        pseudo_aligned_out = torch.bmm(prob_pred_output_position, x) # batch_size x out_seq_len x in_dim
+        # pseudo_aligned_out is regarded as the aligned A (w.r.t B)
+        # return pseudo_aligned_out, (pred_output_position_inclu_blank)
+        return pseudo_aligned_out
+class AlignSubNet(nn.Module):
+    def __init__(self, args, mode):
+        """
+        mode: the way of aligning
+            avg_pool, ctc, conv1d
+        """
+        super(AlignSubNet, self).__init__()
+        assert mode in ['avg_pool', 'ctc', 'conv1d']
+        in_dim_t, in_dim_a, in_dim_v = args.feature_dims
+        seq_len_t, seq_len_a, seq_len_v = args.seq_lens
+        self.dst_len = seq_len_t
+        self.mode = mode
+        self.ALIGN_WAY = {
+            'avg_pool': self.__avg_pool,
+            'ctc': self.__ctc,
+            'conv1d': self.__conv1d
+        }
+        if mode == 'conv1d':
+            self.conv1d_T = nn.Conv1d(seq_len_t, self.dst_len, kernel_size=1, bias=False)
+            self.conv1d_A = nn.Conv1d(seq_len_a, self.dst_len, kernel_size=1, bias=False)
+            self.conv1d_V = nn.Conv1d(seq_len_v, self.dst_len, kernel_size=1, bias=False)
+        elif mode == 'ctc':
+            self.ctc_t = CTCModule(in_dim_t, self.dst_len)
+            self.ctc_a = CTCModule(in_dim_a, self.dst_len)
+            self.ctc_v = CTCModule(in_dim_v, self.dst_len)
+    def get_seq_len(self):
+        return self.dst_len
+    def __ctc(self, text_x, audio_x, video_x):
+        text_x = self.ctc_t(text_x) if text_x.size(1) != self.dst_len else text_x
+        audio_x = self.ctc_a(audio_x) if audio_x.size(1) != self.dst_len else audio_x
+        video_x = self.ctc_v(video_x) if video_x.size(1) != self.dst_len else video_x
+        return text_x, audio_x, video_x
+    def __avg_pool(self, text_x, audio_x, video_x):
+        def align(x):
+            raw_seq_len = x.size(1)
+            if raw_seq_len == self.dst_len:
+                return x
+            if raw_seq_len // self.dst_len == raw_seq_len / self.dst_len:
+                pad_len = 0
+                pool_size = raw_seq_len // self.dst_len
+            else:
+                pad_len = self.dst_len - raw_seq_len % self.dst_len
+                pool_size = raw_seq_len // self.dst_len + 1
+            pad_x = x[:, -1, :].unsqueeze(1).expand([x.size(0), pad_len, x.size(-1)])
+            x = torch.cat([x, pad_x], dim=1).view(x.size(0), pool_size, self.dst_len, -1)
+            x = x.mean(dim=1)
+            return x
+        text_x = align(text_x)
+        audio_x = align(audio_x)
+        video_x = align(video_x)
+        return text_x, audio_x, video_x
+    def __conv1d(self, text_x, audio_x, video_x):
+        text_x = self.conv1d_T(text_x) if text_x.size(1) != self.dst_len else text_x
+        audio_x = self.conv1d_A(text_x) if audio_x.size(1) != self.dst_len else audio_x
+        video_x = self.conv1d_V(text_x) if video_x.size(1) != self.dst_len else video_x
+        return text_x, audio_x, video_x
+    def forward(self, text_x, audio_x, video_x):
+        # already aligned
+        if text_x.size(1) == audio_x.size(1) == video_x.size(1):
+            return text_x, audio_x, video_x
+        return self.ALIGN_WAY[self.mode](text_x, audio_x, video_x)

trains/subNets/BertTextEncoder.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+import torch.nn as nn
+from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer
+__all__ = ['BertTextEncoder']
+TRANSFORMERS_MAP = {
+    'bert': (BertModel, BertTokenizer),
+    'roberta': (RobertaModel, RobertaTokenizer),
+}
+class BertTextEncoder(nn.Module):
+    def __init__(self, use_finetune=False, transformers='bert', pretrained='bert-base-uncased'):
+        super().__init__()
+        tokenizer_class = TRANSFORMERS_MAP[transformers][1]
+        model_class = TRANSFORMERS_MAP[transformers][0]
+        self.tokenizer = tokenizer_class.from_pretrained(pretrained)
+        self.model = model_class.from_pretrained(pretrained)
+        self.use_finetune = use_finetune
+    def get_tokenizer(self):
+        return self.tokenizer
+    # def from_text(self, text):
+    #     """
+    #     text: raw data
+    #     """
+    #     input_ids = self.get_id(text)
+    #     with torch.no_grad():
+    #         last_hidden_states = self.model(input_ids)[0]  # Models outputs are now tuples
+    #     return last_hidden_states.squeeze()
+    def forward(self, text):
+        """
+        text: (batch_size, 3, seq_len)
+        3: input_ids, input_mask, segment_ids
+        input_ids: input_ids,
+        input_mask: attention_mask,
+        segment_ids: token_type_ids
+        """
+        input_ids, input_mask, segment_ids = text[:,0,:].long(), text[:,1,:].float(), text[:,2,:].long()
+        if self.use_finetune:
+            last_hidden_states = self.model(input_ids=input_ids,
+                                            attention_mask=input_mask,
+                                            token_type_ids=segment_ids)[0]  # Models outputs are now tuples
+        else:
+            with torch.no_grad():
+                last_hidden_states = self.model(input_ids=input_ids,
+                                                attention_mask=input_mask,
+                                                token_type_ids=segment_ids)[0]  # Models outputs are now tuples
+        return last_hidden_states

trains/subNets/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .BertTextEncoder import BertTextEncoder
2	+ from .AlignNets import AlignSubNet

trains/subNets/transformers_encoder/multihead_attention.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import Parameter
+class MultiheadAttention(nn.Module):
+    """Multi-headed attention.
+    See "Attention Is All You Need" for more details.
+    """
+    def __init__(self, embed_dim, num_heads, attn_dropout=0.,
+                 bias=True, add_bias_kv=False, add_zero_attn=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.attn_dropout = attn_dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+        self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim))
+        self.register_parameter('in_proj_bias', None)
+        if bias:
+            self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim))
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.in_proj_weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.in_proj_bias is not None:
+            nn.init.constant_(self.in_proj_bias, 0.)
+            nn.init.constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+    def forward(self, query, key, value, attn_mask=None):
+        """Input shape: Time x Batch x Channel
+        Self-attention can be implemented by passing in the same arguments for
+        query, key and value. Timesteps can be masked by supplying a T x T mask in the
+        `attn_mask` argument. Padding elements can be excluded from
+        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
+        batch x src_len, where padding elements are indicated by 1s.
+        """
+        qkv_same = query.data_ptr() == key.data_ptr() == value.data_ptr()
+        kv_same = key.data_ptr() == value.data_ptr()
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        assert key.size() == value.size()
+        aved_state = None
+        if qkv_same:
+            # self-attention
+            q, k, v = self.in_proj_qkv(query)
+        elif kv_same:
+            # encoder-decoder attention
+            q = self.in_proj_q(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k, v = self.in_proj_kv(key)
+        else:
+            q = self.in_proj_q(query)
+            k = self.in_proj_k(key)
+            v = self.in_proj_v(value)
+        q = q * self.scaling
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        src_len = k.size(1)
+        if self.add_zero_attn:
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+        if attn_mask is not None:
+            try:
+                attn_weights += attn_mask.unsqueeze(0)
+            except:
+                print(attn_weights.shape)
+                print(attn_mask.unsqueeze(0).shape)
+                assert False
+        attn_weights = F.softmax(attn_weights.float(), dim=-1).type_as(attn_weights)
+        # attn_weights = F.relu(attn_weights)
+        # attn_weights = attn_weights / torch.max(attn_weights)
+        attn_weights = F.dropout(attn_weights, p=self.attn_dropout, training=self.training)
+        attn = torch.bmm(attn_weights, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        # average attention weights over heads
+        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        attn_weights = attn_weights.sum(dim=1) / self.num_heads
+        return attn, attn_weights
+    def in_proj_qkv(self, query):
+        return self._in_proj(query).chunk(3, dim=-1)
+    def in_proj_kv(self, key):
+        return self._in_proj(key, start=self.embed_dim).chunk(2, dim=-1)
+    def in_proj_q(self, query, **kwargs):
+        return self._in_proj(query, end=self.embed_dim, **kwargs)
+    def in_proj_k(self, key):
+        return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim)
+    def in_proj_v(self, value):
+        return self._in_proj(value, start=2 * self.embed_dim)
+    def _in_proj(self, input, start=0, end=None, **kwargs):
+        weight = kwargs.get('weight', self.in_proj_weight)
+        bias = kwargs.get('bias', self.in_proj_bias)
+        weight = weight[start:end, :]
+        if bias is not None:
+            bias = bias[start:end]
+        return F.linear(input, weight, bias)

trains/subNets/transformers_encoder/position_embedding.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import math
+import torch
+import torch.nn as nn
+def make_positions(tensor, padding_idx, left_pad):
+    """Replace non-padding symbols with their position numbers.
+    Position numbers begin at padding_idx+1.
+    Padding symbols are ignored, but it is necessary to specify whether padding
+    is added on the left side (left_pad=True) or right side (left_pad=False).
+    """
+    max_pos = padding_idx + 1 + tensor.size(1)
+    device = tensor.get_device()
+    buf_name = f'range_buf_{device}'
+    if not hasattr(make_positions, buf_name):
+        setattr(make_positions, buf_name, tensor.new())
+    setattr(make_positions, buf_name, getattr(make_positions, buf_name).type_as(tensor))
+    if getattr(make_positions, buf_name).numel() < max_pos:
+        torch.arange(padding_idx + 1, max_pos, out=getattr(make_positions, buf_name))
+    mask = tensor.ne(padding_idx)
+    positions = getattr(make_positions, buf_name)[:tensor.size(1)].expand_as(tensor)
+    if left_pad:
+        positions = positions - mask.size(1) + mask.long().sum(dim=1).unsqueeze(1)
+    new_tensor = tensor.clone()
+    return new_tensor.masked_scatter_(mask, positions[mask]).long()
+class SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length.
+    Padding symbols are ignored, but it is necessary to specify whether padding
+    is added on the left side (left_pad=True) or right side (left_pad=False).
+    """
+    def __init__(self, embedding_dim, padding_idx=0, left_pad=0, init_size=128):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.left_pad = left_pad
+        self.weights = dict()   # device --> actual weight; due to nn.DataParallel :-(
+        self.register_buffer('_float_tensor', torch.FloatTensor(1))
+    @staticmethod
+    def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
+        """Build sinusoidal embeddings.
+        This matches the implementation in tensor2tensor, but differs slightly
+        from the description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb
+    def forward(self, input):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = input.size()
+        max_pos = self.padding_idx + 1 + seq_len
+        device = input.get_device()
+        if device not in self.weights or max_pos > self.weights[device].size(0):
+            # recompute/expand embeddings if needed
+            self.weights[device] = SinusoidalPositionalEmbedding.get_embedding(
+                max_pos,
+                self.embedding_dim,
+                self.padding_idx,
+            )
+        self.weights[device] = self.weights[device].type_as(self._float_tensor).to(input.device)
+        positions = make_positions(input, self.padding_idx, self.left_pad)
+        return self.weights[device].index_select(0, positions.contiguous().view(-1)).view(bsz, seq_len, -1).detach()
+    def max_positions(self):
+        """Maximum number of supported positions."""
+        return int(1e5)  # an arbitrary large number

trains/subNets/transformers_encoder/transformer.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import math
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .multihead_attention import MultiheadAttention
+from .position_embedding import SinusoidalPositionalEmbedding
+class TransformerEncoder(nn.Module):
+    """
+    Transformer encoder consisting of *args.encoder_layers* layers. Each layer
+    is a :class:`TransformerEncoderLayer`.
+    Args:
+        embed_tokens (torch.nn.Embedding): input embedding
+        num_heads (int): number of heads
+        layers (int): number of layers
+        attn_dropout (float): dropout applied on the attention weights
+        relu_dropout (float): dropout applied on the first layer of the residual block
+        res_dropout (float): dropout applied on the residual block
+        attn_mask (bool): whether to apply mask on the attention weights
+    """
+    def __init__(self, embed_dim, num_heads, layers, attn_dropout=0.0, relu_dropout=0.0, res_dropout=0.0,
+                 embed_dropout=0.0, attn_mask=False):
+        super().__init__()
+        self.dropout = embed_dropout      # Embedding dropout
+        self.attn_dropout = attn_dropout
+        self.embed_dim = embed_dim
+        self.embed_scale = math.sqrt(embed_dim)
+        self.embed_positions = SinusoidalPositionalEmbedding(embed_dim)
+        self.attn_mask = attn_mask
+        self.layers = nn.ModuleList([])     #define multiple transformer layers
+        for layer in range(layers):
+            new_layer = TransformerEncoderLayer(embed_dim,
+                                                num_heads=num_heads,
+                                                attn_dropout=attn_dropout,
+                                                relu_dropout=relu_dropout,
+                                                res_dropout=res_dropout,
+                                                attn_mask=attn_mask)
+            self.layers.append(new_layer)
+        self.register_buffer('version', torch.Tensor([2]))
+        self.normalize = True
+        if self.normalize:
+            self.layer_norm = LayerNorm(embed_dim)
+    def forward(self, x_in, x_in_k = None, x_in_v = None):
+        """
+        Args:
+            x_in (FloatTensor): embedded input of shape `(src_len, batch, embed_dim)`
+            x_in_k (FloatTensor): embedded input of shape `(src_len, batch, embed_dim)`
+            x_in_v (FloatTensor): embedded input of shape `(src_len, batch, embed_dim)`
+        Returns:
+            dict:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+        """
+        # embed tokens and positions
+        x = self.embed_scale * x_in
+        #breakpoint()
+        if self.embed_positions is not None:
+            x += self.embed_positions(x_in.transpose(0, 1)[:, :, 0]).transpose(0, 1)   # Add positional embedding
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        if x_in_k is not None and x_in_v is not None:
+            # embed tokens and positions
+            x_k = self.embed_scale * x_in_k
+            x_v = self.embed_scale * x_in_v
+            if self.embed_positions is not None:
+                x_k += self.embed_positions(x_in_k.transpose(0, 1)[:, :, 0]).transpose(0, 1)   # Add positional embedding
+                x_v += self.embed_positions(x_in_v.transpose(0, 1)[:, :, 0]).transpose(0, 1)   # Add positional embedding
+            x_k = F.dropout(x_k, p=self.dropout, training=self.training)
+            x_v = F.dropout(x_v, p=self.dropout, training=self.training)
+        # encoder layers
+        intermediates = [x]
+        for layer in self.layers:
+            if x_in_k is not None and x_in_v is not None:
+                x = layer(x, x_k, x_v)
+            else:
+                x = layer(x)
+            intermediates.append(x)
+        if self.normalize:
+            x = self.layer_norm(x)
+        return x
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        if self.embed_positions is None:
+            return self.max_source_positions
+        return min(self.max_source_positions, self.embed_positions.max_positions())
+class TransformerEncoderLayer(nn.Module):
+    """Encoder layer block.
+    In the original paper each operation (multi-head attention or FFN) is
+    postprocessed with: `dropout -> add residual -> layernorm`. In the
+    tensor2tensor code they suggest that learning is more robust when
+    preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *args.encoder_normalize_before* to ``True``.
+    Args:
+        embed_dim: Embedding dimension
+    """
+    def __init__(self, embed_dim, num_heads=4, attn_dropout=0.1, relu_dropout=0.1, res_dropout=0.1,
+                 attn_mask=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.self_attn = MultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=self.num_heads,
+            attn_dropout=attn_dropout
+        )
+        self.attn_mask = attn_mask
+        self.relu_dropout = relu_dropout
+        self.res_dropout = res_dropout
+        self.normalize_before = True    #true means using tensor2tensor approach
+        self.fc1 = Linear(self.embed_dim, 4*self.embed_dim)   # The "Add & Norm" part in the paper
+        self.fc2 = Linear(4*self.embed_dim, self.embed_dim)
+        self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)])  #Define two layer_norms layers
+    def forward(self, x, x_k=None, x_v=None):                                     #Two Transformer layers
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, src_len)` where padding elements are indicated by ``1``.
+            x_k (Tensor): same as x
+            x_v (Tensor): same as x
+        Returns:
+            encoded output of shape `(batch, src_len, embed_dim)`
+        """
+        residual = x
+        x = self.maybe_layer_norm(0, x, before=True)
+        mask = buffered_future_mask(x, x_k) if self.attn_mask else None
+        if x_k is None and x_v is None:
+            x, _ = self.self_attn(query=x, key=x, value=x, attn_mask=mask)
+        else:
+            x_k = self.maybe_layer_norm(0, x_k, before=True)
+            x_v = self.maybe_layer_norm(0, x_v, before=True)
+            x, _ = self.self_attn(query=x, key=x_k, value=x_v, attn_mask=mask)
+        x = F.dropout(x, p=self.res_dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(0, x, after=True)                                  #First Transformer layer
+        residual = x
+        x = self.maybe_layer_norm(1, x, before=True)
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, p=self.relu_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.res_dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(1, x, after=True)
+        return x                                                                     #The second one
+    def maybe_layer_norm(self, i, x, before=False, after=False):
+        assert before ^ after   #before XOR after, allow only one is true
+        if after ^ self.normalize_before:
+            return self.layer_norms[i](x)
+        else:
+            return x
+def fill_with_neg_inf(t):
+    """FP16-compatible function that fills a tensor with -inf."""
+    return t.float().fill_(float('-inf')).type_as(t)
+def buffered_future_mask(tensor, tensor2=None):
+    dim1 = dim2 = tensor.size(0)
+    if tensor2 is not None:
+        dim2 = tensor2.size(0)
+    future_mask = torch.triu(fill_with_neg_inf(torch.ones(dim1, dim2)), 1+abs(dim2-dim1))
+    if tensor.is_cuda:
+        future_mask = future_mask.to(tensor.device)
+    return future_mask[:dim1, :dim2]
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.)
+    return m
+def LayerNorm(embedding_dim):
+    m = nn.LayerNorm(embedding_dim)
+    return m
+if __name__ == '__main__':
+    encoder = TransformerEncoder(300, 4, 2)    #embed_dim, num_heads, layers
+    x = torch.tensor(torch.rand(20, 2, 300))
+    print(encoder(x).shape)

trains/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .functions import dict_to_str, setup_seed, assign_gpu, count_parameters
2	+ from .metricsTop import MetricsTop

trains/utils/functions.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import numpy as np
+import random
+import pynvml
+import logging
+logger = logging.getLogger('MMSA')
+def dict_to_str(src_dict):
+    dst_str = ""
+    for key in src_dict.keys():
+        dst_str += " %s: %.4f " %(key, src_dict[key])
+    return dst_str
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+def assign_gpu(gpu_ids, memory_limit=1e16):
+    if len(gpu_ids) == 0 and torch.cuda.is_available():
+        # find most free gpu
+        pynvml.nvmlInit()
+        n_gpus = pynvml.nvmlDeviceGetCount()
+        dst_gpu_id, min_mem_used = 0, memory_limit
+        for g_id in range(n_gpus):
+            handle = pynvml.nvmlDeviceGetHandleByIndex(g_id)
+            meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            mem_used = meminfo.used
+            if mem_used < min_mem_used:
+                min_mem_used = mem_used
+                dst_gpu_id = g_id
+        logger.info(f'Found gpu {dst_gpu_id}, used memory {min_mem_used}.')
+        gpu_ids.append(dst_gpu_id)
+    # device
+    using_cuda = len(gpu_ids) > 0 and torch.cuda.is_available()
+    # logger.info("Let's use %d GPUs!" % len(gpu_ids))
+    device = torch.device('cuda:%d' % int(gpu_ids[0]) if using_cuda else 'cpu')
+    return device
+def count_parameters(model):
+    res = 0
+    for p in model.parameters():
+        if p.requires_grad:
+            res += p.numel()
+            # print(p)
+    return res

trains/utils/metricsTop.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import numpy as np
+import torch
+from sklearn.metrics import accuracy_score, f1_score
+from sklearn.metrics import mutual_info_score
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+__all__ = ['MetricsTop']
+class MetricsTop():
+    def __init__(self, train_mode):
+        if train_mode == "regression":
+            self.metrics_dict = {
+                'MOSI': self.__eval_mosi_regression,
+                'MOSEI': self.__eval_mosei_regression,
+            }
+        else:
+            self.metrics_dict = {
+                'MOSI': self.__eval_mosi_classification,
+                'MOSEI': self.__eval_mosei_classification,
+            }
+    def __eval_mosi_classification(self, y_pred, y_true):
+        """
+        {
+            "Negative": 0,
+            "Neutral": 1,
+            "Positive": 2
+        }
+        """
+        y_pred = y_pred.cpu().detach().numpy()
+        y_true = y_true.cpu().detach().numpy()
+        # three classes
+        y_pred_3 = np.argmax(y_pred, axis=1)
+        Mult_acc_3 = accuracy_score(y_pred_3, y_true)
+        F1_score_3 = f1_score(y_true, y_pred_3, average='weighted')
+        # two classes
+        y_pred = np.array([[v[0], v[2]] for v in y_pred])
+        # with 0 (<= 0 or > 0)
+        y_pred_2 = np.argmax(y_pred, axis=1)
+        y_true_2 = []
+        for v in y_true:
+            y_true_2.append(0 if v <= 1 else 1)
+        y_true_2 = np.array(y_true_2)
+        Has0_acc_2 = accuracy_score(y_pred_2, y_true_2)
+        Has0_F1_score = f1_score(y_true_2, y_pred_2, average='weighted')
+        # without 0 (< 0 or > 0)
+        non_zeros = np.array([i for i, e in enumerate(y_true) if e != 1])
+        y_pred_2 = y_pred[non_zeros]
+        y_pred_2 = np.argmax(y_pred_2, axis=1)
+        y_true_2 = y_true[non_zeros]
+        Non0_acc_2 = accuracy_score(y_pred_2, y_true_2)
+        Non0_F1_score = f1_score(y_true_2, y_pred_2, average='weighted')
+        eval_results = {
+            "Has0_acc_2":  round(Has0_acc_2, 4),
+            "Has0_F1_score": round(Has0_F1_score, 4),
+            "Non0_acc_2":  round(Non0_acc_2, 4),
+            "Non0_F1_score": round(Non0_F1_score, 4),
+            "Acc_3": round(Mult_acc_3, 4),
+            "F1_score_3": round(F1_score_3, 4)
+        }
+        return eval_results
+    def __eval_mosei_classification(self, y_pred, y_true):
+        return self.__eval_mosi_classification(y_pred, y_true)
+    def __multiclass_acc(self, y_pred, y_true):
+        """
+        Compute the multiclass accuracy w.r.t. groundtruth
+        :param preds: Float array representing the predictions, dimension (N,)
+        :param truths: Float/int array representing the groundtruth classes, dimension (N,)
+        :return: Classification accuracy
+        """
+        return np.sum(np.round(y_pred) == np.round(y_true)) / float(len(y_true))
+    def __eval_mosei_regression(self, y_pred, y_true, exclude_zero=False):
+        test_preds = y_pred.view(-1).cpu().detach().numpy()
+        test_truth = y_true.view(-1).cpu().detach().numpy()
+        test_preds_a7 = np.clip(test_preds, a_min=-3., a_max=3.)
+        test_truth_a7 = np.clip(test_truth, a_min=-3., a_max=3.)
+        test_preds_a5 = np.clip(test_preds, a_min=-2., a_max=2.)
+        test_truth_a5 = np.clip(test_truth, a_min=-2., a_max=2.)
+        test_preds_a3 = np.clip(test_preds, a_min=-1., a_max=1.)
+        test_truth_a3 = np.clip(test_truth, a_min=-1., a_max=1.)
+        mae = np.mean(np.absolute(test_preds - test_truth)).astype(np.float64)
+        corr = np.corrcoef(test_preds, test_truth)[0][1]
+        mult_a7 = self.__multiclass_acc(test_preds_a7, test_truth_a7)
+        mult_a5 = self.__multiclass_acc(test_preds_a5, test_truth_a5)
+        mult_a3 = self.__multiclass_acc(test_preds_a3, test_truth_a3)
+        non_zeros = np.array([i for i, e in enumerate(test_truth) if e != 0])
+        non_zeros_binary_truth = (test_truth[non_zeros] > 0)
+        non_zeros_binary_preds = (test_preds[non_zeros] > 0)
+        non_zeros_acc2 = accuracy_score(non_zeros_binary_preds, non_zeros_binary_truth)
+        non_zeros_f1_score = f1_score(non_zeros_binary_truth, non_zeros_binary_preds, average='weighted')
+        binary_truth = (test_truth >= 0)
+        binary_preds = (test_preds >= 0)
+        acc2 = accuracy_score(binary_preds, binary_truth)
+        f_score = f1_score(binary_truth, binary_preds, average='weighted')
+        eval_results = {
+            "acc_7": round(mult_a7, 4),
+            "acc_5": round(mult_a5, 4),
+            "acc_2":  round(non_zeros_acc2, 4),
+            "F1_score": round(non_zeros_f1_score, 4),
+            "Corr": round(corr, 4),
+            "MAE": round(mae, 4)
+        }
+        return eval_results
+    def __eval_mosi_regression(self, y_pred, y_true):
+        return self.__eval_mosei_regression(y_pred, y_true)
+    def getMetics(self, datasetName):
+        return self.metrics_dict[datasetName.upper()]

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .functions import dict_to_str, setup_seed, assign_gpu, count_parameters
2	+ from .metricsTop import MetricsTop

utils/functions.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import numpy as np
+import random
+import pynvml
+import logging
+logger = logging.getLogger('MMSA')
+def dict_to_str(src_dict):
+    dst_str = ""
+    for key in src_dict.keys():
+        dst_str += " %s: %.4f " %(key, src_dict[key])
+    return dst_str
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+def assign_gpu(gpu_ids, memory_limit=1e16):
+    if len(gpu_ids) == 0 and torch.cuda.is_available():
+        # find most free gpu
+        pynvml.nvmlInit()
+        n_gpus = pynvml.nvmlDeviceGetCount()
+        dst_gpu_id, min_mem_used = 0, memory_limit
+        for g_id in range(n_gpus):
+            handle = pynvml.nvmlDeviceGetHandleByIndex(g_id)
+            meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            mem_used = meminfo.used
+            if mem_used < min_mem_used:
+                min_mem_used = mem_used
+                dst_gpu_id = g_id
+        logger.info(f'Found gpu {dst_gpu_id}, used memory {min_mem_used}.')
+        gpu_ids.append(dst_gpu_id)
+    # device
+    using_cuda = len(gpu_ids) > 0 and torch.cuda.is_available()
+    # logger.info("Let's use %d GPUs!" % len(gpu_ids))
+    device = torch.device('cuda:%d' % int(gpu_ids[0]) if using_cuda else 'cpu')
+    return device
+def count_parameters(model):
+    res = 0
+    for p in model.parameters():
+        if p.requires_grad:
+            res += p.numel()
+            # print(p)
+    return res

utils/metricsTop.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import numpy as np
+from sklearn.metrics import accuracy_score, f1_score
+__all__ = ['MetricsTop']
+class MetricsTop():
+    def __init__(self, train_mode):
+        if train_mode == "regression":
+            self.metrics_dict = {
+                'MOSI': self.__eval_mosi_regression,
+                'MOSEI': self.__eval_mosei_regression,
+            }
+        else:
+            self.metrics_dict = {
+                'MOSI': self.__eval_mosi_classification,
+                'MOSEI': self.__eval_mosei_classification,
+            }
+    def __eval_mosi_classification(self, y_pred, y_true):
+        y_pred = y_pred.cpu().detach().numpy()
+        y_true = y_true.cpu().detach().numpy()
+        # three classes
+        y_pred_3 = np.argmax(y_pred, axis=1)
+        Mult_acc_3 = accuracy_score(y_pred_3, y_true)
+        F1_score_3 = f1_score(y_true, y_pred_3, average='weighted')
+        # two classes
+        y_pred = np.array([[v[0], v[2]] for v in y_pred])
+        # with 0 (<= 0 or > 0)
+        y_pred_2 = np.argmax(y_pred, axis=1)
+        y_true_2 = []
+        for v in y_true:
+            y_true_2.append(0 if v <= 1 else 1)
+        y_true_2 = np.array(y_true_2)
+        Has0_acc_2 = accuracy_score(y_pred_2, y_true_2)
+        Has0_F1_score = f1_score(y_true_2, y_pred_2, average='weighted')
+        # without 0 (< 0 or > 0)
+        non_zeros = np.array([i for i, e in enumerate(y_true) if e != 1])
+        y_pred_2 = y_pred[non_zeros]
+        y_pred_2 = np.argmax(y_pred_2, axis=1)
+        y_true_2 = y_true[non_zeros]
+        Non0_acc_2 = accuracy_score(y_pred_2, y_true_2)
+        Non0_F1_score = f1_score(y_true_2, y_pred_2, average='weighted')
+        eval_results = {
+            "Has0_acc_2":  round(Has0_acc_2, 4),
+            "Has0_F1_score": round(Has0_F1_score, 4),
+            "Non0_acc_2":  round(Non0_acc_2, 4),
+            "Non0_F1_score": round(Non0_F1_score, 4),
+            "Acc_3": round(Mult_acc_3, 4),
+            "F1_score_3": round(F1_score_3, 4)
+        }
+        return eval_results
+    def __eval_mosei_classification(self, y_pred, y_true):
+        return self.__eval_mosi_classification(y_pred, y_true)
+    def __multiclass_acc(self, y_pred, y_true):
+        """
+        Compute the multiclass accuracy w.r.t. groundtruth
+        :param preds: Float array representing the predictions, dimension (N,)
+        :param truths: Float/int array representing the groundtruth classes, dimension (N,)
+        :return: Classification accuracy
+        """
+        return np.sum(np.round(y_pred) == np.round(y_true)) / float(len(y_true))
+    def __eval_mosei_regression(self, y_pred, y_true, exclude_zero=False):
+        test_preds = y_pred.view(-1).cpu().detach().numpy()
+        test_truth = y_true.view(-1).cpu().detach().numpy()
+        test_preds_a7 = np.clip(test_preds, a_min=-3., a_max=3.)
+        test_truth_a7 = np.clip(test_truth, a_min=-3., a_max=3.)
+        test_preds_a5 = np.clip(test_preds, a_min=-2., a_max=2.)
+        test_truth_a5 = np.clip(test_truth, a_min=-2., a_max=2.)
+        test_preds_a3 = np.clip(test_preds, a_min=-1., a_max=1.)
+        test_truth_a3 = np.clip(test_truth, a_min=-1., a_max=1.)
+        mae = np.mean(np.absolute(test_preds - test_truth)).astype(np.float64)   # Average L1 distance between preds and truths
+        corr = np.corrcoef(test_preds, test_truth)[0][1]
+        mult_a7 = self.__multiclass_acc(test_preds_a7, test_truth_a7)
+        mult_a5 = self.__multiclass_acc(test_preds_a5, test_truth_a5)
+        mult_a3 = self.__multiclass_acc(test_preds_a3, test_truth_a3)
+        non_zeros = np.array([i for i, e in enumerate(test_truth) if e != 0])
+        non_zeros_binary_truth = (test_truth[non_zeros] > 0)
+        non_zeros_binary_preds = (test_preds[non_zeros] > 0)
+        non_zeros_acc2 = accuracy_score(non_zeros_binary_preds, non_zeros_binary_truth)
+        non_zeros_f1_score = f1_score(non_zeros_binary_truth, non_zeros_binary_preds, average='weighted')
+        binary_truth = (test_truth >= 0)
+        binary_preds = (test_preds >= 0)
+        acc2 = accuracy_score(binary_preds, binary_truth)
+        f_score = f1_score(binary_truth, binary_preds, average='weighted')
+        eval_results = {
+            "Acc_2":  round(non_zeros_acc2, 4),
+            "F1_score": round(non_zeros_f1_score, 4),
+            "Acc_7": round(mult_a7, 4),
+            "MAE": round(mae, 4),
+        }
+        return eval_results
+    def __eval_mosi_regression(self, y_pred, y_true):
+        return self.__eval_mosei_regression(y_pred, y_true)
+    def getMetics(self, datasetName):
+        return self.metrics_dict[datasetName.upper()]