dataset: video_processor: VideoProcessor bert_name: bert-base-uncased meta_processor: MSRVTTMetaProcessor train_path: data/msrvtt/MSRVTT_train.csv dup: 20 val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTTextProcessor json_path: data/msrvtt/MSRVTT_data.json aligner: DSAligner num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 128 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 122 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 5 checkpoint: restore_file: runs/mtm/vlm/checkpoint_best.pt reset_optimizer: true reset_dataloader: true reset_meters: true save_dir: runs/mtm/vlm/vttqa task_type: sweep_small model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint use_seg_emb: true loss: loss_cls: V2TContraLoss