slurm_config: big | |
task_type: local_predict | |
dataset: | |
split: test | |
video_processor: VideoProcessor | |
aligner: DSAligner | |
bert_name: bert-base-uncased | |
meta_processor: MSRVTTMetaProcessor | |
test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv | |
vfeat_dir: data/feat/feat_vtt_s3d | |
text_processor: MSRVTTTextProcessor | |
num_iso_layer: 12 | |
max_video_len: 32 | |
max_len: 96 | |
fairseq: | |
dataset: | |
batch_size: 256 | |
valid_subset: test | |
num_workers: 2 | |
common_eval: | |
path: runs/mtm/vlm/vtt/checkpoint_last.pt | |
model: | |
model_cls: MMFusionJoint | |
mm_encoder_cls: MMBertForJoint | |
use_seg_emb: true | |
eval: | |
save_path: runs/mtm/vlm/vtt/eval | |
metric: RetrievalMetric | |
predictor: RetrievalPredictor | |