project_dir: mfmmlm run_task: - how2.yaml - [vtt.yaml, vttcap.yaml, vttqa.yaml, youcook.yaml, youcookcap.yaml, crosstask.yaml, coin.yaml] base_dir: task task_group: pretrain: task_list: - how2.yaml dataset: subsampling: 32 sampled_min_len: 10 sampled_max_len: 64 max_video_len: 32 max_len: 96 aligner: MFMMLMAligner lazy_vfeat_mask: True mfm_probability: 0.15 mlm_probability: 0.15 mm_prob: 0.5 model: model_cls: MMFusionMFMMLM mm_encoder_cls: MMFusionForMFMMLM loss: loss_cls: MFMMLM fairseq: common: fp16: true dataset: batch_size: 256 optimization: max_epoch: 15 finetune: task_list: - vtt.yaml - vttqa.yaml - youcook.yaml - youcookcap.yaml - crosstask.yaml - coin.yaml dataset: max_video_len: 32 max_len: 96 fairseq: common: fp16: true # do not write any model or loss here (they are expected to be fixed in mmfusion). test: task_list: - test_vtt.yaml - test_vttqa.yaml - test_youcook.yaml - test_youcookcap.yaml - test_crosstask.yaml - test_crosstask_zs.yaml - test_coin.yaml dataset: max_video_len: 32 max_len: 96