File size: 2,417 Bytes
b1b22fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tb
min_loss_scale: 1e-6
fp16_no_flatten_grads: true
user_dir: ${env:PWD}/examples/data2vec
checkpoint:
save_interval: 1
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
task:
_name: audio_pretraining
data: /fsx-wav2vec/abaevski/data/librivox/no_silence
max_sample_size: 320000
min_sample_size: 32000
normalize: true
precompute_mask_config: {}
dataset:
num_workers: 8
max_tokens: 320000
skip_invalid_size_inputs_valid_test: true
validate_interval: 5
required_batch_size_multiple: 1
disable_validation: true
distributed_training:
distributed_world_size: 48
ddp_backend: c10d
criterion:
_name: model
log_keys:
- ema_decay
- target_var
- pred_var
- model_norm
- ema_norm
- masked_pct
optimization:
max_update: 600000
debug_param_names: true
clip_norm: 1
optimizer:
_name: composite
dynamic_groups: true
groups:
default:
lr_float: 0.0004
optimizer:
_name: adam
adam_betas: [0.9,0.98]
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: cosine
warmup_updates: 10000
lr_scheduler: pass_through
model:
_name: data2vec_multi
loss_beta: 0
loss_scale: null
depth: 16
embed_dim: 1024
num_heads: 16
clone_batch: 12
ema_decay: 0.9997
ema_end_decay: 1
ema_anneal_end_step: 300000
ema_encoder_only: false
average_top_k_layers: 16
instance_norm_target_layer: true
layer_norm_target_layer: false
layer_norm_targets: false
layerdrop: 0
norm_eps: 1e-5
supported_modality: AUDIO
modalities:
audio:
feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
conv_pos_depth: 5
conv_pos_width: 95
conv_pos_groups: 16
prenet_depth: 8
mask_prob: 0.55
mask_prob_adjust: 0.1
inverse_mask: false
mask_length: 5
mask_noise_std: 0.01
mask_dropout: 0
add_masks: false
ema_local_encoder: false
use_alibi_encoder: true
prenet_layerdrop: 0
prenet_dropout: 0.1
learned_alibi_scale: true
learned_alibi_scale_per_head: true
decoder:
input_dropout: 0.1
decoder_dim: 768
decoder_groups: 16
decoder_kernel: 7
decoder_layers: 4
|