File size: 7,640 Bytes
8cea9fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
run_name: OLMO-250-40GB-700M-10-epoch
seed: 6198
epoch: null
dry_run: false
model:
  d_model: 512
  n_heads: 8
  n_kv_heads: null
  clip_qkv: null
  n_layers: 8
  mlp_ratio: 8
  mlp_hidden_size: null
  activation_type: swiglu
  block_type: sequential
  block_group_size: 1
  alibi: false
  alibi_bias_max: 8.0
  rope: true
  rope_full_precision: true
  flash_attention: false
  attention_dropout: 0.0
  multi_query_attention: false
  attention_layer_norm: false
  residual_dropout: 0.0
  embedding_dropout: 0.0
  layer_norm_type: rms
  layer_norm_with_affine: true
  attention_layer_norm_with_affine: false
  max_sequence_length: 250
  include_bias: false
  bias_for_layer_norm: false
  scale_logits: false
  vocab_size: 4096
  embedding_size: 4096
  weight_tying: false
  eos_token_id: 3
  pad_token_id: 3
  init_device: meta
  init_fn: mitchell
  init_std: 0.02
  init_cutoff_factor: null
  precision: amp_bf16
optimizer:
  name: adamw
  learning_rate: 0.0006
  weight_decay: 0.1
  betas:
  - 0.9
  - 0.95
  no_decay_norm_and_bias: null
  decay_norm_and_bias: false
  decay_embeddings: false
  metrics_log_interval: 10
scheduler:
  name: cosine_with_warmup
  units: steps
  t_warmup: 5000
  t_max: null
  alpha_f: 0.1
  grad_clip_warmup_steps: null
  grad_clip_warmup_factor: null
  warmup_min_lr: null
data:
  paths:
  - /mnt/data/tokenized_data/train_input_ids_1.npy
  - /mnt/data/tokenized_data/train_input_ids_2.npy
  - /mnt/data/tokenized_data/train_input_ids_3.npy
  - /mnt/data/tokenized_data/train_input_ids_4.npy
  - /mnt/data/tokenized_data/train_input_ids_5.npy
  - /mnt/data/tokenized_data/train_input_ids_6.npy
  - /mnt/data/tokenized_data/train_input_ids_7.npy
  - /mnt/data/tokenized_data/train_input_ids_8.npy
  - /mnt/data/tokenized_data/train_input_ids_9.npy
  - /mnt/data/tokenized_data/train_input_ids_10.npy
  - /mnt/data/tokenized_data/train_input_ids_11.npy
  - /mnt/data/tokenized_data/train_input_ids_12.npy
  - /mnt/data/tokenized_data/train_input_ids_13.npy
  - /mnt/data/tokenized_data/train_input_ids_14.npy
  - /mnt/data/tokenized_data/train_input_ids_15.npy
  - /mnt/data/tokenized_data/train_input_ids_16.npy
  - /mnt/data/tokenized_data/train_input_ids_17.npy
  - /mnt/data/tokenized_data/train_input_ids_18.npy
  - /mnt/data/tokenized_data/train_input_ids_19.npy
  - /mnt/data/tokenized_data/train_input_ids_20.npy
  - /mnt/data/tokenized_data/train_input_ids_21.npy
  - /mnt/data/tokenized_data/train_input_ids_22.npy
  - /mnt/data/tokenized_data/train_input_ids_23.npy
  - /mnt/data/tokenized_data/train_input_ids_24.npy
  - /mnt/data/tokenized_data/train_input_ids_25.npy
  - /mnt/data/tokenized_data/train_input_ids_26.npy
  - /mnt/data/tokenized_data/train_input_ids_27.npy
  - /mnt/data/tokenized_data/train_input_ids_28.npy
  - /mnt/data/tokenized_data/train_input_ids_29.npy
  - /mnt/data/tokenized_data/train_input_ids_30.npy
  - /mnt/data/tokenized_data/train_input_ids_31.npy
  - /mnt/data/tokenized_data/train_input_ids_32.npy
  - /mnt/data/tokenized_data/train_input_ids_33.npy
  - /mnt/data/tokenized_data/train_input_ids_34.npy
  - /mnt/data/tokenized_data/train_input_ids_35.npy
  - /mnt/data/tokenized_data/train_input_ids_36.npy
  - /mnt/data/tokenized_data/train_input_ids_37.npy
  - /mnt/data/tokenized_data/train_input_ids_38.npy
  - /mnt/data/tokenized_data/train_input_ids_39.npy
  - /mnt/data/tokenized_data/train_input_ids_40.npy
  - /mnt/data/tokenized_data/train_input_ids_41.npy
  - /mnt/data/tokenized_data/train_input_ids_42.npy
  - /mnt/data/tokenized_data/train_input_ids_43.npy
  - /mnt/data/tokenized_data/train_input_ids_44.npy
  - /mnt/data/tokenized_data/train_input_ids_45.npy
  - /mnt/data/tokenized_data/train_input_ids_46.npy
  - /mnt/data/tokenized_data/train_input_ids_47.npy
  - /mnt/data/tokenized_data/train_input_ids_48.npy
  - /mnt/data/tokenized_data/train_input_ids_49.npy
  - /mnt/data/tokenized_data/train_input_ids_50.npy
  - /mnt/data/tokenized_data/train_input_ids_51.npy
  - /mnt/data/tokenized_data/train_input_ids_52.npy
  - /mnt/data/tokenized_data/train_input_ids_53.npy
  - /mnt/data/tokenized_data/train_input_ids_54.npy
  - /mnt/data/tokenized_data/train_input_ids_55.npy
  - /mnt/data/tokenized_data/train_input_ids_56.npy
  - /mnt/data/tokenized_data/train_input_ids_57.npy
  - /mnt/data/tokenized_data/train_input_ids_58.npy
  - /mnt/data/tokenized_data/train_input_ids_59.npy
  - /mnt/data/tokenized_data/train_input_ids_60.npy
  - /mnt/data/tokenized_data/train_input_ids_61.npy
  - /mnt/data/tokenized_data/train_input_ids_62.npy
  - /mnt/data/tokenized_data/train_input_ids_63.npy
  - /mnt/data/tokenized_data/train_input_ids_64.npy
  - /mnt/data/tokenized_data/train_input_ids_65.npy
  - /mnt/data/tokenized_data/train_input_ids_66.npy
  - /mnt/data/tokenized_data/train_input_ids_67.npy
  - /mnt/data/tokenized_data/train_input_ids_68.npy
  - /mnt/data/tokenized_data/train_input_ids_69.npy
  - /mnt/data/tokenized_data/train_input_ids_70.npy
  - /mnt/data/tokenized_data/train_input_ids_71.npy
  - /mnt/data/tokenized_data/train_input_ids_72.npy
  - /mnt/data/tokenized_data/train_input_ids_73.npy
  - /mnt/data/tokenized_data/train_input_ids_74.npy
  - /mnt/data/tokenized_data/val_input_ids_2.npy
  - /mnt/data/tokenized_data/val_input_ids_3.npy
  - /mnt/data/tokenized_data/val_input_ids_4.npy
  datasets: null
  label_mask_paths: null
  pad_direction: right
  generate_attention_mask: false
  num_workers: 16
  drop_last: true
  pin_memory: true
  prefetch_factor: 16
  persistent_workers: true
  timeout: 0
  seed: null
restore_dataloader: true
fast_forward_batches: null
evaluators:
- label: human-chunk
  type: lm
  data:
    paths: null
    datasets:
      dna-bert2-eval:
      - /mnt/data/tokenized_data/val_input_ids_1.npy
    label_mask_paths: null
    pad_direction: right
    generate_attention_mask: false
    num_workers: 16
    drop_last: true
    pin_memory: false
    prefetch_factor: null
    persistent_workers: false
    timeout: 0
    seed: null
  device_eval_batch_size: null
  subset_num_batches: null
eval_interval: 10000
tokenizer:
  identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
  truncate_direction: right
save_folder: /mnt/data/pretrain_formal_60M
remote_save_folder: null
canceled_check_interval: 50
save_interval: 10000
save_interval_unsharded: 20000
save_interval_ephemeral: null
save_num_checkpoints_to_keep: 1
save_num_unsharded_checkpoints_to_keep: 10
save_overwrite: true
force_save_unsharded: false
no_pre_train_checkpoint: false
load_path: null
load_path_sharded_checkpointer: null
reset_optimizer_state: false
reset_trainer_state: false
sharded_checkpointer: torch_legacy
new_style_checkpoints: null
max_duration: 10ep
global_train_batch_size: 384
device_train_batch_size: 48
device_train_microbatch_size: 48
device_eval_batch_size: 48
eval_subset_num_batches: -1
eval_on_load: false
device_train_grad_accum: 1
max_grad_norm: 1.0
max_grad_norm_ratio: null
precision: amp_bf16
wandb:
  project: olmo-dna
  entity: zehui127-imperial-college-london
  group: null
  name: OLMO-250-40GB-700M-10-epoch
  tags:
  - watching
  log_artifacts: false
  rank_zero_only: true
  log_interval: 1
speed_monitor:
  window_size: 3
  gpu_flops_available: null
console_log_interval: 1
gen1_gc_interval: 1
compile: null
fsdp:
  use_orig_params: true
  sharding_strategy: FULL_SHARD
  wrapping_strategy: null
  precision: mixed
softmax_auxiliary_loss: false
time_limit: 964000.0
extra_steps_after_cancel: 10
early_stopping_factor: null
save_data_indices: true
python_profiling: false
torch_profiling: false
stop_at: null
stop_after: null
activation_checkpointing: null
fused_loss: null