kguo2 commited on
Commit
72ba05e
·
verified ·
1 Parent(s): 228a9d6

Model save

Browse files
Files changed (4) hide show
  1. README.md +3 -5
  2. all_results.json +6 -6
  3. train_results.json +6 -6
  4. trainer_state.json +1678 -1022
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
- base_model: Qwen/Qwen2.5-3B-Instruct
3
- datasets: kguo2/scaffold_finetune
4
  library_name: transformers
5
  model_name: finetune_demo
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - sft
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for finetune_demo
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) on the [kguo2/scaffold_finetune](https://huggingface.co/datasets/kguo2/scaffold_finetune) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/nd/huggingface/runs/mvkr71e8)
33
 
34
 
35
  This model was trained with SFT.
 
1
  ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
 
3
  library_name: transformers
4
  model_name: finetune_demo
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
 
11
 
12
  # Model Card for finetune_demo
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/nd/huggingface/runs/ijld17d7)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 2.7740847281799168e+17,
3
- "train_loss": 0.0704414379602571,
4
- "train_runtime": 3418.2006,
5
- "train_samples": 29047,
6
- "train_samples_per_second": 18.999,
7
- "train_steps_per_second": 0.298
8
  }
 
1
  {
2
+ "total_flos": 7.444201440207176e+17,
3
+ "train_loss": 0.06586769079210378,
4
+ "train_runtime": 7485.0094,
5
+ "train_samples": 28548,
6
+ "train_samples_per_second": 9.147,
7
+ "train_steps_per_second": 0.191
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 2.7740847281799168e+17,
3
- "train_loss": 0.0704414379602571,
4
- "train_runtime": 3418.2006,
5
- "train_samples": 29047,
6
- "train_samples_per_second": 18.999,
7
- "train_steps_per_second": 0.298
8
  }
 
1
  {
2
+ "total_flos": 7.444201440207176e+17,
3
+ "train_loss": 0.06586769079210378,
4
+ "train_runtime": 7485.0094,
5
+ "train_samples": 28548,
6
+ "train_samples_per_second": 9.147,
7
+ "train_steps_per_second": 0.191
8
  }
trainer_state.json CHANGED
@@ -4,1648 +4,2304 @@
4
  "best_model_checkpoint": null,
5
  "epoch": 3.0,
6
  "eval_steps": 500,
7
- "global_step": 1017,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.014749262536873156,
14
- "grad_norm": 28.681703567504883,
15
- "learning_rate": 4.901960784313726e-06,
16
- "loss": 1.7339,
17
- "num_tokens": 81920.0,
18
  "step": 5
19
  },
20
  {
21
- "epoch": 0.029498525073746312,
22
- "grad_norm": 6.434526443481445,
23
- "learning_rate": 9.803921568627451e-06,
24
- "loss": 0.985,
25
- "num_tokens": 163840.0,
26
  "step": 10
27
  },
28
  {
29
- "epoch": 0.04424778761061947,
30
- "grad_norm": 2.7200655937194824,
31
- "learning_rate": 1.4705882352941177e-05,
32
- "loss": 0.2836,
33
- "num_tokens": 245760.0,
34
  "step": 15
35
  },
36
  {
37
- "epoch": 0.058997050147492625,
38
- "grad_norm": 1.4244611263275146,
39
- "learning_rate": 1.9607843137254903e-05,
40
- "loss": 0.1205,
41
- "num_tokens": 327680.0,
42
  "step": 20
43
  },
44
  {
45
- "epoch": 0.07374631268436578,
46
- "grad_norm": 1.7670924663543701,
47
- "learning_rate": 2.4509803921568626e-05,
48
- "loss": 0.0935,
49
- "num_tokens": 409600.0,
50
  "step": 25
51
  },
52
  {
53
- "epoch": 0.08849557522123894,
54
- "grad_norm": 1.4682564735412598,
55
- "learning_rate": 2.9411764705882354e-05,
56
- "loss": 0.083,
57
- "num_tokens": 491479.0,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.10324483775811209,
62
- "grad_norm": 1.1769920587539673,
63
- "learning_rate": 3.431372549019608e-05,
64
- "loss": 0.0815,
65
- "num_tokens": 573399.0,
66
  "step": 35
67
  },
68
  {
69
- "epoch": 0.11799410029498525,
70
- "grad_norm": 1.2460798025131226,
71
- "learning_rate": 3.9215686274509805e-05,
72
- "loss": 0.0752,
73
- "num_tokens": 655143.0,
74
  "step": 40
75
  },
76
  {
77
- "epoch": 0.13274336283185842,
78
- "grad_norm": 0.7001373171806335,
79
- "learning_rate": 4.411764705882353e-05,
80
- "loss": 0.0739,
81
- "num_tokens": 737063.0,
82
  "step": 45
83
  },
84
  {
85
- "epoch": 0.14749262536873156,
86
- "grad_norm": 0.7255822420120239,
87
- "learning_rate": 4.901960784313725e-05,
88
- "loss": 0.0708,
89
- "num_tokens": 818902.0,
90
  "step": 50
91
  },
92
  {
93
- "epoch": 0.16224188790560473,
94
- "grad_norm": 1.1189676523208618,
95
- "learning_rate": 4.999809624142209e-05,
96
- "loss": 0.0902,
97
- "num_tokens": 900822.0,
98
  "step": 55
99
  },
100
  {
101
- "epoch": 0.17699115044247787,
102
- "grad_norm": 20.670774459838867,
103
- "learning_rate": 4.9990362774334994e-05,
104
- "loss": 0.1074,
105
- "num_tokens": 982742.0,
106
  "step": 60
107
  },
108
  {
109
- "epoch": 0.19174041297935104,
110
- "grad_norm": 0.5863003730773926,
111
- "learning_rate": 4.997668265705137e-05,
112
- "loss": 0.0683,
113
- "num_tokens": 1064662.0,
114
  "step": 65
115
  },
116
  {
117
- "epoch": 0.20648967551622419,
118
- "grad_norm": 0.6090629696846008,
119
- "learning_rate": 4.9957059506714846e-05,
120
- "loss": 0.0693,
121
- "num_tokens": 1146582.0,
122
  "step": 70
123
  },
124
  {
125
- "epoch": 0.22123893805309736,
126
- "grad_norm": 0.4677298069000244,
127
- "learning_rate": 4.9931498511859377e-05,
128
- "loss": 0.0673,
129
- "num_tokens": 1228502.0,
130
  "step": 75
131
  },
132
  {
133
- "epoch": 0.2359882005899705,
134
- "grad_norm": 0.5659052729606628,
135
- "learning_rate": 4.990000643103734e-05,
136
- "loss": 0.0685,
137
- "num_tokens": 1310422.0,
138
  "step": 80
139
  },
140
  {
141
- "epoch": 0.25073746312684364,
142
- "grad_norm": 0.39706626534461975,
143
- "learning_rate": 4.986259159103256e-05,
144
- "loss": 0.0669,
145
- "num_tokens": 1392342.0,
146
  "step": 85
147
  },
148
  {
149
- "epoch": 0.26548672566371684,
150
- "grad_norm": 0.38469579815864563,
151
- "learning_rate": 4.981926388465857e-05,
152
- "loss": 0.0653,
153
- "num_tokens": 1474262.0,
154
  "step": 90
155
  },
156
  {
157
- "epoch": 0.28023598820059,
158
- "grad_norm": 0.3758561313152313,
159
- "learning_rate": 4.9770034768142934e-05,
160
- "loss": 0.0635,
161
- "num_tokens": 1555918.0,
162
  "step": 95
163
  },
164
  {
165
- "epoch": 0.2949852507374631,
166
- "grad_norm": 0.439568430185318,
167
- "learning_rate": 4.971491725809807e-05,
168
  "loss": 0.0638,
169
- "num_tokens": 1637838.0,
170
  "step": 100
171
  },
172
  {
173
- "epoch": 0.30973451327433627,
174
- "grad_norm": 0.3522689938545227,
175
- "learning_rate": 4.965392592807956e-05,
176
- "loss": 0.0631,
177
- "num_tokens": 1719758.0,
178
  "step": 105
179
  },
180
  {
181
- "epoch": 0.32448377581120946,
182
- "grad_norm": 0.3989611864089966,
183
- "learning_rate": 4.9587076904732756e-05,
184
- "loss": 0.0605,
185
- "num_tokens": 1801678.0,
186
  "step": 110
187
  },
188
  {
189
- "epoch": 0.3392330383480826,
190
- "grad_norm": 0.34883007407188416,
191
- "learning_rate": 4.951438786352881e-05,
192
- "loss": 0.0619,
193
- "num_tokens": 1883598.0,
194
  "step": 115
195
  },
196
  {
197
- "epoch": 0.35398230088495575,
198
- "grad_norm": 0.29905784130096436,
199
- "learning_rate": 4.943587802409103e-05,
200
- "loss": 0.063,
201
- "num_tokens": 1965518.0,
202
  "step": 120
203
  },
204
  {
205
- "epoch": 0.3687315634218289,
206
- "grad_norm": 0.3990210294723511,
207
- "learning_rate": 4.935156814511314e-05,
208
- "loss": 0.0617,
209
- "num_tokens": 2047438.0,
210
  "step": 125
211
  },
212
  {
213
- "epoch": 0.3834808259587021,
214
- "grad_norm": 0.23905648291110992,
215
- "learning_rate": 4.926148051887042e-05,
216
- "loss": 0.0595,
217
- "num_tokens": 2129358.0,
218
  "step": 130
219
  },
220
  {
221
- "epoch": 0.39823008849557523,
222
- "grad_norm": 0.2639841139316559,
223
- "learning_rate": 4.916563896532549e-05,
224
- "loss": 0.059,
225
- "num_tokens": 2211230.0,
226
  "step": 135
227
  },
228
  {
229
- "epoch": 0.41297935103244837,
230
- "grad_norm": 0.29038557410240173,
231
- "learning_rate": 4.906406882583004e-05,
232
- "loss": 0.0568,
233
- "num_tokens": 2293150.0,
234
  "step": 140
235
  },
236
  {
237
- "epoch": 0.4277286135693215,
238
- "grad_norm": 0.30302760004997253,
239
- "learning_rate": 4.895679695642444e-05,
240
- "loss": 0.0591,
241
- "num_tokens": 2374790.0,
242
  "step": 145
243
  },
244
  {
245
- "epoch": 0.4424778761061947,
246
- "grad_norm": 0.2980372905731201,
247
- "learning_rate": 4.884385172073666e-05,
248
- "loss": 0.0608,
249
- "num_tokens": 2456682.0,
250
  "step": 150
251
  },
252
  {
253
- "epoch": 0.45722713864306785,
254
- "grad_norm": 0.3176118731498718,
255
- "learning_rate": 4.8725262982482794e-05,
256
- "loss": 0.058,
257
- "num_tokens": 2538602.0,
258
  "step": 155
259
  },
260
  {
261
- "epoch": 0.471976401179941,
262
- "grad_norm": 0.2424178570508957,
263
- "learning_rate": 4.860106209757071e-05,
264
- "loss": 0.0578,
265
- "num_tokens": 2620522.0,
266
  "step": 160
267
  },
268
  {
269
- "epoch": 0.48672566371681414,
270
- "grad_norm": 0.4045540988445282,
271
- "learning_rate": 4.847128190580936e-05,
272
- "loss": 0.0577,
273
- "num_tokens": 2702326.0,
274
  "step": 165
275
  },
276
  {
277
- "epoch": 0.5014749262536873,
278
- "grad_norm": 0.248455211520195,
279
- "learning_rate": 4.8335956722225616e-05,
280
- "loss": 0.0566,
281
- "num_tokens": 2784246.0,
282
  "step": 170
283
  },
284
  {
285
- "epoch": 0.5162241887905604,
286
- "grad_norm": 0.2201586663722992,
287
- "learning_rate": 4.819512232799107e-05,
288
- "loss": 0.0575,
289
- "num_tokens": 2866166.0,
290
  "step": 175
291
  },
292
  {
293
- "epoch": 0.5309734513274337,
294
- "grad_norm": 0.23909515142440796,
295
- "learning_rate": 4.804881596096118e-05,
296
- "loss": 0.0553,
297
- "num_tokens": 2948086.0,
298
  "step": 180
299
  },
300
  {
301
- "epoch": 0.5457227138643068,
302
- "grad_norm": 0.23102901875972748,
303
- "learning_rate": 4.789707630582923e-05,
304
- "loss": 0.0591,
305
- "num_tokens": 3029739.0,
306
  "step": 185
307
  },
308
  {
309
- "epoch": 0.56047197640118,
310
- "grad_norm": 0.2302148938179016,
311
- "learning_rate": 4.773994348389782e-05,
312
- "loss": 0.0587,
313
- "num_tokens": 3111659.0,
314
  "step": 190
315
  },
316
  {
317
- "epoch": 0.5752212389380531,
318
- "grad_norm": 0.20451895892620087,
319
- "learning_rate": 4.757745904247038e-05,
320
- "loss": 0.0601,
321
- "num_tokens": 3193453.0,
322
  "step": 195
323
  },
324
  {
325
- "epoch": 0.5899705014749262,
326
- "grad_norm": 0.19253046810626984,
327
- "learning_rate": 4.7409665943865705e-05,
328
- "loss": 0.0579,
329
- "num_tokens": 3275373.0,
330
  "step": 200
331
  },
332
  {
333
- "epoch": 0.6047197640117994,
334
- "grad_norm": 0.2857654094696045,
335
- "learning_rate": 4.7236608554058375e-05,
336
- "loss": 0.0577,
337
- "num_tokens": 3357228.0,
338
  "step": 205
339
  },
340
  {
341
- "epoch": 0.6194690265486725,
342
- "grad_norm": 0.21651217341423035,
343
- "learning_rate": 4.7058332630947935e-05,
344
- "loss": 0.0602,
345
- "num_tokens": 3439124.0,
346
  "step": 210
347
  },
348
  {
349
- "epoch": 0.6342182890855457,
350
- "grad_norm": 0.22674554586410522,
351
- "learning_rate": 4.6874885312260186e-05,
352
- "loss": 0.0588,
353
- "num_tokens": 3520967.0,
354
  "step": 215
355
  },
356
  {
357
- "epoch": 0.6489675516224189,
358
- "grad_norm": 0.22327347099781036,
359
- "learning_rate": 4.668631510308349e-05,
360
- "loss": 0.0585,
361
- "num_tokens": 3602887.0,
362
  "step": 220
363
  },
364
  {
365
- "epoch": 0.6637168141592921,
366
- "grad_norm": 0.19307062029838562,
367
- "learning_rate": 4.649267186304362e-05,
368
- "loss": 0.0574,
369
- "num_tokens": 3684556.0,
370
  "step": 225
371
  },
372
  {
373
- "epoch": 0.6784660766961652,
374
- "grad_norm": 0.2287718951702118,
375
- "learning_rate": 4.6294006793120436e-05,
376
- "loss": 0.0576,
377
- "num_tokens": 3766476.0,
378
  "step": 230
379
  },
380
  {
381
- "epoch": 0.6932153392330384,
382
- "grad_norm": 0.16587583720684052,
383
- "learning_rate": 4.609037242210989e-05,
384
- "loss": 0.056,
385
- "num_tokens": 3848396.0,
386
  "step": 235
387
  },
388
  {
389
- "epoch": 0.7079646017699115,
390
- "grad_norm": 0.2796880900859833,
391
- "learning_rate": 4.5881822592734946e-05,
392
- "loss": 0.0572,
393
- "num_tokens": 3930227.0,
394
  "step": 240
395
  },
396
  {
397
- "epoch": 0.7227138643067846,
398
- "grad_norm": 0.2181931585073471,
399
- "learning_rate": 4.5668412447409116e-05,
400
- "loss": 0.057,
401
- "num_tokens": 4011953.0,
402
  "step": 245
403
  },
404
  {
405
- "epoch": 0.7374631268436578,
406
- "grad_norm": 0.31474748253822327,
407
- "learning_rate": 4.545019841365628e-05,
408
- "loss": 0.0555,
409
- "num_tokens": 4093782.0,
410
  "step": 250
411
  },
412
  {
413
- "epoch": 0.7522123893805309,
414
- "grad_norm": 0.19964337348937988,
415
- "learning_rate": 4.5227238189190775e-05,
416
- "loss": 0.0559,
417
- "num_tokens": 4175702.0,
418
  "step": 255
419
  },
420
  {
421
- "epoch": 0.7669616519174042,
422
- "grad_norm": 0.17513608932495117,
423
- "learning_rate": 4.4999590726661605e-05,
424
- "loss": 0.0555,
425
- "num_tokens": 4257622.0,
426
  "step": 260
427
  },
428
  {
429
- "epoch": 0.7817109144542773,
430
- "grad_norm": 0.18871212005615234,
431
- "learning_rate": 4.476731621806485e-05,
432
- "loss": 0.0572,
433
- "num_tokens": 4339542.0,
434
  "step": 265
435
  },
436
  {
437
- "epoch": 0.7964601769911505,
438
- "grad_norm": 0.18672113120555878,
439
- "learning_rate": 4.453047607882834e-05,
440
- "loss": 0.0579,
441
- "num_tokens": 4421462.0,
442
  "step": 270
443
  },
444
  {
445
- "epoch": 0.8112094395280236,
446
- "grad_norm": 0.20873971283435822,
447
- "learning_rate": 4.428913293157293e-05,
448
- "loss": 0.0552,
449
- "num_tokens": 4503098.0,
450
  "step": 275
451
  },
452
  {
453
- "epoch": 0.8259587020648967,
454
- "grad_norm": 0.17426809668540955,
455
- "learning_rate": 4.404335058955446e-05,
456
- "loss": 0.0554,
457
- "num_tokens": 4585018.0,
458
  "step": 280
459
  },
460
  {
461
- "epoch": 0.8407079646017699,
462
- "grad_norm": 0.18556445837020874,
463
- "learning_rate": 4.379319403979104e-05,
464
- "loss": 0.0565,
465
- "num_tokens": 4666938.0,
466
  "step": 285
467
  },
468
  {
469
- "epoch": 0.855457227138643,
470
- "grad_norm": 0.1989375203847885,
471
- "learning_rate": 4.353872942587985e-05,
472
- "loss": 0.0559,
473
- "num_tokens": 4748858.0,
474
  "step": 290
475
  },
476
  {
477
- "epoch": 0.8702064896755162,
478
- "grad_norm": 0.2044542133808136,
479
- "learning_rate": 4.32800240305082e-05,
480
- "loss": 0.0548,
481
- "num_tokens": 4830778.0,
482
  "step": 295
483
  },
484
  {
485
- "epoch": 0.8849557522123894,
486
- "grad_norm": 0.17457793653011322,
487
- "learning_rate": 4.301714625766342e-05,
488
- "loss": 0.0563,
489
- "num_tokens": 4912698.0,
490
  "step": 300
491
  },
492
  {
493
- "epoch": 0.8997050147492626,
494
- "grad_norm": 0.1791624128818512,
495
- "learning_rate": 4.275016561454622e-05,
496
- "loss": 0.0566,
497
- "num_tokens": 4994553.0,
498
  "step": 305
499
  },
500
  {
501
- "epoch": 0.9144542772861357,
502
- "grad_norm": 0.16299232840538025,
503
- "learning_rate": 4.247915269319241e-05,
504
- "loss": 0.0553,
505
- "num_tokens": 5076473.0,
506
  "step": 310
507
  },
508
  {
509
- "epoch": 0.9292035398230089,
510
- "grad_norm": 0.1811797171831131,
511
- "learning_rate": 4.2204179151807685e-05,
512
- "loss": 0.0554,
513
- "num_tokens": 5158254.0,
514
  "step": 315
515
  },
516
  {
517
- "epoch": 0.943952802359882,
518
- "grad_norm": 0.1605992168188095,
519
- "learning_rate": 4.19253176958206e-05,
520
- "loss": 0.0558,
521
- "num_tokens": 5240174.0,
522
  "step": 320
523
  },
524
  {
525
- "epoch": 0.9587020648967551,
526
- "grad_norm": 0.18501946330070496,
527
- "learning_rate": 4.1642642058658605e-05,
528
- "loss": 0.0554,
529
- "num_tokens": 5322094.0,
530
  "step": 325
531
  },
532
  {
533
- "epoch": 0.9734513274336283,
534
- "grad_norm": 0.18083244562149048,
535
- "learning_rate": 4.135622698225229e-05,
536
- "loss": 0.0562,
537
- "num_tokens": 5403956.0,
538
  "step": 330
539
  },
540
  {
541
- "epoch": 0.9882005899705014,
542
- "grad_norm": 0.21333028376102448,
543
- "learning_rate": 4.1066148197272944e-05,
544
- "loss": 0.0566,
545
- "num_tokens": 5485712.0,
546
  "step": 335
547
  },
548
  {
549
- "epoch": 1.0029498525073746,
550
- "grad_norm": 0.19363561272621155,
551
- "learning_rate": 4.077248240310868e-05,
552
- "loss": 0.0553,
553
- "num_tokens": 5567632.0,
554
  "step": 340
555
  },
556
  {
557
- "epoch": 1.0176991150442478,
558
- "grad_norm": 0.1972503960132599,
559
- "learning_rate": 4.047530724758451e-05,
560
- "loss": 0.0551,
561
- "num_tokens": 5649552.0,
562
  "step": 345
563
  },
564
  {
565
- "epoch": 1.0324483775811208,
566
- "grad_norm": 0.16727674007415771,
567
- "learning_rate": 4.017470130643149e-05,
568
- "loss": 0.0566,
569
- "num_tokens": 5731214.0,
570
  "step": 350
571
  },
572
  {
573
- "epoch": 1.047197640117994,
574
- "grad_norm": 0.14662833511829376,
575
- "learning_rate": 3.987074406251067e-05,
576
- "loss": 0.0567,
577
- "num_tokens": 5813045.0,
578
  "step": 355
579
  },
580
  {
581
- "epoch": 1.0619469026548674,
582
- "grad_norm": 0.16213928163051605,
583
- "learning_rate": 3.9563515884797074e-05,
584
- "loss": 0.057,
585
- "num_tokens": 5894884.0,
586
  "step": 360
587
  },
588
  {
589
- "epoch": 1.0766961651917404,
590
- "grad_norm": 0.13559935986995697,
591
- "learning_rate": 3.925309800712945e-05,
592
- "loss": 0.0545,
593
- "num_tokens": 5976776.0,
594
  "step": 365
595
  },
596
  {
597
- "epoch": 1.0914454277286136,
598
- "grad_norm": 0.16717351973056793,
599
- "learning_rate": 3.8939572506731375e-05,
600
- "loss": 0.0558,
601
- "num_tokens": 6058573.0,
602
  "step": 370
603
  },
604
  {
605
- "epoch": 1.1061946902654867,
606
- "grad_norm": 0.11658819019794464,
607
- "learning_rate": 3.8623022282509245e-05,
608
- "loss": 0.0555,
609
- "num_tokens": 6140493.0,
610
  "step": 375
611
  },
612
  {
613
- "epoch": 1.12094395280236,
614
- "grad_norm": 0.14316953718662262,
615
- "learning_rate": 3.83035310331331e-05,
616
- "loss": 0.0553,
617
- "num_tokens": 6222413.0,
618
  "step": 380
619
  },
620
  {
621
- "epoch": 1.135693215339233,
622
- "grad_norm": 0.15932655334472656,
623
- "learning_rate": 3.798118323490597e-05,
624
- "loss": 0.0542,
625
- "num_tokens": 6304333.0,
626
  "step": 385
627
  },
628
  {
629
- "epoch": 1.1504424778761062,
630
- "grad_norm": 0.11980193108320236,
631
- "learning_rate": 3.765606411942759e-05,
632
- "loss": 0.0535,
633
- "num_tokens": 6386253.0,
634
  "step": 390
635
  },
636
  {
637
- "epoch": 1.1651917404129795,
638
- "grad_norm": 0.1464000791311264,
639
- "learning_rate": 3.7328259651058384e-05,
640
- "loss": 0.0547,
641
- "num_tokens": 6468173.0,
642
  "step": 395
643
  },
644
  {
645
- "epoch": 1.1799410029498525,
646
- "grad_norm": 0.12491417676210403,
647
- "learning_rate": 3.699785650418977e-05,
648
- "loss": 0.0536,
649
- "num_tokens": 6550093.0,
650
  "step": 400
651
  },
652
  {
653
- "epoch": 1.1946902654867257,
654
- "grad_norm": 0.15376126766204834,
655
- "learning_rate": 3.666494204032668e-05,
656
- "loss": 0.0555,
657
- "num_tokens": 6631948.0,
658
  "step": 405
659
  },
660
  {
661
- "epoch": 1.2094395280235988,
662
- "grad_norm": 0.16010604798793793,
663
- "learning_rate": 3.63296042849884e-05,
664
- "loss": 0.0544,
665
- "num_tokens": 6713791.0,
666
  "step": 410
667
  },
668
  {
669
- "epoch": 1.224188790560472,
670
- "grad_norm": 0.15632019937038422,
671
- "learning_rate": 3.5991931904433824e-05,
672
- "loss": 0.0557,
673
- "num_tokens": 6795632.0,
674
  "step": 415
675
  },
676
  {
677
- "epoch": 1.238938053097345,
678
- "grad_norm": 0.14752759039402008,
679
- "learning_rate": 3.5652014182217355e-05,
680
- "loss": 0.0543,
681
- "num_tokens": 6877552.0,
682
  "step": 420
683
  },
684
  {
685
- "epoch": 1.2536873156342183,
686
- "grad_norm": 0.21005742251873016,
687
- "learning_rate": 3.530994099558153e-05,
688
- "loss": 0.0533,
689
- "num_tokens": 6959472.0,
690
  "step": 425
691
  },
692
  {
693
- "epoch": 1.2684365781710913,
694
- "grad_norm": 0.13375329971313477,
695
- "learning_rate": 3.496580279169265e-05,
696
- "loss": 0.0553,
697
- "num_tokens": 7041392.0,
698
  "step": 430
699
  },
700
  {
701
- "epoch": 1.2831858407079646,
702
- "grad_norm": 0.16405825316905975,
703
- "learning_rate": 3.461969056372582e-05,
704
- "loss": 0.0545,
705
- "num_tokens": 7123312.0,
706
  "step": 435
707
  },
708
  {
709
- "epoch": 1.2979351032448379,
710
- "grad_norm": 0.1644970029592514,
711
- "learning_rate": 3.427169582680551e-05,
712
- "loss": 0.0551,
713
- "num_tokens": 7205232.0,
714
  "step": 440
715
  },
716
  {
717
- "epoch": 1.3126843657817109,
718
- "grad_norm": 0.1362873613834381,
719
- "learning_rate": 3.392191059380812e-05,
720
- "loss": 0.0544,
721
- "num_tokens": 7287046.0,
722
  "step": 445
723
  },
724
  {
725
- "epoch": 1.3274336283185841,
726
- "grad_norm": 0.148273766040802,
727
- "learning_rate": 3.3570427351033046e-05,
728
- "loss": 0.0546,
729
- "num_tokens": 7368966.0,
730
  "step": 450
731
  },
732
  {
733
- "epoch": 1.3421828908554572,
734
- "grad_norm": 0.14546248316764832,
735
- "learning_rate": 3.321733903374841e-05,
736
- "loss": 0.0539,
737
- "num_tokens": 7450886.0,
738
  "step": 455
739
  },
740
  {
741
- "epoch": 1.3569321533923304,
742
- "grad_norm": 0.14057567715644836,
743
- "learning_rate": 3.286273900161818e-05,
744
- "loss": 0.0553,
745
- "num_tokens": 7532806.0,
746
  "step": 460
747
  },
748
  {
749
- "epoch": 1.3716814159292037,
750
- "grad_norm": 0.1333959847688675,
751
- "learning_rate": 3.250672101401707e-05,
752
- "loss": 0.0546,
753
- "num_tokens": 7614726.0,
754
  "step": 465
755
  },
756
  {
757
- "epoch": 1.3864306784660767,
758
- "grad_norm": 0.1370707005262375,
759
- "learning_rate": 3.214937920523974e-05,
760
- "loss": 0.0561,
761
- "num_tokens": 7696402.0,
762
  "step": 470
763
  },
764
  {
765
- "epoch": 1.4011799410029497,
766
- "grad_norm": 0.1322726011276245,
767
- "learning_rate": 3.1790808059610786e-05,
768
- "loss": 0.0536,
769
- "num_tokens": 7778322.0,
770
  "step": 475
771
  },
772
  {
773
- "epoch": 1.415929203539823,
774
- "grad_norm": 0.14645648002624512,
775
- "learning_rate": 3.143110238650236e-05,
776
- "loss": 0.0543,
777
- "num_tokens": 7860242.0,
778
  "step": 480
779
  },
780
  {
781
- "epoch": 1.4306784660766962,
782
- "grad_norm": 0.1392751932144165,
783
- "learning_rate": 3.107035729526566e-05,
784
- "loss": 0.0555,
785
- "num_tokens": 7942073.0,
786
  "step": 485
787
  },
788
  {
789
- "epoch": 1.4454277286135693,
790
- "grad_norm": 0.15789316594600677,
791
- "learning_rate": 3.070866817008319e-05,
792
- "loss": 0.0565,
793
- "num_tokens": 8023993.0,
794
  "step": 490
795
  },
796
  {
797
- "epoch": 1.4601769911504425,
798
- "grad_norm": 0.11416062712669373,
799
- "learning_rate": 3.0346130644748367e-05,
800
- "loss": 0.0557,
801
- "num_tokens": 8105787.0,
802
  "step": 495
803
  },
804
  {
805
- "epoch": 1.4749262536873156,
806
- "grad_norm": 0.14937061071395874,
807
- "learning_rate": 2.998284057737909e-05,
808
- "loss": 0.0532,
809
- "num_tokens": 8187707.0,
810
  "step": 500
811
  },
812
  {
813
- "epoch": 1.4896755162241888,
814
- "grad_norm": 0.11278703063726425,
815
- "learning_rate": 2.9618894025071984e-05,
816
- "loss": 0.0534,
817
- "num_tokens": 8269536.0,
818
  "step": 505
819
  },
820
  {
821
- "epoch": 1.504424778761062,
822
- "grad_norm": 0.1776408553123474,
823
- "learning_rate": 2.925438721850412e-05,
824
- "loss": 0.0545,
825
- "num_tokens": 8351456.0,
826
  "step": 510
827
  },
828
  {
829
- "epoch": 1.519174041297935,
830
- "grad_norm": 0.1571996957063675,
831
- "learning_rate": 2.88894165364887e-05,
832
- "loss": 0.054,
833
- "num_tokens": 8433240.0,
834
  "step": 515
835
  },
836
  {
837
- "epoch": 1.5339233038348081,
838
- "grad_norm": 0.15761998295783997,
839
- "learning_rate": 2.8524078480491684e-05,
840
- "loss": 0.0534,
841
- "num_tokens": 8515160.0,
842
  "step": 520
843
  },
844
  {
845
- "epoch": 1.5486725663716814,
846
- "grad_norm": 0.12947538495063782,
847
- "learning_rate": 2.8158469649115978e-05,
848
- "loss": 0.0539,
849
- "num_tokens": 8597080.0,
850
  "step": 525
851
  },
852
  {
853
- "epoch": 1.5634218289085546,
854
- "grad_norm": 0.15504850447177887,
855
- "learning_rate": 2.779268671255985e-05,
856
- "loss": 0.0539,
857
- "num_tokens": 8678806.0,
858
  "step": 530
859
  },
860
  {
861
- "epoch": 1.5781710914454279,
862
- "grad_norm": 0.11758331209421158,
863
- "learning_rate": 2.7426826387056555e-05,
864
- "loss": 0.055,
865
- "num_tokens": 8760726.0,
866
  "step": 535
867
  },
868
  {
869
- "epoch": 1.592920353982301,
870
- "grad_norm": 0.14461639523506165,
871
- "learning_rate": 2.7060985409301627e-05,
872
- "loss": 0.0557,
873
- "num_tokens": 8842646.0,
874
  "step": 540
875
  },
876
  {
877
- "epoch": 1.607669616519174,
878
- "grad_norm": 0.15372057259082794,
879
- "learning_rate": 2.6695260510874914e-05,
880
- "loss": 0.054,
881
- "num_tokens": 8924450.0,
882
  "step": 545
883
  },
884
  {
885
- "epoch": 1.6224188790560472,
886
- "grad_norm": 0.14337101578712463,
887
- "learning_rate": 2.632974839266385e-05,
888
- "loss": 0.0531,
889
- "num_tokens": 9006370.0,
890
  "step": 550
891
  },
892
  {
893
- "epoch": 1.6371681415929205,
894
- "grad_norm": 0.12814313173294067,
895
- "learning_rate": 2.5964545699294906e-05,
896
- "loss": 0.0549,
897
- "num_tokens": 9088290.0,
898
  "step": 555
899
  },
900
  {
901
- "epoch": 1.6519174041297935,
902
- "grad_norm": 0.1454104632139206,
903
- "learning_rate": 2.559974899357991e-05,
904
- "loss": 0.0541,
905
- "num_tokens": 9170162.0,
906
  "step": 560
907
  },
908
  {
909
- "epoch": 1.6666666666666665,
910
- "grad_norm": 0.11160258203744888,
911
- "learning_rate": 2.5235454730983955e-05,
912
- "loss": 0.0535,
913
- "num_tokens": 9252082.0,
914
  "step": 565
915
  },
916
  {
917
- "epoch": 1.6814159292035398,
918
- "grad_norm": 0.14262907207012177,
919
- "learning_rate": 2.487175923412175e-05,
920
- "loss": 0.0538,
921
- "num_tokens": 9334002.0,
922
  "step": 570
923
  },
924
  {
925
- "epoch": 1.696165191740413,
926
- "grad_norm": 0.12927588820457458,
927
- "learning_rate": 2.4508758667289076e-05,
928
- "loss": 0.054,
929
- "num_tokens": 9415922.0,
930
  "step": 575
931
  },
932
  {
933
- "epoch": 1.7109144542772863,
934
- "grad_norm": 0.1411578357219696,
935
- "learning_rate": 2.4146549011036074e-05,
936
- "loss": 0.0519,
937
- "num_tokens": 9497842.0,
938
  "step": 580
939
  },
940
  {
941
- "epoch": 1.7256637168141593,
942
- "grad_norm": 0.14883753657341003,
943
- "learning_rate": 2.378522603678917e-05,
944
- "loss": 0.0524,
945
- "num_tokens": 9579762.0,
946
  "step": 585
947
  },
948
  {
949
- "epoch": 1.7404129793510323,
950
- "grad_norm": 0.13972261548042297,
951
- "learning_rate": 2.3424885281528248e-05,
952
- "loss": 0.0546,
953
- "num_tokens": 9661480.0,
954
  "step": 590
955
  },
956
  {
957
- "epoch": 1.7551622418879056,
958
- "grad_norm": 0.15459661185741425,
959
- "learning_rate": 2.3065622022525813e-05,
960
- "loss": 0.0528,
961
- "num_tokens": 9743400.0,
962
  "step": 595
963
  },
964
  {
965
- "epoch": 1.7699115044247788,
966
- "grad_norm": 0.12449204921722412,
967
- "learning_rate": 2.2707531252154868e-05,
968
- "loss": 0.0539,
969
- "num_tokens": 9825283.0,
970
  "step": 600
971
  },
972
  {
973
- "epoch": 1.7846607669616519,
974
- "grad_norm": 0.14224207401275635,
975
- "learning_rate": 2.2350707652772102e-05,
976
- "loss": 0.0552,
977
- "num_tokens": 9906978.0,
978
  "step": 605
979
  },
980
  {
981
- "epoch": 1.799410029498525,
982
- "grad_norm": 0.11525611579418182,
983
- "learning_rate": 2.1995245571683016e-05,
984
- "loss": 0.0555,
985
- "num_tokens": 9988898.0,
986
  "step": 610
987
  },
988
  {
989
- "epoch": 1.8141592920353982,
990
- "grad_norm": 0.12278515100479126,
991
- "learning_rate": 2.1641238996195645e-05,
992
- "loss": 0.0532,
993
- "num_tokens": 10070818.0,
994
  "step": 615
995
  },
996
  {
997
- "epoch": 1.8289085545722714,
998
- "grad_norm": 0.14640717208385468,
999
- "learning_rate": 2.1288781528769553e-05,
1000
- "loss": 0.0545,
1001
- "num_tokens": 10152738.0,
1002
  "step": 620
1003
  },
1004
  {
1005
- "epoch": 1.8436578171091447,
1006
- "grad_norm": 0.11209894716739655,
1007
- "learning_rate": 2.0937966362266443e-05,
1008
- "loss": 0.0522,
1009
- "num_tokens": 10234658.0,
1010
  "step": 625
1011
  },
1012
  {
1013
- "epoch": 1.8584070796460177,
1014
- "grad_norm": 0.13037630915641785,
1015
- "learning_rate": 2.0588886255309192e-05,
1016
- "loss": 0.0533,
1017
- "num_tokens": 10316562.0,
1018
  "step": 630
1019
  },
1020
  {
1021
- "epoch": 1.8731563421828907,
1022
- "grad_norm": 0.12001070380210876,
1023
- "learning_rate": 2.0241633507755625e-05,
1024
- "loss": 0.0559,
1025
- "num_tokens": 10398482.0,
1026
  "step": 635
1027
  },
1028
  {
1029
- "epoch": 1.887905604719764,
1030
- "grad_norm": 0.13977284729480743,
1031
- "learning_rate": 1.989629993629364e-05,
1032
- "loss": 0.0526,
1033
- "num_tokens": 10480361.0,
1034
  "step": 640
1035
  },
1036
  {
1037
- "epoch": 1.9026548672566372,
1038
- "grad_norm": 0.13830554485321045,
1039
- "learning_rate": 1.9552976850164047e-05,
1040
- "loss": 0.0538,
1041
- "num_tokens": 10562281.0,
1042
  "step": 645
1043
  },
1044
  {
1045
- "epoch": 1.9174041297935103,
1046
- "grad_norm": 0.11689659208059311,
1047
- "learning_rate": 1.9211755027017625e-05,
1048
- "loss": 0.0521,
1049
- "num_tokens": 10644201.0,
1050
  "step": 650
1051
  },
1052
  {
1053
- "epoch": 1.9321533923303835,
1054
- "grad_norm": 0.1302964985370636,
1055
- "learning_rate": 1.8872724688912684e-05,
1056
- "loss": 0.0544,
1057
- "num_tokens": 10726121.0,
1058
  "step": 655
1059
  },
1060
  {
1061
- "epoch": 1.9469026548672566,
1062
- "grad_norm": 0.11920026689767838,
1063
- "learning_rate": 1.8535975478459566e-05,
1064
- "loss": 0.0539,
1065
- "num_tokens": 10807790.0,
1066
  "step": 660
1067
  },
1068
  {
1069
- "epoch": 1.9616519174041298,
1070
- "grad_norm": 0.14449501037597656,
1071
- "learning_rate": 1.8201596435118356e-05,
1072
- "loss": 0.052,
1073
- "num_tokens": 10889546.0,
1074
  "step": 665
1075
  },
1076
  {
1077
- "epoch": 1.976401179941003,
1078
- "grad_norm": 0.13730932772159576,
1079
- "learning_rate": 1.7869675971656062e-05,
1080
- "loss": 0.0545,
1081
- "num_tokens": 10971466.0,
1082
  "step": 670
1083
  },
1084
  {
1085
- "epoch": 1.991150442477876,
1086
- "grad_norm": 0.14631158113479614,
1087
- "learning_rate": 1.7540301850769482e-05,
1088
- "loss": 0.051,
1089
- "num_tokens": 11053344.0,
1090
  "step": 675
1091
  },
1092
  {
1093
- "epoch": 2.005899705014749,
1094
- "grad_norm": 0.14203102886676788,
1095
- "learning_rate": 1.721356116188001e-05,
1096
- "loss": 0.0551,
1097
- "num_tokens": 11135264.0,
1098
  "step": 680
1099
  },
1100
  {
1101
- "epoch": 2.0206489675516224,
1102
- "grad_norm": 0.14633382856845856,
1103
- "learning_rate": 1.688954029810639e-05,
1104
- "loss": 0.0528,
1105
- "num_tokens": 11217184.0,
1106
  "step": 685
1107
  },
1108
  {
1109
- "epoch": 2.0353982300884956,
1110
- "grad_norm": 0.12269877642393112,
1111
- "learning_rate": 1.6568324933421605e-05,
1112
- "loss": 0.0524,
1113
- "num_tokens": 11299104.0,
1114
  "step": 690
1115
  },
1116
  {
1117
- "epoch": 2.050147492625369,
1118
- "grad_norm": 0.13254043459892273,
1119
- "learning_rate": 1.6250000000000005e-05,
1120
- "loss": 0.0532,
1121
- "num_tokens": 11381024.0,
1122
  "step": 695
1123
  },
1124
  {
1125
- "epoch": 2.0648967551622417,
1126
- "grad_norm": 0.13670207560062408,
1127
- "learning_rate": 1.5934649665760377e-05,
1128
- "loss": 0.0538,
1129
- "num_tokens": 11462944.0,
1130
  "step": 700
1131
  },
1132
  {
1133
- "epoch": 2.079646017699115,
1134
- "grad_norm": 0.13071012496948242,
1135
- "learning_rate": 1.5622357312111275e-05,
1136
- "loss": 0.054,
1137
- "num_tokens": 11544864.0,
1138
  "step": 705
1139
  },
1140
  {
1141
- "epoch": 2.094395280235988,
1142
- "grad_norm": 0.11913175135850906,
1143
- "learning_rate": 1.5313205511904228e-05,
1144
- "loss": 0.052,
1145
- "num_tokens": 11626784.0,
1146
  "step": 710
1147
  },
1148
  {
1149
- "epoch": 2.1091445427728615,
1150
- "grad_norm": 0.11887428164482117,
1151
- "learning_rate": 1.5007276007600757e-05,
1152
- "loss": 0.0526,
1153
- "num_tokens": 11708704.0,
1154
  "step": 715
1155
  },
1156
  {
1157
- "epoch": 2.1238938053097347,
1158
- "grad_norm": 0.12120334059000015,
1159
- "learning_rate": 1.4704649689658917e-05,
1160
- "loss": 0.0521,
1161
- "num_tokens": 11790559.0,
1162
  "step": 720
1163
  },
1164
  {
1165
- "epoch": 2.1386430678466075,
1166
- "grad_norm": 0.09578295052051544,
1167
- "learning_rate": 1.4405406575145198e-05,
1168
- "loss": 0.0549,
1169
- "num_tokens": 11872285.0,
1170
  "step": 725
1171
  },
1172
  {
1173
- "epoch": 2.1533923303834808,
1174
- "grad_norm": 0.13386505842208862,
1175
- "learning_rate": 1.4109625786577236e-05,
1176
- "loss": 0.0542,
1177
- "num_tokens": 11954205.0,
1178
  "step": 730
1179
  },
1180
  {
1181
- "epoch": 2.168141592920354,
1182
- "grad_norm": 0.1328478455543518,
1183
- "learning_rate": 1.3817385531003186e-05,
1184
- "loss": 0.0518,
1185
- "num_tokens": 12036002.0,
1186
  "step": 735
1187
  },
1188
  {
1189
- "epoch": 2.1828908554572273,
1190
- "grad_norm": 0.13573037087917328,
1191
- "learning_rate": 1.3528763079323076e-05,
1192
- "loss": 0.053,
1193
- "num_tokens": 12117880.0,
1194
  "step": 740
1195
  },
1196
  {
1197
- "epoch": 2.1976401179941005,
1198
- "grad_norm": 0.11889643222093582,
1199
- "learning_rate": 1.3243834745857667e-05,
1200
- "loss": 0.0518,
1201
- "num_tokens": 12199800.0,
1202
  "step": 745
1203
  },
1204
  {
1205
- "epoch": 2.2123893805309733,
1206
- "grad_norm": 0.13203181326389313,
1207
- "learning_rate": 1.29626758681703e-05,
1208
- "loss": 0.0538,
1209
- "num_tokens": 12281720.0,
1210
  "step": 750
1211
  },
1212
  {
1213
- "epoch": 2.2271386430678466,
1214
- "grad_norm": 0.15322619676589966,
1215
- "learning_rate": 1.2685360787146994e-05,
1216
- "loss": 0.0521,
1217
- "num_tokens": 12363640.0,
1218
  "step": 755
1219
  },
1220
  {
1221
- "epoch": 2.24188790560472,
1222
- "grad_norm": 0.1316722333431244,
1223
- "learning_rate": 1.2411962827340023e-05,
1224
- "loss": 0.0509,
1225
- "num_tokens": 12445316.0,
1226
  "step": 760
1227
  },
1228
  {
1229
- "epoch": 2.256637168141593,
1230
- "grad_norm": 0.12419988960027695,
1231
- "learning_rate": 1.2142554277580288e-05,
1232
- "loss": 0.0523,
1233
- "num_tokens": 12527159.0,
1234
  "step": 765
1235
  },
1236
  {
1237
- "epoch": 2.271386430678466,
1238
- "grad_norm": 0.16085049510002136,
1239
- "learning_rate": 1.187720637186349e-05,
1240
- "loss": 0.0535,
1241
- "num_tokens": 12609079.0,
1242
  "step": 770
1243
  },
1244
  {
1245
- "epoch": 2.286135693215339,
1246
- "grad_norm": 0.1237749382853508,
1247
- "learning_rate": 1.1615989270515268e-05,
1248
- "loss": 0.0537,
1249
- "num_tokens": 12690873.0,
1250
  "step": 775
1251
  },
1252
  {
1253
- "epoch": 2.3008849557522124,
1254
- "grad_norm": 0.10973517596721649,
1255
- "learning_rate": 1.1358972041640139e-05,
1256
- "loss": 0.0528,
1257
- "num_tokens": 12772682.0,
1258
  "step": 780
1259
  },
1260
  {
1261
- "epoch": 2.3156342182890857,
1262
- "grad_norm": 0.12612909078598022,
1263
- "learning_rate": 1.110622264285934e-05,
1264
- "loss": 0.0527,
1265
- "num_tokens": 12854602.0,
1266
  "step": 785
1267
  },
1268
  {
1269
- "epoch": 2.330383480825959,
1270
- "grad_norm": 0.14214757084846497,
1271
- "learning_rate": 1.085780790334219e-05,
1272
- "loss": 0.0513,
1273
- "num_tokens": 12936522.0,
1274
  "step": 790
1275
  },
1276
  {
1277
- "epoch": 2.3451327433628317,
1278
- "grad_norm": 0.12092974781990051,
1279
- "learning_rate": 1.0613793506135872e-05,
1280
- "loss": 0.05,
1281
- "num_tokens": 13018442.0,
1282
  "step": 795
1283
  },
1284
  {
1285
- "epoch": 2.359882005899705,
1286
- "grad_norm": 0.11506114155054092,
1287
- "learning_rate": 1.0374243970798297e-05,
1288
- "loss": 0.052,
1289
- "num_tokens": 13100341.0,
1290
  "step": 800
1291
  },
1292
  {
1293
- "epoch": 2.3746312684365782,
1294
- "grad_norm": 0.13261866569519043,
1295
- "learning_rate": 1.0139222636338505e-05,
1296
- "loss": 0.0521,
1297
- "num_tokens": 13182261.0,
1298
  "step": 805
1299
  },
1300
  {
1301
- "epoch": 2.3893805309734515,
1302
- "grad_norm": 0.13615797460079193,
1303
- "learning_rate": 9.90879164446933e-06,
1304
- "loss": 0.0531,
1305
- "num_tokens": 13264181.0,
1306
  "step": 810
1307
  },
1308
  {
1309
- "epoch": 2.4041297935103243,
1310
- "grad_norm": 0.1656993329524994,
1311
- "learning_rate": 9.683011923176537e-06,
1312
- "loss": 0.0534,
1313
- "num_tokens": 13346101.0,
1314
  "step": 815
1315
  },
1316
  {
1317
- "epoch": 2.4188790560471976,
1318
- "grad_norm": 0.12165559083223343,
1319
- "learning_rate": 9.461943170608942e-06,
1320
- "loss": 0.0523,
1321
- "num_tokens": 13427963.0,
1322
  "step": 820
1323
  },
1324
  {
1325
- "epoch": 2.433628318584071,
1326
- "grad_norm": 0.12782715260982513,
1327
- "learning_rate": 9.245643839293718e-06,
1328
- "loss": 0.0517,
1329
- "num_tokens": 13509883.0,
1330
  "step": 825
1331
  },
1332
  {
1333
- "epoch": 2.448377581120944,
1334
- "grad_norm": 0.11893630027770996,
1335
- "learning_rate": 9.034171120680993e-06,
1336
- "loss": 0.0543,
1337
- "num_tokens": 13591803.0,
1338
  "step": 830
1339
  },
1340
  {
1341
- "epoch": 2.4631268436578173,
1342
- "grad_norm": 0.1121756061911583,
1343
- "learning_rate": 8.827580930021936e-06,
1344
- "loss": 0.0508,
1345
- "num_tokens": 13673723.0,
1346
  "step": 835
1347
  },
1348
  {
1349
- "epoch": 2.47787610619469,
1350
- "grad_norm": 0.14923910796642303,
1351
- "learning_rate": 8.625927891584307e-06,
1352
- "loss": 0.052,
1353
- "num_tokens": 13755595.0,
1354
  "step": 840
1355
  },
1356
  {
1357
- "epoch": 2.4926253687315634,
1358
- "grad_norm": 0.12945568561553955,
1359
- "learning_rate": 8.429265324209275e-06,
1360
- "loss": 0.0541,
1361
- "num_tokens": 13837426.0,
1362
  "step": 845
1363
  },
1364
  {
1365
- "epoch": 2.5073746312684366,
1366
- "grad_norm": 0.11035799235105515,
1367
- "learning_rate": 8.237645227213465e-06,
1368
- "loss": 0.0535,
1369
- "num_tokens": 13919095.0,
1370
  "step": 850
1371
  },
1372
  {
1373
- "epoch": 2.52212389380531,
1374
- "grad_norm": 0.12474622577428818,
1375
- "learning_rate": 8.051118266639879e-06,
1376
- "loss": 0.0527,
1377
- "num_tokens": 14001015.0,
1378
  "step": 855
1379
  },
1380
  {
1381
- "epoch": 2.5368731563421827,
1382
- "grad_norm": 0.12021201103925705,
1383
- "learning_rate": 7.869733761861347e-06,
1384
- "loss": 0.0519,
1385
- "num_tokens": 14082733.0,
1386
  "step": 860
1387
  },
1388
  {
1389
- "epoch": 2.551622418879056,
1390
- "grad_norm": 0.15574879944324493,
1391
- "learning_rate": 7.693539672540045e-06,
1392
- "loss": 0.0524,
1393
- "num_tokens": 14164653.0,
1394
  "step": 865
1395
  },
1396
  {
1397
- "epoch": 2.566371681415929,
1398
- "grad_norm": 0.12805156409740448,
1399
- "learning_rate": 7.522582585946558e-06,
1400
- "loss": 0.0527,
1401
- "num_tokens": 14246457.0,
1402
  "step": 870
1403
  },
1404
  {
1405
- "epoch": 2.5811209439528024,
1406
- "grad_norm": 0.13377341628074646,
1407
- "learning_rate": 7.356907704641764e-06,
1408
- "loss": 0.0521,
1409
- "num_tokens": 14328377.0,
1410
  "step": 875
1411
  },
1412
  {
1413
- "epoch": 2.5958702064896757,
1414
- "grad_norm": 0.1164890006184578,
1415
- "learning_rate": 7.196558834524891e-06,
1416
- "loss": 0.0531,
1417
- "num_tokens": 14410297.0,
1418
  "step": 880
1419
  },
1420
  {
1421
- "epoch": 2.6106194690265485,
1422
- "grad_norm": 0.1588173806667328,
1423
- "learning_rate": 7.041578373250875e-06,
1424
- "loss": 0.0513,
1425
- "num_tokens": 14491946.0,
1426
  "step": 885
1427
  },
1428
  {
1429
- "epoch": 2.6253687315634218,
1430
- "grad_norm": 0.12378757447004318,
1431
- "learning_rate": 6.892007299020003e-06,
1432
- "loss": 0.0523,
1433
- "num_tokens": 14573866.0,
1434
  "step": 890
1435
  },
1436
  {
1437
- "epoch": 2.640117994100295,
1438
- "grad_norm": 0.12086950987577438,
1439
- "learning_rate": 6.747885159742945e-06,
1440
- "loss": 0.0528,
1441
- "num_tokens": 14655786.0,
1442
  "step": 895
1443
  },
1444
  {
1445
- "epoch": 2.6548672566371683,
1446
- "grad_norm": 0.13173659145832062,
1447
- "learning_rate": 6.609250062583937e-06,
1448
- "loss": 0.0513,
1449
- "num_tokens": 14737706.0,
1450
  "step": 900
1451
  },
1452
  {
1453
- "epoch": 2.669616519174041,
1454
- "grad_norm": 0.11908340454101562,
1455
- "learning_rate": 6.476138663884902e-06,
1456
- "loss": 0.0508,
1457
- "num_tokens": 14819610.0,
1458
  "step": 905
1459
  },
1460
  {
1461
- "epoch": 2.6843657817109143,
1462
- "grad_norm": 0.16239605844020844,
1463
- "learning_rate": 6.34858615947318e-06,
1464
- "loss": 0.0536,
1465
- "num_tokens": 14901366.0,
1466
  "step": 910
1467
  },
1468
  {
1469
- "epoch": 2.6991150442477876,
1470
- "grad_norm": 0.14603886008262634,
1471
- "learning_rate": 6.226626275355474e-06,
1472
  "loss": 0.0506,
1473
- "num_tokens": 14983286.0,
1474
  "step": 915
1475
  },
1476
  {
1477
- "epoch": 2.713864306784661,
1478
- "grad_norm": 0.13718026876449585,
1479
- "learning_rate": 6.110291258800356e-06,
1480
- "loss": 0.0519,
1481
- "num_tokens": 15065206.0,
1482
  "step": 920
1483
  },
1484
  {
1485
- "epoch": 2.728613569321534,
1486
- "grad_norm": 0.12413609027862549,
1487
- "learning_rate": 5.9996118698118335e-06,
1488
- "loss": 0.052,
1489
- "num_tokens": 15147126.0,
1490
  "step": 925
1491
  },
1492
  {
1493
- "epoch": 2.7433628318584073,
1494
- "grad_norm": 0.14036346971988678,
1495
- "learning_rate": 5.89461737299613e-06,
1496
- "loss": 0.0515,
1497
- "num_tokens": 15229046.0,
1498
  "step": 930
1499
  },
1500
  {
1501
- "epoch": 2.75811209439528,
1502
- "grad_norm": 0.1299838423728943,
1503
- "learning_rate": 5.795335529823848e-06,
1504
  "loss": 0.0523,
1505
- "num_tokens": 15310966.0,
1506
  "step": 935
1507
  },
1508
  {
1509
- "epoch": 2.7728613569321534,
1510
- "grad_norm": 0.12340506166219711,
1511
- "learning_rate": 5.701792591289609e-06,
1512
- "loss": 0.0524,
1513
- "num_tokens": 15392621.0,
1514
  "step": 940
1515
  },
1516
  {
1517
- "epoch": 2.7876106194690267,
1518
- "grad_norm": 0.11809273809194565,
1519
- "learning_rate": 5.614013290971055e-06,
1520
- "loss": 0.0512,
1521
- "num_tokens": 15474541.0,
1522
  "step": 945
1523
  },
1524
  {
1525
- "epoch": 2.8023598820058995,
1526
- "grad_norm": 0.11352474987506866,
1527
- "learning_rate": 5.532020838489065e-06,
1528
- "loss": 0.052,
1529
- "num_tokens": 15556461.0,
1530
  "step": 950
1531
  },
1532
  {
1533
- "epoch": 2.8171091445427727,
1534
- "grad_norm": 0.12270710617303848,
1535
- "learning_rate": 5.455836913370934e-06,
1536
- "loss": 0.0533,
1537
- "num_tokens": 15638316.0,
1538
  "step": 955
1539
  },
1540
  {
1541
- "epoch": 2.831858407079646,
1542
- "grad_norm": 0.1239713653922081,
1543
- "learning_rate": 5.38548165931812e-06,
1544
- "loss": 0.0523,
1545
- "num_tokens": 15720236.0,
1546
  "step": 960
1547
  },
1548
  {
1549
- "epoch": 2.8466076696165192,
1550
- "grad_norm": 0.13654908537864685,
1551
- "learning_rate": 5.3209736788800545e-06,
1552
- "loss": 0.0505,
1553
- "num_tokens": 15802115.0,
1554
  "step": 965
1555
  },
1556
  {
1557
- "epoch": 2.8613569321533925,
1558
- "grad_norm": 0.12271752953529358,
1559
- "learning_rate": 5.262330028535478e-06,
1560
- "loss": 0.0527,
1561
- "num_tokens": 15884035.0,
1562
  "step": 970
1563
  },
1564
  {
1565
- "epoch": 2.8761061946902657,
1566
- "grad_norm": 0.11097793281078339,
1567
- "learning_rate": 5.209566214182558e-06,
1568
- "loss": 0.0503,
1569
- "num_tokens": 15965955.0,
1570
  "step": 975
1571
  },
1572
  {
1573
- "epoch": 2.8908554572271385,
1574
- "grad_norm": 0.12811797857284546,
1575
- "learning_rate": 5.162696187038983e-06,
1576
- "loss": 0.0521,
1577
- "num_tokens": 16047875.0,
1578
  "step": 980
1579
  },
1580
  {
1581
- "epoch": 2.905604719764012,
1582
- "grad_norm": 0.12047919631004333,
1583
- "learning_rate": 5.121732339953144e-06,
1584
- "loss": 0.0526,
1585
- "num_tokens": 16129795.0,
1586
  "step": 985
1587
  },
1588
  {
1589
- "epoch": 2.920353982300885,
1590
- "grad_norm": 0.1275618076324463,
1591
- "learning_rate": 5.086685504127355e-06,
1592
- "loss": 0.0525,
1593
- "num_tokens": 16211715.0,
1594
  "step": 990
1595
  },
1596
  {
1597
- "epoch": 2.935103244837758,
1598
- "grad_norm": 0.13480406999588013,
1599
- "learning_rate": 5.057564946253982e-06,
1600
- "loss": 0.054,
1601
- "num_tokens": 16293598.0,
1602
  "step": 995
1603
  },
1604
  {
1605
- "epoch": 2.949852507374631,
1606
- "grad_norm": 0.17191572487354279,
1607
- "learning_rate": 5.03437836606527e-06,
1608
- "loss": 0.0514,
1609
- "num_tokens": 16375518.0,
1610
  "step": 1000
1611
  },
1612
  {
1613
- "epoch": 2.9646017699115044,
1614
- "grad_norm": 0.1515762060880661,
1615
- "learning_rate": 5.0171318942974285e-06,
1616
- "loss": 0.054,
1617
- "num_tokens": 16457438.0,
1618
  "step": 1005
1619
  },
1620
  {
1621
- "epoch": 2.9793510324483776,
1622
- "grad_norm": 0.15460149943828583,
1623
- "learning_rate": 5.005830091069644e-06,
1624
- "loss": 0.053,
1625
- "num_tokens": 16539277.0,
1626
  "step": 1010
1627
  },
1628
  {
1629
- "epoch": 2.994100294985251,
1630
- "grad_norm": 0.11925122141838074,
1631
- "learning_rate": 5.000475944678329e-06,
1632
- "loss": 0.0522,
1633
- "num_tokens": 16620976.0,
1634
  "step": 1015
1635
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1636
  {
1637
  "epoch": 3.0,
1638
- "num_tokens": 16653744.0,
1639
- "step": 1017,
1640
- "total_flos": 2.7740847281799168e+17,
1641
- "train_loss": 0.0704414379602571,
1642
- "train_runtime": 3418.2006,
1643
- "train_samples_per_second": 18.999,
1644
- "train_steps_per_second": 0.298
1645
  }
1646
  ],
1647
  "logging_steps": 5,
1648
- "max_steps": 1017,
1649
  "num_input_tokens_seen": 0,
1650
  "num_train_epochs": 3,
1651
  "save_steps": 100,
@@ -1661,7 +2317,7 @@
1661
  "attributes": {}
1662
  }
1663
  },
1664
- "total_flos": 2.7740847281799168e+17,
1665
  "train_batch_size": 16,
1666
  "trial_name": null,
1667
  "trial_params": null
 
4
  "best_model_checkpoint": null,
5
  "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 1428,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.01050420168067227,
14
+ "grad_norm": 21.68925666809082,
15
+ "learning_rate": 3.4722222222222224e-06,
16
+ "loss": 1.7424,
17
+ "num_tokens": 61440.0,
18
  "step": 5
19
  },
20
  {
21
+ "epoch": 0.02100840336134454,
22
+ "grad_norm": 6.517210483551025,
23
+ "learning_rate": 6.944444444444445e-06,
24
+ "loss": 0.8531,
25
+ "num_tokens": 122880.0,
26
  "step": 10
27
  },
28
  {
29
+ "epoch": 0.031512605042016806,
30
+ "grad_norm": 1.3076003789901733,
31
+ "learning_rate": 1.0416666666666668e-05,
32
+ "loss": 0.2121,
33
+ "num_tokens": 184320.0,
34
  "step": 15
35
  },
36
  {
37
+ "epoch": 0.04201680672268908,
38
+ "grad_norm": 1.2455157041549683,
39
+ "learning_rate": 1.388888888888889e-05,
40
+ "loss": 0.113,
41
+ "num_tokens": 245760.0,
42
  "step": 20
43
  },
44
  {
45
+ "epoch": 0.052521008403361345,
46
+ "grad_norm": 0.854950487613678,
47
+ "learning_rate": 1.736111111111111e-05,
48
+ "loss": 0.0885,
49
+ "num_tokens": 307200.0,
50
  "step": 25
51
  },
52
  {
53
+ "epoch": 0.06302521008403361,
54
+ "grad_norm": 0.9929510951042175,
55
+ "learning_rate": 2.0833333333333336e-05,
56
+ "loss": 0.0771,
57
+ "num_tokens": 368640.0,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.07352941176470588,
62
+ "grad_norm": 1.0237308740615845,
63
+ "learning_rate": 2.4305555555555558e-05,
64
+ "loss": 0.0805,
65
+ "num_tokens": 430080.0,
66
  "step": 35
67
  },
68
  {
69
+ "epoch": 0.08403361344537816,
70
+ "grad_norm": 1.4065616130828857,
71
+ "learning_rate": 2.777777777777778e-05,
72
+ "loss": 0.0822,
73
+ "num_tokens": 491520.0,
74
  "step": 40
75
  },
76
  {
77
+ "epoch": 0.09453781512605042,
78
+ "grad_norm": 0.7961967587471008,
79
+ "learning_rate": 3.125e-05,
80
+ "loss": 0.0721,
81
+ "num_tokens": 552960.0,
82
  "step": 45
83
  },
84
  {
85
+ "epoch": 0.10504201680672269,
86
+ "grad_norm": 0.7718358039855957,
87
+ "learning_rate": 3.472222222222222e-05,
88
+ "loss": 0.0683,
89
+ "num_tokens": 614297.0,
90
  "step": 50
91
  },
92
  {
93
+ "epoch": 0.11554621848739496,
94
+ "grad_norm": 0.6594786047935486,
95
+ "learning_rate": 3.8194444444444444e-05,
96
+ "loss": 0.0665,
97
+ "num_tokens": 675536.0,
98
  "step": 55
99
  },
100
  {
101
+ "epoch": 0.12605042016806722,
102
+ "grad_norm": 0.637739896774292,
103
+ "learning_rate": 4.166666666666667e-05,
104
+ "loss": 0.0718,
105
+ "num_tokens": 736976.0,
106
  "step": 60
107
  },
108
  {
109
+ "epoch": 0.13655462184873948,
110
+ "grad_norm": 0.4128863215446472,
111
+ "learning_rate": 4.5138888888888894e-05,
112
+ "loss": 0.069,
113
+ "num_tokens": 798416.0,
114
  "step": 65
115
  },
116
  {
117
+ "epoch": 0.14705882352941177,
118
+ "grad_norm": 0.7012873888015747,
119
+ "learning_rate": 4.8611111111111115e-05,
120
+ "loss": 0.0651,
121
+ "num_tokens": 859856.0,
122
  "step": 70
123
  },
124
  {
125
+ "epoch": 0.15756302521008403,
126
+ "grad_norm": 0.6367799639701843,
127
+ "learning_rate": 4.9999456532409905e-05,
128
+ "loss": 0.0777,
129
+ "num_tokens": 921296.0,
130
  "step": 75
131
  },
132
  {
133
+ "epoch": 0.16806722689075632,
134
+ "grad_norm": 0.49970555305480957,
135
+ "learning_rate": 4.999613543665713e-05,
136
+ "loss": 0.0642,
137
+ "num_tokens": 982736.0,
138
  "step": 80
139
  },
140
  {
141
+ "epoch": 0.17857142857142858,
142
+ "grad_norm": 0.49360784888267517,
143
+ "learning_rate": 4.998979561670338e-05,
144
+ "loss": 0.0651,
145
+ "num_tokens": 1044176.0,
146
  "step": 85
147
  },
148
  {
149
+ "epoch": 0.18907563025210083,
150
+ "grad_norm": 0.432689905166626,
151
+ "learning_rate": 4.9980437923280036e-05,
152
+ "loss": 0.063,
153
+ "num_tokens": 1105611.0,
154
  "step": 90
155
  },
156
  {
157
+ "epoch": 0.19957983193277312,
158
+ "grad_norm": 0.6416879296302795,
159
+ "learning_rate": 4.996806361208257e-05,
160
+ "loss": 0.0659,
161
+ "num_tokens": 1167051.0,
162
  "step": 95
163
  },
164
  {
165
+ "epoch": 0.21008403361344538,
166
+ "grad_norm": 0.517444908618927,
167
+ "learning_rate": 4.995267434360207e-05,
168
  "loss": 0.0638,
169
+ "num_tokens": 1228314.0,
170
  "step": 100
171
  },
172
  {
173
+ "epoch": 0.22058823529411764,
174
+ "grad_norm": 107.99620819091797,
175
+ "learning_rate": 4.993427218290246e-05,
176
+ "loss": 0.6636,
177
+ "num_tokens": 1289602.0,
178
  "step": 105
179
  },
180
  {
181
+ "epoch": 0.23109243697478993,
182
+ "grad_norm": 0.5925813317298889,
183
+ "learning_rate": 4.991285959934332e-05,
184
+ "loss": 0.1472,
185
+ "num_tokens": 1351042.0,
186
  "step": 110
187
  },
188
  {
189
+ "epoch": 0.2415966386554622,
190
+ "grad_norm": 0.3515715003013611,
191
+ "learning_rate": 4.988843946624858e-05,
192
+ "loss": 0.0645,
193
+ "num_tokens": 1412482.0,
194
  "step": 115
195
  },
196
  {
197
+ "epoch": 0.25210084033613445,
198
+ "grad_norm": 0.4023377001285553,
199
+ "learning_rate": 4.9861015060520935e-05,
200
+ "loss": 0.0632,
201
+ "num_tokens": 1473922.0,
202
  "step": 120
203
  },
204
  {
205
+ "epoch": 0.26260504201680673,
206
+ "grad_norm": 0.37604016065597534,
207
+ "learning_rate": 4.9830590062202105e-05,
208
+ "loss": 0.0635,
209
+ "num_tokens": 1535362.0,
210
  "step": 125
211
  },
212
  {
213
+ "epoch": 0.27310924369747897,
214
+ "grad_norm": 3.824808359146118,
215
+ "learning_rate": 4.9797168553979054e-05,
216
+ "loss": 0.0704,
217
+ "num_tokens": 1596802.0,
218
  "step": 130
219
  },
220
  {
221
+ "epoch": 0.28361344537815125,
222
+ "grad_norm": 4.908388137817383,
223
+ "learning_rate": 4.976075502063613e-05,
224
+ "loss": 0.128,
225
+ "num_tokens": 1658080.0,
226
  "step": 135
227
  },
228
  {
229
+ "epoch": 0.29411764705882354,
230
+ "grad_norm": 0.3574487566947937,
231
+ "learning_rate": 4.97213543484532e-05,
232
+ "loss": 0.0682,
233
+ "num_tokens": 1719520.0,
234
  "step": 140
235
  },
236
  {
237
+ "epoch": 0.30462184873949577,
238
+ "grad_norm": 0.35017549991607666,
239
+ "learning_rate": 4.9678971824550074e-05,
240
+ "loss": 0.0593,
241
+ "num_tokens": 1780960.0,
242
  "step": 145
243
  },
244
  {
245
+ "epoch": 0.31512605042016806,
246
+ "grad_norm": 0.36095139384269714,
247
+ "learning_rate": 4.9633613136176925e-05,
248
+ "loss": 0.0605,
249
+ "num_tokens": 1842400.0,
250
  "step": 150
251
  },
252
  {
253
+ "epoch": 0.32563025210084034,
254
+ "grad_norm": 0.3857307434082031,
255
+ "learning_rate": 4.95852843699512e-05,
256
+ "loss": 0.059,
257
+ "num_tokens": 1903840.0,
258
  "step": 155
259
  },
260
  {
261
+ "epoch": 0.33613445378151263,
262
+ "grad_norm": 0.27103978395462036,
263
+ "learning_rate": 4.953399201104084e-05,
264
+ "loss": 0.0581,
265
+ "num_tokens": 1965280.0,
266
  "step": 160
267
  },
268
  {
269
+ "epoch": 0.34663865546218486,
270
+ "grad_norm": 0.29975804686546326,
271
+ "learning_rate": 4.9479742942294035e-05,
272
+ "loss": 0.0578,
273
+ "num_tokens": 2026720.0,
274
  "step": 165
275
  },
276
  {
277
+ "epoch": 0.35714285714285715,
278
+ "grad_norm": 0.3278215825557709,
279
+ "learning_rate": 4.9422544443315635e-05,
280
+ "loss": 0.056,
281
+ "num_tokens": 2088160.0,
282
  "step": 170
283
  },
284
  {
285
+ "epoch": 0.36764705882352944,
286
+ "grad_norm": 0.28858262300491333,
287
+ "learning_rate": 4.936240418949032e-05,
288
+ "loss": 0.0582,
289
+ "num_tokens": 2149600.0,
290
  "step": 175
291
  },
292
  {
293
+ "epoch": 0.37815126050420167,
294
+ "grad_norm": 0.36514896154403687,
295
+ "learning_rate": 4.929933025095261e-05,
296
+ "loss": 0.0577,
297
+ "num_tokens": 2211007.0,
298
  "step": 180
299
  },
300
  {
301
+ "epoch": 0.38865546218487396,
302
+ "grad_norm": 0.37031087279319763,
303
+ "learning_rate": 4.9233331091504034e-05,
304
+ "loss": 0.0554,
305
+ "num_tokens": 2272203.0,
306
  "step": 185
307
  },
308
  {
309
+ "epoch": 0.39915966386554624,
310
+ "grad_norm": 0.28869447112083435,
311
+ "learning_rate": 4.916441556747727e-05,
312
+ "loss": 0.0575,
313
+ "num_tokens": 2333494.0,
314
  "step": 190
315
  },
316
  {
317
+ "epoch": 0.4096638655462185,
318
+ "grad_norm": 0.29715102910995483,
319
+ "learning_rate": 4.909259292654782e-05,
320
+ "loss": 0.0573,
321
+ "num_tokens": 2394934.0,
322
  "step": 195
323
  },
324
  {
325
+ "epoch": 0.42016806722689076,
326
+ "grad_norm": 0.40769094228744507,
327
+ "learning_rate": 4.9017872806492995e-05,
328
+ "loss": 0.0583,
329
+ "num_tokens": 2456374.0,
330
  "step": 200
331
  },
332
  {
333
+ "epoch": 0.43067226890756305,
334
+ "grad_norm": 0.3187924325466156,
335
+ "learning_rate": 4.8940265233898744e-05,
336
+ "loss": 0.0546,
337
+ "num_tokens": 2517814.0,
338
  "step": 205
339
  },
340
  {
341
+ "epoch": 0.4411764705882353,
342
+ "grad_norm": 0.25656041502952576,
343
+ "learning_rate": 4.885978062281408e-05,
344
+ "loss": 0.057,
345
+ "num_tokens": 2579248.0,
346
  "step": 210
347
  },
348
  {
349
+ "epoch": 0.45168067226890757,
350
+ "grad_norm": 0.2591699957847595,
351
+ "learning_rate": 4.877642977335371e-05,
352
+ "loss": 0.0573,
353
+ "num_tokens": 2640688.0,
354
  "step": 215
355
  },
356
  {
357
+ "epoch": 0.46218487394957986,
358
+ "grad_norm": 0.3359907567501068,
359
+ "learning_rate": 4.869022387024879e-05,
360
+ "loss": 0.0538,
361
+ "num_tokens": 2702128.0,
362
  "step": 220
363
  },
364
  {
365
+ "epoch": 0.4726890756302521,
366
+ "grad_norm": 0.3090253174304962,
367
+ "learning_rate": 4.8601174481346015e-05,
368
+ "loss": 0.057,
369
+ "num_tokens": 2763412.0,
370
  "step": 225
371
  },
372
  {
373
+ "epoch": 0.4831932773109244,
374
+ "grad_norm": 0.2006455659866333,
375
+ "learning_rate": 4.8509293556055345e-05,
376
+ "loss": 0.0554,
377
+ "num_tokens": 2824852.0,
378
  "step": 230
379
  },
380
  {
381
+ "epoch": 0.49369747899159666,
382
+ "grad_norm": 0.255356103181839,
383
+ "learning_rate": 4.84145934237466e-05,
384
+ "loss": 0.0557,
385
+ "num_tokens": 2886292.0,
386
  "step": 235
387
  },
388
  {
389
+ "epoch": 0.5042016806722689,
390
+ "grad_norm": 0.21021538972854614,
391
+ "learning_rate": 4.8317086792094906e-05,
392
+ "loss": 0.0527,
393
+ "num_tokens": 2947732.0,
394
  "step": 240
395
  },
396
  {
397
+ "epoch": 0.5147058823529411,
398
+ "grad_norm": 0.24062202870845795,
399
+ "learning_rate": 4.821678674537557e-05,
400
+ "loss": 0.0545,
401
+ "num_tokens": 3009172.0,
402
  "step": 245
403
  },
404
  {
405
+ "epoch": 0.5252100840336135,
406
+ "grad_norm": 0.30908721685409546,
407
+ "learning_rate": 4.811370674270821e-05,
408
+ "loss": 0.0538,
409
+ "num_tokens": 3070612.0,
410
  "step": 250
411
  },
412
  {
413
+ "epoch": 0.5357142857142857,
414
+ "grad_norm": 0.2805303931236267,
415
+ "learning_rate": 4.800786061625078e-05,
416
+ "loss": 0.0528,
417
+ "num_tokens": 3132052.0,
418
  "step": 255
419
  },
420
  {
421
+ "epoch": 0.5462184873949579,
422
+ "grad_norm": 0.245064839720726,
423
+ "learning_rate": 4.789926256934345e-05,
424
+ "loss": 0.0566,
425
+ "num_tokens": 3193416.0,
426
  "step": 260
427
  },
428
  {
429
+ "epoch": 0.5567226890756303,
430
+ "grad_norm": 0.2865758538246155,
431
+ "learning_rate": 4.778792717460259e-05,
432
+ "loss": 0.0542,
433
+ "num_tokens": 3254856.0,
434
  "step": 265
435
  },
436
  {
437
+ "epoch": 0.5672268907563025,
438
+ "grad_norm": 0.2498483657836914,
439
+ "learning_rate": 4.7673869371965425e-05,
440
+ "loss": 0.0544,
441
+ "num_tokens": 3316296.0,
442
  "step": 270
443
  },
444
  {
445
+ "epoch": 0.5777310924369747,
446
+ "grad_norm": 0.24384859204292297,
447
+ "learning_rate": 4.755710446668515e-05,
448
+ "loss": 0.0564,
449
+ "num_tokens": 3377345.0,
450
  "step": 275
451
  },
452
  {
453
+ "epoch": 0.5882352941176471,
454
+ "grad_norm": 0.22256287932395935,
455
+ "learning_rate": 4.7437648127277216e-05,
456
+ "loss": 0.0543,
457
+ "num_tokens": 3438785.0,
458
  "step": 280
459
  },
460
  {
461
+ "epoch": 0.5987394957983193,
462
+ "grad_norm": 0.7542266845703125,
463
+ "learning_rate": 4.7315516383416736e-05,
464
+ "loss": 0.0595,
465
+ "num_tokens": 3500225.0,
466
  "step": 285
467
  },
468
  {
469
+ "epoch": 0.6092436974789915,
470
+ "grad_norm": 0.24494485557079315,
471
+ "learning_rate": 4.7190725623787545e-05,
472
+ "loss": 0.0565,
473
+ "num_tokens": 3561466.0,
474
  "step": 290
475
  },
476
  {
477
+ "epoch": 0.6197478991596639,
478
+ "grad_norm": 0.26762306690216064,
479
+ "learning_rate": 4.706329259388298e-05,
480
+ "loss": 0.0557,
481
+ "num_tokens": 3622906.0,
482
  "step": 295
483
  },
484
  {
485
+ "epoch": 0.6302521008403361,
486
+ "grad_norm": 0.285203754901886,
487
+ "learning_rate": 4.6933234393758844e-05,
488
+ "loss": 0.0537,
489
+ "num_tokens": 3684184.0,
490
  "step": 300
491
  },
492
  {
493
+ "epoch": 0.6407563025210085,
494
+ "grad_norm": 0.5487973690032959,
495
+ "learning_rate": 4.680056847573878e-05,
496
+ "loss": 0.0551,
497
+ "num_tokens": 3745624.0,
498
  "step": 305
499
  },
500
  {
501
+ "epoch": 0.6512605042016807,
502
+ "grad_norm": 0.2598506808280945,
503
+ "learning_rate": 4.666531264207235e-05,
504
+ "loss": 0.0542,
505
+ "num_tokens": 3806907.0,
506
  "step": 310
507
  },
508
  {
509
+ "epoch": 0.6617647058823529,
510
+ "grad_norm": 0.30089858174324036,
511
+ "learning_rate": 4.6527485042546204e-05,
512
+ "loss": 0.0576,
513
+ "num_tokens": 3868347.0,
514
  "step": 315
515
  },
516
  {
517
+ "epoch": 0.6722689075630253,
518
+ "grad_norm": 0.24752771854400635,
519
+ "learning_rate": 4.638710417204855e-05,
520
+ "loss": 0.0538,
521
+ "num_tokens": 3929787.0,
522
  "step": 320
523
  },
524
  {
525
+ "epoch": 0.6827731092436975,
526
+ "grad_norm": 0.2561896741390228,
527
+ "learning_rate": 4.6244188868087395e-05,
528
+ "loss": 0.0556,
529
+ "num_tokens": 3991227.0,
530
  "step": 325
531
  },
532
  {
533
+ "epoch": 0.6932773109243697,
534
+ "grad_norm": 0.3156558871269226,
535
+ "learning_rate": 4.609875830826272e-05,
536
+ "loss": 0.0564,
537
+ "num_tokens": 4052667.0,
538
  "step": 330
539
  },
540
  {
541
+ "epoch": 0.7037815126050421,
542
+ "grad_norm": 0.291674941778183,
543
+ "learning_rate": 4.59508320076931e-05,
544
+ "loss": 0.0558,
545
+ "num_tokens": 4114107.0,
546
  "step": 335
547
  },
548
  {
549
+ "epoch": 0.7142857142857143,
550
+ "grad_norm": 0.29663559794425964,
551
+ "learning_rate": 4.580042981639698e-05,
552
+ "loss": 0.0545,
553
+ "num_tokens": 4175485.0,
554
  "step": 340
555
  },
556
  {
557
+ "epoch": 0.7247899159663865,
558
+ "grad_norm": 0.2254744917154312,
559
+ "learning_rate": 4.5647571916629064e-05,
560
+ "loss": 0.0544,
561
+ "num_tokens": 4236925.0,
562
  "step": 345
563
  },
564
  {
565
+ "epoch": 0.7352941176470589,
566
+ "grad_norm": 0.327118843793869,
567
+ "learning_rate": 4.549227882017202e-05,
568
+ "loss": 0.0556,
569
+ "num_tokens": 4298365.0,
570
  "step": 350
571
  },
572
  {
573
+ "epoch": 0.7457983193277311,
574
+ "grad_norm": 0.24738788604736328,
575
+ "learning_rate": 4.533457136558408e-05,
576
+ "loss": 0.0533,
577
+ "num_tokens": 4359805.0,
578
  "step": 355
579
  },
580
  {
581
+ "epoch": 0.7563025210084033,
582
+ "grad_norm": 0.17776817083358765,
583
+ "learning_rate": 4.5174470715402764e-05,
584
+ "loss": 0.0559,
585
+ "num_tokens": 4421245.0,
586
  "step": 360
587
  },
588
  {
589
+ "epoch": 0.7668067226890757,
590
+ "grad_norm": 0.2214263677597046,
591
+ "learning_rate": 4.501199835330507e-05,
592
+ "loss": 0.0533,
593
+ "num_tokens": 4482685.0,
594
  "step": 365
595
  },
596
  {
597
+ "epoch": 0.7773109243697479,
598
+ "grad_norm": 0.2129214107990265,
599
+ "learning_rate": 4.484717608122459e-05,
600
+ "loss": 0.0542,
601
+ "num_tokens": 4544125.0,
602
  "step": 370
603
  },
604
  {
605
+ "epoch": 0.7878151260504201,
606
+ "grad_norm": 0.2668643295764923,
607
+ "learning_rate": 4.468002601642603e-05,
608
+ "loss": 0.052,
609
+ "num_tokens": 4605565.0,
610
  "step": 375
611
  },
612
  {
613
+ "epoch": 0.7983193277310925,
614
+ "grad_norm": 0.4849870502948761,
615
+ "learning_rate": 4.4510570588537206e-05,
616
+ "loss": 0.057,
617
+ "num_tokens": 4666849.0,
618
  "step": 380
619
  },
620
  {
621
+ "epoch": 0.8088235294117647,
622
+ "grad_norm": 0.2671234607696533,
623
+ "learning_rate": 4.433883253653936e-05,
624
+ "loss": 0.0533,
625
+ "num_tokens": 4728021.0,
626
  "step": 385
627
  },
628
  {
629
+ "epoch": 0.819327731092437,
630
+ "grad_norm": 0.8898406028747559,
631
+ "learning_rate": 4.416483490571574e-05,
632
+ "loss": 0.0551,
633
+ "num_tokens": 4789461.0,
634
  "step": 390
635
  },
636
  {
637
+ "epoch": 0.8298319327731093,
638
+ "grad_norm": 0.1947408765554428,
639
+ "learning_rate": 4.39886010445593e-05,
640
+ "loss": 0.0555,
641
+ "num_tokens": 4850862.0,
642
  "step": 395
643
  },
644
  {
645
+ "epoch": 0.8403361344537815,
646
+ "grad_norm": 0.3551996648311615,
647
+ "learning_rate": 4.381015460163949e-05,
648
+ "loss": 0.0559,
649
+ "num_tokens": 4912302.0,
650
  "step": 400
651
  },
652
  {
653
+ "epoch": 0.8508403361344538,
654
+ "grad_norm": 0.24865303933620453,
655
+ "learning_rate": 4.362951952242898e-05,
656
+ "loss": 0.0554,
657
+ "num_tokens": 4973742.0,
658
  "step": 405
659
  },
660
  {
661
+ "epoch": 0.8613445378151261,
662
+ "grad_norm": 0.21480585634708405,
663
+ "learning_rate": 4.344672004609037e-05,
664
+ "loss": 0.0536,
665
+ "num_tokens": 5035005.0,
666
  "step": 410
667
  },
668
  {
669
+ "epoch": 0.8718487394957983,
670
+ "grad_norm": 0.2362818419933319,
671
+ "learning_rate": 4.326178070222364e-05,
672
+ "loss": 0.0552,
673
+ "num_tokens": 5096371.0,
674
  "step": 415
675
  },
676
  {
677
+ "epoch": 0.8823529411764706,
678
+ "grad_norm": 0.3188863694667816,
679
+ "learning_rate": 4.3074726307574516e-05,
680
+ "loss": 0.0579,
681
+ "num_tokens": 5157811.0,
682
  "step": 420
683
  },
684
  {
685
+ "epoch": 0.8928571428571429,
686
+ "grad_norm": 0.2944329082965851,
687
+ "learning_rate": 4.2885581962704366e-05,
688
+ "loss": 0.0555,
689
+ "num_tokens": 5219251.0,
690
  "step": 425
691
  },
692
  {
693
+ "epoch": 0.9033613445378151,
694
+ "grad_norm": 0.35381075739860535,
695
+ "learning_rate": 4.2694373048622e-05,
696
+ "loss": 0.0548,
697
+ "num_tokens": 5280691.0,
698
  "step": 430
699
  },
700
  {
701
+ "epoch": 0.9138655462184874,
702
+ "grad_norm": 0.2967956066131592,
703
+ "learning_rate": 4.2501125223377754e-05,
704
+ "loss": 0.0542,
705
+ "num_tokens": 5342131.0,
706
  "step": 435
707
  },
708
  {
709
+ "epoch": 0.9243697478991597,
710
+ "grad_norm": 1.3979747295379639,
711
+ "learning_rate": 4.230586441862062e-05,
712
+ "loss": 0.0529,
713
+ "num_tokens": 5403410.0,
714
  "step": 440
715
  },
716
  {
717
+ "epoch": 0.9348739495798319,
718
+ "grad_norm": 0.2130478024482727,
719
+ "learning_rate": 4.210861683611837e-05,
720
+ "loss": 0.0546,
721
+ "num_tokens": 5464723.0,
722
  "step": 445
723
  },
724
  {
725
+ "epoch": 0.9453781512605042,
726
+ "grad_norm": 0.23148652911186218,
727
+ "learning_rate": 4.1909408944241644e-05,
728
+ "loss": 0.0543,
729
+ "num_tokens": 5526163.0,
730
  "step": 450
731
  },
732
  {
733
+ "epoch": 0.9558823529411765,
734
+ "grad_norm": 0.17318475246429443,
735
+ "learning_rate": 4.1708267474412215e-05,
736
+ "loss": 0.0543,
737
+ "num_tokens": 5587603.0,
738
  "step": 455
739
  },
740
  {
741
+ "epoch": 0.9663865546218487,
742
+ "grad_norm": 0.28641510009765625,
743
+ "learning_rate": 4.1505219417515884e-05,
744
+ "loss": 0.0549,
745
+ "num_tokens": 5649043.0,
746
  "step": 460
747
  },
748
  {
749
+ "epoch": 0.976890756302521,
750
+ "grad_norm": 0.35609665513038635,
751
+ "learning_rate": 4.1300292020280645e-05,
752
+ "loss": 0.056,
753
+ "num_tokens": 5710483.0,
754
  "step": 465
755
  },
756
  {
757
+ "epoch": 0.9873949579831933,
758
+ "grad_norm": 0.2831043601036072,
759
+ "learning_rate": 4.10935127816205e-05,
760
+ "loss": 0.0574,
761
+ "num_tokens": 5771914.0,
762
  "step": 470
763
  },
764
  {
765
+ "epoch": 0.9978991596638656,
766
+ "grad_norm": 0.2836240530014038,
767
+ "learning_rate": 4.088490944894539e-05,
768
+ "loss": 0.0515,
769
+ "num_tokens": 5833354.0,
770
  "step": 475
771
  },
772
  {
773
+ "epoch": 1.0084033613445378,
774
+ "grad_norm": 2.6926634311676025,
775
+ "learning_rate": 4.06745100144378e-05,
776
+ "loss": 0.0558,
777
+ "num_tokens": 5894794.0,
778
  "step": 480
779
  },
780
  {
781
+ "epoch": 1.01890756302521,
782
+ "grad_norm": 0.17681336402893066,
783
+ "learning_rate": 4.0462342711296584e-05,
784
+ "loss": 0.0523,
785
+ "num_tokens": 5956077.0,
786
  "step": 485
787
  },
788
  {
789
+ "epoch": 1.0294117647058822,
790
+ "grad_norm": 0.2838696539402008,
791
+ "learning_rate": 4.024843600994833e-05,
792
+ "loss": 0.0537,
793
+ "num_tokens": 6017517.0,
794
  "step": 490
795
  },
796
  {
797
+ "epoch": 1.0399159663865547,
798
+ "grad_norm": 0.2431504875421524,
799
+ "learning_rate": 4.003281861422699e-05,
800
+ "loss": 0.0537,
801
+ "num_tokens": 6078801.0,
802
  "step": 495
803
  },
804
  {
805
+ "epoch": 1.050420168067227,
806
+ "grad_norm": 0.2204369157552719,
807
+ "learning_rate": 3.981551945752215e-05,
808
+ "loss": 0.0538,
809
+ "num_tokens": 6140232.0,
810
  "step": 500
811
  },
812
  {
813
+ "epoch": 1.0609243697478992,
814
+ "grad_norm": 0.2458706945180893,
815
+ "learning_rate": 3.959656769889646e-05,
816
+ "loss": 0.0545,
817
+ "num_tokens": 6201672.0,
818
  "step": 505
819
  },
820
  {
821
+ "epoch": 1.0714285714285714,
822
+ "grad_norm": 0.21258144080638885,
823
+ "learning_rate": 3.937599271917292e-05,
824
+ "loss": 0.056,
825
+ "num_tokens": 6263112.0,
826
  "step": 510
827
  },
828
  {
829
+ "epoch": 1.0819327731092436,
830
+ "grad_norm": 0.2708013355731964,
831
+ "learning_rate": 3.915382411699218e-05,
832
+ "loss": 0.0547,
833
+ "num_tokens": 6324552.0,
834
  "step": 515
835
  },
836
  {
837
+ "epoch": 1.092436974789916,
838
+ "grad_norm": 2.9274137020111084,
839
+ "learning_rate": 3.893009170484085e-05,
840
+ "loss": 0.0524,
841
+ "num_tokens": 6385992.0,
842
  "step": 520
843
  },
844
  {
845
+ "epoch": 1.1029411764705883,
846
+ "grad_norm": 0.3301822543144226,
847
+ "learning_rate": 3.870482550505094e-05,
848
+ "loss": 0.0554,
849
+ "num_tokens": 6447432.0,
850
  "step": 525
851
  },
852
  {
853
+ "epoch": 1.1134453781512605,
854
+ "grad_norm": 0.4145117700099945,
855
+ "learning_rate": 3.847805574577123e-05,
856
+ "loss": 0.0551,
857
+ "num_tokens": 6508872.0,
858
  "step": 530
859
  },
860
  {
861
+ "epoch": 1.1239495798319328,
862
+ "grad_norm": 0.2403404861688614,
863
+ "learning_rate": 3.8249812856910985e-05,
864
+ "loss": 0.0576,
865
+ "num_tokens": 6570312.0,
866
  "step": 535
867
  },
868
  {
869
+ "epoch": 1.134453781512605,
870
+ "grad_norm": 0.2703566551208496,
871
+ "learning_rate": 3.8020127466056636e-05,
872
+ "loss": 0.0526,
873
+ "num_tokens": 6631553.0,
874
  "step": 540
875
  },
876
  {
877
+ "epoch": 1.1449579831932772,
878
+ "grad_norm": 0.23629266023635864,
879
+ "learning_rate": 3.778903039436189e-05,
880
+ "loss": 0.053,
881
+ "num_tokens": 6692993.0,
882
  "step": 545
883
  },
884
  {
885
+ "epoch": 1.1554621848739495,
886
+ "grad_norm": 31.853532791137695,
887
+ "learning_rate": 3.755655265241187e-05,
888
+ "loss": 0.0551,
889
+ "num_tokens": 6754394.0,
890
  "step": 550
891
  },
892
  {
893
+ "epoch": 1.165966386554622,
894
+ "grad_norm": 0.274700790643692,
895
+ "learning_rate": 3.7322725436061875e-05,
896
+ "loss": 0.0534,
897
+ "num_tokens": 6815834.0,
898
  "step": 555
899
  },
900
  {
901
+ "epoch": 1.1764705882352942,
902
+ "grad_norm": 0.26774168014526367,
903
+ "learning_rate": 3.708758012225125e-05,
904
+ "loss": 0.0528,
905
+ "num_tokens": 6877269.0,
906
  "step": 560
907
  },
908
  {
909
+ "epoch": 1.1869747899159664,
910
+ "grad_norm": 0.2192794382572174,
911
+ "learning_rate": 3.685114826479292e-05,
912
+ "loss": 0.0543,
913
+ "num_tokens": 6938555.0,
914
  "step": 565
915
  },
916
  {
917
+ "epoch": 1.1974789915966386,
918
+ "grad_norm": 0.4298264980316162,
919
+ "learning_rate": 3.661346159013929e-05,
920
+ "loss": 0.0536,
921
+ "num_tokens": 6999704.0,
922
  "step": 570
923
  },
924
  {
925
+ "epoch": 1.2079831932773109,
926
+ "grad_norm": 0.19733546674251556,
927
+ "learning_rate": 3.637455199312488e-05,
928
+ "loss": 0.053,
929
+ "num_tokens": 7061144.0,
930
  "step": 575
931
  },
932
  {
933
+ "epoch": 1.2184873949579833,
934
+ "grad_norm": 0.22298553586006165,
935
+ "learning_rate": 3.61344515326864e-05,
936
+ "loss": 0.0532,
937
+ "num_tokens": 7122407.0,
938
  "step": 580
939
  },
940
  {
941
+ "epoch": 1.2289915966386555,
942
+ "grad_norm": 0.19753730297088623,
943
+ "learning_rate": 3.5893192427560834e-05,
944
+ "loss": 0.0536,
945
+ "num_tokens": 7183847.0,
946
  "step": 585
947
  },
948
  {
949
+ "epoch": 1.2394957983193278,
950
+ "grad_norm": 0.20278260111808777,
951
+ "learning_rate": 3.565080705196202e-05,
952
+ "loss": 0.0525,
953
+ "num_tokens": 7245125.0,
954
  "step": 590
955
  },
956
  {
957
+ "epoch": 1.25,
958
+ "grad_norm": 0.8037598133087158,
959
+ "learning_rate": 3.5407327931236434e-05,
960
+ "loss": 0.0536,
961
+ "num_tokens": 7306565.0,
962
  "step": 595
963
  },
964
  {
965
+ "epoch": 1.2605042016806722,
966
+ "grad_norm": 0.25609511137008667,
967
+ "learning_rate": 3.516278773749863e-05,
968
+ "loss": 0.0534,
969
+ "num_tokens": 7368005.0,
970
  "step": 600
971
  },
972
  {
973
+ "epoch": 1.2710084033613445,
974
+ "grad_norm": 0.18984173238277435,
975
+ "learning_rate": 3.4917219285247036e-05,
976
+ "loss": 0.0517,
977
+ "num_tokens": 7429445.0,
978
  "step": 605
979
  },
980
  {
981
+ "epoch": 1.2815126050420167,
982
+ "grad_norm": 0.17902691662311554,
983
+ "learning_rate": 3.4670655526960627e-05,
984
+ "loss": 0.0538,
985
+ "num_tokens": 7490885.0,
986
  "step": 610
987
  },
988
  {
989
+ "epoch": 1.2920168067226891,
990
+ "grad_norm": 0.19289465248584747,
991
+ "learning_rate": 3.4423129548677055e-05,
992
+ "loss": 0.0526,
993
+ "num_tokens": 7552325.0,
994
  "step": 615
995
  },
996
  {
997
+ "epoch": 1.3025210084033614,
998
+ "grad_norm": 0.21860499680042267,
999
+ "learning_rate": 3.41746745655529e-05,
1000
+ "loss": 0.0546,
1001
+ "num_tokens": 7613765.0,
1002
  "step": 620
1003
  },
1004
  {
1005
+ "epoch": 1.3130252100840336,
1006
+ "grad_norm": 0.19712497293949127,
1007
+ "learning_rate": 3.3925323917406574e-05,
1008
+ "loss": 0.0538,
1009
+ "num_tokens": 7675205.0,
1010
  "step": 625
1011
  },
1012
  {
1013
+ "epoch": 1.3235294117647058,
1014
+ "grad_norm": 0.22082890570163727,
1015
+ "learning_rate": 3.3675111064244504e-05,
1016
+ "loss": 0.0537,
1017
+ "num_tokens": 7736645.0,
1018
  "step": 630
1019
  },
1020
  {
1021
+ "epoch": 1.334033613445378,
1022
+ "grad_norm": 0.20152664184570312,
1023
+ "learning_rate": 3.3424069581771155e-05,
1024
+ "loss": 0.0529,
1025
+ "num_tokens": 7798085.0,
1026
  "step": 635
1027
  },
1028
  {
1029
+ "epoch": 1.3445378151260505,
1030
+ "grad_norm": 0.22678756713867188,
1031
+ "learning_rate": 3.317223315688358e-05,
1032
+ "loss": 0.0539,
1033
+ "num_tokens": 7859525.0,
1034
  "step": 640
1035
  },
1036
  {
1037
+ "epoch": 1.3550420168067228,
1038
+ "grad_norm": 0.2191995084285736,
1039
+ "learning_rate": 3.2919635583151025e-05,
1040
+ "loss": 0.0529,
1041
+ "num_tokens": 7920965.0,
1042
  "step": 645
1043
  },
1044
  {
1045
+ "epoch": 1.365546218487395,
1046
+ "grad_norm": 0.18456892669200897,
1047
+ "learning_rate": 3.2666310756280194e-05,
1048
+ "loss": 0.0544,
1049
+ "num_tokens": 7982405.0,
1050
  "step": 650
1051
  },
1052
  {
1053
+ "epoch": 1.3760504201680672,
1054
+ "grad_norm": 0.1721736490726471,
1055
+ "learning_rate": 3.241229266956687e-05,
1056
+ "loss": 0.054,
1057
+ "num_tokens": 8043845.0,
1058
  "step": 655
1059
  },
1060
  {
1061
+ "epoch": 1.3865546218487395,
1062
+ "grad_norm": 0.22381868958473206,
1063
+ "learning_rate": 3.215761540933436e-05,
1064
+ "loss": 0.0525,
1065
+ "num_tokens": 8105285.0,
1066
  "step": 660
1067
  },
1068
  {
1069
+ "epoch": 1.3970588235294117,
1070
+ "grad_norm": 0.19951923191547394,
1071
+ "learning_rate": 3.190231315035954e-05,
1072
+ "loss": 0.0514,
1073
+ "num_tokens": 8166725.0,
1074
  "step": 665
1075
  },
1076
  {
1077
+ "epoch": 1.407563025210084,
1078
+ "grad_norm": 0.21700704097747803,
1079
+ "learning_rate": 3.164642015128694e-05,
1080
+ "loss": 0.0531,
1081
+ "num_tokens": 8228159.0,
1082
  "step": 670
1083
  },
1084
  {
1085
+ "epoch": 1.4180672268907564,
1086
+ "grad_norm": 0.13238979876041412,
1087
+ "learning_rate": 3.13899707500317e-05,
1088
+ "loss": 0.0503,
1089
+ "num_tokens": 8289370.0,
1090
  "step": 675
1091
  },
1092
  {
1093
+ "epoch": 1.4285714285714286,
1094
+ "grad_norm": 0.16383114457130432,
1095
+ "learning_rate": 3.1132999359171737e-05,
1096
+ "loss": 0.0513,
1097
+ "num_tokens": 8350810.0,
1098
  "step": 680
1099
  },
1100
  {
1101
+ "epoch": 1.4390756302521008,
1102
+ "grad_norm": 0.18902327120304108,
1103
+ "learning_rate": 3.087554046133004e-05,
1104
+ "loss": 0.052,
1105
+ "num_tokens": 8412174.0,
1106
  "step": 685
1107
  },
1108
  {
1109
+ "epoch": 1.449579831932773,
1110
+ "grad_norm": 0.16599516570568085,
1111
+ "learning_rate": 3.0617628604547424e-05,
1112
+ "loss": 0.0533,
1113
+ "num_tokens": 8473614.0,
1114
  "step": 690
1115
  },
1116
  {
1117
+ "epoch": 1.4600840336134453,
1118
+ "grad_norm": 0.20513266324996948,
1119
+ "learning_rate": 3.035929839764665e-05,
1120
+ "loss": 0.0507,
1121
+ "num_tokens": 8535054.0,
1122
  "step": 695
1123
  },
1124
  {
1125
+ "epoch": 1.4705882352941178,
1126
+ "grad_norm": 0.22719748318195343,
1127
+ "learning_rate": 3.0100584505588275e-05,
1128
+ "loss": 0.052,
1129
+ "num_tokens": 8596494.0,
1130
  "step": 700
1131
  },
1132
  {
1133
+ "epoch": 1.48109243697479,
1134
+ "grad_norm": 0.18906480073928833,
1135
+ "learning_rate": 2.9841521644818976e-05,
1136
+ "loss": 0.0516,
1137
+ "num_tokens": 8657934.0,
1138
  "step": 705
1139
  },
1140
  {
1141
+ "epoch": 1.4915966386554622,
1142
+ "grad_norm": 0.17335395514965057,
1143
+ "learning_rate": 2.9582144578613102e-05,
1144
+ "loss": 0.0496,
1145
+ "num_tokens": 8719374.0,
1146
  "step": 710
1147
  },
1148
  {
1149
+ "epoch": 1.5021008403361344,
1150
+ "grad_norm": 0.20907028019428253,
1151
+ "learning_rate": 2.9322488112407743e-05,
1152
+ "loss": 0.0523,
1153
+ "num_tokens": 8780740.0,
1154
  "step": 715
1155
  },
1156
  {
1157
+ "epoch": 1.5126050420168067,
1158
+ "grad_norm": 0.21596895158290863,
1159
+ "learning_rate": 2.906258708913228e-05,
1160
+ "loss": 0.053,
1161
+ "num_tokens": 8842180.0,
1162
  "step": 720
1163
  },
1164
  {
1165
+ "epoch": 1.523109243697479,
1166
+ "grad_norm": 0.21814534068107605,
1167
+ "learning_rate": 2.880247638453288e-05,
1168
+ "loss": 0.0535,
1169
+ "num_tokens": 8903620.0,
1170
  "step": 725
1171
  },
1172
  {
1173
+ "epoch": 1.5336134453781511,
1174
+ "grad_norm": 0.17172180116176605,
1175
+ "learning_rate": 2.854219090249251e-05,
1176
+ "loss": 0.0511,
1177
+ "num_tokens": 8965060.0,
1178
  "step": 730
1179
  },
1180
  {
1181
+ "epoch": 1.5441176470588234,
1182
+ "grad_norm": 0.144153892993927,
1183
+ "learning_rate": 2.8281765570347306e-05,
1184
+ "loss": 0.0509,
1185
+ "num_tokens": 9026344.0,
1186
  "step": 735
1187
  },
1188
  {
1189
+ "epoch": 1.5546218487394958,
1190
+ "grad_norm": 0.1880050003528595,
1191
+ "learning_rate": 2.802123533419966e-05,
1192
+ "loss": 0.0546,
1193
+ "num_tokens": 9087784.0,
1194
  "step": 740
1195
  },
1196
  {
1197
+ "epoch": 1.565126050420168,
1198
+ "grad_norm": 0.15667995810508728,
1199
+ "learning_rate": 2.7760635154228896e-05,
1200
+ "loss": 0.051,
1201
+ "num_tokens": 9149063.0,
1202
  "step": 745
1203
  },
1204
  {
1205
+ "epoch": 1.5756302521008403,
1206
+ "grad_norm": 0.2708027958869934,
1207
+ "learning_rate": 2.7500000000000004e-05,
1208
+ "loss": 0.0544,
1209
+ "num_tokens": 9210503.0,
1210
  "step": 750
1211
  },
1212
  {
1213
+ "epoch": 1.5861344537815127,
1214
+ "grad_norm": 0.1797020584344864,
1215
+ "learning_rate": 2.723936484577111e-05,
1216
+ "loss": 0.0528,
1217
+ "num_tokens": 9271881.0,
1218
  "step": 755
1219
  },
1220
  {
1221
+ "epoch": 1.596638655462185,
1222
+ "grad_norm": 0.21367360651493073,
1223
+ "learning_rate": 2.6978764665800343e-05,
1224
+ "loss": 0.0535,
1225
+ "num_tokens": 9333321.0,
1226
  "step": 760
1227
  },
1228
  {
1229
+ "epoch": 1.6071428571428572,
1230
+ "grad_norm": 0.21819233894348145,
1231
+ "learning_rate": 2.67182344296527e-05,
1232
+ "loss": 0.0522,
1233
+ "num_tokens": 9394761.0,
1234
  "step": 765
1235
  },
1236
  {
1237
+ "epoch": 1.6176470588235294,
1238
+ "grad_norm": 0.1992005854845047,
1239
+ "learning_rate": 2.6457809097507496e-05,
1240
+ "loss": 0.0506,
1241
+ "num_tokens": 9456201.0,
1242
  "step": 770
1243
  },
1244
  {
1245
+ "epoch": 1.6281512605042017,
1246
+ "grad_norm": 0.19866126775741577,
1247
+ "learning_rate": 2.619752361546713e-05,
1248
+ "loss": 0.0518,
1249
+ "num_tokens": 9517492.0,
1250
  "step": 775
1251
  },
1252
  {
1253
+ "epoch": 1.638655462184874,
1254
+ "grad_norm": 0.174868643283844,
1255
+ "learning_rate": 2.593741291086772e-05,
1256
+ "loss": 0.0532,
1257
+ "num_tokens": 9578932.0,
1258
  "step": 780
1259
  },
1260
  {
1261
+ "epoch": 1.6491596638655461,
1262
+ "grad_norm": 0.22887223958969116,
1263
+ "learning_rate": 2.567751188759227e-05,
1264
+ "loss": 0.0523,
1265
+ "num_tokens": 9640372.0,
1266
  "step": 785
1267
  },
1268
  {
1269
+ "epoch": 1.6596638655462184,
1270
+ "grad_norm": 0.17208142578601837,
1271
+ "learning_rate": 2.541785542138691e-05,
1272
+ "loss": 0.0502,
1273
+ "num_tokens": 9701812.0,
1274
  "step": 790
1275
  },
1276
  {
1277
+ "epoch": 1.6701680672268906,
1278
+ "grad_norm": 0.21313603222370148,
1279
+ "learning_rate": 2.515847835518103e-05,
1280
+ "loss": 0.0526,
1281
+ "num_tokens": 9763075.0,
1282
  "step": 795
1283
  },
1284
  {
1285
+ "epoch": 1.680672268907563,
1286
+ "grad_norm": 0.15035264194011688,
1287
+ "learning_rate": 2.4899415494411737e-05,
1288
+ "loss": 0.0507,
1289
+ "num_tokens": 9824515.0,
1290
  "step": 800
1291
  },
1292
  {
1293
+ "epoch": 1.6911764705882353,
1294
+ "grad_norm": 0.2601998746395111,
1295
+ "learning_rate": 2.464070160235335e-05,
1296
+ "loss": 0.0526,
1297
+ "num_tokens": 9885955.0,
1298
  "step": 805
1299
  },
1300
  {
1301
+ "epoch": 1.7016806722689075,
1302
+ "grad_norm": 0.17123112082481384,
1303
+ "learning_rate": 2.438237139545258e-05,
1304
+ "loss": 0.0521,
1305
+ "num_tokens": 9947395.0,
1306
  "step": 810
1307
  },
1308
  {
1309
+ "epoch": 1.71218487394958,
1310
+ "grad_norm": 0.17243990302085876,
1311
+ "learning_rate": 2.412445953866997e-05,
1312
+ "loss": 0.0502,
1313
+ "num_tokens": 10008835.0,
1314
  "step": 815
1315
  },
1316
  {
1317
+ "epoch": 1.7226890756302522,
1318
+ "grad_norm": 0.21723228693008423,
1319
+ "learning_rate": 2.386700064082827e-05,
1320
+ "loss": 0.0517,
1321
+ "num_tokens": 10070123.0,
1322
  "step": 820
1323
  },
1324
  {
1325
+ "epoch": 1.7331932773109244,
1326
+ "grad_norm": 0.13738787174224854,
1327
+ "learning_rate": 2.361002924996831e-05,
1328
+ "loss": 0.051,
1329
+ "num_tokens": 10131563.0,
1330
  "step": 825
1331
  },
1332
  {
1333
+ "epoch": 1.7436974789915967,
1334
+ "grad_norm": 0.21257147192955017,
1335
+ "learning_rate": 2.3353579848713063e-05,
1336
+ "loss": 0.0522,
1337
+ "num_tokens": 10192967.0,
1338
  "step": 830
1339
  },
1340
  {
1341
+ "epoch": 1.754201680672269,
1342
+ "grad_norm": 0.20029078423976898,
1343
+ "learning_rate": 2.3097686849640476e-05,
1344
+ "loss": 0.0543,
1345
+ "num_tokens": 10254407.0,
1346
  "step": 835
1347
  },
1348
  {
1349
+ "epoch": 1.7647058823529411,
1350
+ "grad_norm": 0.20497262477874756,
1351
+ "learning_rate": 2.2842384590665645e-05,
1352
+ "loss": 0.0526,
1353
+ "num_tokens": 10315847.0,
1354
  "step": 840
1355
  },
1356
  {
1357
+ "epoch": 1.7752100840336134,
1358
+ "grad_norm": 0.19529034197330475,
1359
+ "learning_rate": 2.2587707330433133e-05,
1360
+ "loss": 0.052,
1361
+ "num_tokens": 10377287.0,
1362
  "step": 845
1363
  },
1364
  {
1365
+ "epoch": 1.7857142857142856,
1366
+ "grad_norm": 0.19968290627002716,
1367
+ "learning_rate": 2.23336892437198e-05,
1368
+ "loss": 0.0511,
1369
+ "num_tokens": 10438565.0,
1370
  "step": 850
1371
  },
1372
  {
1373
+ "epoch": 1.7962184873949578,
1374
+ "grad_norm": 0.20312048494815826,
1375
+ "learning_rate": 2.2080364416848987e-05,
1376
+ "loss": 0.0508,
1377
+ "num_tokens": 10500005.0,
1378
  "step": 855
1379
  },
1380
  {
1381
+ "epoch": 1.8067226890756303,
1382
+ "grad_norm": 0.2170594483613968,
1383
+ "learning_rate": 2.1827766843116428e-05,
1384
+ "loss": 0.052,
1385
+ "num_tokens": 10561445.0,
1386
  "step": 860
1387
  },
1388
  {
1389
+ "epoch": 1.8172268907563025,
1390
+ "grad_norm": 0.20793762803077698,
1391
+ "learning_rate": 2.157593041822885e-05,
1392
+ "loss": 0.0507,
1393
+ "num_tokens": 10622885.0,
1394
  "step": 865
1395
  },
1396
  {
1397
+ "epoch": 1.8277310924369747,
1398
+ "grad_norm": 0.18194827437400818,
1399
+ "learning_rate": 2.1324888935755498e-05,
1400
+ "loss": 0.0512,
1401
+ "num_tokens": 10684325.0,
1402
  "step": 870
1403
  },
1404
  {
1405
+ "epoch": 1.8382352941176472,
1406
+ "grad_norm": 0.14043785631656647,
1407
+ "learning_rate": 2.1074676082593425e-05,
1408
+ "loss": 0.0507,
1409
+ "num_tokens": 10745533.0,
1410
  "step": 875
1411
  },
1412
  {
1413
+ "epoch": 1.8487394957983194,
1414
+ "grad_norm": 0.17620113492012024,
1415
+ "learning_rate": 2.0825325434447106e-05,
1416
+ "loss": 0.0526,
1417
+ "num_tokens": 10806971.0,
1418
  "step": 880
1419
  },
1420
  {
1421
+ "epoch": 1.8592436974789917,
1422
+ "grad_norm": 0.17084655165672302,
1423
+ "learning_rate": 2.0576870451322953e-05,
1424
+ "loss": 0.05,
1425
+ "num_tokens": 10868411.0,
1426
  "step": 885
1427
  },
1428
  {
1429
+ "epoch": 1.8697478991596639,
1430
+ "grad_norm": 0.17954443395137787,
1431
+ "learning_rate": 2.032934447303938e-05,
1432
+ "loss": 0.0479,
1433
+ "num_tokens": 10929851.0,
1434
  "step": 890
1435
  },
1436
  {
1437
+ "epoch": 1.8802521008403361,
1438
+ "grad_norm": 0.19004443287849426,
1439
+ "learning_rate": 2.0082780714752963e-05,
1440
+ "loss": 0.0516,
1441
+ "num_tokens": 10991291.0,
1442
  "step": 895
1443
  },
1444
  {
1445
+ "epoch": 1.8907563025210083,
1446
+ "grad_norm": 0.1933521330356598,
1447
+ "learning_rate": 1.9837212262501382e-05,
1448
+ "loss": 0.0526,
1449
+ "num_tokens": 11052731.0,
1450
  "step": 900
1451
  },
1452
  {
1453
+ "epoch": 1.9012605042016806,
1454
+ "grad_norm": 0.1794319450855255,
1455
+ "learning_rate": 1.9592672068763574e-05,
1456
+ "loss": 0.052,
1457
+ "num_tokens": 11114068.0,
1458
  "step": 905
1459
  },
1460
  {
1461
+ "epoch": 1.9117647058823528,
1462
+ "grad_norm": 0.16216090321540833,
1463
+ "learning_rate": 1.934919294803798e-05,
1464
+ "loss": 0.0519,
1465
+ "num_tokens": 11175508.0,
1466
  "step": 910
1467
  },
1468
  {
1469
+ "epoch": 1.9222689075630253,
1470
+ "grad_norm": 0.19467321038246155,
1471
+ "learning_rate": 1.9106807572439168e-05,
1472
  "loss": 0.0506,
1473
+ "num_tokens": 11236948.0,
1474
  "step": 915
1475
  },
1476
  {
1477
+ "epoch": 1.9327731092436975,
1478
+ "grad_norm": 0.13739857077598572,
1479
+ "learning_rate": 1.88655484673136e-05,
1480
+ "loss": 0.0516,
1481
+ "num_tokens": 11298388.0,
1482
  "step": 920
1483
  },
1484
  {
1485
+ "epoch": 1.9432773109243697,
1486
+ "grad_norm": 0.15686306357383728,
1487
+ "learning_rate": 1.8625448006875123e-05,
1488
+ "loss": 0.0505,
1489
+ "num_tokens": 11359828.0,
1490
  "step": 925
1491
  },
1492
  {
1493
+ "epoch": 1.9537815126050422,
1494
+ "grad_norm": 0.12999138236045837,
1495
+ "learning_rate": 1.8386538409860708e-05,
1496
+ "loss": 0.051,
1497
+ "num_tokens": 11421268.0,
1498
  "step": 930
1499
  },
1500
  {
1501
+ "epoch": 1.9642857142857144,
1502
+ "grad_norm": 0.18375808000564575,
1503
+ "learning_rate": 1.8148851735207083e-05,
1504
  "loss": 0.0523,
1505
+ "num_tokens": 11482548.0,
1506
  "step": 935
1507
  },
1508
  {
1509
+ "epoch": 1.9747899159663866,
1510
+ "grad_norm": 0.19671285152435303,
1511
+ "learning_rate": 1.791241987774876e-05,
1512
+ "loss": 0.0509,
1513
+ "num_tokens": 11543988.0,
1514
  "step": 940
1515
  },
1516
  {
1517
+ "epoch": 1.9852941176470589,
1518
+ "grad_norm": 0.1805330216884613,
1519
+ "learning_rate": 1.7677274563938134e-05,
1520
+ "loss": 0.0503,
1521
+ "num_tokens": 11605268.0,
1522
  "step": 945
1523
  },
1524
  {
1525
+ "epoch": 1.995798319327731,
1526
+ "grad_norm": 0.19455303251743317,
1527
+ "learning_rate": 1.744344734758814e-05,
1528
+ "loss": 0.0517,
1529
+ "num_tokens": 11666708.0,
1530
  "step": 950
1531
  },
1532
  {
1533
+ "epoch": 2.0063025210084033,
1534
+ "grad_norm": 0.17816315591335297,
1535
+ "learning_rate": 1.721096960563812e-05,
1536
+ "loss": 0.0507,
1537
+ "num_tokens": 11728148.0,
1538
  "step": 955
1539
  },
1540
  {
1541
+ "epoch": 2.0168067226890756,
1542
+ "grad_norm": 0.12756673991680145,
1543
+ "learning_rate": 1.697987253394337e-05,
1544
+ "loss": 0.0491,
1545
+ "num_tokens": 11789273.0,
1546
  "step": 960
1547
  },
1548
  {
1549
+ "epoch": 2.027310924369748,
1550
+ "grad_norm": 0.19657427072525024,
1551
+ "learning_rate": 1.675018714308902e-05,
1552
+ "loss": 0.0504,
1553
+ "num_tokens": 11850713.0,
1554
  "step": 965
1555
  },
1556
  {
1557
+ "epoch": 2.03781512605042,
1558
+ "grad_norm": 0.1950300633907318,
1559
+ "learning_rate": 1.652194425422878e-05,
1560
+ "loss": 0.0505,
1561
+ "num_tokens": 11912153.0,
1562
  "step": 970
1563
  },
1564
  {
1565
+ "epoch": 2.0483193277310923,
1566
+ "grad_norm": 0.16631367802619934,
1567
+ "learning_rate": 1.629517449494906e-05,
1568
+ "loss": 0.0502,
1569
+ "num_tokens": 11973593.0,
1570
  "step": 975
1571
  },
1572
  {
1573
+ "epoch": 2.0588235294117645,
1574
+ "grad_norm": 0.17350395023822784,
1575
+ "learning_rate": 1.6069908295159146e-05,
1576
+ "loss": 0.0526,
1577
+ "num_tokens": 12035033.0,
1578
  "step": 980
1579
  },
1580
  {
1581
+ "epoch": 2.069327731092437,
1582
+ "grad_norm": 0.18997882306575775,
1583
+ "learning_rate": 1.5846175883007815e-05,
1584
+ "loss": 0.0493,
1585
+ "num_tokens": 12096473.0,
1586
  "step": 985
1587
  },
1588
  {
1589
+ "epoch": 2.0798319327731094,
1590
+ "grad_norm": 0.1386975198984146,
1591
+ "learning_rate": 1.562400728082709e-05,
1592
+ "loss": 0.0497,
1593
+ "num_tokens": 12157913.0,
1594
  "step": 990
1595
  },
1596
  {
1597
+ "epoch": 2.0903361344537816,
1598
+ "grad_norm": 0.1656985878944397,
1599
+ "learning_rate": 1.540343230110354e-05,
1600
+ "loss": 0.0509,
1601
+ "num_tokens": 12219353.0,
1602
  "step": 995
1603
  },
1604
  {
1605
+ "epoch": 2.100840336134454,
1606
+ "grad_norm": 0.19251607358455658,
1607
+ "learning_rate": 1.5184480542477869e-05,
1608
+ "loss": 0.0503,
1609
+ "num_tokens": 12280793.0,
1610
  "step": 1000
1611
  },
1612
  {
1613
+ "epoch": 2.111344537815126,
1614
+ "grad_norm": 0.17274506390094757,
1615
+ "learning_rate": 1.4967181385773022e-05,
1616
+ "loss": 0.0491,
1617
+ "num_tokens": 12342004.0,
1618
  "step": 1005
1619
  },
1620
  {
1621
+ "epoch": 2.1218487394957983,
1622
+ "grad_norm": 0.20883677899837494,
1623
+ "learning_rate": 1.4751563990051675e-05,
1624
+ "loss": 0.0495,
1625
+ "num_tokens": 12403444.0,
1626
  "step": 1010
1627
  },
1628
  {
1629
+ "epoch": 2.1323529411764706,
1630
+ "grad_norm": 0.20437228679656982,
1631
+ "learning_rate": 1.453765728870343e-05,
1632
+ "loss": 0.0514,
1633
+ "num_tokens": 12464884.0,
1634
  "step": 1015
1635
  },
1636
+ {
1637
+ "epoch": 2.142857142857143,
1638
+ "grad_norm": 0.20462237298488617,
1639
+ "learning_rate": 1.432548998556221e-05,
1640
+ "loss": 0.051,
1641
+ "num_tokens": 12526175.0,
1642
+ "step": 1020
1643
+ },
1644
+ {
1645
+ "epoch": 2.153361344537815,
1646
+ "grad_norm": 0.2599621117115021,
1647
+ "learning_rate": 1.4115090551054622e-05,
1648
+ "loss": 0.0517,
1649
+ "num_tokens": 12587615.0,
1650
+ "step": 1025
1651
+ },
1652
+ {
1653
+ "epoch": 2.1638655462184873,
1654
+ "grad_norm": 0.1801358163356781,
1655
+ "learning_rate": 1.3906487218379504e-05,
1656
+ "loss": 0.0499,
1657
+ "num_tokens": 12649055.0,
1658
+ "step": 1030
1659
+ },
1660
+ {
1661
+ "epoch": 2.1743697478991595,
1662
+ "grad_norm": 0.1843215674161911,
1663
+ "learning_rate": 1.3699707979719357e-05,
1664
+ "loss": 0.0513,
1665
+ "num_tokens": 12710459.0,
1666
+ "step": 1035
1667
+ },
1668
+ {
1669
+ "epoch": 2.184873949579832,
1670
+ "grad_norm": 0.19053132832050323,
1671
+ "learning_rate": 1.3494780582484126e-05,
1672
+ "loss": 0.0496,
1673
+ "num_tokens": 12771899.0,
1674
+ "step": 1040
1675
+ },
1676
+ {
1677
+ "epoch": 2.1953781512605044,
1678
+ "grad_norm": 0.15285778045654297,
1679
+ "learning_rate": 1.329173252558779e-05,
1680
+ "loss": 0.0497,
1681
+ "num_tokens": 12833339.0,
1682
+ "step": 1045
1683
+ },
1684
+ {
1685
+ "epoch": 2.2058823529411766,
1686
+ "grad_norm": 0.14396464824676514,
1687
+ "learning_rate": 1.3090591055758356e-05,
1688
+ "loss": 0.0507,
1689
+ "num_tokens": 12894779.0,
1690
+ "step": 1050
1691
+ },
1692
+ {
1693
+ "epoch": 2.216386554621849,
1694
+ "grad_norm": 0.14991876482963562,
1695
+ "learning_rate": 1.2891383163881633e-05,
1696
+ "loss": 0.05,
1697
+ "num_tokens": 12956219.0,
1698
+ "step": 1055
1699
+ },
1700
+ {
1701
+ "epoch": 2.226890756302521,
1702
+ "grad_norm": 0.14839011430740356,
1703
+ "learning_rate": 1.2694135581379383e-05,
1704
+ "loss": 0.0499,
1705
+ "num_tokens": 13017659.0,
1706
+ "step": 1060
1707
+ },
1708
+ {
1709
+ "epoch": 2.2373949579831933,
1710
+ "grad_norm": 0.12264993786811829,
1711
+ "learning_rate": 1.2498874776622246e-05,
1712
+ "loss": 0.0462,
1713
+ "num_tokens": 13079099.0,
1714
+ "step": 1065
1715
+ },
1716
+ {
1717
+ "epoch": 2.2478991596638656,
1718
+ "grad_norm": 0.1659439504146576,
1719
+ "learning_rate": 1.2305626951378019e-05,
1720
+ "loss": 0.0492,
1721
+ "num_tokens": 13140539.0,
1722
+ "step": 1070
1723
+ },
1724
+ {
1725
+ "epoch": 2.258403361344538,
1726
+ "grad_norm": 0.16605842113494873,
1727
+ "learning_rate": 1.2114418037295636e-05,
1728
+ "loss": 0.0502,
1729
+ "num_tokens": 13201979.0,
1730
+ "step": 1075
1731
+ },
1732
+ {
1733
+ "epoch": 2.26890756302521,
1734
+ "grad_norm": 0.16009701788425446,
1735
+ "learning_rate": 1.1925273692425487e-05,
1736
+ "loss": 0.0496,
1737
+ "num_tokens": 13263419.0,
1738
+ "step": 1080
1739
+ },
1740
+ {
1741
+ "epoch": 2.2794117647058822,
1742
+ "grad_norm": 0.1512678861618042,
1743
+ "learning_rate": 1.1738219297776371e-05,
1744
+ "loss": 0.0497,
1745
+ "num_tokens": 13324859.0,
1746
+ "step": 1085
1747
+ },
1748
+ {
1749
+ "epoch": 2.2899159663865545,
1750
+ "grad_norm": 0.18362964689731598,
1751
+ "learning_rate": 1.1553279953909641e-05,
1752
+ "loss": 0.0485,
1753
+ "num_tokens": 13386299.0,
1754
+ "step": 1090
1755
+ },
1756
+ {
1757
+ "epoch": 2.3004201680672267,
1758
+ "grad_norm": 0.15746116638183594,
1759
+ "learning_rate": 1.1370480477571029e-05,
1760
+ "loss": 0.0503,
1761
+ "num_tokens": 13447730.0,
1762
+ "step": 1095
1763
+ },
1764
+ {
1765
+ "epoch": 2.310924369747899,
1766
+ "grad_norm": 0.2701464891433716,
1767
+ "learning_rate": 1.118984539836051e-05,
1768
+ "loss": 0.0521,
1769
+ "num_tokens": 13509170.0,
1770
+ "step": 1100
1771
+ },
1772
+ {
1773
+ "epoch": 2.3214285714285716,
1774
+ "grad_norm": 0.18647603690624237,
1775
+ "learning_rate": 1.1011398955440702e-05,
1776
+ "loss": 0.0498,
1777
+ "num_tokens": 13570409.0,
1778
+ "step": 1105
1779
+ },
1780
+ {
1781
+ "epoch": 2.331932773109244,
1782
+ "grad_norm": 0.12975195050239563,
1783
+ "learning_rate": 1.0835165094284264e-05,
1784
+ "loss": 0.0507,
1785
+ "num_tokens": 13631849.0,
1786
+ "step": 1110
1787
+ },
1788
+ {
1789
+ "epoch": 2.342436974789916,
1790
+ "grad_norm": 0.15623484551906586,
1791
+ "learning_rate": 1.066116746346065e-05,
1792
+ "loss": 0.0499,
1793
+ "num_tokens": 13693289.0,
1794
+ "step": 1115
1795
+ },
1796
+ {
1797
+ "epoch": 2.3529411764705883,
1798
+ "grad_norm": 0.1415032297372818,
1799
+ "learning_rate": 1.0489429411462794e-05,
1800
+ "loss": 0.05,
1801
+ "num_tokens": 13754729.0,
1802
+ "step": 1120
1803
+ },
1804
+ {
1805
+ "epoch": 2.3634453781512605,
1806
+ "grad_norm": 0.188720241189003,
1807
+ "learning_rate": 1.0319973983573971e-05,
1808
+ "loss": 0.053,
1809
+ "num_tokens": 13816169.0,
1810
+ "step": 1125
1811
+ },
1812
+ {
1813
+ "epoch": 2.3739495798319328,
1814
+ "grad_norm": 0.19719360768795013,
1815
+ "learning_rate": 1.0152823918775408e-05,
1816
+ "loss": 0.0503,
1817
+ "num_tokens": 13877609.0,
1818
+ "step": 1130
1819
+ },
1820
+ {
1821
+ "epoch": 2.384453781512605,
1822
+ "grad_norm": 0.1669566035270691,
1823
+ "learning_rate": 9.988001646694935e-06,
1824
+ "loss": 0.0499,
1825
+ "num_tokens": 13939049.0,
1826
+ "step": 1135
1827
+ },
1828
+ {
1829
+ "epoch": 2.3949579831932772,
1830
+ "grad_norm": 0.22400623559951782,
1831
+ "learning_rate": 9.825529284597238e-06,
1832
+ "loss": 0.0534,
1833
+ "num_tokens": 14000489.0,
1834
+ "step": 1140
1835
+ },
1836
+ {
1837
+ "epoch": 2.4054621848739495,
1838
+ "grad_norm": 0.15708568692207336,
1839
+ "learning_rate": 9.665428634415923e-06,
1840
+ "loss": 0.0499,
1841
+ "num_tokens": 14061697.0,
1842
+ "step": 1145
1843
+ },
1844
+ {
1845
+ "epoch": 2.4159663865546217,
1846
+ "grad_norm": 0.16055414080619812,
1847
+ "learning_rate": 9.50772117982799e-06,
1848
+ "loss": 0.0506,
1849
+ "num_tokens": 14123137.0,
1850
+ "step": 1150
1851
+ },
1852
+ {
1853
+ "epoch": 2.426470588235294,
1854
+ "grad_norm": 0.14245997369289398,
1855
+ "learning_rate": 9.352428083370946e-06,
1856
+ "loss": 0.0497,
1857
+ "num_tokens": 14184577.0,
1858
+ "step": 1155
1859
+ },
1860
+ {
1861
+ "epoch": 2.4369747899159666,
1862
+ "grad_norm": 0.14547857642173767,
1863
+ "learning_rate": 9.199570183603021e-06,
1864
+ "loss": 0.0501,
1865
+ "num_tokens": 14246017.0,
1866
+ "step": 1160
1867
+ },
1868
+ {
1869
+ "epoch": 2.447478991596639,
1870
+ "grad_norm": 0.17205478250980377,
1871
+ "learning_rate": 9.049167992306908e-06,
1872
+ "loss": 0.0501,
1873
+ "num_tokens": 14307457.0,
1874
+ "step": 1165
1875
+ },
1876
+ {
1877
+ "epoch": 2.457983193277311,
1878
+ "grad_norm": 0.16485774517059326,
1879
+ "learning_rate": 8.901241691737286e-06,
1880
+ "loss": 0.0499,
1881
+ "num_tokens": 14368897.0,
1882
+ "step": 1170
1883
+ },
1884
+ {
1885
+ "epoch": 2.4684873949579833,
1886
+ "grad_norm": 0.1967056393623352,
1887
+ "learning_rate": 8.755811131912612e-06,
1888
+ "loss": 0.051,
1889
+ "num_tokens": 14430337.0,
1890
+ "step": 1175
1891
+ },
1892
+ {
1893
+ "epoch": 2.4789915966386555,
1894
+ "grad_norm": 0.14919425547122955,
1895
+ "learning_rate": 8.612895827951451e-06,
1896
+ "loss": 0.0495,
1897
+ "num_tokens": 14491744.0,
1898
+ "step": 1180
1899
+ },
1900
+ {
1901
+ "epoch": 2.4894957983193278,
1902
+ "grad_norm": 0.1448267251253128,
1903
+ "learning_rate": 8.472514957453801e-06,
1904
+ "loss": 0.0512,
1905
+ "num_tokens": 14553007.0,
1906
+ "step": 1185
1907
+ },
1908
+ {
1909
+ "epoch": 2.5,
1910
+ "grad_norm": 0.17768217623233795,
1911
+ "learning_rate": 8.33468735792765e-06,
1912
+ "loss": 0.0501,
1913
+ "num_tokens": 14614447.0,
1914
+ "step": 1190
1915
+ },
1916
+ {
1917
+ "epoch": 2.5105042016806722,
1918
+ "grad_norm": 0.1507992148399353,
1919
+ "learning_rate": 8.199431524261223e-06,
1920
+ "loss": 0.0503,
1921
+ "num_tokens": 14675727.0,
1922
+ "step": 1195
1923
+ },
1924
+ {
1925
+ "epoch": 2.5210084033613445,
1926
+ "grad_norm": 0.16350729763507843,
1927
+ "learning_rate": 8.066765606241163e-06,
1928
+ "loss": 0.0496,
1929
+ "num_tokens": 14737165.0,
1930
+ "step": 1200
1931
+ },
1932
+ {
1933
+ "epoch": 2.5315126050420167,
1934
+ "grad_norm": 0.16035616397857666,
1935
+ "learning_rate": 7.936707406117028e-06,
1936
+ "loss": 0.0488,
1937
+ "num_tokens": 14798605.0,
1938
+ "step": 1205
1939
+ },
1940
+ {
1941
+ "epoch": 2.542016806722689,
1942
+ "grad_norm": 0.1894913911819458,
1943
+ "learning_rate": 7.809274376212464e-06,
1944
+ "loss": 0.0508,
1945
+ "num_tokens": 14859883.0,
1946
+ "step": 1210
1947
+ },
1948
+ {
1949
+ "epoch": 2.552521008403361,
1950
+ "grad_norm": 0.1903340071439743,
1951
+ "learning_rate": 7.68448361658327e-06,
1952
+ "loss": 0.0488,
1953
+ "num_tokens": 14921105.0,
1954
+ "step": 1215
1955
+ },
1956
+ {
1957
+ "epoch": 2.5630252100840334,
1958
+ "grad_norm": 0.14671629667282104,
1959
+ "learning_rate": 7.5623518727227975e-06,
1960
+ "loss": 0.0495,
1961
+ "num_tokens": 14982545.0,
1962
+ "step": 1220
1963
+ },
1964
+ {
1965
+ "epoch": 2.5735294117647056,
1966
+ "grad_norm": 0.16081440448760986,
1967
+ "learning_rate": 7.442895533314856e-06,
1968
+ "loss": 0.0473,
1969
+ "num_tokens": 15043985.0,
1970
+ "step": 1225
1971
+ },
1972
+ {
1973
+ "epoch": 2.5840336134453783,
1974
+ "grad_norm": 0.1555902659893036,
1975
+ "learning_rate": 7.326130628034581e-06,
1976
+ "loss": 0.0492,
1977
+ "num_tokens": 15105425.0,
1978
+ "step": 1230
1979
+ },
1980
+ {
1981
+ "epoch": 2.5945378151260505,
1982
+ "grad_norm": 0.1796170324087143,
1983
+ "learning_rate": 7.212072825397413e-06,
1984
+ "loss": 0.0497,
1985
+ "num_tokens": 15166865.0,
1986
+ "step": 1235
1987
+ },
1988
+ {
1989
+ "epoch": 2.6050420168067228,
1990
+ "grad_norm": 0.13445882499217987,
1991
+ "learning_rate": 7.100737430656561e-06,
1992
+ "loss": 0.0494,
1993
+ "num_tokens": 15228139.0,
1994
+ "step": 1240
1995
+ },
1996
+ {
1997
+ "epoch": 2.615546218487395,
1998
+ "grad_norm": 0.18797667324543,
1999
+ "learning_rate": 6.992139383749224e-06,
2000
+ "loss": 0.0499,
2001
+ "num_tokens": 15289579.0,
2002
+ "step": 1245
2003
+ },
2004
+ {
2005
+ "epoch": 2.6260504201680672,
2006
+ "grad_norm": 0.1478380262851715,
2007
+ "learning_rate": 6.886293257291801e-06,
2008
+ "loss": 0.0503,
2009
+ "num_tokens": 15351019.0,
2010
+ "step": 1250
2011
+ },
2012
+ {
2013
+ "epoch": 2.6365546218487395,
2014
+ "grad_norm": 0.19320227205753326,
2015
+ "learning_rate": 6.78321325462444e-06,
2016
+ "loss": 0.0486,
2017
+ "num_tokens": 15412459.0,
2018
+ "step": 1255
2019
+ },
2020
+ {
2021
+ "epoch": 2.6470588235294117,
2022
+ "grad_norm": 0.18944524228572845,
2023
+ "learning_rate": 6.682913207905095e-06,
2024
+ "loss": 0.0496,
2025
+ "num_tokens": 15473796.0,
2026
+ "step": 1260
2027
+ },
2028
+ {
2029
+ "epoch": 2.657563025210084,
2030
+ "grad_norm": 0.17592737078666687,
2031
+ "learning_rate": 6.585406576253404e-06,
2032
+ "loss": 0.0501,
2033
+ "num_tokens": 15535236.0,
2034
+ "step": 1265
2035
+ },
2036
+ {
2037
+ "epoch": 2.668067226890756,
2038
+ "grad_norm": 0.18211396038532257,
2039
+ "learning_rate": 6.490706443944656e-06,
2040
+ "loss": 0.0491,
2041
+ "num_tokens": 15596676.0,
2042
+ "step": 1270
2043
+ },
2044
+ {
2045
+ "epoch": 2.678571428571429,
2046
+ "grad_norm": 0.1536846160888672,
2047
+ "learning_rate": 6.398825518653992e-06,
2048
+ "loss": 0.05,
2049
+ "num_tokens": 15658116.0,
2050
+ "step": 1275
2051
+ },
2052
+ {
2053
+ "epoch": 2.689075630252101,
2054
+ "grad_norm": 0.18677380681037903,
2055
+ "learning_rate": 6.30977612975121e-06,
2056
+ "loss": 0.0493,
2057
+ "num_tokens": 15719399.0,
2058
+ "step": 1280
2059
+ },
2060
+ {
2061
+ "epoch": 2.6995798319327733,
2062
+ "grad_norm": 0.1490916907787323,
2063
+ "learning_rate": 6.223570226646291e-06,
2064
+ "loss": 0.0514,
2065
+ "num_tokens": 15780839.0,
2066
+ "step": 1285
2067
+ },
2068
+ {
2069
+ "epoch": 2.7100840336134455,
2070
+ "grad_norm": 0.15238384902477264,
2071
+ "learning_rate": 6.140219377185933e-06,
2072
+ "loss": 0.05,
2073
+ "num_tokens": 15842274.0,
2074
+ "step": 1290
2075
+ },
2076
+ {
2077
+ "epoch": 2.7205882352941178,
2078
+ "grad_norm": 0.15011648833751678,
2079
+ "learning_rate": 6.0597347661012635e-06,
2080
+ "loss": 0.0493,
2081
+ "num_tokens": 15903714.0,
2082
+ "step": 1295
2083
+ },
2084
+ {
2085
+ "epoch": 2.73109243697479,
2086
+ "grad_norm": 0.1596149504184723,
2087
+ "learning_rate": 5.982127193507003e-06,
2088
+ "loss": 0.0494,
2089
+ "num_tokens": 15965148.0,
2090
+ "step": 1300
2091
+ },
2092
+ {
2093
+ "epoch": 2.741596638655462,
2094
+ "grad_norm": 0.16487446427345276,
2095
+ "learning_rate": 5.907407073452186e-06,
2096
+ "loss": 0.0506,
2097
+ "num_tokens": 16026588.0,
2098
+ "step": 1305
2099
+ },
2100
+ {
2101
+ "epoch": 2.7521008403361344,
2102
+ "grad_norm": 0.1454056352376938,
2103
+ "learning_rate": 5.835584432522727e-06,
2104
+ "loss": 0.0492,
2105
+ "num_tokens": 16088028.0,
2106
+ "step": 1310
2107
+ },
2108
+ {
2109
+ "epoch": 2.7626050420168067,
2110
+ "grad_norm": 0.16204313933849335,
2111
+ "learning_rate": 5.766668908495966e-06,
2112
+ "loss": 0.0509,
2113
+ "num_tokens": 16149468.0,
2114
+ "step": 1315
2115
+ },
2116
+ {
2117
+ "epoch": 2.773109243697479,
2118
+ "grad_norm": 0.18910805881023407,
2119
+ "learning_rate": 5.700669749047387e-06,
2120
+ "loss": 0.0489,
2121
+ "num_tokens": 16210908.0,
2122
+ "step": 1320
2123
+ },
2124
+ {
2125
+ "epoch": 2.783613445378151,
2126
+ "grad_norm": 0.17493724822998047,
2127
+ "learning_rate": 5.637595810509689e-06,
2128
+ "loss": 0.05,
2129
+ "num_tokens": 16272348.0,
2130
+ "step": 1325
2131
+ },
2132
+ {
2133
+ "epoch": 2.7941176470588234,
2134
+ "grad_norm": 0.14875848591327667,
2135
+ "learning_rate": 5.577455556684369e-06,
2136
+ "loss": 0.049,
2137
+ "num_tokens": 16333788.0,
2138
+ "step": 1330
2139
+ },
2140
+ {
2141
+ "epoch": 2.8046218487394956,
2142
+ "grad_norm": 0.1500275731086731,
2143
+ "learning_rate": 5.520257057705971e-06,
2144
+ "loss": 0.0498,
2145
+ "num_tokens": 16395228.0,
2146
+ "step": 1335
2147
+ },
2148
+ {
2149
+ "epoch": 2.815126050420168,
2150
+ "grad_norm": 0.1598060131072998,
2151
+ "learning_rate": 5.466007988959163e-06,
2152
+ "loss": 0.0507,
2153
+ "num_tokens": 16456417.0,
2154
+ "step": 1340
2155
+ },
2156
+ {
2157
+ "epoch": 2.82563025210084,
2158
+ "grad_norm": 0.1572778970003128,
2159
+ "learning_rate": 5.414715630048797e-06,
2160
+ "loss": 0.051,
2161
+ "num_tokens": 16517857.0,
2162
+ "step": 1345
2163
+ },
2164
+ {
2165
+ "epoch": 2.8361344537815127,
2166
+ "grad_norm": 0.15581144392490387,
2167
+ "learning_rate": 5.366386863823077e-06,
2168
+ "loss": 0.0499,
2169
+ "num_tokens": 16579297.0,
2170
+ "step": 1350
2171
+ },
2172
+ {
2173
+ "epoch": 2.846638655462185,
2174
+ "grad_norm": 0.18151573836803436,
2175
+ "learning_rate": 5.3210281754499284e-06,
2176
+ "loss": 0.0496,
2177
+ "num_tokens": 16640737.0,
2178
+ "step": 1355
2179
+ },
2180
+ {
2181
+ "epoch": 2.857142857142857,
2182
+ "grad_norm": 0.1601044237613678,
2183
+ "learning_rate": 5.278645651546797e-06,
2184
+ "loss": 0.0487,
2185
+ "num_tokens": 16702177.0,
2186
+ "step": 1360
2187
+ },
2188
+ {
2189
+ "epoch": 2.8676470588235294,
2190
+ "grad_norm": 0.1497681736946106,
2191
+ "learning_rate": 5.239244979363877e-06,
2192
+ "loss": 0.0492,
2193
+ "num_tokens": 16763617.0,
2194
+ "step": 1365
2195
+ },
2196
+ {
2197
+ "epoch": 2.8781512605042017,
2198
+ "grad_norm": 0.15907803177833557,
2199
+ "learning_rate": 5.202831446020945e-06,
2200
+ "loss": 0.0502,
2201
+ "num_tokens": 16824905.0,
2202
+ "step": 1370
2203
+ },
2204
+ {
2205
+ "epoch": 2.888655462184874,
2206
+ "grad_norm": 0.17641125619411469,
2207
+ "learning_rate": 5.169409937797901e-06,
2208
+ "loss": 0.0502,
2209
+ "num_tokens": 16886146.0,
2210
+ "step": 1375
2211
+ },
2212
+ {
2213
+ "epoch": 2.899159663865546,
2214
+ "grad_norm": 0.12964367866516113,
2215
+ "learning_rate": 5.138984939479077e-06,
2216
+ "loss": 0.0487,
2217
+ "num_tokens": 16947586.0,
2218
+ "step": 1380
2219
+ },
2220
+ {
2221
+ "epoch": 2.9096638655462184,
2222
+ "grad_norm": 0.14473247528076172,
2223
+ "learning_rate": 5.111560533751426e-06,
2224
+ "loss": 0.0491,
2225
+ "num_tokens": 17009026.0,
2226
+ "step": 1385
2227
+ },
2228
+ {
2229
+ "epoch": 2.9201680672268906,
2230
+ "grad_norm": 0.18652838468551636,
2231
+ "learning_rate": 5.087140400656684e-06,
2232
+ "loss": 0.0506,
2233
+ "num_tokens": 17070466.0,
2234
+ "step": 1390
2235
+ },
2236
+ {
2237
+ "epoch": 2.9306722689075633,
2238
+ "grad_norm": 0.18603888154029846,
2239
+ "learning_rate": 5.065727817097544e-06,
2240
+ "loss": 0.0492,
2241
+ "num_tokens": 17131779.0,
2242
+ "step": 1395
2243
+ },
2244
+ {
2245
+ "epoch": 2.9411764705882355,
2246
+ "grad_norm": 0.15723615884780884,
2247
+ "learning_rate": 5.047325656397932e-06,
2248
+ "loss": 0.0494,
2249
+ "num_tokens": 17193063.0,
2250
+ "step": 1400
2251
+ },
2252
+ {
2253
+ "epoch": 2.9516806722689077,
2254
+ "grad_norm": 0.14798587560653687,
2255
+ "learning_rate": 5.031936387917442e-06,
2256
+ "loss": 0.049,
2257
+ "num_tokens": 17254503.0,
2258
+ "step": 1405
2259
+ },
2260
+ {
2261
+ "epoch": 2.96218487394958,
2262
+ "grad_norm": 0.19435246288776398,
2263
+ "learning_rate": 5.019562076719972e-06,
2264
+ "loss": 0.0494,
2265
+ "num_tokens": 17315742.0,
2266
+ "step": 1410
2267
+ },
2268
+ {
2269
+ "epoch": 2.972689075630252,
2270
+ "grad_norm": 0.17056235671043396,
2271
+ "learning_rate": 5.0102043832966236e-06,
2272
+ "loss": 0.0493,
2273
+ "num_tokens": 17377182.0,
2274
+ "step": 1415
2275
+ },
2276
+ {
2277
+ "epoch": 2.9831932773109244,
2278
+ "grad_norm": 0.12487131357192993,
2279
+ "learning_rate": 5.003864563342878e-06,
2280
+ "loss": 0.0477,
2281
+ "num_tokens": 17438622.0,
2282
+ "step": 1420
2283
+ },
2284
+ {
2285
+ "epoch": 2.9936974789915967,
2286
+ "grad_norm": 0.13770359754562378,
2287
+ "learning_rate": 5.0005434675900966e-06,
2288
+ "loss": 0.0477,
2289
+ "num_tokens": 17500062.0,
2290
+ "step": 1425
2291
+ },
2292
  {
2293
  "epoch": 3.0,
2294
+ "num_tokens": 17536926.0,
2295
+ "step": 1428,
2296
+ "total_flos": 7.444201440207176e+17,
2297
+ "train_loss": 0.06586769079210378,
2298
+ "train_runtime": 7485.0094,
2299
+ "train_samples_per_second": 9.147,
2300
+ "train_steps_per_second": 0.191
2301
  }
2302
  ],
2303
  "logging_steps": 5,
2304
+ "max_steps": 1428,
2305
  "num_input_tokens_seen": 0,
2306
  "num_train_epochs": 3,
2307
  "save_steps": 100,
 
2317
  "attributes": {}
2318
  }
2319
  },
2320
+ "total_flos": 7.444201440207176e+17,
2321
  "train_batch_size": 16,
2322
  "trial_name": null,
2323
  "trial_params": null