hlillemark commited on
Commit
214abf1
·
verified ·
1 Parent(s): 1acac7f

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3
4
  base_model: meta-llama/Meta-Llama-3-8B-Instruct
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: all_tasks_combined_8b_sft
@@ -15,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # all_tasks_combined_8b_sft
17
 
18
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.4941
21
 
22
  ## Model description
23
 
 
4
  base_model: meta-llama/Meta-Llama-3-8B-Instruct
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: all_tasks_combined_8b_sft
 
16
 
17
  # all_tasks_combined_8b_sft
18
 
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on the identity and the data_mc_filtered datasets.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.4943
22
 
23
  ## Model description
24
 
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_loss": 1.065657615661621,
4
- "eval_runtime": 1.1497,
5
- "eval_samples_per_second": 19.135,
6
- "eval_steps_per_second": 2.609,
7
- "total_flos": 52476849684480.0,
8
- "train_loss": 0.2212562888971905,
9
- "train_runtime": 2292.2808,
10
- "train_samples_per_second": 4.663,
11
- "train_steps_per_second": 0.292
12
  }
 
1
  {
2
+ "epoch": 2.9953574744661093,
3
+ "eval_loss": 0.4942765235900879,
4
+ "eval_runtime": 19.3607,
5
+ "eval_samples_per_second": 18.181,
6
+ "eval_steps_per_second": 2.273,
7
+ "total_flos": 292826854195200.0,
8
+ "train_loss": 0.2790517607955389,
9
+ "train_runtime": 12257.2359,
10
+ "train_samples_per_second": 4.217,
11
+ "train_steps_per_second": 0.132
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_loss": 1.065657615661621,
4
- "eval_runtime": 1.1497,
5
- "eval_samples_per_second": 19.135,
6
- "eval_steps_per_second": 2.609
7
  }
 
1
  {
2
+ "epoch": 2.9953574744661093,
3
+ "eval_loss": 0.4942765235900879,
4
+ "eval_runtime": 19.3607,
5
+ "eval_samples_per_second": 18.181,
6
+ "eval_steps_per_second": 2.273
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "total_flos": 52476849684480.0,
4
- "train_loss": 0.2212562888971905,
5
- "train_runtime": 2292.2808,
6
- "train_samples_per_second": 4.663,
7
- "train_steps_per_second": 0.292
8
  }
 
1
  {
2
+ "epoch": 2.9953574744661093,
3
+ "total_flos": 292826854195200.0,
4
+ "train_loss": 0.2790517607955389,
5
+ "train_runtime": 12257.2359,
6
+ "train_samples_per_second": 4.217,
7
+ "train_steps_per_second": 0.132
8
  }
trainer_state.json CHANGED
@@ -1,600 +1,1410 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 10.0,
5
  "eval_steps": 50,
6
- "global_step": 670,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.14925373134328357,
13
- "grad_norm": 62.46964209905432,
14
- "learning_rate": 1.4925373134328358e-06,
15
- "loss": 1.5237,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.29850746268656714,
20
- "grad_norm": 18.23641099206786,
21
- "learning_rate": 2.9850746268656716e-06,
22
- "loss": 0.8967,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.44776119402985076,
27
- "grad_norm": 17.700391850985287,
28
- "learning_rate": 4.477611940298508e-06,
29
- "loss": 0.7064,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.5970149253731343,
34
- "grad_norm": 15.47240679468768,
35
- "learning_rate": 5.970149253731343e-06,
36
- "loss": 0.7563,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.746268656716418,
41
- "grad_norm": 13.336317250535666,
42
- "learning_rate": 7.46268656716418e-06,
43
- "loss": 0.7504,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.746268656716418,
48
- "eval_loss": 0.7051475048065186,
49
- "eval_runtime": 1.1419,
50
- "eval_samples_per_second": 19.266,
51
- "eval_steps_per_second": 2.627,
52
  "step": 50
53
  },
54
  {
55
- "epoch": 0.8955223880597015,
56
- "grad_norm": 13.252454175450078,
57
- "learning_rate": 8.955223880597016e-06,
58
- "loss": 0.6893,
59
  "step": 60
60
  },
61
  {
62
- "epoch": 1.044776119402985,
63
- "grad_norm": 11.80189996714652,
64
- "learning_rate": 9.999389284703265e-06,
65
- "loss": 0.7673,
66
  "step": 70
67
  },
68
  {
69
- "epoch": 1.1940298507462686,
70
- "grad_norm": 8.996097585606723,
71
- "learning_rate": 9.988536273658876e-06,
72
- "loss": 0.4853,
73
  "step": 80
74
  },
75
  {
76
- "epoch": 1.3432835820895521,
77
- "grad_norm": 10.21049769891817,
78
- "learning_rate": 9.964145714351633e-06,
79
- "loss": 0.4875,
80
  "step": 90
81
  },
82
  {
83
- "epoch": 1.4925373134328357,
84
- "grad_norm": 15.200908102543476,
85
- "learning_rate": 9.926283796211796e-06,
86
- "loss": 0.6328,
87
  "step": 100
88
  },
89
  {
90
- "epoch": 1.4925373134328357,
91
- "eval_loss": 0.6948466897010803,
92
- "eval_runtime": 1.1346,
93
- "eval_samples_per_second": 19.391,
94
- "eval_steps_per_second": 2.644,
95
  "step": 100
96
  },
97
  {
98
- "epoch": 1.6417910447761193,
99
- "grad_norm": 11.348256417642952,
100
- "learning_rate": 9.87505326632108e-06,
101
- "loss": 0.5766,
102
  "step": 110
103
  },
104
  {
105
- "epoch": 1.7910447761194028,
106
- "grad_norm": 9.974009872915042,
107
- "learning_rate": 9.810593150584658e-06,
108
- "loss": 0.5581,
109
  "step": 120
110
  },
111
  {
112
- "epoch": 1.9402985074626866,
113
- "grad_norm": 9.043873474237678,
114
- "learning_rate": 9.733078376452172e-06,
115
- "loss": 0.5858,
116
  "step": 130
117
  },
118
  {
119
- "epoch": 2.08955223880597,
120
- "grad_norm": 6.359601801482128,
121
- "learning_rate": 9.642719298211602e-06,
122
- "loss": 0.3937,
123
  "step": 140
124
  },
125
  {
126
- "epoch": 2.2388059701492535,
127
- "grad_norm": 7.850358378677195,
128
- "learning_rate": 9.539761126144193e-06,
129
- "loss": 0.2823,
130
  "step": 150
131
  },
132
  {
133
- "epoch": 2.2388059701492535,
134
- "eval_loss": 0.8329303860664368,
135
- "eval_runtime": 1.1443,
136
- "eval_samples_per_second": 19.226,
137
- "eval_steps_per_second": 2.622,
138
  "step": 150
139
  },
140
  {
141
- "epoch": 2.388059701492537,
142
- "grad_norm": 7.403616207329922,
143
- "learning_rate": 9.424483261089584e-06,
144
- "loss": 0.3042,
145
  "step": 160
146
  },
147
  {
148
- "epoch": 2.5373134328358207,
149
- "grad_norm": 9.903824789117488,
150
- "learning_rate": 9.297198536226927e-06,
151
- "loss": 0.3432,
152
  "step": 170
153
  },
154
  {
155
- "epoch": 2.6865671641791042,
156
- "grad_norm": 7.276007975249782,
157
- "learning_rate": 9.158252368129628e-06,
158
- "loss": 0.3385,
159
  "step": 180
160
  },
161
  {
162
- "epoch": 2.835820895522388,
163
- "grad_norm": 6.961512240006563,
164
- "learning_rate": 9.008021819397488e-06,
165
- "loss": 0.3322,
166
  "step": 190
167
  },
168
  {
169
- "epoch": 2.9850746268656714,
170
- "grad_norm": 9.214663023889504,
171
- "learning_rate": 8.846914575410035e-06,
172
- "loss": 0.3349,
173
  "step": 200
174
  },
175
  {
176
- "epoch": 2.9850746268656714,
177
- "eval_loss": 0.8281504511833191,
178
- "eval_runtime": 1.138,
179
- "eval_samples_per_second": 19.332,
180
- "eval_steps_per_second": 2.636,
181
  "step": 200
182
  },
183
  {
184
- "epoch": 3.1343283582089554,
185
- "grad_norm": 6.775051665842526,
186
- "learning_rate": 8.675367837977848e-06,
187
- "loss": 0.2317,
188
  "step": 210
189
  },
190
  {
191
- "epoch": 3.283582089552239,
192
- "grad_norm": 10.639831698910774,
193
- "learning_rate": 8.49384713889421e-06,
194
- "loss": 0.219,
195
  "step": 220
196
  },
197
  {
198
- "epoch": 3.4328358208955225,
199
- "grad_norm": 5.513116180089331,
200
- "learning_rate": 8.302845076606786e-06,
201
- "loss": 0.2004,
202
  "step": 230
203
  },
204
  {
205
- "epoch": 3.582089552238806,
206
- "grad_norm": 4.675709172799224,
207
- "learning_rate": 8.10287997943769e-06,
208
- "loss": 0.2106,
209
  "step": 240
210
  },
211
  {
212
- "epoch": 3.7313432835820897,
213
- "grad_norm": 7.218127118287365,
214
- "learning_rate": 7.894494498979558e-06,
215
- "loss": 0.2069,
216
  "step": 250
217
  },
218
  {
219
- "epoch": 3.7313432835820897,
220
- "eval_loss": 0.7403361797332764,
221
- "eval_runtime": 1.1421,
222
- "eval_samples_per_second": 19.263,
223
- "eval_steps_per_second": 2.627,
224
  "step": 250
225
  },
226
  {
227
- "epoch": 3.8805970149253732,
228
- "grad_norm": 5.413632092673544,
229
- "learning_rate": 7.678254137484797e-06,
230
- "loss": 0.2099,
231
  "step": 260
232
  },
233
  {
234
- "epoch": 4.029850746268656,
235
- "grad_norm": 4.514679371933533,
236
- "learning_rate": 7.4547457132442895e-06,
237
- "loss": 0.1866,
238
  "step": 270
239
  },
240
  {
241
- "epoch": 4.17910447761194,
242
- "grad_norm": 5.397504900004704,
243
- "learning_rate": 7.2245757681200835e-06,
244
- "loss": 0.0992,
245
  "step": 280
246
  },
247
  {
248
- "epoch": 4.3283582089552235,
249
- "grad_norm": 6.311754836671643,
250
- "learning_rate": 6.988368921553601e-06,
251
- "loss": 0.1249,
252
  "step": 290
253
  },
254
  {
255
- "epoch": 4.477611940298507,
256
- "grad_norm": 5.288945692952636,
257
- "learning_rate": 6.746766175516159e-06,
258
- "loss": 0.0942,
259
  "step": 300
260
  },
261
  {
262
- "epoch": 4.477611940298507,
263
- "eval_loss": 0.8359870910644531,
264
- "eval_runtime": 1.1349,
265
- "eval_samples_per_second": 19.385,
266
- "eval_steps_per_second": 2.643,
267
  "step": 300
268
  },
269
  {
270
- "epoch": 4.6268656716417915,
271
- "grad_norm": 8.659651343283201,
272
- "learning_rate": 6.500423175001705e-06,
273
- "loss": 0.1187,
274
  "step": 310
275
  },
276
  {
277
- "epoch": 4.776119402985074,
278
- "grad_norm": 5.073570072467742,
279
- "learning_rate": 6.2500084287822925e-06,
280
- "loss": 0.131,
281
  "step": 320
282
  },
283
  {
284
- "epoch": 4.925373134328359,
285
- "grad_norm": 5.5550702226053605,
286
- "learning_rate": 5.996201495254757e-06,
287
- "loss": 0.1343,
288
  "step": 330
289
  },
290
  {
291
- "epoch": 5.074626865671641,
292
- "grad_norm": 3.9447083325808108,
293
- "learning_rate": 5.73969113830165e-06,
294
- "loss": 0.1011,
295
  "step": 340
296
  },
297
  {
298
- "epoch": 5.223880597014926,
299
- "grad_norm": 3.2472003531286338,
300
- "learning_rate": 5.481173458170952e-06,
301
- "loss": 0.0553,
302
  "step": 350
303
  },
304
  {
305
- "epoch": 5.223880597014926,
306
- "eval_loss": 0.8742682933807373,
307
- "eval_runtime": 1.1401,
308
- "eval_samples_per_second": 19.296,
309
- "eval_steps_per_second": 2.631,
310
  "step": 350
311
  },
312
  {
313
- "epoch": 5.373134328358209,
314
- "grad_norm": 3.628965369391052,
315
- "learning_rate": 5.221350002446882e-06,
316
- "loss": 0.061,
317
  "step": 360
318
  },
319
  {
320
- "epoch": 5.522388059701493,
321
- "grad_norm": 2.5871034929884313,
322
- "learning_rate": 4.96092586223808e-06,
323
- "loss": 0.0554,
324
  "step": 370
325
  },
326
  {
327
- "epoch": 5.6716417910447765,
328
- "grad_norm": 3.7166476236947528,
329
- "learning_rate": 4.700607758749626e-06,
330
- "loss": 0.0627,
331
  "step": 380
332
  },
333
  {
334
- "epoch": 5.82089552238806,
335
- "grad_norm": 5.238055742685194,
336
- "learning_rate": 4.441102125431398e-06,
337
- "loss": 0.0642,
338
  "step": 390
339
  },
340
  {
341
- "epoch": 5.970149253731344,
342
- "grad_norm": 4.718083316839321,
343
- "learning_rate": 4.183113190907349e-06,
344
- "loss": 0.0774,
345
  "step": 400
346
  },
347
  {
348
- "epoch": 5.970149253731344,
349
- "eval_loss": 0.834354817867279,
350
- "eval_runtime": 1.1389,
351
- "eval_samples_per_second": 19.317,
352
- "eval_steps_per_second": 2.634,
353
  "step": 400
354
  },
355
  {
356
- "epoch": 6.119402985074627,
357
- "grad_norm": 3.1655302149163553,
358
- "learning_rate": 3.927341067888065e-06,
359
- "loss": 0.0478,
360
  "step": 410
361
  },
362
  {
363
- "epoch": 6.268656716417911,
364
- "grad_norm": 2.6103089340590206,
365
- "learning_rate": 3.6744798532528137e-06,
366
- "loss": 0.0356,
367
  "step": 420
368
  },
369
  {
370
- "epoch": 6.417910447761194,
371
- "grad_norm": 2.1582814059469797,
372
- "learning_rate": 3.4252157444569478e-06,
373
- "loss": 0.0264,
374
  "step": 430
375
  },
376
  {
377
- "epoch": 6.567164179104478,
378
- "grad_norm": 5.753417833105347,
379
- "learning_rate": 3.1802251773762294e-06,
380
- "loss": 0.039,
381
  "step": 440
382
  },
383
  {
384
- "epoch": 6.7164179104477615,
385
- "grad_norm": 2.948747643898508,
386
- "learning_rate": 2.9401729906414385e-06,
387
- "loss": 0.0352,
388
  "step": 450
389
  },
390
  {
391
- "epoch": 6.7164179104477615,
392
- "eval_loss": 0.9773427844047546,
393
- "eval_runtime": 1.1359,
394
- "eval_samples_per_second": 19.368,
395
- "eval_steps_per_second": 2.641,
396
  "step": 450
397
  },
398
  {
399
- "epoch": 6.865671641791045,
400
- "grad_norm": 4.989745569465322,
401
- "learning_rate": 2.7057106214448216e-06,
402
- "loss": 0.043,
403
  "step": 460
404
  },
405
  {
406
- "epoch": 7.014925373134329,
407
- "grad_norm": 0.42314469761534634,
408
- "learning_rate": 2.4774743377144265e-06,
409
- "loss": 0.0236,
410
  "step": 470
411
  },
412
  {
413
- "epoch": 7.164179104477612,
414
- "grad_norm": 0.5481091739655988,
415
- "learning_rate": 2.256083511453747e-06,
416
- "loss": 0.0173,
417
  "step": 480
418
  },
419
  {
420
- "epoch": 7.313432835820896,
421
- "grad_norm": 1.00555080244342,
422
- "learning_rate": 2.042138937932388e-06,
423
- "loss": 0.0167,
424
  "step": 490
425
  },
426
  {
427
- "epoch": 7.462686567164179,
428
- "grad_norm": 0.9232480963513995,
429
- "learning_rate": 1.8362212052889827e-06,
430
- "loss": 0.0176,
431
  "step": 500
432
  },
433
  {
434
- "epoch": 7.462686567164179,
435
- "eval_loss": 0.9960648417472839,
436
- "eval_runtime": 1.136,
437
- "eval_samples_per_second": 19.366,
438
- "eval_steps_per_second": 2.641,
439
  "step": 500
440
  },
441
  {
442
- "epoch": 7.611940298507463,
443
- "grad_norm": 2.6590690516807345,
444
- "learning_rate": 1.63888911897084e-06,
445
- "loss": 0.0175,
446
  "step": 510
447
  },
448
  {
449
- "epoch": 7.7611940298507465,
450
- "grad_norm": 1.6242460024315573,
451
- "learning_rate": 1.4506781852859836e-06,
452
- "loss": 0.0171,
453
  "step": 520
454
  },
455
  {
456
- "epoch": 7.91044776119403,
457
- "grad_norm": 1.0094779557317752,
458
- "learning_rate": 1.2720991581827852e-06,
459
- "loss": 0.0123,
460
  "step": 530
461
  },
462
  {
463
- "epoch": 8.059701492537313,
464
- "grad_norm": 0.05439658756015428,
465
- "learning_rate": 1.1036366532008552e-06,
466
- "loss": 0.0061,
467
  "step": 540
468
  },
469
  {
470
- "epoch": 8.208955223880597,
471
- "grad_norm": 1.4613912695331988,
472
- "learning_rate": 9.457478323545749e-07,
473
- "loss": 0.0083,
474
  "step": 550
475
  },
476
  {
477
- "epoch": 8.208955223880597,
478
- "eval_loss": 1.016642689704895,
479
- "eval_runtime": 1.135,
480
- "eval_samples_per_second": 19.383,
481
- "eval_steps_per_second": 2.643,
482
  "step": 550
483
  },
484
  {
485
- "epoch": 8.35820895522388,
486
- "grad_norm": 0.8842473436486248,
487
- "learning_rate": 7.988611635181099e-07,
488
- "loss": 0.007,
489
  "step": 560
490
  },
491
  {
492
- "epoch": 8.507462686567164,
493
- "grad_norm": 1.1014869055273346,
494
- "learning_rate": 6.633752576786251e-07,
495
- "loss": 0.0119,
496
  "step": 570
497
  },
498
  {
499
- "epoch": 8.656716417910447,
500
- "grad_norm": 0.10980572243131498,
501
- "learning_rate": 5.396577872130676e-07,
502
- "loss": 0.0107,
503
  "step": 580
504
  },
505
  {
506
- "epoch": 8.805970149253731,
507
- "grad_norm": 0.5442600253450464,
508
- "learning_rate": 4.2804448812404754e-07,
509
- "loss": 0.0041,
510
  "step": 590
511
  },
512
  {
513
- "epoch": 8.955223880597014,
514
- "grad_norm": 1.4383902839461293,
515
- "learning_rate": 3.288382489424502e-07,
516
- "loss": 0.0075,
517
  "step": 600
518
  },
519
  {
520
- "epoch": 8.955223880597014,
521
- "eval_loss": 1.043831467628479,
522
- "eval_runtime": 1.1419,
523
- "eval_samples_per_second": 19.266,
524
- "eval_steps_per_second": 2.627,
525
  "step": 600
526
  },
527
  {
528
- "epoch": 9.104477611940299,
529
- "grad_norm": 0.04597308958713943,
530
- "learning_rate": 2.4230828876927293e-07,
531
- "loss": 0.0047,
532
  "step": 610
533
  },
534
  {
535
- "epoch": 9.253731343283581,
536
- "grad_norm": 0.11644667495675552,
537
- "learning_rate": 1.6868942668726408e-07,
538
- "loss": 0.004,
539
  "step": 620
540
  },
541
  {
542
- "epoch": 9.402985074626866,
543
- "grad_norm": 0.632174719460539,
544
- "learning_rate": 1.0818144452496293e-07,
545
- "loss": 0.005,
546
  "step": 630
547
  },
548
  {
549
- "epoch": 9.552238805970148,
550
- "grad_norm": 0.41757906274589185,
551
- "learning_rate": 6.094854470245326e-08,
552
- "loss": 0.003,
553
  "step": 640
554
  },
555
  {
556
- "epoch": 9.701492537313433,
557
- "grad_norm": 0.623129850332705,
558
- "learning_rate": 2.711890463007405e-08,
559
- "loss": 0.0028,
560
  "step": 650
561
  },
562
  {
563
- "epoch": 9.701492537313433,
564
- "eval_loss": 1.0667099952697754,
565
- "eval_runtime": 1.1414,
566
- "eval_samples_per_second": 19.275,
567
- "eval_steps_per_second": 2.628,
568
  "step": 650
569
  },
570
  {
571
- "epoch": 9.850746268656717,
572
- "grad_norm": 0.1499912028824655,
573
- "learning_rate": 6.784328869339218e-09,
574
- "loss": 0.0046,
575
  "step": 660
576
  },
577
  {
578
- "epoch": 10.0,
579
- "grad_norm": 0.3933267850861081,
580
- "learning_rate": 0.0,
581
- "loss": 0.0054,
582
  "step": 670
583
  },
584
  {
585
- "epoch": 10.0,
586
- "step": 670,
587
- "total_flos": 52476849684480.0,
588
- "train_loss": 0.2212562888971905,
589
- "train_runtime": 2292.2808,
590
- "train_samples_per_second": 4.663,
591
- "train_steps_per_second": 0.292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592
  }
593
  ],
594
  "logging_steps": 10,
595
- "max_steps": 670,
596
  "num_input_tokens_seen": 0,
597
- "num_train_epochs": 10,
598
  "save_steps": 5000,
599
  "stateful_callbacks": {
600
  "TrainerControl": {
@@ -608,7 +1418,7 @@
608
  "attributes": {}
609
  }
610
  },
611
- "total_flos": 52476849684480.0,
612
  "train_batch_size": 2,
613
  "trial_name": null,
614
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.9953574744661093,
5
  "eval_steps": 50,
6
+ "global_step": 1614,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.018570102135561744,
13
+ "grad_norm": 33.58969944080453,
14
+ "learning_rate": 6.17283950617284e-07,
15
+ "loss": 0.8947,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.03714020427112349,
20
+ "grad_norm": 16.744611450678853,
21
+ "learning_rate": 1.234567901234568e-06,
22
+ "loss": 0.6994,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.055710306406685235,
27
+ "grad_norm": 13.439601262151244,
28
+ "learning_rate": 1.8518518518518519e-06,
29
+ "loss": 0.624,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.07428040854224698,
34
+ "grad_norm": 13.907357312219297,
35
+ "learning_rate": 2.469135802469136e-06,
36
+ "loss": 0.4977,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.09285051067780872,
41
+ "grad_norm": 8.492973049121323,
42
+ "learning_rate": 3.08641975308642e-06,
43
+ "loss": 0.4639,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.09285051067780872,
48
+ "eval_loss": 0.5398163199424744,
49
+ "eval_runtime": 19.4371,
50
+ "eval_samples_per_second": 18.11,
51
+ "eval_steps_per_second": 2.264,
52
  "step": 50
53
  },
54
  {
55
+ "epoch": 0.11142061281337047,
56
+ "grad_norm": 8.651091209611899,
57
+ "learning_rate": 3.7037037037037037e-06,
58
+ "loss": 0.4468,
59
  "step": 60
60
  },
61
  {
62
+ "epoch": 0.12999071494893222,
63
+ "grad_norm": 5.085215797862169,
64
+ "learning_rate": 4.3209876543209875e-06,
65
+ "loss": 0.3832,
66
  "step": 70
67
  },
68
  {
69
+ "epoch": 0.14856081708449395,
70
+ "grad_norm": 7.069622973657299,
71
+ "learning_rate": 4.938271604938272e-06,
72
+ "loss": 0.4276,
73
  "step": 80
74
  },
75
  {
76
+ "epoch": 0.1671309192200557,
77
+ "grad_norm": 99.76457865321862,
78
+ "learning_rate": 5.555555555555557e-06,
79
+ "loss": 0.5276,
80
  "step": 90
81
  },
82
  {
83
+ "epoch": 0.18570102135561745,
84
+ "grad_norm": 6.1883235099602905,
85
+ "learning_rate": 6.17283950617284e-06,
86
+ "loss": 0.4939,
87
  "step": 100
88
  },
89
  {
90
+ "epoch": 0.18570102135561745,
91
+ "eval_loss": 0.5121593475341797,
92
+ "eval_runtime": 19.3824,
93
+ "eval_samples_per_second": 18.161,
94
+ "eval_steps_per_second": 2.27,
95
  "step": 100
96
  },
97
  {
98
+ "epoch": 0.2042711234911792,
99
+ "grad_norm": 7.381670488617432,
100
+ "learning_rate": 6.790123456790124e-06,
101
+ "loss": 0.4866,
102
  "step": 110
103
  },
104
  {
105
+ "epoch": 0.22284122562674094,
106
+ "grad_norm": 5.554268011959653,
107
+ "learning_rate": 7.4074074074074075e-06,
108
+ "loss": 0.4282,
109
  "step": 120
110
  },
111
  {
112
+ "epoch": 0.2414113277623027,
113
+ "grad_norm": 5.807686582470137,
114
+ "learning_rate": 8.024691358024692e-06,
115
+ "loss": 0.4918,
116
  "step": 130
117
  },
118
  {
119
+ "epoch": 0.25998142989786444,
120
+ "grad_norm": 6.799528459280429,
121
+ "learning_rate": 8.641975308641975e-06,
122
+ "loss": 0.4841,
123
  "step": 140
124
  },
125
  {
126
+ "epoch": 0.2785515320334262,
127
+ "grad_norm": 4.94296183453013,
128
+ "learning_rate": 9.25925925925926e-06,
129
+ "loss": 0.4822,
130
  "step": 150
131
  },
132
  {
133
+ "epoch": 0.2785515320334262,
134
+ "eval_loss": 0.5242471694946289,
135
+ "eval_runtime": 19.4304,
136
+ "eval_samples_per_second": 18.116,
137
+ "eval_steps_per_second": 2.264,
138
  "step": 150
139
  },
140
  {
141
+ "epoch": 0.2971216341689879,
142
+ "grad_norm": 5.6102713178528205,
143
+ "learning_rate": 9.876543209876543e-06,
144
+ "loss": 0.4886,
145
  "step": 160
146
  },
147
  {
148
+ "epoch": 0.31569173630454966,
149
+ "grad_norm": 4.845388135628488,
150
+ "learning_rate": 9.99925101063302e-06,
151
+ "loss": 0.4529,
152
  "step": 170
153
  },
154
  {
155
+ "epoch": 0.3342618384401114,
156
+ "grad_norm": 5.39966471118419,
157
+ "learning_rate": 9.99620862590714e-06,
158
+ "loss": 0.4463,
159
  "step": 180
160
  },
161
  {
162
+ "epoch": 0.3528319405756732,
163
+ "grad_norm": 5.489503360186311,
164
+ "learning_rate": 9.990827457067342e-06,
165
+ "loss": 0.4885,
166
  "step": 190
167
  },
168
  {
169
+ "epoch": 0.3714020427112349,
170
+ "grad_norm": 4.684590902011352,
171
+ "learning_rate": 9.983110023102148e-06,
172
+ "loss": 0.4701,
173
  "step": 200
174
  },
175
  {
176
+ "epoch": 0.3714020427112349,
177
+ "eval_loss": 0.5521253943443298,
178
+ "eval_runtime": 19.3667,
179
+ "eval_samples_per_second": 18.176,
180
+ "eval_steps_per_second": 2.272,
181
  "step": 200
182
  },
183
  {
184
+ "epoch": 0.38997214484679665,
185
+ "grad_norm": 6.186929243018788,
186
+ "learning_rate": 9.973059936633308e-06,
187
+ "loss": 0.4983,
188
  "step": 210
189
  },
190
  {
191
+ "epoch": 0.4085422469823584,
192
+ "grad_norm": 7.186427473045426,
193
+ "learning_rate": 9.960681902224692e-06,
194
+ "loss": 0.4983,
195
  "step": 220
196
  },
197
  {
198
+ "epoch": 0.4271123491179202,
199
+ "grad_norm": 4.948600368895616,
200
+ "learning_rate": 9.945981714180021e-06,
201
+ "loss": 0.493,
202
  "step": 230
203
  },
204
  {
205
+ "epoch": 0.4456824512534819,
206
+ "grad_norm": 4.131993099439563,
207
+ "learning_rate": 9.928966253830492e-06,
208
+ "loss": 0.4753,
209
  "step": 240
210
  },
211
  {
212
+ "epoch": 0.46425255338904364,
213
+ "grad_norm": 4.007998907864266,
214
+ "learning_rate": 9.909643486313533e-06,
215
+ "loss": 0.4216,
216
  "step": 250
217
  },
218
  {
219
+ "epoch": 0.46425255338904364,
220
+ "eval_loss": 0.5373674631118774,
221
+ "eval_runtime": 19.3954,
222
+ "eval_samples_per_second": 18.149,
223
+ "eval_steps_per_second": 2.269,
224
  "step": 250
225
  },
226
  {
227
+ "epoch": 0.4828226555246054,
228
+ "grad_norm": 4.355924678117399,
229
+ "learning_rate": 9.888022456844251e-06,
230
+ "loss": 0.4874,
231
  "step": 260
232
  },
233
  {
234
+ "epoch": 0.5013927576601671,
235
+ "grad_norm": 4.3475514584684385,
236
+ "learning_rate": 9.864113286481237e-06,
237
+ "loss": 0.4499,
238
  "step": 270
239
  },
240
  {
241
+ "epoch": 0.5199628597957289,
242
+ "grad_norm": 4.925328524759934,
243
+ "learning_rate": 9.837927167388793e-06,
244
+ "loss": 0.4604,
245
  "step": 280
246
  },
247
  {
248
+ "epoch": 0.5385329619312906,
249
+ "grad_norm": 4.580602008041337,
250
+ "learning_rate": 9.809476357597738e-06,
251
+ "loss": 0.4697,
252
  "step": 290
253
  },
254
  {
255
+ "epoch": 0.5571030640668524,
256
+ "grad_norm": 3.9055769337575335,
257
+ "learning_rate": 9.778774175267294e-06,
258
+ "loss": 0.4159,
259
  "step": 300
260
  },
261
  {
262
+ "epoch": 0.5571030640668524,
263
+ "eval_loss": 0.5146207213401794,
264
+ "eval_runtime": 19.4216,
265
+ "eval_samples_per_second": 18.124,
266
+ "eval_steps_per_second": 2.266,
267
  "step": 300
268
  },
269
  {
270
+ "epoch": 0.5756731662024142,
271
+ "grad_norm": 4.477623029269123,
272
+ "learning_rate": 9.745834992450688e-06,
273
+ "loss": 0.487,
274
  "step": 310
275
  },
276
  {
277
+ "epoch": 0.5942432683379758,
278
+ "grad_norm": 3.729028613174465,
279
+ "learning_rate": 9.710674228367422e-06,
280
+ "loss": 0.4751,
281
  "step": 320
282
  },
283
  {
284
+ "epoch": 0.6128133704735376,
285
+ "grad_norm": 4.298167407385586,
286
+ "learning_rate": 9.673308342185366e-06,
287
+ "loss": 0.4289,
288
  "step": 330
289
  },
290
  {
291
+ "epoch": 0.6313834726090993,
292
+ "grad_norm": 4.984754626750181,
293
+ "learning_rate": 9.633754825316015e-06,
294
+ "loss": 0.4783,
295
  "step": 340
296
  },
297
  {
298
+ "epoch": 0.6499535747446611,
299
+ "grad_norm": 3.9844077010010444,
300
+ "learning_rate": 9.592032193226564e-06,
301
+ "loss": 0.4502,
302
  "step": 350
303
  },
304
  {
305
+ "epoch": 0.6499535747446611,
306
+ "eval_loss": 0.5022189617156982,
307
+ "eval_runtime": 19.4166,
308
+ "eval_samples_per_second": 18.129,
309
+ "eval_steps_per_second": 2.266,
310
  "step": 350
311
  },
312
  {
313
+ "epoch": 0.6685236768802229,
314
+ "grad_norm": 4.474043738957621,
315
+ "learning_rate": 9.548159976772593e-06,
316
+ "loss": 0.4341,
317
  "step": 360
318
  },
319
  {
320
+ "epoch": 0.6870937790157846,
321
+ "grad_norm": 12.385083857047688,
322
+ "learning_rate": 9.502158713055444e-06,
323
+ "loss": 0.4612,
324
  "step": 370
325
  },
326
  {
327
+ "epoch": 0.7056638811513464,
328
+ "grad_norm": 3.8491200153587535,
329
+ "learning_rate": 9.454049935808568e-06,
330
+ "loss": 0.4352,
331
  "step": 380
332
  },
333
  {
334
+ "epoch": 0.724233983286908,
335
+ "grad_norm": 4.149978752910591,
336
+ "learning_rate": 9.403856165317322e-06,
337
+ "loss": 0.3996,
338
  "step": 390
339
  },
340
  {
341
+ "epoch": 0.7428040854224698,
342
+ "grad_norm": 4.740837408740574,
343
+ "learning_rate": 9.351600897876964e-06,
344
+ "loss": 0.4625,
345
  "step": 400
346
  },
347
  {
348
+ "epoch": 0.7428040854224698,
349
+ "eval_loss": 0.49847906827926636,
350
+ "eval_runtime": 19.3879,
351
+ "eval_samples_per_second": 18.156,
352
+ "eval_steps_per_second": 2.269,
353
  "step": 400
354
  },
355
  {
356
+ "epoch": 0.7613741875580315,
357
+ "grad_norm": 4.734999881046304,
358
+ "learning_rate": 9.297308594793757e-06,
359
+ "loss": 0.3962,
360
  "step": 410
361
  },
362
  {
363
+ "epoch": 0.7799442896935933,
364
+ "grad_norm": 3.769866725347126,
365
+ "learning_rate": 9.241004670934348e-06,
366
+ "loss": 0.458,
367
  "step": 420
368
  },
369
  {
370
+ "epoch": 0.7985143918291551,
371
+ "grad_norm": 3.6685753005427832,
372
+ "learning_rate": 9.182715482828764e-06,
373
+ "loss": 0.3995,
374
  "step": 430
375
  },
376
  {
377
+ "epoch": 0.8170844939647168,
378
+ "grad_norm": 3.876272765109621,
379
+ "learning_rate": 9.122468316332611e-06,
380
+ "loss": 0.5064,
381
  "step": 440
382
  },
383
  {
384
+ "epoch": 0.8356545961002786,
385
+ "grad_norm": 4.30859512157268,
386
+ "learning_rate": 9.060291373854252e-06,
387
+ "loss": 0.4313,
388
  "step": 450
389
  },
390
  {
391
+ "epoch": 0.8356545961002786,
392
+ "eval_loss": 0.47159674763679504,
393
+ "eval_runtime": 19.4024,
394
+ "eval_samples_per_second": 18.142,
395
+ "eval_steps_per_second": 2.268,
396
  "step": 450
397
  },
398
  {
399
+ "epoch": 0.8542246982358404,
400
+ "grad_norm": 4.898314330177169,
401
+ "learning_rate": 8.99621376115291e-06,
402
+ "loss": 0.3925,
403
  "step": 460
404
  },
405
  {
406
+ "epoch": 0.872794800371402,
407
+ "grad_norm": 3.1139075537025684,
408
+ "learning_rate": 8.930265473713939e-06,
409
+ "loss": 0.4499,
410
  "step": 470
411
  },
412
  {
413
+ "epoch": 0.8913649025069638,
414
+ "grad_norm": 3.627205546071805,
415
+ "learning_rate": 8.862477382707569e-06,
416
+ "loss": 0.3699,
417
  "step": 480
418
  },
419
  {
420
+ "epoch": 0.9099350046425255,
421
+ "grad_norm": 4.789522180281674,
422
+ "learning_rate": 8.792881220537752e-06,
423
+ "loss": 0.4436,
424
  "step": 490
425
  },
426
  {
427
+ "epoch": 0.9285051067780873,
428
+ "grad_norm": 2.96048981245909,
429
+ "learning_rate": 8.721509565987858e-06,
430
+ "loss": 0.4472,
431
  "step": 500
432
  },
433
  {
434
+ "epoch": 0.9285051067780873,
435
+ "eval_loss": 0.47707709670066833,
436
+ "eval_runtime": 19.4043,
437
+ "eval_samples_per_second": 18.14,
438
+ "eval_steps_per_second": 2.268,
439
  "step": 500
440
  },
441
  {
442
+ "epoch": 0.947075208913649,
443
+ "grad_norm": 3.043235000042675,
444
+ "learning_rate": 8.64839582897015e-06,
445
+ "loss": 0.4165,
446
  "step": 510
447
  },
448
  {
449
+ "epoch": 0.9656453110492108,
450
+ "grad_norm": 4.594551610015674,
451
+ "learning_rate": 8.573574234886217e-06,
452
+ "loss": 0.4319,
453
  "step": 520
454
  },
455
  {
456
+ "epoch": 0.9842154131847726,
457
+ "grad_norm": 3.3821933340348482,
458
+ "learning_rate": 8.497079808605659e-06,
459
+ "loss": 0.4254,
460
  "step": 530
461
  },
462
  {
463
+ "epoch": 1.0018570102135562,
464
+ "grad_norm": 6.485913778240125,
465
+ "learning_rate": 8.418948358070535e-06,
466
+ "loss": 0.4035,
467
  "step": 540
468
  },
469
  {
470
+ "epoch": 1.020427112349118,
471
+ "grad_norm": 3.6050686984381373,
472
+ "learning_rate": 8.339216457533244e-06,
473
+ "loss": 0.2753,
474
  "step": 550
475
  },
476
  {
477
+ "epoch": 1.020427112349118,
478
+ "eval_loss": 0.5026086568832397,
479
+ "eval_runtime": 19.4196,
480
+ "eval_samples_per_second": 18.126,
481
+ "eval_steps_per_second": 2.266,
482
  "step": 550
483
  },
484
  {
485
+ "epoch": 1.0389972144846797,
486
+ "grad_norm": 3.2597416531200416,
487
+ "learning_rate": 8.257921430435678e-06,
488
+ "loss": 0.2879,
489
  "step": 560
490
  },
491
  {
492
+ "epoch": 1.0575673166202415,
493
+ "grad_norm": 3.347364037440774,
494
+ "learning_rate": 8.175101331937692e-06,
495
+ "loss": 0.2567,
496
  "step": 570
497
  },
498
  {
499
+ "epoch": 1.076137418755803,
500
+ "grad_norm": 2.8678452979996316,
501
+ "learning_rate": 8.090794931103026e-06,
502
+ "loss": 0.2597,
503
  "step": 580
504
  },
505
  {
506
+ "epoch": 1.0947075208913648,
507
+ "grad_norm": 3.714937118745614,
508
+ "learning_rate": 8.005041692751055e-06,
509
+ "loss": 0.2831,
510
  "step": 590
511
  },
512
  {
513
+ "epoch": 1.1132776230269266,
514
+ "grad_norm": 4.1369787389179224,
515
+ "learning_rate": 7.917881758982838e-06,
516
+ "loss": 0.2877,
517
  "step": 600
518
  },
519
  {
520
+ "epoch": 1.1132776230269266,
521
+ "eval_loss": 0.4783521890640259,
522
+ "eval_runtime": 19.3969,
523
+ "eval_samples_per_second": 18.147,
524
+ "eval_steps_per_second": 2.268,
525
  "step": 600
526
  },
527
  {
528
+ "epoch": 1.1318477251624883,
529
+ "grad_norm": 3.8629861075408964,
530
+ "learning_rate": 7.829355930390126e-06,
531
+ "loss": 0.2965,
532
  "step": 610
533
  },
534
  {
535
+ "epoch": 1.15041782729805,
536
+ "grad_norm": 3.8801891490338334,
537
+ "learning_rate": 7.739505646956136e-06,
538
+ "loss": 0.2889,
539
  "step": 620
540
  },
541
  {
542
+ "epoch": 1.1689879294336118,
543
+ "grad_norm": 3.1120305459072584,
544
+ "learning_rate": 7.648372968656995e-06,
545
+ "loss": 0.2891,
546
  "step": 630
547
  },
548
  {
549
+ "epoch": 1.1875580315691736,
550
+ "grad_norm": 2.7200004915782636,
551
+ "learning_rate": 7.5560005557729664e-06,
552
+ "loss": 0.2774,
553
  "step": 640
554
  },
555
  {
556
+ "epoch": 1.2061281337047354,
557
+ "grad_norm": 3.7661310879129415,
558
+ "learning_rate": 7.462431648918689e-06,
559
+ "loss": 0.3038,
560
  "step": 650
561
  },
562
  {
563
+ "epoch": 1.2061281337047354,
564
+ "eval_loss": 0.4794943630695343,
565
+ "eval_runtime": 19.4005,
566
+ "eval_samples_per_second": 18.144,
567
+ "eval_steps_per_second": 2.268,
568
  "step": 650
569
  },
570
  {
571
+ "epoch": 1.2246982358402971,
572
+ "grad_norm": 3.4542640506421876,
573
+ "learning_rate": 7.367710048801715e-06,
574
+ "loss": 0.292,
575
  "step": 660
576
  },
577
  {
578
+ "epoch": 1.243268337975859,
579
+ "grad_norm": 2.949849968659152,
580
+ "learning_rate": 7.271880095718895e-06,
581
+ "loss": 0.2753,
582
  "step": 670
583
  },
584
  {
585
+ "epoch": 1.2618384401114207,
586
+ "grad_norm": 3.7882283436275834,
587
+ "learning_rate": 7.1749866488001604e-06,
588
+ "loss": 0.2802,
589
+ "step": 680
590
+ },
591
+ {
592
+ "epoch": 1.2804085422469824,
593
+ "grad_norm": 2.7991103167976403,
594
+ "learning_rate": 7.0770750650094335e-06,
595
+ "loss": 0.284,
596
+ "step": 690
597
+ },
598
+ {
599
+ "epoch": 1.2989786443825442,
600
+ "grad_norm": 3.1581810974702322,
601
+ "learning_rate": 6.978191177912499e-06,
602
+ "loss": 0.2944,
603
+ "step": 700
604
+ },
605
+ {
606
+ "epoch": 1.2989786443825442,
607
+ "eval_loss": 0.4681582748889923,
608
+ "eval_runtime": 19.4083,
609
+ "eval_samples_per_second": 18.137,
610
+ "eval_steps_per_second": 2.267,
611
+ "step": 700
612
+ },
613
+ {
614
+ "epoch": 1.317548746518106,
615
+ "grad_norm": 3.1190982947461943,
616
+ "learning_rate": 6.878381276221777e-06,
617
+ "loss": 0.2786,
618
+ "step": 710
619
+ },
620
+ {
621
+ "epoch": 1.3361188486536677,
622
+ "grad_norm": 3.3739780549267757,
623
+ "learning_rate": 6.777692082128024e-06,
624
+ "loss": 0.2748,
625
+ "step": 720
626
+ },
627
+ {
628
+ "epoch": 1.3546889507892295,
629
+ "grad_norm": 3.2357722923246315,
630
+ "learning_rate": 6.676170729429132e-06,
631
+ "loss": 0.2635,
632
+ "step": 730
633
+ },
634
+ {
635
+ "epoch": 1.3732590529247912,
636
+ "grad_norm": 3.603099890429247,
637
+ "learning_rate": 6.573864741466236e-06,
638
+ "loss": 0.2978,
639
+ "step": 740
640
+ },
641
+ {
642
+ "epoch": 1.3918291550603528,
643
+ "grad_norm": 3.3730203042394873,
644
+ "learning_rate": 6.470822008877482e-06,
645
+ "loss": 0.2722,
646
+ "step": 750
647
+ },
648
+ {
649
+ "epoch": 1.3918291550603528,
650
+ "eval_loss": 0.4681358337402344,
651
+ "eval_runtime": 19.3821,
652
+ "eval_samples_per_second": 18.161,
653
+ "eval_steps_per_second": 2.27,
654
+ "step": 750
655
+ },
656
+ {
657
+ "epoch": 1.4103992571959145,
658
+ "grad_norm": 3.608817378094592,
659
+ "learning_rate": 6.367090767179855e-06,
660
+ "loss": 0.2691,
661
+ "step": 760
662
+ },
663
+ {
664
+ "epoch": 1.4289693593314763,
665
+ "grad_norm": 4.122346770691036,
666
+ "learning_rate": 6.262719574189564e-06,
667
+ "loss": 0.2974,
668
+ "step": 770
669
+ },
670
+ {
671
+ "epoch": 1.447539461467038,
672
+ "grad_norm": 3.795562564382414,
673
+ "learning_rate": 6.157757287291557e-06,
674
+ "loss": 0.272,
675
+ "step": 780
676
+ },
677
+ {
678
+ "epoch": 1.4661095636025998,
679
+ "grad_norm": 3.4474808363843903,
680
+ "learning_rate": 6.052253040568804e-06,
681
+ "loss": 0.2924,
682
+ "step": 790
683
+ },
684
+ {
685
+ "epoch": 1.4846796657381616,
686
+ "grad_norm": 3.5395096255499885,
687
+ "learning_rate": 5.946256221802052e-06,
688
+ "loss": 0.2734,
689
+ "step": 800
690
+ },
691
+ {
692
+ "epoch": 1.4846796657381616,
693
+ "eval_loss": 0.4480016827583313,
694
+ "eval_runtime": 19.4053,
695
+ "eval_samples_per_second": 18.139,
696
+ "eval_steps_per_second": 2.267,
697
+ "step": 800
698
+ },
699
+ {
700
+ "epoch": 1.5032497678737233,
701
+ "grad_norm": 2.890746658579431,
702
+ "learning_rate": 5.839816449350824e-06,
703
+ "loss": 0.2277,
704
+ "step": 810
705
+ },
706
+ {
707
+ "epoch": 1.521819870009285,
708
+ "grad_norm": 3.8258223876451884,
709
+ "learning_rate": 5.7329835489264855e-06,
710
+ "loss": 0.296,
711
+ "step": 820
712
+ },
713
+ {
714
+ "epoch": 1.5403899721448466,
715
+ "grad_norm": 3.528517987679271,
716
+ "learning_rate": 5.62580753026823e-06,
717
+ "loss": 0.2521,
718
+ "step": 830
719
+ },
720
+ {
721
+ "epoch": 1.5589600742804084,
722
+ "grad_norm": 2.677594926231651,
723
+ "learning_rate": 5.518338563732945e-06,
724
+ "loss": 0.2523,
725
+ "step": 840
726
+ },
727
+ {
728
+ "epoch": 1.5775301764159702,
729
+ "grad_norm": 3.4048411120623876,
730
+ "learning_rate": 5.410626956809864e-06,
731
+ "loss": 0.2826,
732
+ "step": 850
733
+ },
734
+ {
735
+ "epoch": 1.5775301764159702,
736
+ "eval_loss": 0.44835272431373596,
737
+ "eval_runtime": 19.3933,
738
+ "eval_samples_per_second": 18.151,
739
+ "eval_steps_per_second": 2.269,
740
+ "step": 850
741
+ },
742
+ {
743
+ "epoch": 1.596100278551532,
744
+ "grad_norm": 2.8957625744349014,
745
+ "learning_rate": 5.30272313057105e-06,
746
+ "loss": 0.2436,
747
+ "step": 860
748
+ },
749
+ {
750
+ "epoch": 1.6146703806870937,
751
+ "grad_norm": 2.9749580565630933,
752
+ "learning_rate": 5.194677596068689e-06,
753
+ "loss": 0.2633,
754
+ "step": 870
755
+ },
756
+ {
757
+ "epoch": 1.6332404828226554,
758
+ "grad_norm": 3.485048080470531,
759
+ "learning_rate": 5.0865409306902755e-06,
760
+ "loss": 0.2525,
761
+ "step": 880
762
+ },
763
+ {
764
+ "epoch": 1.6518105849582172,
765
+ "grad_norm": 3.517296425834879,
766
+ "learning_rate": 4.978363754482741e-06,
767
+ "loss": 0.2776,
768
+ "step": 890
769
+ },
770
+ {
771
+ "epoch": 1.670380687093779,
772
+ "grad_norm": 2.773820634987393,
773
+ "learning_rate": 4.870196706456609e-06,
774
+ "loss": 0.2344,
775
+ "step": 900
776
+ },
777
+ {
778
+ "epoch": 1.670380687093779,
779
+ "eval_loss": 0.43884018063545227,
780
+ "eval_runtime": 19.3629,
781
+ "eval_samples_per_second": 18.179,
782
+ "eval_steps_per_second": 2.272,
783
+ "step": 900
784
+ },
785
+ {
786
+ "epoch": 1.6889507892293407,
787
+ "grad_norm": 4.004058405011273,
788
+ "learning_rate": 4.762090420881289e-06,
789
+ "loss": 0.247,
790
+ "step": 910
791
+ },
792
+ {
793
+ "epoch": 1.7075208913649025,
794
+ "grad_norm": 4.098064484974279,
795
+ "learning_rate": 4.654095503582568e-06,
796
+ "loss": 0.2864,
797
+ "step": 920
798
+ },
799
+ {
800
+ "epoch": 1.7260909935004642,
801
+ "grad_norm": 3.5973991483602945,
802
+ "learning_rate": 4.546262508253429e-06,
803
+ "loss": 0.248,
804
+ "step": 930
805
+ },
806
+ {
807
+ "epoch": 1.744661095636026,
808
+ "grad_norm": 4.143958197553495,
809
+ "learning_rate": 4.438641912789277e-06,
810
+ "loss": 0.265,
811
+ "step": 940
812
+ },
813
+ {
814
+ "epoch": 1.7632311977715878,
815
+ "grad_norm": 2.8268071610510126,
816
+ "learning_rate": 4.331284095658637e-06,
817
+ "loss": 0.2437,
818
+ "step": 950
819
+ },
820
+ {
821
+ "epoch": 1.7632311977715878,
822
+ "eval_loss": 0.42715513706207275,
823
+ "eval_runtime": 19.3915,
824
+ "eval_samples_per_second": 18.152,
825
+ "eval_steps_per_second": 2.269,
826
+ "step": 950
827
+ },
828
+ {
829
+ "epoch": 1.7818012999071495,
830
+ "grad_norm": 3.815603949375226,
831
+ "learning_rate": 4.224239312320399e-06,
832
+ "loss": 0.2493,
833
+ "step": 960
834
+ },
835
+ {
836
+ "epoch": 1.8003714020427113,
837
+ "grad_norm": 3.7789513906241248,
838
+ "learning_rate": 4.117557671698648e-06,
839
+ "loss": 0.2371,
840
+ "step": 970
841
+ },
842
+ {
843
+ "epoch": 1.818941504178273,
844
+ "grad_norm": 2.8941597604671903,
845
+ "learning_rate": 4.011289112726085e-06,
846
+ "loss": 0.2605,
847
+ "step": 980
848
+ },
849
+ {
850
+ "epoch": 1.8375116063138348,
851
+ "grad_norm": 3.5121780352826337,
852
+ "learning_rate": 3.905483380967027e-06,
853
+ "loss": 0.2457,
854
+ "step": 990
855
+ },
856
+ {
857
+ "epoch": 1.8560817084493966,
858
+ "grad_norm": 3.129527995601254,
859
+ "learning_rate": 3.800190005330918e-06,
860
+ "loss": 0.2113,
861
+ "step": 1000
862
+ },
863
+ {
864
+ "epoch": 1.8560817084493966,
865
+ "eval_loss": 0.4232879877090454,
866
+ "eval_runtime": 19.3985,
867
+ "eval_samples_per_second": 18.146,
868
+ "eval_steps_per_second": 2.268,
869
+ "step": 1000
870
+ },
871
+ {
872
+ "epoch": 1.8746518105849583,
873
+ "grad_norm": 3.7626616094158876,
874
+ "learning_rate": 3.695458274887268e-06,
875
+ "loss": 0.2709,
876
+ "step": 1010
877
+ },
878
+ {
879
+ "epoch": 1.89322191272052,
880
+ "grad_norm": 3.5799401116393743,
881
+ "learning_rate": 3.5913372157928515e-06,
882
+ "loss": 0.2678,
883
+ "step": 1020
884
+ },
885
+ {
886
+ "epoch": 1.9117920148560819,
887
+ "grad_norm": 3.304004098929529,
888
+ "learning_rate": 3.487875568341995e-06,
889
+ "loss": 0.2439,
890
+ "step": 1030
891
+ },
892
+ {
893
+ "epoch": 1.9303621169916436,
894
+ "grad_norm": 2.842908582122805,
895
+ "learning_rate": 3.3851217641506657e-06,
896
+ "loss": 0.2691,
897
+ "step": 1040
898
+ },
899
+ {
900
+ "epoch": 1.9489322191272052,
901
+ "grad_norm": 2.4953468111237402,
902
+ "learning_rate": 3.2831239034850593e-06,
903
+ "loss": 0.2548,
904
+ "step": 1050
905
+ },
906
+ {
907
+ "epoch": 1.9489322191272052,
908
+ "eval_loss": 0.4117203652858734,
909
+ "eval_runtime": 19.4229,
910
+ "eval_samples_per_second": 18.123,
911
+ "eval_steps_per_second": 2.265,
912
+ "step": 1050
913
+ },
914
+ {
915
+ "epoch": 1.967502321262767,
916
+ "grad_norm": 3.2206038323962836,
917
+ "learning_rate": 3.1819297327453045e-06,
918
+ "loss": 0.2381,
919
+ "step": 1060
920
+ },
921
+ {
922
+ "epoch": 1.9860724233983287,
923
+ "grad_norm": 3.5882038303694372,
924
+ "learning_rate": 3.081586622114809e-06,
925
+ "loss": 0.219,
926
+ "step": 1070
927
+ },
928
+ {
929
+ "epoch": 2.0037140204271124,
930
+ "grad_norm": 2.4519833657637693,
931
+ "learning_rate": 2.9821415433857174e-06,
932
+ "loss": 0.245,
933
+ "step": 1080
934
+ },
935
+ {
936
+ "epoch": 2.022284122562674,
937
+ "grad_norm": 2.5708061741401345,
938
+ "learning_rate": 2.8836410479708625e-06,
939
+ "loss": 0.1082,
940
+ "step": 1090
941
+ },
942
+ {
943
+ "epoch": 2.040854224698236,
944
+ "grad_norm": 2.758692661666751,
945
+ "learning_rate": 2.786131245112495e-06,
946
+ "loss": 0.1126,
947
+ "step": 1100
948
+ },
949
+ {
950
+ "epoch": 2.040854224698236,
951
+ "eval_loss": 0.5031464695930481,
952
+ "eval_runtime": 19.3667,
953
+ "eval_samples_per_second": 18.176,
954
+ "eval_steps_per_second": 2.272,
955
+ "step": 1100
956
+ },
957
+ {
958
+ "epoch": 2.0594243268337977,
959
+ "grad_norm": 2.9605517752760933,
960
+ "learning_rate": 2.689657780298019e-06,
961
+ "loss": 0.1195,
962
+ "step": 1110
963
+ },
964
+ {
965
+ "epoch": 2.0779944289693595,
966
+ "grad_norm": 2.55505057652882,
967
+ "learning_rate": 2.5942658138927866e-06,
968
+ "loss": 0.1006,
969
+ "step": 1120
970
+ },
971
+ {
972
+ "epoch": 2.0965645311049212,
973
+ "grad_norm": 3.246793514684397,
974
+ "learning_rate": 2.5000000000000015e-06,
975
+ "loss": 0.1069,
976
+ "step": 1130
977
+ },
978
+ {
979
+ "epoch": 2.115134633240483,
980
+ "grad_norm": 3.2377032295876105,
981
+ "learning_rate": 2.406904465557614e-06,
982
+ "loss": 0.0936,
983
+ "step": 1140
984
+ },
985
+ {
986
+ "epoch": 2.1337047353760448,
987
+ "grad_norm": 2.5395933544539218,
988
+ "learning_rate": 2.3150227896819782e-06,
989
+ "loss": 0.1128,
990
+ "step": 1150
991
+ },
992
+ {
993
+ "epoch": 2.1337047353760448,
994
+ "eval_loss": 0.48206087946891785,
995
+ "eval_runtime": 19.455,
996
+ "eval_samples_per_second": 18.093,
997
+ "eval_steps_per_second": 2.262,
998
+ "step": 1150
999
+ },
1000
+ {
1001
+ "epoch": 2.152274837511606,
1002
+ "grad_norm": 2.5999204788587007,
1003
+ "learning_rate": 2.2243979832679515e-06,
1004
+ "loss": 0.1172,
1005
+ "step": 1160
1006
+ },
1007
+ {
1008
+ "epoch": 2.170844939647168,
1009
+ "grad_norm": 2.9958486705632446,
1010
+ "learning_rate": 2.1350724688549906e-06,
1011
+ "loss": 0.1098,
1012
+ "step": 1170
1013
+ },
1014
+ {
1015
+ "epoch": 2.1894150417827296,
1016
+ "grad_norm": 2.2061344645528633,
1017
+ "learning_rate": 2.0470880607686605e-06,
1018
+ "loss": 0.0953,
1019
+ "step": 1180
1020
+ },
1021
+ {
1022
+ "epoch": 2.2079851439182914,
1023
+ "grad_norm": 1.406675633603301,
1024
+ "learning_rate": 1.9604859455468587e-06,
1025
+ "loss": 0.1081,
1026
+ "step": 1190
1027
+ },
1028
+ {
1029
+ "epoch": 2.226555246053853,
1030
+ "grad_norm": 2.9325208784441203,
1031
+ "learning_rate": 1.8753066626599086e-06,
1032
+ "loss": 0.0993,
1033
+ "step": 1200
1034
+ },
1035
+ {
1036
+ "epoch": 2.226555246053853,
1037
+ "eval_loss": 0.49974343180656433,
1038
+ "eval_runtime": 19.4104,
1039
+ "eval_samples_per_second": 18.135,
1040
+ "eval_steps_per_second": 2.267,
1041
+ "step": 1200
1042
+ },
1043
+ {
1044
+ "epoch": 2.245125348189415,
1045
+ "grad_norm": 3.1642149256556356,
1046
+ "learning_rate": 1.7915900855335506e-06,
1047
+ "loss": 0.1187,
1048
+ "step": 1210
1049
+ },
1050
+ {
1051
+ "epoch": 2.2636954503249767,
1052
+ "grad_norm": 3.0092943445643003,
1053
+ "learning_rate": 1.7093754028837345e-06,
1054
+ "loss": 0.1,
1055
+ "step": 1220
1056
+ },
1057
+ {
1058
+ "epoch": 2.2822655524605384,
1059
+ "grad_norm": 2.553339861021684,
1060
+ "learning_rate": 1.6287011003719105e-06,
1061
+ "loss": 0.1044,
1062
+ "step": 1230
1063
+ },
1064
+ {
1065
+ "epoch": 2.3008356545961,
1066
+ "grad_norm": 2.021123673041649,
1067
+ "learning_rate": 1.549604942589441e-06,
1068
+ "loss": 0.1012,
1069
+ "step": 1240
1070
+ },
1071
+ {
1072
+ "epoch": 2.319405756731662,
1073
+ "grad_norm": 2.1421187938993724,
1074
+ "learning_rate": 1.4721239553795485e-06,
1075
+ "loss": 0.0978,
1076
+ "step": 1250
1077
+ },
1078
+ {
1079
+ "epoch": 2.319405756731662,
1080
+ "eval_loss": 0.4895870089530945,
1081
+ "eval_runtime": 19.3723,
1082
+ "eval_samples_per_second": 18.17,
1083
+ "eval_steps_per_second": 2.271,
1084
+ "step": 1250
1085
+ },
1086
+ {
1087
+ "epoch": 2.3379758588672237,
1088
+ "grad_norm": 3.298579283457447,
1089
+ "learning_rate": 1.3962944085050833e-06,
1090
+ "loss": 0.1094,
1091
+ "step": 1260
1092
+ },
1093
+ {
1094
+ "epoch": 2.3565459610027855,
1095
+ "grad_norm": 3.3116222885011846,
1096
+ "learning_rate": 1.3221517986702249e-06,
1097
+ "loss": 0.1023,
1098
+ "step": 1270
1099
+ },
1100
+ {
1101
+ "epoch": 2.3751160631383472,
1102
+ "grad_norm": 2.3986802183164824,
1103
+ "learning_rate": 1.2497308329040475e-06,
1104
+ "loss": 0.0953,
1105
+ "step": 1280
1106
+ },
1107
+ {
1108
+ "epoch": 2.393686165273909,
1109
+ "grad_norm": 2.6963628811320035,
1110
+ "learning_rate": 1.1790654123137552e-06,
1111
+ "loss": 0.1014,
1112
+ "step": 1290
1113
+ },
1114
+ {
1115
+ "epoch": 2.4122562674094707,
1116
+ "grad_norm": 3.541306025884895,
1117
+ "learning_rate": 1.1101886162151764e-06,
1118
+ "loss": 0.1056,
1119
+ "step": 1300
1120
+ },
1121
+ {
1122
+ "epoch": 2.4122562674094707,
1123
+ "eval_loss": 0.4979850947856903,
1124
+ "eval_runtime": 19.4616,
1125
+ "eval_samples_per_second": 18.087,
1126
+ "eval_steps_per_second": 2.261,
1127
+ "step": 1300
1128
+ },
1129
+ {
1130
+ "epoch": 2.4308263695450325,
1131
+ "grad_norm": 1.8964890409301638,
1132
+ "learning_rate": 1.0431326866479457e-06,
1133
+ "loss": 0.0855,
1134
+ "step": 1310
1135
+ },
1136
+ {
1137
+ "epoch": 2.4493964716805943,
1138
+ "grad_norm": 2.08584902180452,
1139
+ "learning_rate": 9.779290132826224e-07,
1140
+ "loss": 0.1018,
1141
+ "step": 1320
1142
+ },
1143
+ {
1144
+ "epoch": 2.467966573816156,
1145
+ "grad_norm": 2.1509863412780357,
1146
+ "learning_rate": 9.146081187268185e-07,
1147
+ "loss": 0.1057,
1148
+ "step": 1330
1149
+ },
1150
+ {
1151
+ "epoch": 2.486536675951718,
1152
+ "grad_norm": 2.337018038983776,
1153
+ "learning_rate": 8.531996442372048e-07,
1154
+ "loss": 0.0969,
1155
+ "step": 1340
1156
+ },
1157
+ {
1158
+ "epoch": 2.5051067780872796,
1159
+ "grad_norm": 2.1781847876681604,
1160
+ "learning_rate": 7.937323358440935e-07,
1161
+ "loss": 0.0897,
1162
+ "step": 1350
1163
+ },
1164
+ {
1165
+ "epoch": 2.5051067780872796,
1166
+ "eval_loss": 0.4882669448852539,
1167
+ "eval_runtime": 19.5254,
1168
+ "eval_samples_per_second": 18.028,
1169
+ "eval_steps_per_second": 2.253,
1170
+ "step": 1350
1171
+ },
1172
+ {
1173
+ "epoch": 2.5236768802228413,
1174
+ "grad_norm": 2.6685806136301684,
1175
+ "learning_rate": 7.362340308950783e-07,
1176
+ "loss": 0.0976,
1177
+ "step": 1360
1178
+ },
1179
+ {
1180
+ "epoch": 2.542246982358403,
1181
+ "grad_norm": 2.6897365042467047,
1182
+ "learning_rate": 6.807316450240425e-07,
1183
+ "loss": 0.0957,
1184
+ "step": 1370
1185
+ },
1186
+ {
1187
+ "epoch": 2.560817084493965,
1188
+ "grad_norm": 2.143646748023798,
1189
+ "learning_rate": 6.2725115955164e-07,
1190
+ "loss": 0.1087,
1191
+ "step": 1380
1192
+ },
1193
+ {
1194
+ "epoch": 2.5793871866295266,
1195
+ "grad_norm": 3.3390598746500264,
1196
+ "learning_rate": 5.758176093231294e-07,
1197
+ "loss": 0.0952,
1198
+ "step": 1390
1199
+ },
1200
+ {
1201
+ "epoch": 2.5979572887650884,
1202
+ "grad_norm": 3.150702584087223,
1203
+ "learning_rate": 5.264550709892685e-07,
1204
+ "loss": 0.0872,
1205
+ "step": 1400
1206
+ },
1207
+ {
1208
+ "epoch": 2.5979572887650884,
1209
+ "eval_loss": 0.49406710267066956,
1210
+ "eval_runtime": 19.3841,
1211
+ "eval_samples_per_second": 18.159,
1212
+ "eval_steps_per_second": 2.27,
1213
+ "step": 1400
1214
+ },
1215
+ {
1216
+ "epoch": 2.61652739090065,
1217
+ "grad_norm": 3.077156808484069,
1218
+ "learning_rate": 4.791866517357491e-07,
1219
+ "loss": 0.1028,
1220
+ "step": 1410
1221
+ },
1222
+ {
1223
+ "epoch": 2.635097493036212,
1224
+ "grad_norm": 2.2139585731845703,
1225
+ "learning_rate": 4.3403447846645355e-07,
1226
+ "loss": 0.0929,
1227
+ "step": 1420
1228
+ },
1229
+ {
1230
+ "epoch": 2.6536675951717736,
1231
+ "grad_norm": 2.509450394120424,
1232
+ "learning_rate": 3.910196874455896e-07,
1233
+ "loss": 0.0956,
1234
+ "step": 1430
1235
+ },
1236
+ {
1237
+ "epoch": 2.6722376973073354,
1238
+ "grad_norm": 4.615247501065769,
1239
+ "learning_rate": 3.501624144035559e-07,
1240
+ "loss": 0.1015,
1241
+ "step": 1440
1242
+ },
1243
+ {
1244
+ "epoch": 2.690807799442897,
1245
+ "grad_norm": 3.284511844581789,
1246
+ "learning_rate": 3.1148178511116624e-07,
1247
+ "loss": 0.0916,
1248
+ "step": 1450
1249
+ },
1250
+ {
1251
+ "epoch": 2.690807799442897,
1252
+ "eval_loss": 0.49390342831611633,
1253
+ "eval_runtime": 19.3724,
1254
+ "eval_samples_per_second": 18.17,
1255
+ "eval_steps_per_second": 2.271,
1256
+ "step": 1450
1257
+ },
1258
+ {
1259
+ "epoch": 2.709377901578459,
1260
+ "grad_norm": 2.9686013973926695,
1261
+ "learning_rate": 2.7499590642665773e-07,
1262
+ "loss": 0.101,
1263
+ "step": 1460
1264
+ },
1265
+ {
1266
+ "epoch": 2.7279480037140207,
1267
+ "grad_norm": 2.6344171640489265,
1268
+ "learning_rate": 2.407218578196524e-07,
1269
+ "loss": 0.0899,
1270
+ "step": 1470
1271
+ },
1272
+ {
1273
+ "epoch": 2.7465181058495824,
1274
+ "grad_norm": 6.728672130977422,
1275
+ "learning_rate": 2.0867568337605616e-07,
1276
+ "loss": 0.1063,
1277
+ "step": 1480
1278
+ },
1279
+ {
1280
+ "epoch": 2.7650882079851438,
1281
+ "grad_norm": 1.8996993963458386,
1282
+ "learning_rate": 1.7887238428763553e-07,
1283
+ "loss": 0.086,
1284
+ "step": 1490
1285
+ },
1286
+ {
1287
+ "epoch": 2.7836583101207055,
1288
+ "grad_norm": 2.3753024288893223,
1289
+ "learning_rate": 1.5132591182978107e-07,
1290
+ "loss": 0.0844,
1291
+ "step": 1500
1292
+ },
1293
+ {
1294
+ "epoch": 2.7836583101207055,
1295
+ "eval_loss": 0.4944659173488617,
1296
+ "eval_runtime": 19.3894,
1297
+ "eval_samples_per_second": 18.154,
1298
+ "eval_steps_per_second": 2.269,
1299
+ "step": 1500
1300
+ },
1301
+ {
1302
+ "epoch": 2.8022284122562673,
1303
+ "grad_norm": 2.4868387472806917,
1304
+ "learning_rate": 1.2604916083075236e-07,
1305
+ "loss": 0.0936,
1306
+ "step": 1510
1307
+ },
1308
+ {
1309
+ "epoch": 2.820798514391829,
1310
+ "grad_norm": 2.5429472050035447,
1311
+ "learning_rate": 1.0305396363545717e-07,
1312
+ "loss": 0.1114,
1313
+ "step": 1520
1314
+ },
1315
+ {
1316
+ "epoch": 2.839368616527391,
1317
+ "grad_norm": 2.6782347948140037,
1318
+ "learning_rate": 8.235108456658814e-08,
1319
+ "loss": 0.0933,
1320
+ "step": 1530
1321
+ },
1322
+ {
1323
+ "epoch": 2.8579387186629526,
1324
+ "grad_norm": 3.3595802114049533,
1325
+ "learning_rate": 6.395021488572128e-08,
1326
+ "loss": 0.1059,
1327
+ "step": 1540
1328
+ },
1329
+ {
1330
+ "epoch": 2.8765088207985143,
1331
+ "grad_norm": 2.5500224216604344,
1332
+ "learning_rate": 4.7859968256719344e-08,
1333
+ "loss": 0.0959,
1334
+ "step": 1550
1335
+ },
1336
+ {
1337
+ "epoch": 2.8765088207985143,
1338
+ "eval_loss": 0.4942573308944702,
1339
+ "eval_runtime": 19.3998,
1340
+ "eval_samples_per_second": 18.144,
1341
+ "eval_steps_per_second": 2.268,
1342
+ "step": 1550
1343
+ },
1344
+ {
1345
+ "epoch": 2.895078922934076,
1346
+ "grad_norm": 2.472877504615022,
1347
+ "learning_rate": 3.408787671357494e-08,
1348
+ "loss": 0.0945,
1349
+ "step": 1560
1350
+ },
1351
+ {
1352
+ "epoch": 2.913649025069638,
1353
+ "grad_norm": 2.6027015654240286,
1354
+ "learning_rate": 2.264038713457706e-08,
1355
+ "loss": 0.088,
1356
+ "step": 1570
1357
+ },
1358
+ {
1359
+ "epoch": 2.9322191272051996,
1360
+ "grad_norm": 2.9295049029652995,
1361
+ "learning_rate": 1.3522858224450652e-08,
1362
+ "loss": 0.0992,
1363
+ "step": 1580
1364
+ },
1365
+ {
1366
+ "epoch": 2.9507892293407614,
1367
+ "grad_norm": 2.7591280720199776,
1368
+ "learning_rate": 6.739558005884883e-09,
1369
+ "loss": 0.0805,
1370
+ "step": 1590
1371
+ },
1372
+ {
1373
+ "epoch": 2.969359331476323,
1374
+ "grad_norm": 2.9108217727851398,
1375
+ "learning_rate": 2.2936618216201635e-09,
1376
+ "loss": 0.094,
1377
+ "step": 1600
1378
+ },
1379
+ {
1380
+ "epoch": 2.969359331476323,
1381
+ "eval_loss": 0.4940944015979767,
1382
+ "eval_runtime": 19.4091,
1383
+ "eval_samples_per_second": 18.136,
1384
+ "eval_steps_per_second": 2.267,
1385
+ "step": 1600
1386
+ },
1387
+ {
1388
+ "epoch": 2.987929433611885,
1389
+ "grad_norm": 3.2721186890864007,
1390
+ "learning_rate": 1.872508480332824e-10,
1391
+ "loss": 0.0803,
1392
+ "step": 1610
1393
+ },
1394
+ {
1395
+ "epoch": 2.9953574744661093,
1396
+ "step": 1614,
1397
+ "total_flos": 292826854195200.0,
1398
+ "train_loss": 0.2790517607955389,
1399
+ "train_runtime": 12257.2359,
1400
+ "train_samples_per_second": 4.217,
1401
+ "train_steps_per_second": 0.132
1402
  }
1403
  ],
1404
  "logging_steps": 10,
1405
+ "max_steps": 1614,
1406
  "num_input_tokens_seen": 0,
1407
+ "num_train_epochs": 3,
1408
  "save_steps": 5000,
1409
  "stateful_callbacks": {
1410
  "TrainerControl": {
 
1418
  "attributes": {}
1419
  }
1420
  },
1421
+ "total_flos": 292826854195200.0,
1422
  "train_batch_size": 2,
1423
  "trial_name": null,
1424
  "trial_params": null
training_eval_loss.png CHANGED
training_loss.png CHANGED