hllj commited on
Commit
7c6c83f
·
1 Parent(s): dbd41ce

Model save

Browse files
README.md CHANGED
@@ -1,5 +1,6 @@
1
  ---
2
- base_model: hllj/zephyr-7b-beta-vi-math
 
3
  tags:
4
  - generated_from_trainer
5
  model-index:
@@ -12,9 +13,9 @@ should probably proofread and complete it, then remove this comment. -->
12
 
13
  # sft-zephyr-7b-beta-v1
14
 
15
- This model is a fine-tuned version of [hllj/zephyr-7b-beta-vi-math](https://huggingface.co/hllj/zephyr-7b-beta-vi-math) on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
- - Loss: 0.3935
18
 
19
  ## Model description
20
 
@@ -48,26 +49,26 @@ The following hyperparameters were used during training:
48
 
49
  | Training Loss | Epoch | Step | Validation Loss |
50
  |:-------------:|:-----:|:----:|:---------------:|
51
- | 0.6583 | 0.19 | 50 | 0.5998 |
52
- | 0.4808 | 0.37 | 100 | 0.4464 |
53
- | 0.4476 | 0.56 | 150 | 0.4201 |
54
- | 0.4158 | 0.74 | 200 | 0.4091 |
55
- | 0.4028 | 0.93 | 250 | 0.4018 |
56
- | 0.4074 | 1.12 | 300 | 0.3965 |
57
- | 0.388 | 1.3 | 350 | 0.3942 |
58
- | 0.3699 | 1.49 | 400 | 0.3921 |
59
- | 0.3699 | 1.67 | 450 | 0.3932 |
60
- | 0.336 | 1.86 | 500 | 0.3955 |
61
- | 0.3512 | 2.04 | 550 | 0.3911 |
62
- | 0.3413 | 2.23 | 600 | 0.3900 |
63
- | 0.3402 | 2.42 | 650 | 0.3932 |
64
- | 0.3255 | 2.6 | 700 | 0.3948 |
65
- | 0.3252 | 2.79 | 750 | 0.3930 |
66
- | 0.316 | 2.97 | 800 | 0.3946 |
67
- | 0.305 | 3.16 | 850 | 0.3931 |
68
- | 0.3248 | 3.35 | 900 | 0.3935 |
69
- | 0.3363 | 3.53 | 950 | 0.3934 |
70
- | 0.3032 | 3.72 | 1000 | 0.3935 |
71
 
72
 
73
  ### Framework versions
 
1
  ---
2
+ license: mit
3
+ base_model: HuggingFaceH4/zephyr-7b-beta
4
  tags:
5
  - generated_from_trainer
6
  model-index:
 
13
 
14
  # sft-zephyr-7b-beta-v1
15
 
16
+ This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.4927
19
 
20
  ## Model description
21
 
 
49
 
50
  | Training Loss | Epoch | Step | Validation Loss |
51
  |:-------------:|:-----:|:----:|:---------------:|
52
+ | 1.0538 | 0.19 | 50 | 1.1364 |
53
+ | 0.7744 | 0.37 | 100 | 0.7777 |
54
+ | 0.5936 | 0.56 | 150 | 0.6507 |
55
+ | 0.5449 | 0.74 | 200 | 0.6087 |
56
+ | 0.501 | 0.93 | 250 | 0.5840 |
57
+ | 0.5752 | 1.12 | 300 | 0.5552 |
58
+ | 0.4542 | 1.3 | 350 | 0.5419 |
59
+ | 0.5115 | 1.49 | 400 | 0.5243 |
60
+ | 0.4224 | 1.67 | 450 | 0.5188 |
61
+ | 0.4486 | 1.86 | 500 | 0.5055 |
62
+ | 0.3865 | 2.04 | 550 | 0.5038 |
63
+ | 0.4193 | 2.23 | 600 | 0.5048 |
64
+ | 0.4294 | 2.42 | 650 | 0.4995 |
65
+ | 0.4077 | 2.6 | 700 | 0.5014 |
66
+ | 0.4667 | 2.79 | 750 | 0.4985 |
67
+ | 0.4226 | 2.97 | 800 | 0.4937 |
68
+ | 0.4195 | 3.16 | 850 | 0.4920 |
69
+ | 0.338 | 3.35 | 900 | 0.4923 |
70
+ | 0.3943 | 3.53 | 950 | 0.4926 |
71
+ | 0.3953 | 3.72 | 1000 | 0.4927 |
72
 
73
 
74
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 3.72,
3
- "eval_loss": 0.3934732675552368,
4
- "eval_runtime": 10.335,
5
  "eval_samples": 120,
6
- "eval_samples_per_second": 11.611,
7
- "eval_steps_per_second": 2.903,
8
- "train_loss": 0.3842643254995346,
9
- "train_runtime": 3643.6441,
10
  "train_samples": 1076,
11
- "train_samples_per_second": 1.098,
12
- "train_steps_per_second": 0.274
13
  }
 
1
  {
2
  "epoch": 3.72,
3
+ "eval_loss": 0.4926711618900299,
4
+ "eval_runtime": 13.9963,
5
  "eval_samples": 120,
6
+ "eval_samples_per_second": 8.574,
7
+ "eval_steps_per_second": 2.143,
8
+ "train_loss": 0.509855135679245,
9
+ "train_runtime": 2165.2959,
10
  "train_samples": 1076,
11
+ "train_samples_per_second": 1.847,
12
+ "train_steps_per_second": 0.462
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.72,
3
- "eval_loss": 0.3934732675552368,
4
- "eval_runtime": 10.335,
5
  "eval_samples": 120,
6
- "eval_samples_per_second": 11.611,
7
- "eval_steps_per_second": 2.903
8
  }
 
1
  {
2
  "epoch": 3.72,
3
+ "eval_loss": 0.4926711618900299,
4
+ "eval_runtime": 13.9963,
5
  "eval_samples": 120,
6
+ "eval_samples_per_second": 8.574,
7
+ "eval_steps_per_second": 2.143
8
  }
runs/Nov18_06-38-13_7a59b30c842e/events.out.tfevents.1700291694.7a59b30c842e.64531.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ef7657a7271fef814fe0325e4e921f63b674ca61359fb264fb380f2c2278e5a
3
+ size 359
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.72,
3
- "train_loss": 0.3842643254995346,
4
- "train_runtime": 3643.6441,
5
  "train_samples": 1076,
6
- "train_samples_per_second": 1.098,
7
- "train_steps_per_second": 0.274
8
  }
 
1
  {
2
  "epoch": 3.72,
3
+ "train_loss": 0.509855135679245,
4
+ "train_runtime": 2165.2959,
5
  "train_samples": 1076,
6
+ "train_samples_per_second": 1.847,
7
+ "train_steps_per_second": 0.462
8
  }
trainer_state.json CHANGED
@@ -10,785 +10,785 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 6.000000000000001e-07,
14
- "loss": 0.9756,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.04,
19
- "learning_rate": 6e-06,
20
- "loss": 0.892,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.07,
25
- "learning_rate": 1.2e-05,
26
- "loss": 0.852,
27
  "step": 20
28
  },
29
  {
30
  "epoch": 0.11,
31
- "learning_rate": 1.8e-05,
32
- "loss": 0.8052,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.15,
37
- "learning_rate": 2.4e-05,
38
- "loss": 0.762,
39
  "step": 40
40
  },
41
  {
42
  "epoch": 0.19,
43
- "learning_rate": 3e-05,
44
- "loss": 0.6583,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.19,
49
- "eval_loss": 0.5997987389564514,
50
- "eval_runtime": 10.374,
51
- "eval_samples_per_second": 11.567,
52
- "eval_steps_per_second": 2.892,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 0.22,
57
- "learning_rate": 2.999179886011389e-05,
58
- "loss": 0.5995,
59
  "step": 60
60
  },
61
  {
62
  "epoch": 0.26,
63
- "learning_rate": 2.9967204408281618e-05,
64
- "loss": 0.5319,
65
  "step": 70
66
  },
67
  {
68
  "epoch": 0.3,
69
- "learning_rate": 2.9926243538175172e-05,
70
- "loss": 0.4955,
71
  "step": 80
72
  },
73
  {
74
  "epoch": 0.33,
75
- "learning_rate": 2.9868961039904628e-05,
76
- "loss": 0.5063,
77
  "step": 90
78
  },
79
  {
80
  "epoch": 0.37,
81
- "learning_rate": 2.9795419551040836e-05,
82
- "loss": 0.4808,
83
  "step": 100
84
  },
85
  {
86
  "epoch": 0.37,
87
- "eval_loss": 0.44642969965934753,
88
- "eval_runtime": 10.3848,
89
- "eval_samples_per_second": 11.555,
90
- "eval_steps_per_second": 2.889,
91
  "step": 100
92
  },
93
  {
94
  "epoch": 0.41,
95
- "learning_rate": 2.970569948812214e-05,
96
- "loss": 0.4638,
97
  "step": 110
98
  },
99
  {
100
  "epoch": 0.45,
101
- "learning_rate": 2.9599898958720088e-05,
102
- "loss": 0.4424,
103
  "step": 120
104
  },
105
  {
106
  "epoch": 0.48,
107
- "learning_rate": 2.947813365416023e-05,
108
- "loss": 0.4506,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.52,
113
- "learning_rate": 2.9340536723015367e-05,
114
- "loss": 0.4449,
115
  "step": 140
116
  },
117
  {
118
  "epoch": 0.56,
119
- "learning_rate": 2.9187258625509518e-05,
120
- "loss": 0.4476,
121
  "step": 150
122
  },
123
  {
124
  "epoch": 0.56,
125
- "eval_loss": 0.4200552701950073,
126
- "eval_runtime": 10.3829,
127
- "eval_samples_per_second": 11.557,
128
- "eval_steps_per_second": 2.889,
129
  "step": 150
130
  },
131
  {
132
  "epoch": 0.59,
133
- "learning_rate": 2.9036039116586097e-05,
134
- "loss": 0.4266,
135
  "step": 160
136
  },
137
  {
138
  "epoch": 0.63,
139
- "learning_rate": 2.885344258594923e-05,
140
- "loss": 0.4162,
141
  "step": 170
142
  },
143
  {
144
  "epoch": 0.67,
145
- "learning_rate": 2.865569751923882e-05,
146
- "loss": 0.4106,
147
  "step": 180
148
  },
149
  {
150
  "epoch": 0.71,
151
- "learning_rate": 2.8443020147782055e-05,
152
- "loss": 0.4255,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 0.74,
157
- "learning_rate": 2.821564303116212e-05,
158
- "loss": 0.4158,
159
  "step": 200
160
  },
161
  {
162
  "epoch": 0.74,
163
- "eval_loss": 0.4091338515281677,
164
- "eval_runtime": 10.3877,
165
- "eval_samples_per_second": 11.552,
166
- "eval_steps_per_second": 2.888,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 0.78,
171
- "learning_rate": 2.797381480291773e-05,
172
- "loss": 0.4362,
173
  "step": 210
174
  },
175
  {
176
  "epoch": 0.82,
177
- "learning_rate": 2.7717799898665977e-05,
178
- "loss": 0.4048,
179
  "step": 220
180
  },
181
  {
182
  "epoch": 0.86,
183
- "learning_rate": 2.744787826694589e-05,
184
- "loss": 0.4074,
185
  "step": 230
186
  },
187
  {
188
  "epoch": 0.89,
189
- "learning_rate": 2.71643450630988e-05,
190
- "loss": 0.4273,
191
  "step": 240
192
  },
193
  {
194
  "epoch": 0.93,
195
- "learning_rate": 2.686751032652033e-05,
196
- "loss": 0.4028,
197
  "step": 250
198
  },
199
  {
200
  "epoch": 0.93,
201
- "eval_loss": 0.4017806947231293,
202
- "eval_runtime": 10.3868,
203
- "eval_samples_per_second": 11.553,
204
- "eval_steps_per_second": 2.888,
205
  "step": 250
206
  },
207
  {
208
  "epoch": 0.97,
209
- "learning_rate": 2.655769864163684e-05,
210
- "loss": 0.409,
211
  "step": 260
212
  },
213
  {
214
  "epoch": 1.0,
215
- "learning_rate": 2.623524878297714e-05,
216
- "loss": 0.4021,
217
  "step": 270
218
  },
219
  {
220
  "epoch": 1.04,
221
- "learning_rate": 2.590051334472751e-05,
222
- "loss": 0.3942,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.08,
227
- "learning_rate": 2.5553858355175156e-05,
228
- "loss": 0.3821,
229
  "step": 290
230
  },
231
  {
232
  "epoch": 1.12,
233
- "learning_rate": 2.51956628764616e-05,
234
- "loss": 0.4074,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 1.12,
239
- "eval_loss": 0.3964887857437134,
240
- "eval_runtime": 10.3909,
241
- "eval_samples_per_second": 11.549,
242
- "eval_steps_per_second": 2.887,
243
  "step": 300
244
  },
245
  {
246
  "epoch": 1.15,
247
- "learning_rate": 2.482631859008384e-05,
248
- "loss": 0.3937,
249
  "step": 310
250
  },
251
  {
252
  "epoch": 1.19,
253
- "learning_rate": 2.4446229368596388e-05,
254
- "loss": 0.3503,
255
  "step": 320
256
  },
257
  {
258
  "epoch": 1.23,
259
- "learning_rate": 2.4055810833982512e-05,
260
- "loss": 0.3724,
261
  "step": 330
262
  },
263
  {
264
  "epoch": 1.26,
265
- "learning_rate": 2.365548990317775e-05,
266
- "loss": 0.3733,
267
  "step": 340
268
  },
269
  {
270
  "epoch": 1.3,
271
- "learning_rate": 2.3245704321242494e-05,
272
- "loss": 0.388,
273
  "step": 350
274
  },
275
  {
276
  "epoch": 1.3,
277
- "eval_loss": 0.3942064344882965,
278
- "eval_runtime": 10.3843,
279
- "eval_samples_per_second": 11.556,
280
- "eval_steps_per_second": 2.889,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 1.34,
285
- "learning_rate": 2.282690218269416e-05,
286
- "loss": 0.3713,
287
  "step": 360
288
  },
289
  {
290
  "epoch": 1.38,
291
- "learning_rate": 2.2442649405387632e-05,
292
- "loss": 0.3792,
293
  "step": 370
294
  },
295
  {
296
  "epoch": 1.41,
297
- "learning_rate": 2.2007985218000543e-05,
298
- "loss": 0.3665,
299
  "step": 380
300
  },
301
  {
302
  "epoch": 1.45,
303
- "learning_rate": 2.1565657901667777e-05,
304
- "loss": 0.3529,
305
  "step": 390
306
  },
307
  {
308
  "epoch": 1.49,
309
- "learning_rate": 2.1116151134815555e-05,
310
- "loss": 0.3699,
311
  "step": 400
312
  },
313
  {
314
  "epoch": 1.49,
315
- "eval_loss": 0.39205998182296753,
316
- "eval_runtime": 10.3869,
317
- "eval_samples_per_second": 11.553,
318
- "eval_steps_per_second": 2.888,
319
  "step": 400
320
  },
321
  {
322
  "epoch": 1.52,
323
- "learning_rate": 2.065995644649384e-05,
324
- "loss": 0.3441,
325
  "step": 410
326
  },
327
  {
328
  "epoch": 1.56,
329
- "learning_rate": 2.0197572678896522e-05,
330
- "loss": 0.3399,
331
  "step": 420
332
  },
333
  {
334
  "epoch": 1.6,
335
- "learning_rate": 1.9729505441884825e-05,
336
- "loss": 0.3617,
337
  "step": 430
338
  },
339
  {
340
  "epoch": 1.64,
341
- "learning_rate": 1.9256266560110322e-05,
342
- "loss": 0.3596,
343
  "step": 440
344
  },
345
  {
346
  "epoch": 1.67,
347
- "learning_rate": 1.8778373513342223e-05,
348
- "loss": 0.3699,
349
  "step": 450
350
  },
351
  {
352
  "epoch": 1.67,
353
- "eval_loss": 0.3931977450847626,
354
- "eval_runtime": 10.3775,
355
- "eval_samples_per_second": 11.564,
356
- "eval_steps_per_second": 2.891,
357
  "step": 450
358
  },
359
  {
360
  "epoch": 1.71,
361
- "learning_rate": 1.8296348870610798e-05,
362
- "loss": 0.3654,
363
  "step": 460
364
  },
365
  {
366
  "epoch": 1.75,
367
- "learning_rate": 1.781071971878587e-05,
368
- "loss": 0.3588,
369
  "step": 470
370
  },
371
  {
372
  "epoch": 1.78,
373
- "learning_rate": 1.7322017086215023e-05,
374
- "loss": 0.352,
375
  "step": 480
376
  },
377
  {
378
  "epoch": 1.82,
379
- "learning_rate": 1.6830775362051904e-05,
380
- "loss": 0.3639,
381
  "step": 490
382
  },
383
  {
384
  "epoch": 1.86,
385
- "learning_rate": 1.633753171190956e-05,
386
- "loss": 0.336,
387
  "step": 500
388
  },
389
  {
390
  "epoch": 1.86,
391
- "eval_loss": 0.3954925537109375,
392
- "eval_runtime": 10.3812,
393
- "eval_samples_per_second": 11.559,
394
- "eval_steps_per_second": 2.89,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 1.9,
399
- "learning_rate": 1.5842825490477683e-05,
400
- "loss": 0.3421,
401
  "step": 510
402
  },
403
  {
404
  "epoch": 1.93,
405
- "learning_rate": 1.5347197651746207e-05,
406
- "loss": 0.3421,
407
  "step": 520
408
  },
409
  {
410
  "epoch": 1.97,
411
- "learning_rate": 1.4851190157480054e-05,
412
- "loss": 0.3547,
413
  "step": 530
414
  },
415
  {
416
  "epoch": 2.01,
417
- "learning_rate": 1.4355345384591894e-05,
418
- "loss": 0.3355,
419
  "step": 540
420
  },
421
  {
422
  "epoch": 2.04,
423
- "learning_rate": 1.3860205532060953e-05,
424
- "loss": 0.3512,
425
  "step": 550
426
  },
427
  {
428
  "epoch": 2.04,
429
- "eval_loss": 0.3910907804965973,
430
- "eval_runtime": 10.3795,
431
- "eval_samples_per_second": 11.561,
432
- "eval_steps_per_second": 2.89,
433
  "step": 550
434
  },
435
  {
436
  "epoch": 2.08,
437
- "learning_rate": 1.3366312028046412e-05,
438
- "loss": 0.3426,
439
  "step": 560
440
  },
441
  {
442
  "epoch": 2.12,
443
- "learning_rate": 1.2874204937843636e-05,
444
- "loss": 0.3577,
445
  "step": 570
446
  },
447
  {
448
  "epoch": 2.16,
449
- "learning_rate": 1.2384422373330728e-05,
450
- "loss": 0.3308,
451
  "step": 580
452
  },
453
  {
454
  "epoch": 2.19,
455
- "learning_rate": 1.189749990455105e-05,
456
- "loss": 0.3464,
457
  "step": 590
458
  },
459
  {
460
  "epoch": 2.23,
461
- "learning_rate": 1.1413969974075299e-05,
462
- "loss": 0.3413,
463
  "step": 600
464
  },
465
  {
466
  "epoch": 2.23,
467
- "eval_loss": 0.39001432061195374,
468
- "eval_runtime": 10.3775,
469
- "eval_samples_per_second": 11.563,
470
- "eval_steps_per_second": 2.891,
471
  "step": 600
472
  },
473
  {
474
  "epoch": 2.27,
475
- "learning_rate": 1.0934361314783339e-05,
476
- "loss": 0.3535,
477
  "step": 610
478
  },
479
  {
480
  "epoch": 2.3,
481
- "learning_rate": 1.0459198371702553e-05,
482
- "loss": 0.3322,
483
  "step": 620
484
  },
485
  {
486
  "epoch": 2.34,
487
- "learning_rate": 9.989000728534936e-06,
488
- "loss": 0.347,
489
  "step": 630
490
  },
491
  {
492
  "epoch": 2.38,
493
- "learning_rate": 9.524282539499916e-06,
494
- "loss": 0.3088,
495
  "step": 640
496
  },
497
  {
498
  "epoch": 2.42,
499
- "learning_rate": 9.06555196711428e-06,
500
- "loss": 0.3402,
501
  "step": 650
502
  },
503
  {
504
  "epoch": 2.42,
505
- "eval_loss": 0.39315077662467957,
506
- "eval_runtime": 10.3839,
507
- "eval_samples_per_second": 11.556,
508
- "eval_steps_per_second": 2.889,
509
  "step": 650
510
  },
511
  {
512
  "epoch": 2.45,
513
- "learning_rate": 8.61331062652391e-06,
514
- "loss": 0.3123,
515
  "step": 660
516
  },
517
  {
518
  "epoch": 2.49,
519
- "learning_rate": 8.168053036995011e-06,
520
- "loss": 0.322,
521
  "step": 670
522
  },
523
  {
524
  "epoch": 2.53,
525
- "learning_rate": 7.73026608116453e-06,
526
- "loss": 0.3335,
527
  "step": 680
528
  },
529
  {
530
  "epoch": 2.57,
531
- "learning_rate": 7.3004284726411315e-06,
532
- "loss": 0.318,
533
  "step": 690
534
  },
535
  {
536
  "epoch": 2.6,
537
- "learning_rate": 6.87901023253893e-06,
538
- "loss": 0.3255,
539
  "step": 700
540
  },
541
  {
542
  "epoch": 2.6,
543
- "eval_loss": 0.3948245942592621,
544
- "eval_runtime": 10.39,
545
- "eval_samples_per_second": 11.55,
546
- "eval_steps_per_second": 2.887,
547
  "step": 700
548
  },
549
  {
550
  "epoch": 2.64,
551
- "learning_rate": 6.466472175516284e-06,
552
- "loss": 0.3275,
553
  "step": 710
554
  },
555
  {
556
  "epoch": 2.68,
557
- "learning_rate": 6.06326540588171e-06,
558
- "loss": 0.3226,
559
  "step": 720
560
  },
561
  {
562
  "epoch": 2.71,
563
- "learning_rate": 5.669830824317992e-06,
564
- "loss": 0.3154,
565
  "step": 730
566
  },
567
  {
568
  "epoch": 2.75,
569
- "learning_rate": 5.286598645763718e-06,
570
- "loss": 0.3194,
571
  "step": 740
572
  },
573
  {
574
  "epoch": 2.79,
575
- "learning_rate": 4.91398792897958e-06,
576
- "loss": 0.3252,
577
  "step": 750
578
  },
579
  {
580
  "epoch": 2.79,
581
- "eval_loss": 0.39301279187202454,
582
- "eval_runtime": 10.3855,
583
- "eval_samples_per_second": 11.555,
584
- "eval_steps_per_second": 2.889,
585
  "step": 750
586
  },
587
  {
588
  "epoch": 2.83,
589
- "learning_rate": 4.552406118313767e-06,
590
- "loss": 0.3198,
591
  "step": 760
592
  },
593
  {
594
  "epoch": 2.86,
595
- "learning_rate": 4.202248598167549e-06,
596
- "loss": 0.3136,
597
  "step": 770
598
  },
599
  {
600
  "epoch": 2.9,
601
- "learning_rate": 3.8638982606482525e-06,
602
- "loss": 0.3179,
603
  "step": 780
604
  },
605
  {
606
  "epoch": 2.94,
607
- "learning_rate": 3.537725086882333e-06,
608
- "loss": 0.3196,
609
  "step": 790
610
  },
611
  {
612
  "epoch": 2.97,
613
- "learning_rate": 3.224085742446484e-06,
614
- "loss": 0.316,
615
  "step": 800
616
  },
617
  {
618
  "epoch": 2.97,
619
- "eval_loss": 0.3946268558502197,
620
- "eval_runtime": 10.3837,
621
- "eval_samples_per_second": 11.557,
622
- "eval_steps_per_second": 2.889,
623
  "step": 800
624
  },
625
  {
626
  "epoch": 3.01,
627
- "learning_rate": 2.9233231873590445e-06,
628
- "loss": 0.3046,
629
  "step": 810
630
  },
631
  {
632
  "epoch": 3.05,
633
- "learning_rate": 2.635766301058241e-06,
634
- "loss": 0.3013,
635
  "step": 820
636
  },
637
  {
638
  "epoch": 3.09,
639
- "learning_rate": 2.3617295227773805e-06,
640
- "loss": 0.3181,
641
  "step": 830
642
  },
643
  {
644
  "epoch": 3.12,
645
- "learning_rate": 2.101512507710146e-06,
646
- "loss": 0.326,
647
  "step": 840
648
  },
649
  {
650
  "epoch": 3.16,
651
- "learning_rate": 1.8553997993420495e-06,
652
- "loss": 0.305,
653
  "step": 850
654
  },
655
  {
656
  "epoch": 3.16,
657
- "eval_loss": 0.3930993974208832,
658
- "eval_runtime": 10.3853,
659
- "eval_samples_per_second": 11.555,
660
- "eval_steps_per_second": 2.889,
661
  "step": 850
662
  },
663
  {
664
  "epoch": 3.2,
665
- "learning_rate": 1.623660518306293e-06,
666
- "loss": 0.2808,
667
  "step": 860
668
  },
669
  {
670
  "epoch": 3.23,
671
- "learning_rate": 1.4065480681043319e-06,
672
- "loss": 0.3079,
673
  "step": 870
674
  },
675
  {
676
  "epoch": 3.27,
677
- "learning_rate": 1.2042998580128488e-06,
678
- "loss": 0.3259,
679
  "step": 880
680
  },
681
  {
682
  "epoch": 3.31,
683
- "learning_rate": 1.0171370434802018e-06,
684
- "loss": 0.3016,
685
  "step": 890
686
  },
687
  {
688
  "epoch": 3.35,
689
- "learning_rate": 8.452642842961845e-07,
690
- "loss": 0.3248,
691
  "step": 900
692
  },
693
  {
694
  "epoch": 3.35,
695
- "eval_loss": 0.393511027097702,
696
- "eval_runtime": 10.3784,
697
- "eval_samples_per_second": 11.562,
698
- "eval_steps_per_second": 2.891,
699
  "step": 900
700
  },
701
  {
702
  "epoch": 3.38,
703
- "learning_rate": 6.888695207995532e-07,
704
- "loss": 0.3202,
705
  "step": 910
706
  },
707
  {
708
  "epoch": 3.42,
709
- "learning_rate": 5.481237683680291e-07,
710
- "loss": 0.3202,
711
  "step": 920
712
  },
713
  {
714
  "epoch": 3.46,
715
- "learning_rate": 4.231809304154849e-07,
716
- "loss": 0.2963,
717
  "step": 930
718
  },
719
  {
720
  "epoch": 3.49,
721
- "learning_rate": 3.1417763010083033e-07,
722
- "loss": 0.298,
723
  "step": 940
724
  },
725
  {
726
  "epoch": 3.53,
727
- "learning_rate": 2.2123306093259022e-07,
728
- "loss": 0.3363,
729
  "step": 950
730
  },
731
  {
732
  "epoch": 3.53,
733
- "eval_loss": 0.3934156000614166,
734
- "eval_runtime": 10.3847,
735
- "eval_samples_per_second": 11.556,
736
- "eval_steps_per_second": 2.889,
737
  "step": 950
738
  },
739
  {
740
  "epoch": 3.57,
741
- "learning_rate": 1.4444885643255136e-07,
742
- "loss": 0.3097,
743
  "step": 960
744
  },
745
  {
746
  "epoch": 3.61,
747
- "learning_rate": 8.390897900099781e-08,
748
- "loss": 0.2979,
749
  "step": 970
750
  },
751
  {
752
  "epoch": 3.64,
753
- "learning_rate": 3.9679628105067643e-08,
754
- "loss": 0.305,
755
  "step": 980
756
  },
757
  {
758
  "epoch": 3.68,
759
- "learning_rate": 1.1809167890592388e-08,
760
- "loss": 0.3314,
761
  "step": 990
762
  },
763
  {
764
  "epoch": 3.72,
765
- "learning_rate": 3.280742966310646e-10,
766
- "loss": 0.3032,
767
  "step": 1000
768
  },
769
  {
770
  "epoch": 3.72,
771
- "eval_loss": 0.39346638321876526,
772
- "eval_runtime": 10.3788,
773
- "eval_samples_per_second": 11.562,
774
- "eval_steps_per_second": 2.891,
775
  "step": 1000
776
  },
777
  {
778
  "epoch": 3.72,
779
  "step": 1000,
780
- "total_flos": 1.7609161899297997e+17,
781
- "train_loss": 0.3842643254995346,
782
- "train_runtime": 3643.6441,
783
- "train_samples_per_second": 1.098,
784
- "train_steps_per_second": 0.274
785
  }
786
  ],
787
  "logging_steps": 10,
788
  "max_steps": 1000,
789
  "num_train_epochs": 4,
790
  "save_steps": 500,
791
- "total_flos": 1.7609161899297997e+17,
792
  "trial_name": null,
793
  "trial_params": null
794
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 0.0,
14
+ "loss": 1.3449,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.04,
19
+ "learning_rate": 4.2000000000000004e-06,
20
+ "loss": 1.2712,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.07,
25
+ "learning_rate": 9.600000000000001e-06,
26
+ "loss": 1.1556,
27
  "step": 20
28
  },
29
  {
30
  "epoch": 0.11,
31
+ "learning_rate": 1.56e-05,
32
+ "loss": 1.0713,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.15,
37
+ "learning_rate": 2.16e-05,
38
+ "loss": 0.9959,
39
  "step": 40
40
  },
41
  {
42
  "epoch": 0.19,
43
+ "learning_rate": 2.7000000000000002e-05,
44
+ "loss": 1.0538,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.19,
49
+ "eval_loss": 1.1364344358444214,
50
+ "eval_runtime": 14.0151,
51
+ "eval_samples_per_second": 8.562,
52
+ "eval_steps_per_second": 2.141,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 0.22,
57
+ "learning_rate": 2.9997949574887035e-05,
58
+ "loss": 0.7802,
59
  "step": 60
60
  },
61
  {
62
  "epoch": 0.26,
63
+ "learning_rate": 2.9981549537224573e-05,
64
+ "loss": 0.7425,
65
  "step": 70
66
  },
67
  {
68
  "epoch": 0.3,
69
+ "learning_rate": 2.994876739510005e-05,
70
+ "loss": 0.8096,
71
  "step": 80
72
  },
73
  {
74
  "epoch": 0.33,
75
+ "learning_rate": 2.9899638995304575e-05,
76
+ "loss": 0.71,
77
  "step": 90
78
  },
79
  {
80
  "epoch": 0.37,
81
+ "learning_rate": 2.9834218059022027e-05,
82
+ "loss": 0.7744,
83
  "step": 100
84
  },
85
  {
86
  "epoch": 0.37,
87
+ "eval_loss": 0.7776542901992798,
88
+ "eval_runtime": 14.0597,
89
+ "eval_samples_per_second": 8.535,
90
+ "eval_steps_per_second": 2.134,
91
  "step": 100
92
  },
93
  {
94
  "epoch": 0.41,
95
+ "learning_rate": 2.9752576123085737e-05,
96
+ "loss": 0.6735,
97
  "step": 110
98
  },
99
  {
100
  "epoch": 0.45,
101
+ "learning_rate": 2.9654802461753992e-05,
102
+ "loss": 0.6107,
103
  "step": 120
104
  },
105
  {
106
  "epoch": 0.48,
107
+ "learning_rate": 2.9541003989089956e-05,
108
+ "loss": 0.6387,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.52,
113
+ "learning_rate": 2.9411305142052725e-05,
114
+ "loss": 0.5779,
115
  "step": 140
116
  },
117
  {
118
  "epoch": 0.56,
119
+ "learning_rate": 2.9265847744427305e-05,
120
+ "loss": 0.5936,
121
  "step": 150
122
  },
123
  {
124
  "epoch": 0.56,
125
+ "eval_loss": 0.6507006287574768,
126
+ "eval_runtime": 14.0635,
127
+ "eval_samples_per_second": 8.533,
128
+ "eval_steps_per_second": 2.133,
129
  "step": 150
130
  },
131
  {
132
  "epoch": 0.59,
133
+ "learning_rate": 2.910479085174242e-05,
134
+ "loss": 0.5965,
135
  "step": 160
136
  },
137
  {
138
  "epoch": 0.63,
139
+ "learning_rate": 2.8928310577345608e-05,
140
+ "loss": 0.5969,
141
  "step": 170
142
  },
143
  {
144
  "epoch": 0.67,
145
+ "learning_rate": 2.873659989982586e-05,
146
+ "loss": 0.5735,
147
  "step": 180
148
  },
149
  {
150
  "epoch": 0.71,
151
+ "learning_rate": 2.8529868451994387e-05,
152
+ "loss": 0.5535,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 0.74,
157
+ "learning_rate": 2.830834229165418e-05,
158
+ "loss": 0.5449,
159
  "step": 200
160
  },
161
  {
162
  "epoch": 0.74,
163
+ "eval_loss": 0.608721137046814,
164
+ "eval_runtime": 14.0592,
165
+ "eval_samples_per_second": 8.535,
166
+ "eval_steps_per_second": 2.134,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 0.78,
171
+ "learning_rate": 2.807226365440916e-05,
172
+ "loss": 0.5445,
173
  "step": 210
174
  },
175
  {
176
  "epoch": 0.82,
177
+ "learning_rate": 2.7821890688783088e-05,
178
+ "loss": 0.6112,
179
  "step": 220
180
  },
181
  {
182
  "epoch": 0.86,
183
+ "learning_rate": 2.7557497173937928e-05,
184
+ "loss": 0.5294,
185
  "step": 230
186
  },
187
  {
188
  "epoch": 0.89,
189
+ "learning_rate": 2.727937222030039e-05,
190
+ "loss": 0.4957,
191
  "step": 240
192
  },
193
  {
194
  "epoch": 0.93,
195
+ "learning_rate": 2.698781995342387e-05,
196
+ "loss": 0.501,
197
  "step": 250
198
  },
199
  {
200
  "epoch": 0.93,
201
+ "eval_loss": 0.5840339660644531,
202
+ "eval_runtime": 14.0673,
203
+ "eval_samples_per_second": 8.53,
204
+ "eval_steps_per_second": 2.133,
205
  "step": 250
206
  },
207
  {
208
  "epoch": 0.97,
209
+ "learning_rate": 2.668315918143169e-05,
210
+ "loss": 0.5568,
211
  "step": 260
212
  },
213
  {
214
  "epoch": 1.0,
215
+ "learning_rate": 2.6365723046405022e-05,
216
+ "loss": 0.5425,
217
  "step": 270
218
  },
219
  {
220
  "epoch": 1.04,
221
+ "learning_rate": 2.6035858660096975e-05,
222
+ "loss": 0.5089,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.08,
227
+ "learning_rate": 2.5693926724370958e-05,
228
+ "loss": 0.5014,
229
  "step": 290
230
  },
231
  {
232
  "epoch": 1.12,
233
+ "learning_rate": 2.534030113677849e-05,
234
+ "loss": 0.5752,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 1.12,
239
+ "eval_loss": 0.555169939994812,
240
+ "eval_runtime": 14.0736,
241
+ "eval_samples_per_second": 8.527,
242
+ "eval_steps_per_second": 2.132,
243
  "step": 300
244
  },
245
  {
246
  "epoch": 1.15,
247
+ "learning_rate": 2.4975368581707724e-05,
248
+ "loss": 0.4937,
249
  "step": 310
250
  },
251
  {
252
  "epoch": 1.19,
253
+ "learning_rate": 2.459952810754975e-05,
254
+ "loss": 0.446,
255
  "step": 320
256
  },
257
  {
258
  "epoch": 1.23,
259
+ "learning_rate": 2.4213190690345018e-05,
260
+ "loss": 0.4632,
261
  "step": 330
262
  },
263
  {
264
  "epoch": 1.26,
265
+ "learning_rate": 2.3816778784387097e-05,
266
+ "loss": 0.4789,
267
  "step": 340
268
  },
269
  {
270
  "epoch": 1.3,
271
+ "learning_rate": 2.3410725860275092e-05,
272
+ "loss": 0.4542,
273
  "step": 350
274
  },
275
  {
276
  "epoch": 1.3,
277
+ "eval_loss": 0.5419090390205383,
278
+ "eval_runtime": 14.073,
279
+ "eval_samples_per_second": 8.527,
280
+ "eval_steps_per_second": 2.132,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 1.34,
285
+ "learning_rate": 2.2995475930919907e-05,
286
+ "loss": 0.4636,
287
  "step": 360
288
  },
289
  {
290
  "epoch": 1.38,
291
+ "learning_rate": 2.257148306602266e-05,
292
+ "loss": 0.5061,
293
  "step": 370
294
  },
295
  {
296
  "epoch": 1.41,
297
+ "learning_rate": 2.2139210895556104e-05,
298
+ "loss": 0.4431,
299
  "step": 380
300
  },
301
  {
302
  "epoch": 1.45,
303
+ "learning_rate": 2.16991321027921e-05,
304
+ "loss": 0.4565,
305
  "step": 390
306
  },
307
  {
308
  "epoch": 1.49,
309
+ "learning_rate": 2.1251727907429357e-05,
310
+ "loss": 0.5115,
311
  "step": 400
312
  },
313
  {
314
  "epoch": 1.49,
315
+ "eval_loss": 0.5242533087730408,
316
+ "eval_runtime": 14.071,
317
+ "eval_samples_per_second": 8.528,
318
+ "eval_steps_per_second": 2.132,
319
  "step": 400
320
  },
321
  {
322
  "epoch": 1.52,
323
+ "learning_rate": 2.0797487539386782e-05,
324
+ "loss": 0.4748,
325
  "step": 410
326
  },
327
  {
328
  "epoch": 1.56,
329
+ "learning_rate": 2.033690770383775e-05,
330
+ "loss": 0.4308,
331
  "step": 420
332
  },
333
  {
334
  "epoch": 1.6,
335
+ "learning_rate": 1.9870492038070255e-05,
336
+ "loss": 0.4808,
337
  "step": 430
338
  },
339
  {
340
  "epoch": 1.64,
341
+ "learning_rate": 1.9398750560766973e-05,
342
+ "loss": 0.5061,
343
  "step": 440
344
  },
345
  {
346
  "epoch": 1.67,
347
+ "learning_rate": 1.8922199114307297e-05,
348
+ "loss": 0.4224,
349
  "step": 450
350
  },
351
  {
352
  "epoch": 1.67,
353
+ "eval_loss": 0.518813967704773,
354
+ "eval_runtime": 14.0743,
355
+ "eval_samples_per_second": 8.526,
356
+ "eval_steps_per_second": 2.132,
357
  "step": 450
358
  },
359
  {
360
  "epoch": 1.71,
361
+ "learning_rate": 1.8441358800701276e-05,
362
+ "loss": 0.4525,
363
  "step": 460
364
  },
365
  {
366
  "epoch": 1.75,
367
+ "learning_rate": 1.7956755411772203e-05,
368
+ "loss": 0.4854,
369
  "step": 470
370
  },
371
  {
372
  "epoch": 1.78,
373
+ "learning_rate": 1.746891885421101e-05,
374
+ "loss": 0.4529,
375
  "step": 480
376
  },
377
  {
378
  "epoch": 1.82,
379
+ "learning_rate": 1.6978382570131037e-05,
380
+ "loss": 0.4786,
381
  "step": 490
382
  },
383
  {
384
  "epoch": 1.86,
385
+ "learning_rate": 1.6485682953756945e-05,
386
+ "loss": 0.4486,
387
  "step": 500
388
  },
389
  {
390
  "epoch": 1.86,
391
+ "eval_loss": 0.5055068731307983,
392
+ "eval_runtime": 14.0709,
393
+ "eval_samples_per_second": 8.528,
394
+ "eval_steps_per_second": 2.132,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 1.9,
399
+ "learning_rate": 1.5991358764885492e-05,
400
+ "loss": 0.4303,
401
  "step": 510
402
  },
403
  {
404
  "epoch": 1.93,
405
+ "learning_rate": 1.549595053975962e-05,
406
+ "loss": 0.4589,
407
  "step": 520
408
  },
409
  {
410
  "epoch": 1.97,
411
+ "learning_rate": 1.5e-05,
412
+ "loss": 0.4715,
413
  "step": 530
414
  },
415
  {
416
  "epoch": 2.01,
417
+ "learning_rate": 1.4504049460240376e-05,
418
+ "loss": 0.4354,
419
  "step": 540
420
  },
421
  {
422
  "epoch": 2.04,
423
+ "learning_rate": 1.400864123511451e-05,
424
+ "loss": 0.3865,
425
  "step": 550
426
  },
427
  {
428
  "epoch": 2.04,
429
+ "eval_loss": 0.5038172006607056,
430
+ "eval_runtime": 14.0798,
431
+ "eval_samples_per_second": 8.523,
432
+ "eval_steps_per_second": 2.131,
433
  "step": 550
434
  },
435
  {
436
  "epoch": 2.08,
437
+ "learning_rate": 1.351431704624306e-05,
438
+ "loss": 0.4119,
439
  "step": 560
440
  },
441
  {
442
  "epoch": 2.12,
443
+ "learning_rate": 1.3021617429868966e-05,
444
+ "loss": 0.4405,
445
  "step": 570
446
  },
447
  {
448
  "epoch": 2.16,
449
+ "learning_rate": 1.2531081145788989e-05,
450
+ "loss": 0.4435,
451
  "step": 580
452
  },
453
  {
454
  "epoch": 2.19,
455
+ "learning_rate": 1.2043244588227797e-05,
456
+ "loss": 0.4083,
457
  "step": 590
458
  },
459
  {
460
  "epoch": 2.23,
461
+ "learning_rate": 1.1558641199298728e-05,
462
+ "loss": 0.4193,
463
  "step": 600
464
  },
465
  {
466
  "epoch": 2.23,
467
+ "eval_loss": 0.5048139691352844,
468
+ "eval_runtime": 14.07,
469
+ "eval_samples_per_second": 8.529,
470
+ "eval_steps_per_second": 2.132,
471
  "step": 600
472
  },
473
  {
474
  "epoch": 2.27,
475
+ "learning_rate": 1.1077800885692704e-05,
476
+ "loss": 0.4138,
477
  "step": 610
478
  },
479
  {
480
  "epoch": 2.3,
481
+ "learning_rate": 1.0601249439233031e-05,
482
+ "loss": 0.4357,
483
  "step": 620
484
  },
485
  {
486
  "epoch": 2.34,
487
+ "learning_rate": 1.0129507961929749e-05,
488
+ "loss": 0.475,
489
  "step": 630
490
  },
491
  {
492
  "epoch": 2.38,
493
+ "learning_rate": 9.663092296162252e-06,
494
+ "loss": 0.4294,
495
  "step": 640
496
  },
497
  {
498
  "epoch": 2.42,
499
+ "learning_rate": 9.20251246061322e-06,
500
+ "loss": 0.4294,
501
  "step": 650
502
  },
503
  {
504
  "epoch": 2.42,
505
+ "eval_loss": 0.49953868985176086,
506
+ "eval_runtime": 14.0784,
507
+ "eval_samples_per_second": 8.524,
508
+ "eval_steps_per_second": 2.131,
509
  "step": 650
510
  },
511
  {
512
  "epoch": 2.45,
513
+ "learning_rate": 8.748272092570648e-06,
514
+ "loss": 0.4213,
515
  "step": 660
516
  },
517
  {
518
  "epoch": 2.49,
519
+ "learning_rate": 8.300867897207903e-06,
520
+ "loss": 0.4353,
521
  "step": 670
522
  },
523
  {
524
  "epoch": 2.53,
525
+ "learning_rate": 7.860789104443897e-06,
526
+ "loss": 0.413,
527
  "step": 680
528
  },
529
  {
530
  "epoch": 2.57,
531
+ "learning_rate": 7.4285169339773486e-06,
532
+ "loss": 0.4153,
533
  "step": 690
534
  },
535
  {
536
  "epoch": 2.6,
537
+ "learning_rate": 7.0045240690800975e-06,
538
+ "loss": 0.4077,
539
  "step": 700
540
  },
541
  {
542
  "epoch": 2.6,
543
+ "eval_loss": 0.5014118552207947,
544
+ "eval_runtime": 14.0698,
545
+ "eval_samples_per_second": 8.529,
546
+ "eval_steps_per_second": 2.132,
547
  "step": 700
548
  },
549
  {
550
  "epoch": 2.64,
551
+ "learning_rate": 6.589274139724911e-06,
552
+ "loss": 0.4025,
553
  "step": 710
554
  },
555
  {
556
  "epoch": 2.68,
557
+ "learning_rate": 6.1832212156129045e-06,
558
+ "loss": 0.4911,
559
  "step": 720
560
  },
561
  {
562
  "epoch": 2.71,
563
+ "learning_rate": 5.786809309654983e-06,
564
+ "loss": 0.4088,
565
  "step": 730
566
  },
567
  {
568
  "epoch": 2.75,
569
+ "learning_rate": 5.400471892450251e-06,
570
+ "loss": 0.4276,
571
  "step": 740
572
  },
573
  {
574
  "epoch": 2.79,
575
+ "learning_rate": 5.024631418292275e-06,
576
+ "loss": 0.4667,
577
  "step": 750
578
  },
579
  {
580
  "epoch": 2.79,
581
+ "eval_loss": 0.49848347902297974,
582
+ "eval_runtime": 14.0784,
583
+ "eval_samples_per_second": 8.524,
584
+ "eval_steps_per_second": 2.131,
585
  "step": 750
586
  },
587
  {
588
  "epoch": 2.83,
589
+ "learning_rate": 4.659698863221513e-06,
590
+ "loss": 0.4215,
591
  "step": 760
592
  },
593
  {
594
  "epoch": 2.86,
595
+ "learning_rate": 4.306073275629045e-06,
596
+ "loss": 0.4274,
597
  "step": 770
598
  },
599
  {
600
  "epoch": 2.9,
601
+ "learning_rate": 3.964141339903026e-06,
602
+ "loss": 0.3821,
603
  "step": 780
604
  },
605
  {
606
  "epoch": 2.94,
607
+ "learning_rate": 3.634276953594982e-06,
608
+ "loss": 0.4385,
609
  "step": 790
610
  },
611
  {
612
  "epoch": 2.97,
613
+ "learning_rate": 3.3168408185683153e-06,
614
+ "loss": 0.4226,
615
  "step": 800
616
  },
617
  {
618
  "epoch": 2.97,
619
+ "eval_loss": 0.4937375485897064,
620
+ "eval_runtime": 14.0647,
621
+ "eval_samples_per_second": 8.532,
622
+ "eval_steps_per_second": 2.133,
623
  "step": 800
624
  },
625
  {
626
  "epoch": 3.01,
627
+ "learning_rate": 3.0121800465761298e-06,
628
+ "loss": 0.4213,
629
  "step": 810
630
  },
631
  {
632
  "epoch": 3.05,
633
+ "learning_rate": 2.720627779699615e-06,
634
+ "loss": 0.4184,
635
  "step": 820
636
  },
637
  {
638
  "epoch": 3.09,
639
+ "learning_rate": 2.442502826062072e-06,
640
+ "loss": 0.4083,
641
  "step": 830
642
  },
643
  {
644
  "epoch": 3.12,
645
+ "learning_rate": 2.1781093112169132e-06,
646
+ "loss": 0.4407,
647
  "step": 840
648
  },
649
  {
650
  "epoch": 3.16,
651
+ "learning_rate": 1.927736345590839e-06,
652
+ "loss": 0.4195,
653
  "step": 850
654
  },
655
  {
656
  "epoch": 3.16,
657
+ "eval_loss": 0.49196699261665344,
658
+ "eval_runtime": 14.0663,
659
+ "eval_samples_per_second": 8.531,
660
+ "eval_steps_per_second": 2.133,
661
  "step": 850
662
  },
663
  {
664
  "epoch": 3.2,
665
+ "learning_rate": 1.691657708345823e-06,
666
+ "loss": 0.3772,
667
  "step": 860
668
  },
669
  {
670
  "epoch": 3.23,
671
+ "learning_rate": 1.4701315480056165e-06,
672
+ "loss": 0.4214,
673
  "step": 870
674
  },
675
  {
676
  "epoch": 3.27,
677
+ "learning_rate": 1.2634001001741375e-06,
678
+ "loss": 0.4077,
679
  "step": 880
680
  },
681
  {
682
  "epoch": 3.31,
683
+ "learning_rate": 1.0716894226543954e-06,
684
+ "loss": 0.373,
685
  "step": 890
686
  },
687
  {
688
  "epoch": 3.35,
689
+ "learning_rate": 8.952091482575825e-07,
690
+ "loss": 0.338,
691
  "step": 900
692
  },
693
  {
694
  "epoch": 3.35,
695
+ "eval_loss": 0.4922858476638794,
696
+ "eval_runtime": 14.0689,
697
+ "eval_samples_per_second": 8.529,
698
+ "eval_steps_per_second": 2.132,
699
  "step": 900
700
  },
701
  {
702
  "epoch": 3.38,
703
+ "learning_rate": 7.341522555726971e-07,
704
+ "loss": 0.4324,
705
  "step": 910
706
  },
707
  {
708
  "epoch": 3.42,
709
+ "learning_rate": 5.886948579472779e-07,
710
+ "loss": 0.4012,
711
  "step": 920
712
  },
713
  {
714
  "epoch": 3.46,
715
+ "learning_rate": 4.589960109100444e-07,
716
+ "loss": 0.4066,
717
  "step": 930
718
  },
719
  {
720
  "epoch": 3.49,
721
+ "learning_rate": 3.45197538246011e-07,
722
+ "loss": 0.3547,
723
  "step": 940
724
  },
725
  {
726
  "epoch": 3.53,
727
+ "learning_rate": 2.474238769142645e-07,
728
+ "loss": 0.3943,
729
  "step": 950
730
  },
731
  {
732
  "epoch": 3.53,
733
+ "eval_loss": 0.4926001727581024,
734
+ "eval_runtime": 14.07,
735
+ "eval_samples_per_second": 8.529,
736
+ "eval_steps_per_second": 2.132,
737
  "step": 950
738
  },
739
  {
740
  "epoch": 3.57,
741
+ "learning_rate": 1.657819409779726e-07,
742
+ "loss": 0.4137,
743
  "step": 960
744
  },
745
  {
746
  "epoch": 3.61,
747
+ "learning_rate": 1.0036100469542786e-07,
748
+ "loss": 0.4237,
749
  "step": 970
750
  },
751
  {
752
  "epoch": 3.64,
753
+ "learning_rate": 5.1232604899952296e-08,
754
+ "loss": 0.3914,
755
  "step": 980
756
  },
757
  {
758
  "epoch": 3.68,
759
+ "learning_rate": 1.8450462775428946e-08,
760
+ "loss": 0.4274,
761
  "step": 990
762
  },
763
  {
764
  "epoch": 3.72,
765
+ "learning_rate": 2.0504251129649378e-09,
766
+ "loss": 0.3953,
767
  "step": 1000
768
  },
769
  {
770
  "epoch": 3.72,
771
+ "eval_loss": 0.4926711618900299,
772
+ "eval_runtime": 14.0668,
773
+ "eval_samples_per_second": 8.531,
774
+ "eval_steps_per_second": 2.133,
775
  "step": 1000
776
  },
777
  {
778
  "epoch": 3.72,
779
  "step": 1000,
780
+ "total_flos": 1.0108845062671565e+17,
781
+ "train_loss": 0.509855135679245,
782
+ "train_runtime": 2165.2959,
783
+ "train_samples_per_second": 1.847,
784
+ "train_steps_per_second": 0.462
785
  }
786
  ],
787
  "logging_steps": 10,
788
  "max_steps": 1000,
789
  "num_train_epochs": 4,
790
  "save_steps": 500,
791
+ "total_flos": 1.0108845062671565e+17,
792
  "trial_name": null,
793
  "trial_params": null
794
  }