CocoRoF commited on
Commit
f2e70d2
·
verified ·
1 Parent(s): 202a2ef

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67e015b0f596fee442abd71af93158ddf702d0b03ddbafe627cd1e3991ffb717
3
  size 368988278
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f65db42286c6828403e19bd03efcefb25250e1efec73d5d62df88d031877370
3
  size 368988278
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:500d83d312df8ef8772133628901aa3ec8b169d37b27727f6771ed38095522d3
3
  size 1107079290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa947f13cc7ec696d218b55e64c458be90c9bb7d6f39b0a47978915ec3764eac
3
  size 1107079290
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69bcb5a0833345d8383f742fc9fcb902b5c0cae43ce63aee4b910dcd39dcda65
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5301d093d831fee92b3f7b9c5a7a55085678b711d728aed3c12fffa4f4b960a3
3
  size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fae392ec6232cbf9da21d6ed12bc8247d0d24e7f3a3606acd23be00f3e8bbfc5
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be561d1df19be227394d8ea607c54262a06c9bf880af0aa5e04a52596a2a6cb0
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbf3e7ca9991a58b0b16574a3c653483c551c270aa05aba06c162ea593f7b0f2
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03f3e24417a59435f5a8450a4aeb0f09cc92734b5c3b45a0701b2c043c415c05
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c695bebf6bcb75cbe26378bfe0ab7e2a33c49f713b9d6e4d10632b24322977e7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bea02744c29f30024590ab1629a0e7b7dabbf1e8476456c2e7c5ce46dc35c28
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5ebb13c71265c5464c9aa9bb9b66f07764d73befe6cd63a2aaf8e781bf0a374
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:041be966454b60c86af576fc1eb7f34189114689abff8f9622b947110f7334c8
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12cc6e245e189be568c8dfd43a4dd8f04bb3dbd9f17f41458107935d2c2a6a9d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85766f6596d15a810177d77dd259d9b50588cf100ec5f8ebff5fed881d57957
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36086646e9a8f76fea69f8a227112e83bb63524964ccdfb82f4cdad88b90e5e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8be75d04b1ebe614241b88fd010a5dda1b7bf703c00c6ebe310ca07975830fe7
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b44153bacf860d0ca6ce4c6b9380a199feab8a72ca613e6745bfb671b02c4e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4699833a7ab4cb692996ef7567f934c0bac79d6a067963a873f89a38e412bd48
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13f8e14c2c7c91bc7deff16a401fb72a8859494fc7925b25ee4ffb4e04a50c64
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2000996a6fbcd6fb689041f1ea8cf89a2645e18896cfa2a47f022b7761d5c9e
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6002318395480254,
5
  "eval_steps": 2500,
6
- "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5281,6 +5281,1764 @@
5281
  "eval_samples_per_second": 1567.447,
5282
  "eval_steps_per_second": 48.983,
5283
  "step": 7500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5284
  }
5285
  ],
5286
  "logging_steps": 10,
@@ -5300,7 +7058,7 @@
5300
  "attributes": {}
5301
  }
5302
  },
5303
- "total_flos": 2.0708374333095936e+19,
5304
  "train_batch_size": 4,
5305
  "trial_name": null,
5306
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8003091193973673,
5
  "eval_steps": 2500,
6
+ "global_step": 10000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5281
  "eval_samples_per_second": 1567.447,
5282
  "eval_steps_per_second": 48.983,
5283
  "step": 7500
5284
+ },
5285
+ {
5286
+ "epoch": 0.6010321486674228,
5287
+ "grad_norm": 79.875,
5288
+ "learning_rate": 9.988261096351077e-07,
5289
+ "loss": 131.0494,
5290
+ "step": 7510
5291
+ },
5292
+ {
5293
+ "epoch": 0.6018324577868202,
5294
+ "grad_norm": 72.125,
5295
+ "learning_rate": 9.988245465320919e-07,
5296
+ "loss": 131.0795,
5297
+ "step": 7520
5298
+ },
5299
+ {
5300
+ "epoch": 0.6026327669062175,
5301
+ "grad_norm": 73.4375,
5302
+ "learning_rate": 9.98822983429076e-07,
5303
+ "loss": 131.2448,
5304
+ "step": 7530
5305
+ },
5306
+ {
5307
+ "epoch": 0.6034330760256149,
5308
+ "grad_norm": 78.125,
5309
+ "learning_rate": 9.9882142032606e-07,
5310
+ "loss": 132.0931,
5311
+ "step": 7540
5312
+ },
5313
+ {
5314
+ "epoch": 0.6042333851450122,
5315
+ "grad_norm": 71.25,
5316
+ "learning_rate": 9.988198572230443e-07,
5317
+ "loss": 131.5498,
5318
+ "step": 7550
5319
+ },
5320
+ {
5321
+ "epoch": 0.6050336942644097,
5322
+ "grad_norm": 77.5625,
5323
+ "learning_rate": 9.988182941200285e-07,
5324
+ "loss": 132.7485,
5325
+ "step": 7560
5326
+ },
5327
+ {
5328
+ "epoch": 0.605834003383807,
5329
+ "grad_norm": 77.375,
5330
+ "learning_rate": 9.988167310170125e-07,
5331
+ "loss": 132.6295,
5332
+ "step": 7570
5333
+ },
5334
+ {
5335
+ "epoch": 0.6066343125032043,
5336
+ "grad_norm": 78.1875,
5337
+ "learning_rate": 9.988151679139968e-07,
5338
+ "loss": 130.9077,
5339
+ "step": 7580
5340
+ },
5341
+ {
5342
+ "epoch": 0.6074346216226018,
5343
+ "grad_norm": 79.9375,
5344
+ "learning_rate": 9.98813604810981e-07,
5345
+ "loss": 133.7221,
5346
+ "step": 7590
5347
+ },
5348
+ {
5349
+ "epoch": 0.6082349307419991,
5350
+ "grad_norm": 77.9375,
5351
+ "learning_rate": 9.98812041707965e-07,
5352
+ "loss": 131.693,
5353
+ "step": 7600
5354
+ },
5355
+ {
5356
+ "epoch": 0.6090352398613965,
5357
+ "grad_norm": 79.875,
5358
+ "learning_rate": 9.988104786049492e-07,
5359
+ "loss": 131.3416,
5360
+ "step": 7610
5361
+ },
5362
+ {
5363
+ "epoch": 0.6098355489807938,
5364
+ "grad_norm": 73.0,
5365
+ "learning_rate": 9.988089155019334e-07,
5366
+ "loss": 132.1983,
5367
+ "step": 7620
5368
+ },
5369
+ {
5370
+ "epoch": 0.6106358581001912,
5371
+ "grad_norm": 77.375,
5372
+ "learning_rate": 9.988073523989176e-07,
5373
+ "loss": 132.4808,
5374
+ "step": 7630
5375
+ },
5376
+ {
5377
+ "epoch": 0.6114361672195886,
5378
+ "grad_norm": 76.5625,
5379
+ "learning_rate": 9.988057892959019e-07,
5380
+ "loss": 131.0641,
5381
+ "step": 7640
5382
+ },
5383
+ {
5384
+ "epoch": 0.6122364763389859,
5385
+ "grad_norm": 84.75,
5386
+ "learning_rate": 9.988042261928859e-07,
5387
+ "loss": 132.1883,
5388
+ "step": 7650
5389
+ },
5390
+ {
5391
+ "epoch": 0.6130367854583832,
5392
+ "grad_norm": 71.75,
5393
+ "learning_rate": 9.9880266308987e-07,
5394
+ "loss": 132.5537,
5395
+ "step": 7660
5396
+ },
5397
+ {
5398
+ "epoch": 0.6138370945777807,
5399
+ "grad_norm": 83.5625,
5400
+ "learning_rate": 9.988010999868543e-07,
5401
+ "loss": 131.5181,
5402
+ "step": 7670
5403
+ },
5404
+ {
5405
+ "epoch": 0.614637403697178,
5406
+ "grad_norm": 79.9375,
5407
+ "learning_rate": 9.987995368838385e-07,
5408
+ "loss": 132.7484,
5409
+ "step": 7680
5410
+ },
5411
+ {
5412
+ "epoch": 0.6154377128165754,
5413
+ "grad_norm": 76.25,
5414
+ "learning_rate": 9.987979737808225e-07,
5415
+ "loss": 132.5646,
5416
+ "step": 7690
5417
+ },
5418
+ {
5419
+ "epoch": 0.6162380219359728,
5420
+ "grad_norm": 77.75,
5421
+ "learning_rate": 9.987964106778067e-07,
5422
+ "loss": 132.2048,
5423
+ "step": 7700
5424
+ },
5425
+ {
5426
+ "epoch": 0.6170383310553701,
5427
+ "grad_norm": 74.125,
5428
+ "learning_rate": 9.98794847574791e-07,
5429
+ "loss": 132.1626,
5430
+ "step": 7710
5431
+ },
5432
+ {
5433
+ "epoch": 0.6178386401747675,
5434
+ "grad_norm": 79.0,
5435
+ "learning_rate": 9.987932844717752e-07,
5436
+ "loss": 131.6138,
5437
+ "step": 7720
5438
+ },
5439
+ {
5440
+ "epoch": 0.6186389492941649,
5441
+ "grad_norm": 76.9375,
5442
+ "learning_rate": 9.987917213687592e-07,
5443
+ "loss": 130.9984,
5444
+ "step": 7730
5445
+ },
5446
+ {
5447
+ "epoch": 0.6194392584135623,
5448
+ "grad_norm": 79.875,
5449
+ "learning_rate": 9.987901582657434e-07,
5450
+ "loss": 132.2128,
5451
+ "step": 7740
5452
+ },
5453
+ {
5454
+ "epoch": 0.6202395675329596,
5455
+ "grad_norm": 75.5625,
5456
+ "learning_rate": 9.987885951627276e-07,
5457
+ "loss": 132.1526,
5458
+ "step": 7750
5459
+ },
5460
+ {
5461
+ "epoch": 0.6210398766523569,
5462
+ "grad_norm": 71.75,
5463
+ "learning_rate": 9.987870320597116e-07,
5464
+ "loss": 130.6957,
5465
+ "step": 7760
5466
+ },
5467
+ {
5468
+ "epoch": 0.6218401857717544,
5469
+ "grad_norm": 80.8125,
5470
+ "learning_rate": 9.987854689566959e-07,
5471
+ "loss": 131.2453,
5472
+ "step": 7770
5473
+ },
5474
+ {
5475
+ "epoch": 0.6226404948911517,
5476
+ "grad_norm": 82.4375,
5477
+ "learning_rate": 9.9878390585368e-07,
5478
+ "loss": 130.4836,
5479
+ "step": 7780
5480
+ },
5481
+ {
5482
+ "epoch": 0.623440804010549,
5483
+ "grad_norm": 72.0625,
5484
+ "learning_rate": 9.987823427506643e-07,
5485
+ "loss": 131.348,
5486
+ "step": 7790
5487
+ },
5488
+ {
5489
+ "epoch": 0.6242411131299465,
5490
+ "grad_norm": 74.125,
5491
+ "learning_rate": 9.987807796476483e-07,
5492
+ "loss": 132.7037,
5493
+ "step": 7800
5494
+ },
5495
+ {
5496
+ "epoch": 0.6250414222493438,
5497
+ "grad_norm": 71.625,
5498
+ "learning_rate": 9.987792165446325e-07,
5499
+ "loss": 132.4081,
5500
+ "step": 7810
5501
+ },
5502
+ {
5503
+ "epoch": 0.6258417313687412,
5504
+ "grad_norm": 75.6875,
5505
+ "learning_rate": 9.987776534416167e-07,
5506
+ "loss": 130.35,
5507
+ "step": 7820
5508
+ },
5509
+ {
5510
+ "epoch": 0.6266420404881385,
5511
+ "grad_norm": 72.8125,
5512
+ "learning_rate": 9.98776090338601e-07,
5513
+ "loss": 132.3228,
5514
+ "step": 7830
5515
+ },
5516
+ {
5517
+ "epoch": 0.6274423496075359,
5518
+ "grad_norm": 80.6875,
5519
+ "learning_rate": 9.987745272355852e-07,
5520
+ "loss": 131.8622,
5521
+ "step": 7840
5522
+ },
5523
+ {
5524
+ "epoch": 0.6282426587269333,
5525
+ "grad_norm": 81.4375,
5526
+ "learning_rate": 9.987729641325692e-07,
5527
+ "loss": 132.0134,
5528
+ "step": 7850
5529
+ },
5530
+ {
5531
+ "epoch": 0.6290429678463306,
5532
+ "grad_norm": 78.9375,
5533
+ "learning_rate": 9.987714010295534e-07,
5534
+ "loss": 132.4322,
5535
+ "step": 7860
5536
+ },
5537
+ {
5538
+ "epoch": 0.6298432769657281,
5539
+ "grad_norm": 76.5625,
5540
+ "learning_rate": 9.987698379265376e-07,
5541
+ "loss": 132.5075,
5542
+ "step": 7870
5543
+ },
5544
+ {
5545
+ "epoch": 0.6306435860851254,
5546
+ "grad_norm": 77.75,
5547
+ "learning_rate": 9.987682748235216e-07,
5548
+ "loss": 131.9704,
5549
+ "step": 7880
5550
+ },
5551
+ {
5552
+ "epoch": 0.6314438952045227,
5553
+ "grad_norm": 78.375,
5554
+ "learning_rate": 9.987667117205058e-07,
5555
+ "loss": 131.3642,
5556
+ "step": 7890
5557
+ },
5558
+ {
5559
+ "epoch": 0.6322442043239201,
5560
+ "grad_norm": 75.9375,
5561
+ "learning_rate": 9.9876514861749e-07,
5562
+ "loss": 132.9338,
5563
+ "step": 7900
5564
+ },
5565
+ {
5566
+ "epoch": 0.6330445134433175,
5567
+ "grad_norm": 73.25,
5568
+ "learning_rate": 9.987635855144743e-07,
5569
+ "loss": 131.7696,
5570
+ "step": 7910
5571
+ },
5572
+ {
5573
+ "epoch": 0.6338448225627148,
5574
+ "grad_norm": 79.4375,
5575
+ "learning_rate": 9.987620224114583e-07,
5576
+ "loss": 132.7947,
5577
+ "step": 7920
5578
+ },
5579
+ {
5580
+ "epoch": 0.6346451316821122,
5581
+ "grad_norm": 75.3125,
5582
+ "learning_rate": 9.987604593084425e-07,
5583
+ "loss": 131.7236,
5584
+ "step": 7930
5585
+ },
5586
+ {
5587
+ "epoch": 0.6354454408015096,
5588
+ "grad_norm": 74.1875,
5589
+ "learning_rate": 9.987588962054267e-07,
5590
+ "loss": 131.487,
5591
+ "step": 7940
5592
+ },
5593
+ {
5594
+ "epoch": 0.636245749920907,
5595
+ "grad_norm": 76.3125,
5596
+ "learning_rate": 9.98757333102411e-07,
5597
+ "loss": 132.3433,
5598
+ "step": 7950
5599
+ },
5600
+ {
5601
+ "epoch": 0.6370460590403043,
5602
+ "grad_norm": 76.4375,
5603
+ "learning_rate": 9.98755769999395e-07,
5604
+ "loss": 132.2295,
5605
+ "step": 7960
5606
+ },
5607
+ {
5608
+ "epoch": 0.6378463681597016,
5609
+ "grad_norm": 73.375,
5610
+ "learning_rate": 9.987542068963792e-07,
5611
+ "loss": 130.3616,
5612
+ "step": 7970
5613
+ },
5614
+ {
5615
+ "epoch": 0.6386466772790991,
5616
+ "grad_norm": 71.75,
5617
+ "learning_rate": 9.987526437933634e-07,
5618
+ "loss": 133.0935,
5619
+ "step": 7980
5620
+ },
5621
+ {
5622
+ "epoch": 0.6394469863984964,
5623
+ "grad_norm": 74.1875,
5624
+ "learning_rate": 9.987510806903476e-07,
5625
+ "loss": 131.3436,
5626
+ "step": 7990
5627
+ },
5628
+ {
5629
+ "epoch": 0.6402472955178938,
5630
+ "grad_norm": 73.1875,
5631
+ "learning_rate": 9.987495175873318e-07,
5632
+ "loss": 132.8394,
5633
+ "step": 8000
5634
+ },
5635
+ {
5636
+ "epoch": 0.6410476046372912,
5637
+ "grad_norm": 77.4375,
5638
+ "learning_rate": 9.987479544843158e-07,
5639
+ "loss": 131.8165,
5640
+ "step": 8010
5641
+ },
5642
+ {
5643
+ "epoch": 0.6418479137566885,
5644
+ "grad_norm": 76.5,
5645
+ "learning_rate": 9.987463913813e-07,
5646
+ "loss": 131.6329,
5647
+ "step": 8020
5648
+ },
5649
+ {
5650
+ "epoch": 0.6426482228760859,
5651
+ "grad_norm": 74.3125,
5652
+ "learning_rate": 9.987448282782843e-07,
5653
+ "loss": 132.5564,
5654
+ "step": 8030
5655
+ },
5656
+ {
5657
+ "epoch": 0.6434485319954832,
5658
+ "grad_norm": 80.25,
5659
+ "learning_rate": 9.987432651752683e-07,
5660
+ "loss": 131.5118,
5661
+ "step": 8040
5662
+ },
5663
+ {
5664
+ "epoch": 0.6442488411148807,
5665
+ "grad_norm": 81.4375,
5666
+ "learning_rate": 9.987417020722525e-07,
5667
+ "loss": 130.5542,
5668
+ "step": 8050
5669
+ },
5670
+ {
5671
+ "epoch": 0.645049150234278,
5672
+ "grad_norm": 75.3125,
5673
+ "learning_rate": 9.987401389692367e-07,
5674
+ "loss": 132.7824,
5675
+ "step": 8060
5676
+ },
5677
+ {
5678
+ "epoch": 0.6458494593536753,
5679
+ "grad_norm": 75.6875,
5680
+ "learning_rate": 9.98738575866221e-07,
5681
+ "loss": 131.6364,
5682
+ "step": 8070
5683
+ },
5684
+ {
5685
+ "epoch": 0.6466497684730728,
5686
+ "grad_norm": 69.125,
5687
+ "learning_rate": 9.98737012763205e-07,
5688
+ "loss": 131.1746,
5689
+ "step": 8080
5690
+ },
5691
+ {
5692
+ "epoch": 0.6474500775924701,
5693
+ "grad_norm": 79.6875,
5694
+ "learning_rate": 9.987354496601891e-07,
5695
+ "loss": 131.0759,
5696
+ "step": 8090
5697
+ },
5698
+ {
5699
+ "epoch": 0.6482503867118674,
5700
+ "grad_norm": 74.0625,
5701
+ "learning_rate": 9.987338865571734e-07,
5702
+ "loss": 131.1551,
5703
+ "step": 8100
5704
+ },
5705
+ {
5706
+ "epoch": 0.6490506958312648,
5707
+ "grad_norm": 81.8125,
5708
+ "learning_rate": 9.987323234541576e-07,
5709
+ "loss": 130.3787,
5710
+ "step": 8110
5711
+ },
5712
+ {
5713
+ "epoch": 0.6498510049506622,
5714
+ "grad_norm": 76.125,
5715
+ "learning_rate": 9.987307603511416e-07,
5716
+ "loss": 132.5409,
5717
+ "step": 8120
5718
+ },
5719
+ {
5720
+ "epoch": 0.6506513140700596,
5721
+ "grad_norm": 74.25,
5722
+ "learning_rate": 9.987291972481258e-07,
5723
+ "loss": 132.2028,
5724
+ "step": 8130
5725
+ },
5726
+ {
5727
+ "epoch": 0.6514516231894569,
5728
+ "grad_norm": 79.9375,
5729
+ "learning_rate": 9.9872763414511e-07,
5730
+ "loss": 130.8431,
5731
+ "step": 8140
5732
+ },
5733
+ {
5734
+ "epoch": 0.6522519323088543,
5735
+ "grad_norm": 71.0,
5736
+ "learning_rate": 9.987260710420942e-07,
5737
+ "loss": 132.5052,
5738
+ "step": 8150
5739
+ },
5740
+ {
5741
+ "epoch": 0.6530522414282517,
5742
+ "grad_norm": 88.9375,
5743
+ "learning_rate": 9.987245079390785e-07,
5744
+ "loss": 131.6914,
5745
+ "step": 8160
5746
+ },
5747
+ {
5748
+ "epoch": 0.653852550547649,
5749
+ "grad_norm": 71.75,
5750
+ "learning_rate": 9.987229448360625e-07,
5751
+ "loss": 131.5299,
5752
+ "step": 8170
5753
+ },
5754
+ {
5755
+ "epoch": 0.6546528596670464,
5756
+ "grad_norm": 75.1875,
5757
+ "learning_rate": 9.987213817330467e-07,
5758
+ "loss": 131.2931,
5759
+ "step": 8180
5760
+ },
5761
+ {
5762
+ "epoch": 0.6554531687864438,
5763
+ "grad_norm": 76.625,
5764
+ "learning_rate": 9.98719818630031e-07,
5765
+ "loss": 132.1339,
5766
+ "step": 8190
5767
+ },
5768
+ {
5769
+ "epoch": 0.6562534779058411,
5770
+ "grad_norm": 76.5625,
5771
+ "learning_rate": 9.98718255527015e-07,
5772
+ "loss": 131.0958,
5773
+ "step": 8200
5774
+ },
5775
+ {
5776
+ "epoch": 0.6570537870252385,
5777
+ "grad_norm": 72.6875,
5778
+ "learning_rate": 9.987166924239991e-07,
5779
+ "loss": 131.8033,
5780
+ "step": 8210
5781
+ },
5782
+ {
5783
+ "epoch": 0.6578540961446359,
5784
+ "grad_norm": 77.4375,
5785
+ "learning_rate": 9.987151293209833e-07,
5786
+ "loss": 132.3042,
5787
+ "step": 8220
5788
+ },
5789
+ {
5790
+ "epoch": 0.6586544052640332,
5791
+ "grad_norm": 72.0625,
5792
+ "learning_rate": 9.987135662179674e-07,
5793
+ "loss": 131.9262,
5794
+ "step": 8230
5795
+ },
5796
+ {
5797
+ "epoch": 0.6594547143834306,
5798
+ "grad_norm": 81.125,
5799
+ "learning_rate": 9.987120031149516e-07,
5800
+ "loss": 131.4936,
5801
+ "step": 8240
5802
+ },
5803
+ {
5804
+ "epoch": 0.6602550235028279,
5805
+ "grad_norm": 72.125,
5806
+ "learning_rate": 9.987104400119358e-07,
5807
+ "loss": 131.1532,
5808
+ "step": 8250
5809
+ },
5810
+ {
5811
+ "epoch": 0.6610553326222254,
5812
+ "grad_norm": 76.5,
5813
+ "learning_rate": 9.9870887690892e-07,
5814
+ "loss": 132.0334,
5815
+ "step": 8260
5816
+ },
5817
+ {
5818
+ "epoch": 0.6618556417416227,
5819
+ "grad_norm": 70.9375,
5820
+ "learning_rate": 9.98707313805904e-07,
5821
+ "loss": 131.7788,
5822
+ "step": 8270
5823
+ },
5824
+ {
5825
+ "epoch": 0.66265595086102,
5826
+ "grad_norm": 78.5,
5827
+ "learning_rate": 9.987057507028882e-07,
5828
+ "loss": 131.3623,
5829
+ "step": 8280
5830
+ },
5831
+ {
5832
+ "epoch": 0.6634562599804175,
5833
+ "grad_norm": 77.75,
5834
+ "learning_rate": 9.987041875998724e-07,
5835
+ "loss": 131.0159,
5836
+ "step": 8290
5837
+ },
5838
+ {
5839
+ "epoch": 0.6642565690998148,
5840
+ "grad_norm": 75.0,
5841
+ "learning_rate": 9.987026244968567e-07,
5842
+ "loss": 131.802,
5843
+ "step": 8300
5844
+ },
5845
+ {
5846
+ "epoch": 0.6650568782192122,
5847
+ "grad_norm": 74.3125,
5848
+ "learning_rate": 9.987010613938409e-07,
5849
+ "loss": 130.4846,
5850
+ "step": 8310
5851
+ },
5852
+ {
5853
+ "epoch": 0.6658571873386095,
5854
+ "grad_norm": 78.0625,
5855
+ "learning_rate": 9.98699498290825e-07,
5856
+ "loss": 130.5051,
5857
+ "step": 8320
5858
+ },
5859
+ {
5860
+ "epoch": 0.6666574964580069,
5861
+ "grad_norm": 71.625,
5862
+ "learning_rate": 9.986979351878091e-07,
5863
+ "loss": 131.5967,
5864
+ "step": 8330
5865
+ },
5866
+ {
5867
+ "epoch": 0.6674578055774043,
5868
+ "grad_norm": 71.5,
5869
+ "learning_rate": 9.986963720847933e-07,
5870
+ "loss": 132.5512,
5871
+ "step": 8340
5872
+ },
5873
+ {
5874
+ "epoch": 0.6682581146968016,
5875
+ "grad_norm": 78.375,
5876
+ "learning_rate": 9.986948089817775e-07,
5877
+ "loss": 131.1753,
5878
+ "step": 8350
5879
+ },
5880
+ {
5881
+ "epoch": 0.669058423816199,
5882
+ "grad_norm": 74.1875,
5883
+ "learning_rate": 9.986932458787616e-07,
5884
+ "loss": 130.4604,
5885
+ "step": 8360
5886
+ },
5887
+ {
5888
+ "epoch": 0.6698587329355964,
5889
+ "grad_norm": 73.4375,
5890
+ "learning_rate": 9.986916827757458e-07,
5891
+ "loss": 132.9316,
5892
+ "step": 8370
5893
+ },
5894
+ {
5895
+ "epoch": 0.6706590420549937,
5896
+ "grad_norm": 74.875,
5897
+ "learning_rate": 9.9869011967273e-07,
5898
+ "loss": 131.4552,
5899
+ "step": 8380
5900
+ },
5901
+ {
5902
+ "epoch": 0.6714593511743912,
5903
+ "grad_norm": 78.0,
5904
+ "learning_rate": 9.98688556569714e-07,
5905
+ "loss": 131.1365,
5906
+ "step": 8390
5907
+ },
5908
+ {
5909
+ "epoch": 0.6722596602937885,
5910
+ "grad_norm": 72.8125,
5911
+ "learning_rate": 9.986869934666982e-07,
5912
+ "loss": 132.5655,
5913
+ "step": 8400
5914
+ },
5915
+ {
5916
+ "epoch": 0.6730599694131858,
5917
+ "grad_norm": 75.3125,
5918
+ "learning_rate": 9.986854303636824e-07,
5919
+ "loss": 132.0653,
5920
+ "step": 8410
5921
+ },
5922
+ {
5923
+ "epoch": 0.6738602785325832,
5924
+ "grad_norm": 80.0,
5925
+ "learning_rate": 9.986838672606667e-07,
5926
+ "loss": 130.1704,
5927
+ "step": 8420
5928
+ },
5929
+ {
5930
+ "epoch": 0.6746605876519806,
5931
+ "grad_norm": 79.9375,
5932
+ "learning_rate": 9.986823041576507e-07,
5933
+ "loss": 131.86,
5934
+ "step": 8430
5935
+ },
5936
+ {
5937
+ "epoch": 0.675460896771378,
5938
+ "grad_norm": 77.5,
5939
+ "learning_rate": 9.986807410546349e-07,
5940
+ "loss": 132.2835,
5941
+ "step": 8440
5942
+ },
5943
+ {
5944
+ "epoch": 0.6762612058907753,
5945
+ "grad_norm": 76.125,
5946
+ "learning_rate": 9.98679177951619e-07,
5947
+ "loss": 133.6062,
5948
+ "step": 8450
5949
+ },
5950
+ {
5951
+ "epoch": 0.6770615150101726,
5952
+ "grad_norm": 74.4375,
5953
+ "learning_rate": 9.986776148486033e-07,
5954
+ "loss": 133.1529,
5955
+ "step": 8460
5956
+ },
5957
+ {
5958
+ "epoch": 0.6778618241295701,
5959
+ "grad_norm": 77.6875,
5960
+ "learning_rate": 9.986760517455875e-07,
5961
+ "loss": 130.7149,
5962
+ "step": 8470
5963
+ },
5964
+ {
5965
+ "epoch": 0.6786621332489674,
5966
+ "grad_norm": 76.6875,
5967
+ "learning_rate": 9.986744886425715e-07,
5968
+ "loss": 131.2965,
5969
+ "step": 8480
5970
+ },
5971
+ {
5972
+ "epoch": 0.6794624423683647,
5973
+ "grad_norm": 78.0625,
5974
+ "learning_rate": 9.986729255395558e-07,
5975
+ "loss": 131.7146,
5976
+ "step": 8490
5977
+ },
5978
+ {
5979
+ "epoch": 0.6802627514877622,
5980
+ "grad_norm": 77.1875,
5981
+ "learning_rate": 9.9867136243654e-07,
5982
+ "loss": 131.4988,
5983
+ "step": 8500
5984
+ },
5985
+ {
5986
+ "epoch": 0.6810630606071595,
5987
+ "grad_norm": 75.625,
5988
+ "learning_rate": 9.986697993335242e-07,
5989
+ "loss": 131.0816,
5990
+ "step": 8510
5991
+ },
5992
+ {
5993
+ "epoch": 0.6818633697265569,
5994
+ "grad_norm": 73.625,
5995
+ "learning_rate": 9.986682362305082e-07,
5996
+ "loss": 131.8303,
5997
+ "step": 8520
5998
+ },
5999
+ {
6000
+ "epoch": 0.6826636788459542,
6001
+ "grad_norm": 77.625,
6002
+ "learning_rate": 9.986666731274924e-07,
6003
+ "loss": 130.0348,
6004
+ "step": 8530
6005
+ },
6006
+ {
6007
+ "epoch": 0.6834639879653516,
6008
+ "grad_norm": 79.0,
6009
+ "learning_rate": 9.986651100244766e-07,
6010
+ "loss": 132.8476,
6011
+ "step": 8540
6012
+ },
6013
+ {
6014
+ "epoch": 0.684264297084749,
6015
+ "grad_norm": 74.625,
6016
+ "learning_rate": 9.986635469214606e-07,
6017
+ "loss": 131.6226,
6018
+ "step": 8550
6019
+ },
6020
+ {
6021
+ "epoch": 0.6850646062041463,
6022
+ "grad_norm": 79.0,
6023
+ "learning_rate": 9.986619838184449e-07,
6024
+ "loss": 131.3241,
6025
+ "step": 8560
6026
+ },
6027
+ {
6028
+ "epoch": 0.6858649153235438,
6029
+ "grad_norm": 76.9375,
6030
+ "learning_rate": 9.98660420715429e-07,
6031
+ "loss": 131.6735,
6032
+ "step": 8570
6033
+ },
6034
+ {
6035
+ "epoch": 0.6866652244429411,
6036
+ "grad_norm": 70.8125,
6037
+ "learning_rate": 9.986588576124133e-07,
6038
+ "loss": 131.7806,
6039
+ "step": 8580
6040
+ },
6041
+ {
6042
+ "epoch": 0.6874655335623384,
6043
+ "grad_norm": 83.25,
6044
+ "learning_rate": 9.986572945093973e-07,
6045
+ "loss": 132.0227,
6046
+ "step": 8590
6047
+ },
6048
+ {
6049
+ "epoch": 0.6882658426817359,
6050
+ "grad_norm": 76.5625,
6051
+ "learning_rate": 9.986557314063815e-07,
6052
+ "loss": 131.6524,
6053
+ "step": 8600
6054
+ },
6055
+ {
6056
+ "epoch": 0.6890661518011332,
6057
+ "grad_norm": 79.875,
6058
+ "learning_rate": 9.986541683033657e-07,
6059
+ "loss": 132.2215,
6060
+ "step": 8610
6061
+ },
6062
+ {
6063
+ "epoch": 0.6898664609205305,
6064
+ "grad_norm": 85.125,
6065
+ "learning_rate": 9.9865260520035e-07,
6066
+ "loss": 132.894,
6067
+ "step": 8620
6068
+ },
6069
+ {
6070
+ "epoch": 0.6906667700399279,
6071
+ "grad_norm": 78.625,
6072
+ "learning_rate": 9.986510420973342e-07,
6073
+ "loss": 131.7396,
6074
+ "step": 8630
6075
+ },
6076
+ {
6077
+ "epoch": 0.6914670791593253,
6078
+ "grad_norm": 84.375,
6079
+ "learning_rate": 9.986494789943182e-07,
6080
+ "loss": 132.6336,
6081
+ "step": 8640
6082
+ },
6083
+ {
6084
+ "epoch": 0.6922673882787227,
6085
+ "grad_norm": 80.4375,
6086
+ "learning_rate": 9.986479158913024e-07,
6087
+ "loss": 130.9795,
6088
+ "step": 8650
6089
+ },
6090
+ {
6091
+ "epoch": 0.69306769739812,
6092
+ "grad_norm": 73.0625,
6093
+ "learning_rate": 9.986463527882866e-07,
6094
+ "loss": 132.994,
6095
+ "step": 8660
6096
+ },
6097
+ {
6098
+ "epoch": 0.6938680065175173,
6099
+ "grad_norm": 77.75,
6100
+ "learning_rate": 9.986447896852708e-07,
6101
+ "loss": 131.1462,
6102
+ "step": 8670
6103
+ },
6104
+ {
6105
+ "epoch": 0.6946683156369148,
6106
+ "grad_norm": 73.4375,
6107
+ "learning_rate": 9.986432265822548e-07,
6108
+ "loss": 132.2478,
6109
+ "step": 8680
6110
+ },
6111
+ {
6112
+ "epoch": 0.6954686247563121,
6113
+ "grad_norm": 76.125,
6114
+ "learning_rate": 9.98641663479239e-07,
6115
+ "loss": 130.9919,
6116
+ "step": 8690
6117
+ },
6118
+ {
6119
+ "epoch": 0.6962689338757095,
6120
+ "grad_norm": 76.0625,
6121
+ "learning_rate": 9.986401003762233e-07,
6122
+ "loss": 130.4246,
6123
+ "step": 8700
6124
+ },
6125
+ {
6126
+ "epoch": 0.6970692429951069,
6127
+ "grad_norm": 75.25,
6128
+ "learning_rate": 9.986385372732073e-07,
6129
+ "loss": 132.2129,
6130
+ "step": 8710
6131
+ },
6132
+ {
6133
+ "epoch": 0.6978695521145042,
6134
+ "grad_norm": 77.8125,
6135
+ "learning_rate": 9.986369741701915e-07,
6136
+ "loss": 132.2469,
6137
+ "step": 8720
6138
+ },
6139
+ {
6140
+ "epoch": 0.6986698612339016,
6141
+ "grad_norm": 74.3125,
6142
+ "learning_rate": 9.986354110671757e-07,
6143
+ "loss": 131.4874,
6144
+ "step": 8730
6145
+ },
6146
+ {
6147
+ "epoch": 0.699470170353299,
6148
+ "grad_norm": 74.6875,
6149
+ "learning_rate": 9.986338479641597e-07,
6150
+ "loss": 131.288,
6151
+ "step": 8740
6152
+ },
6153
+ {
6154
+ "epoch": 0.7002704794726964,
6155
+ "grad_norm": 78.875,
6156
+ "learning_rate": 9.98632284861144e-07,
6157
+ "loss": 130.9051,
6158
+ "step": 8750
6159
+ },
6160
+ {
6161
+ "epoch": 0.7010707885920937,
6162
+ "grad_norm": 70.1875,
6163
+ "learning_rate": 9.986307217581282e-07,
6164
+ "loss": 131.5563,
6165
+ "step": 8760
6166
+ },
6167
+ {
6168
+ "epoch": 0.701871097711491,
6169
+ "grad_norm": 76.75,
6170
+ "learning_rate": 9.986291586551124e-07,
6171
+ "loss": 132.3225,
6172
+ "step": 8770
6173
+ },
6174
+ {
6175
+ "epoch": 0.7026714068308885,
6176
+ "grad_norm": 84.9375,
6177
+ "learning_rate": 9.986275955520966e-07,
6178
+ "loss": 132.3153,
6179
+ "step": 8780
6180
+ },
6181
+ {
6182
+ "epoch": 0.7034717159502858,
6183
+ "grad_norm": 79.0,
6184
+ "learning_rate": 9.986260324490808e-07,
6185
+ "loss": 131.7471,
6186
+ "step": 8790
6187
+ },
6188
+ {
6189
+ "epoch": 0.7042720250696831,
6190
+ "grad_norm": 79.0,
6191
+ "learning_rate": 9.986244693460648e-07,
6192
+ "loss": 132.4772,
6193
+ "step": 8800
6194
+ },
6195
+ {
6196
+ "epoch": 0.7050723341890806,
6197
+ "grad_norm": 74.5625,
6198
+ "learning_rate": 9.98622906243049e-07,
6199
+ "loss": 130.8647,
6200
+ "step": 8810
6201
+ },
6202
+ {
6203
+ "epoch": 0.7058726433084779,
6204
+ "grad_norm": 75.9375,
6205
+ "learning_rate": 9.986213431400333e-07,
6206
+ "loss": 131.4361,
6207
+ "step": 8820
6208
+ },
6209
+ {
6210
+ "epoch": 0.7066729524278753,
6211
+ "grad_norm": 75.3125,
6212
+ "learning_rate": 9.986197800370175e-07,
6213
+ "loss": 131.5961,
6214
+ "step": 8830
6215
+ },
6216
+ {
6217
+ "epoch": 0.7074732615472726,
6218
+ "grad_norm": 74.5,
6219
+ "learning_rate": 9.986182169340015e-07,
6220
+ "loss": 131.5035,
6221
+ "step": 8840
6222
+ },
6223
+ {
6224
+ "epoch": 0.70827357066667,
6225
+ "grad_norm": 77.125,
6226
+ "learning_rate": 9.986166538309857e-07,
6227
+ "loss": 131.417,
6228
+ "step": 8850
6229
+ },
6230
+ {
6231
+ "epoch": 0.7090738797860674,
6232
+ "grad_norm": 71.4375,
6233
+ "learning_rate": 9.9861509072797e-07,
6234
+ "loss": 130.5734,
6235
+ "step": 8860
6236
+ },
6237
+ {
6238
+ "epoch": 0.7098741889054647,
6239
+ "grad_norm": 79.75,
6240
+ "learning_rate": 9.98613527624954e-07,
6241
+ "loss": 130.9962,
6242
+ "step": 8870
6243
+ },
6244
+ {
6245
+ "epoch": 0.7106744980248622,
6246
+ "grad_norm": 71.6875,
6247
+ "learning_rate": 9.986119645219382e-07,
6248
+ "loss": 131.3529,
6249
+ "step": 8880
6250
+ },
6251
+ {
6252
+ "epoch": 0.7114748071442595,
6253
+ "grad_norm": 72.875,
6254
+ "learning_rate": 9.986104014189224e-07,
6255
+ "loss": 131.3156,
6256
+ "step": 8890
6257
+ },
6258
+ {
6259
+ "epoch": 0.7122751162636568,
6260
+ "grad_norm": 73.0625,
6261
+ "learning_rate": 9.986088383159064e-07,
6262
+ "loss": 130.3418,
6263
+ "step": 8900
6264
+ },
6265
+ {
6266
+ "epoch": 0.7130754253830542,
6267
+ "grad_norm": 79.1875,
6268
+ "learning_rate": 9.986072752128906e-07,
6269
+ "loss": 131.1646,
6270
+ "step": 8910
6271
+ },
6272
+ {
6273
+ "epoch": 0.7138757345024516,
6274
+ "grad_norm": 81.6875,
6275
+ "learning_rate": 9.986057121098748e-07,
6276
+ "loss": 131.9824,
6277
+ "step": 8920
6278
+ },
6279
+ {
6280
+ "epoch": 0.7146760436218489,
6281
+ "grad_norm": 81.8125,
6282
+ "learning_rate": 9.98604149006859e-07,
6283
+ "loss": 132.71,
6284
+ "step": 8930
6285
+ },
6286
+ {
6287
+ "epoch": 0.7154763527412463,
6288
+ "grad_norm": 69.9375,
6289
+ "learning_rate": 9.986025859038433e-07,
6290
+ "loss": 131.8712,
6291
+ "step": 8940
6292
+ },
6293
+ {
6294
+ "epoch": 0.7162766618606436,
6295
+ "grad_norm": 83.1875,
6296
+ "learning_rate": 9.986010228008275e-07,
6297
+ "loss": 131.4506,
6298
+ "step": 8950
6299
+ },
6300
+ {
6301
+ "epoch": 0.7170769709800411,
6302
+ "grad_norm": 76.5625,
6303
+ "learning_rate": 9.985994596978115e-07,
6304
+ "loss": 131.5453,
6305
+ "step": 8960
6306
+ },
6307
+ {
6308
+ "epoch": 0.7178772800994384,
6309
+ "grad_norm": 73.625,
6310
+ "learning_rate": 9.985978965947957e-07,
6311
+ "loss": 131.9996,
6312
+ "step": 8970
6313
+ },
6314
+ {
6315
+ "epoch": 0.7186775892188357,
6316
+ "grad_norm": 78.75,
6317
+ "learning_rate": 9.9859633349178e-07,
6318
+ "loss": 132.0983,
6319
+ "step": 8980
6320
+ },
6321
+ {
6322
+ "epoch": 0.7194778983382332,
6323
+ "grad_norm": 72.3125,
6324
+ "learning_rate": 9.98594770388764e-07,
6325
+ "loss": 131.87,
6326
+ "step": 8990
6327
+ },
6328
+ {
6329
+ "epoch": 0.7202782074576305,
6330
+ "grad_norm": 73.625,
6331
+ "learning_rate": 9.985932072857481e-07,
6332
+ "loss": 131.6589,
6333
+ "step": 9000
6334
+ },
6335
+ {
6336
+ "epoch": 0.7210785165770279,
6337
+ "grad_norm": 76.5,
6338
+ "learning_rate": 9.985916441827324e-07,
6339
+ "loss": 132.6549,
6340
+ "step": 9010
6341
+ },
6342
+ {
6343
+ "epoch": 0.7218788256964253,
6344
+ "grad_norm": 80.75,
6345
+ "learning_rate": 9.985900810797166e-07,
6346
+ "loss": 131.3108,
6347
+ "step": 9020
6348
+ },
6349
+ {
6350
+ "epoch": 0.7226791348158226,
6351
+ "grad_norm": 78.9375,
6352
+ "learning_rate": 9.985885179767006e-07,
6353
+ "loss": 132.2872,
6354
+ "step": 9030
6355
+ },
6356
+ {
6357
+ "epoch": 0.72347944393522,
6358
+ "grad_norm": 76.8125,
6359
+ "learning_rate": 9.985869548736848e-07,
6360
+ "loss": 131.6559,
6361
+ "step": 9040
6362
+ },
6363
+ {
6364
+ "epoch": 0.7242797530546173,
6365
+ "grad_norm": 80.1875,
6366
+ "learning_rate": 9.98585391770669e-07,
6367
+ "loss": 131.6873,
6368
+ "step": 9050
6369
+ },
6370
+ {
6371
+ "epoch": 0.7250800621740147,
6372
+ "grad_norm": 76.875,
6373
+ "learning_rate": 9.98583828667653e-07,
6374
+ "loss": 131.4747,
6375
+ "step": 9060
6376
+ },
6377
+ {
6378
+ "epoch": 0.7258803712934121,
6379
+ "grad_norm": 76.1875,
6380
+ "learning_rate": 9.985822655646372e-07,
6381
+ "loss": 131.7952,
6382
+ "step": 9070
6383
+ },
6384
+ {
6385
+ "epoch": 0.7266806804128094,
6386
+ "grad_norm": 75.5,
6387
+ "learning_rate": 9.985807024616215e-07,
6388
+ "loss": 131.458,
6389
+ "step": 9080
6390
+ },
6391
+ {
6392
+ "epoch": 0.7274809895322069,
6393
+ "grad_norm": 73.75,
6394
+ "learning_rate": 9.985791393586057e-07,
6395
+ "loss": 132.2186,
6396
+ "step": 9090
6397
+ },
6398
+ {
6399
+ "epoch": 0.7282812986516042,
6400
+ "grad_norm": 79.5,
6401
+ "learning_rate": 9.9857757625559e-07,
6402
+ "loss": 133.5451,
6403
+ "step": 9100
6404
+ },
6405
+ {
6406
+ "epoch": 0.7290816077710015,
6407
+ "grad_norm": 70.6875,
6408
+ "learning_rate": 9.98576013152574e-07,
6409
+ "loss": 131.524,
6410
+ "step": 9110
6411
+ },
6412
+ {
6413
+ "epoch": 0.7298819168903989,
6414
+ "grad_norm": 78.25,
6415
+ "learning_rate": 9.985744500495581e-07,
6416
+ "loss": 132.1586,
6417
+ "step": 9120
6418
+ },
6419
+ {
6420
+ "epoch": 0.7306822260097963,
6421
+ "grad_norm": 76.4375,
6422
+ "learning_rate": 9.985728869465423e-07,
6423
+ "loss": 131.0122,
6424
+ "step": 9130
6425
+ },
6426
+ {
6427
+ "epoch": 0.7314825351291937,
6428
+ "grad_norm": 75.9375,
6429
+ "learning_rate": 9.985713238435266e-07,
6430
+ "loss": 132.5875,
6431
+ "step": 9140
6432
+ },
6433
+ {
6434
+ "epoch": 0.732282844248591,
6435
+ "grad_norm": 77.8125,
6436
+ "learning_rate": 9.985697607405106e-07,
6437
+ "loss": 132.9425,
6438
+ "step": 9150
6439
+ },
6440
+ {
6441
+ "epoch": 0.7330831533679883,
6442
+ "grad_norm": 71.6875,
6443
+ "learning_rate": 9.985681976374948e-07,
6444
+ "loss": 131.7688,
6445
+ "step": 9160
6446
+ },
6447
+ {
6448
+ "epoch": 0.7338834624873858,
6449
+ "grad_norm": 78.0625,
6450
+ "learning_rate": 9.98566634534479e-07,
6451
+ "loss": 130.2091,
6452
+ "step": 9170
6453
+ },
6454
+ {
6455
+ "epoch": 0.7346837716067831,
6456
+ "grad_norm": 73.375,
6457
+ "learning_rate": 9.98565071431463e-07,
6458
+ "loss": 132.0214,
6459
+ "step": 9180
6460
+ },
6461
+ {
6462
+ "epoch": 0.7354840807261804,
6463
+ "grad_norm": 72.75,
6464
+ "learning_rate": 9.985635083284472e-07,
6465
+ "loss": 131.4804,
6466
+ "step": 9190
6467
+ },
6468
+ {
6469
+ "epoch": 0.7362843898455779,
6470
+ "grad_norm": 72.8125,
6471
+ "learning_rate": 9.985619452254314e-07,
6472
+ "loss": 130.8607,
6473
+ "step": 9200
6474
+ },
6475
+ {
6476
+ "epoch": 0.7370846989649752,
6477
+ "grad_norm": 75.4375,
6478
+ "learning_rate": 9.985603821224157e-07,
6479
+ "loss": 131.9145,
6480
+ "step": 9210
6481
+ },
6482
+ {
6483
+ "epoch": 0.7378850080843726,
6484
+ "grad_norm": 73.875,
6485
+ "learning_rate": 9.985588190193997e-07,
6486
+ "loss": 130.5753,
6487
+ "step": 9220
6488
+ },
6489
+ {
6490
+ "epoch": 0.73868531720377,
6491
+ "grad_norm": 81.0,
6492
+ "learning_rate": 9.985572559163839e-07,
6493
+ "loss": 132.0938,
6494
+ "step": 9230
6495
+ },
6496
+ {
6497
+ "epoch": 0.7394856263231673,
6498
+ "grad_norm": 78.0625,
6499
+ "learning_rate": 9.98555692813368e-07,
6500
+ "loss": 132.836,
6501
+ "step": 9240
6502
+ },
6503
+ {
6504
+ "epoch": 0.7402859354425647,
6505
+ "grad_norm": 76.25,
6506
+ "learning_rate": 9.985541297103523e-07,
6507
+ "loss": 131.6837,
6508
+ "step": 9250
6509
+ },
6510
+ {
6511
+ "epoch": 0.741086244561962,
6512
+ "grad_norm": 74.125,
6513
+ "learning_rate": 9.985525666073365e-07,
6514
+ "loss": 132.7221,
6515
+ "step": 9260
6516
+ },
6517
+ {
6518
+ "epoch": 0.7418865536813595,
6519
+ "grad_norm": 81.875,
6520
+ "learning_rate": 9.985510035043205e-07,
6521
+ "loss": 131.037,
6522
+ "step": 9270
6523
+ },
6524
+ {
6525
+ "epoch": 0.7426868628007568,
6526
+ "grad_norm": 77.0,
6527
+ "learning_rate": 9.985494404013048e-07,
6528
+ "loss": 130.6028,
6529
+ "step": 9280
6530
+ },
6531
+ {
6532
+ "epoch": 0.7434871719201541,
6533
+ "grad_norm": 76.4375,
6534
+ "learning_rate": 9.98547877298289e-07,
6535
+ "loss": 131.0622,
6536
+ "step": 9290
6537
+ },
6538
+ {
6539
+ "epoch": 0.7442874810395516,
6540
+ "grad_norm": 72.3125,
6541
+ "learning_rate": 9.985463141952732e-07,
6542
+ "loss": 130.6438,
6543
+ "step": 9300
6544
+ },
6545
+ {
6546
+ "epoch": 0.7450877901589489,
6547
+ "grad_norm": 73.75,
6548
+ "learning_rate": 9.985447510922572e-07,
6549
+ "loss": 132.5858,
6550
+ "step": 9310
6551
+ },
6552
+ {
6553
+ "epoch": 0.7458880992783462,
6554
+ "grad_norm": 81.375,
6555
+ "learning_rate": 9.985431879892414e-07,
6556
+ "loss": 131.7563,
6557
+ "step": 9320
6558
+ },
6559
+ {
6560
+ "epoch": 0.7466884083977436,
6561
+ "grad_norm": 75.0625,
6562
+ "learning_rate": 9.985416248862256e-07,
6563
+ "loss": 131.6322,
6564
+ "step": 9330
6565
+ },
6566
+ {
6567
+ "epoch": 0.747488717517141,
6568
+ "grad_norm": 74.375,
6569
+ "learning_rate": 9.985400617832097e-07,
6570
+ "loss": 129.3306,
6571
+ "step": 9340
6572
+ },
6573
+ {
6574
+ "epoch": 0.7482890266365384,
6575
+ "grad_norm": 77.25,
6576
+ "learning_rate": 9.985384986801939e-07,
6577
+ "loss": 131.3398,
6578
+ "step": 9350
6579
+ },
6580
+ {
6581
+ "epoch": 0.7490893357559357,
6582
+ "grad_norm": 80.4375,
6583
+ "learning_rate": 9.98536935577178e-07,
6584
+ "loss": 131.5261,
6585
+ "step": 9360
6586
+ },
6587
+ {
6588
+ "epoch": 0.749889644875333,
6589
+ "grad_norm": 76.625,
6590
+ "learning_rate": 9.985353724741623e-07,
6591
+ "loss": 130.4829,
6592
+ "step": 9370
6593
+ },
6594
+ {
6595
+ "epoch": 0.7506899539947305,
6596
+ "grad_norm": 75.1875,
6597
+ "learning_rate": 9.985338093711463e-07,
6598
+ "loss": 130.9288,
6599
+ "step": 9380
6600
+ },
6601
+ {
6602
+ "epoch": 0.7514902631141278,
6603
+ "grad_norm": 74.5,
6604
+ "learning_rate": 9.985322462681305e-07,
6605
+ "loss": 131.758,
6606
+ "step": 9390
6607
+ },
6608
+ {
6609
+ "epoch": 0.7522905722335252,
6610
+ "grad_norm": 72.9375,
6611
+ "learning_rate": 9.985306831651148e-07,
6612
+ "loss": 131.2881,
6613
+ "step": 9400
6614
+ },
6615
+ {
6616
+ "epoch": 0.7530908813529226,
6617
+ "grad_norm": 77.875,
6618
+ "learning_rate": 9.98529120062099e-07,
6619
+ "loss": 132.0472,
6620
+ "step": 9410
6621
+ },
6622
+ {
6623
+ "epoch": 0.7538911904723199,
6624
+ "grad_norm": 81.875,
6625
+ "learning_rate": 9.985275569590832e-07,
6626
+ "loss": 133.1372,
6627
+ "step": 9420
6628
+ },
6629
+ {
6630
+ "epoch": 0.7546914995917173,
6631
+ "grad_norm": 76.4375,
6632
+ "learning_rate": 9.985259938560672e-07,
6633
+ "loss": 132.0327,
6634
+ "step": 9430
6635
+ },
6636
+ {
6637
+ "epoch": 0.7554918087111147,
6638
+ "grad_norm": 82.5,
6639
+ "learning_rate": 9.985244307530514e-07,
6640
+ "loss": 130.7069,
6641
+ "step": 9440
6642
+ },
6643
+ {
6644
+ "epoch": 0.7562921178305121,
6645
+ "grad_norm": 79.9375,
6646
+ "learning_rate": 9.985228676500356e-07,
6647
+ "loss": 131.1774,
6648
+ "step": 9450
6649
+ },
6650
+ {
6651
+ "epoch": 0.7570924269499094,
6652
+ "grad_norm": 76.9375,
6653
+ "learning_rate": 9.985213045470198e-07,
6654
+ "loss": 131.4195,
6655
+ "step": 9460
6656
+ },
6657
+ {
6658
+ "epoch": 0.7578927360693067,
6659
+ "grad_norm": 83.25,
6660
+ "learning_rate": 9.985197414440039e-07,
6661
+ "loss": 130.8186,
6662
+ "step": 9470
6663
+ },
6664
+ {
6665
+ "epoch": 0.7586930451887042,
6666
+ "grad_norm": 76.1875,
6667
+ "learning_rate": 9.98518178340988e-07,
6668
+ "loss": 129.8441,
6669
+ "step": 9480
6670
+ },
6671
+ {
6672
+ "epoch": 0.7594933543081015,
6673
+ "grad_norm": 71.5,
6674
+ "learning_rate": 9.985166152379723e-07,
6675
+ "loss": 131.93,
6676
+ "step": 9490
6677
+ },
6678
+ {
6679
+ "epoch": 0.7602936634274988,
6680
+ "grad_norm": 79.0625,
6681
+ "learning_rate": 9.985150521349563e-07,
6682
+ "loss": 131.2559,
6683
+ "step": 9500
6684
+ },
6685
+ {
6686
+ "epoch": 0.7610939725468963,
6687
+ "grad_norm": 69.3125,
6688
+ "learning_rate": 9.985134890319405e-07,
6689
+ "loss": 131.3485,
6690
+ "step": 9510
6691
+ },
6692
+ {
6693
+ "epoch": 0.7618942816662936,
6694
+ "grad_norm": 79.5,
6695
+ "learning_rate": 9.985119259289247e-07,
6696
+ "loss": 130.6342,
6697
+ "step": 9520
6698
+ },
6699
+ {
6700
+ "epoch": 0.762694590785691,
6701
+ "grad_norm": 78.3125,
6702
+ "learning_rate": 9.985103628259087e-07,
6703
+ "loss": 132.0574,
6704
+ "step": 9530
6705
+ },
6706
+ {
6707
+ "epoch": 0.7634948999050883,
6708
+ "grad_norm": 73.0,
6709
+ "learning_rate": 9.98508799722893e-07,
6710
+ "loss": 130.7042,
6711
+ "step": 9540
6712
+ },
6713
+ {
6714
+ "epoch": 0.7642952090244857,
6715
+ "grad_norm": 87.75,
6716
+ "learning_rate": 9.985072366198772e-07,
6717
+ "loss": 131.7032,
6718
+ "step": 9550
6719
+ },
6720
+ {
6721
+ "epoch": 0.7650955181438831,
6722
+ "grad_norm": 78.9375,
6723
+ "learning_rate": 9.985056735168614e-07,
6724
+ "loss": 132.2712,
6725
+ "step": 9560
6726
+ },
6727
+ {
6728
+ "epoch": 0.7658958272632804,
6729
+ "grad_norm": 75.25,
6730
+ "learning_rate": 9.985041104138456e-07,
6731
+ "loss": 132.5755,
6732
+ "step": 9570
6733
+ },
6734
+ {
6735
+ "epoch": 0.7666961363826779,
6736
+ "grad_norm": 74.9375,
6737
+ "learning_rate": 9.985025473108296e-07,
6738
+ "loss": 131.8575,
6739
+ "step": 9580
6740
+ },
6741
+ {
6742
+ "epoch": 0.7674964455020752,
6743
+ "grad_norm": 74.3125,
6744
+ "learning_rate": 9.985009842078138e-07,
6745
+ "loss": 131.8135,
6746
+ "step": 9590
6747
+ },
6748
+ {
6749
+ "epoch": 0.7682967546214725,
6750
+ "grad_norm": 74.5,
6751
+ "learning_rate": 9.98499421104798e-07,
6752
+ "loss": 130.349,
6753
+ "step": 9600
6754
+ },
6755
+ {
6756
+ "epoch": 0.76909706374087,
6757
+ "grad_norm": 75.625,
6758
+ "learning_rate": 9.984978580017823e-07,
6759
+ "loss": 132.1411,
6760
+ "step": 9610
6761
+ },
6762
+ {
6763
+ "epoch": 0.7698973728602673,
6764
+ "grad_norm": 78.4375,
6765
+ "learning_rate": 9.984962948987665e-07,
6766
+ "loss": 132.007,
6767
+ "step": 9620
6768
+ },
6769
+ {
6770
+ "epoch": 0.7706976819796646,
6771
+ "grad_norm": 76.875,
6772
+ "learning_rate": 9.984947317957505e-07,
6773
+ "loss": 131.2099,
6774
+ "step": 9630
6775
+ },
6776
+ {
6777
+ "epoch": 0.771497991099062,
6778
+ "grad_norm": 78.875,
6779
+ "learning_rate": 9.984931686927347e-07,
6780
+ "loss": 130.7354,
6781
+ "step": 9640
6782
+ },
6783
+ {
6784
+ "epoch": 0.7722983002184594,
6785
+ "grad_norm": 79.875,
6786
+ "learning_rate": 9.98491605589719e-07,
6787
+ "loss": 132.1408,
6788
+ "step": 9650
6789
+ },
6790
+ {
6791
+ "epoch": 0.7730986093378568,
6792
+ "grad_norm": 72.1875,
6793
+ "learning_rate": 9.98490042486703e-07,
6794
+ "loss": 130.8756,
6795
+ "step": 9660
6796
+ },
6797
+ {
6798
+ "epoch": 0.7738989184572541,
6799
+ "grad_norm": 69.4375,
6800
+ "learning_rate": 9.984884793836872e-07,
6801
+ "loss": 131.2112,
6802
+ "step": 9670
6803
+ },
6804
+ {
6805
+ "epoch": 0.7746992275766514,
6806
+ "grad_norm": 76.875,
6807
+ "learning_rate": 9.984869162806714e-07,
6808
+ "loss": 131.0196,
6809
+ "step": 9680
6810
+ },
6811
+ {
6812
+ "epoch": 0.7754995366960489,
6813
+ "grad_norm": 85.0,
6814
+ "learning_rate": 9.984853531776554e-07,
6815
+ "loss": 131.2716,
6816
+ "step": 9690
6817
+ },
6818
+ {
6819
+ "epoch": 0.7762998458154462,
6820
+ "grad_norm": 72.5,
6821
+ "learning_rate": 9.984837900746396e-07,
6822
+ "loss": 130.7754,
6823
+ "step": 9700
6824
+ },
6825
+ {
6826
+ "epoch": 0.7771001549348436,
6827
+ "grad_norm": 75.0,
6828
+ "learning_rate": 9.984822269716238e-07,
6829
+ "loss": 131.3929,
6830
+ "step": 9710
6831
+ },
6832
+ {
6833
+ "epoch": 0.777900464054241,
6834
+ "grad_norm": 75.6875,
6835
+ "learning_rate": 9.98480663868608e-07,
6836
+ "loss": 131.4491,
6837
+ "step": 9720
6838
+ },
6839
+ {
6840
+ "epoch": 0.7787007731736383,
6841
+ "grad_norm": 77.8125,
6842
+ "learning_rate": 9.984791007655923e-07,
6843
+ "loss": 130.3078,
6844
+ "step": 9730
6845
+ },
6846
+ {
6847
+ "epoch": 0.7795010822930357,
6848
+ "grad_norm": 79.375,
6849
+ "learning_rate": 9.984775376625763e-07,
6850
+ "loss": 132.3819,
6851
+ "step": 9740
6852
+ },
6853
+ {
6854
+ "epoch": 0.780301391412433,
6855
+ "grad_norm": 77.875,
6856
+ "learning_rate": 9.984759745595605e-07,
6857
+ "loss": 132.8559,
6858
+ "step": 9750
6859
+ },
6860
+ {
6861
+ "epoch": 0.7811017005318304,
6862
+ "grad_norm": 76.875,
6863
+ "learning_rate": 9.984744114565447e-07,
6864
+ "loss": 130.5132,
6865
+ "step": 9760
6866
+ },
6867
+ {
6868
+ "epoch": 0.7819020096512278,
6869
+ "grad_norm": 80.625,
6870
+ "learning_rate": 9.98472848353529e-07,
6871
+ "loss": 130.5794,
6872
+ "step": 9770
6873
+ },
6874
+ {
6875
+ "epoch": 0.7827023187706251,
6876
+ "grad_norm": 79.1875,
6877
+ "learning_rate": 9.984712852505131e-07,
6878
+ "loss": 132.3585,
6879
+ "step": 9780
6880
+ },
6881
+ {
6882
+ "epoch": 0.7835026278900226,
6883
+ "grad_norm": 76.625,
6884
+ "learning_rate": 9.984697221474971e-07,
6885
+ "loss": 132.778,
6886
+ "step": 9790
6887
+ },
6888
+ {
6889
+ "epoch": 0.7843029370094199,
6890
+ "grad_norm": 74.625,
6891
+ "learning_rate": 9.984681590444814e-07,
6892
+ "loss": 130.5637,
6893
+ "step": 9800
6894
+ },
6895
+ {
6896
+ "epoch": 0.7851032461288172,
6897
+ "grad_norm": 82.9375,
6898
+ "learning_rate": 9.984665959414656e-07,
6899
+ "loss": 130.9602,
6900
+ "step": 9810
6901
+ },
6902
+ {
6903
+ "epoch": 0.7859035552482146,
6904
+ "grad_norm": 78.5,
6905
+ "learning_rate": 9.984650328384496e-07,
6906
+ "loss": 131.8248,
6907
+ "step": 9820
6908
+ },
6909
+ {
6910
+ "epoch": 0.786703864367612,
6911
+ "grad_norm": 71.875,
6912
+ "learning_rate": 9.984634697354338e-07,
6913
+ "loss": 132.0758,
6914
+ "step": 9830
6915
+ },
6916
+ {
6917
+ "epoch": 0.7875041734870094,
6918
+ "grad_norm": 78.5625,
6919
+ "learning_rate": 9.98461906632418e-07,
6920
+ "loss": 131.6132,
6921
+ "step": 9840
6922
+ },
6923
+ {
6924
+ "epoch": 0.7883044826064067,
6925
+ "grad_norm": 82.9375,
6926
+ "learning_rate": 9.98460343529402e-07,
6927
+ "loss": 130.1421,
6928
+ "step": 9850
6929
+ },
6930
+ {
6931
+ "epoch": 0.7891047917258041,
6932
+ "grad_norm": 90.0625,
6933
+ "learning_rate": 9.984587804263863e-07,
6934
+ "loss": 130.7962,
6935
+ "step": 9860
6936
+ },
6937
+ {
6938
+ "epoch": 0.7899051008452015,
6939
+ "grad_norm": 75.5,
6940
+ "learning_rate": 9.984572173233705e-07,
6941
+ "loss": 131.8376,
6942
+ "step": 9870
6943
+ },
6944
+ {
6945
+ "epoch": 0.7907054099645988,
6946
+ "grad_norm": 75.4375,
6947
+ "learning_rate": 9.984556542203547e-07,
6948
+ "loss": 132.0439,
6949
+ "step": 9880
6950
+ },
6951
+ {
6952
+ "epoch": 0.7915057190839961,
6953
+ "grad_norm": 86.25,
6954
+ "learning_rate": 9.98454091117339e-07,
6955
+ "loss": 130.9918,
6956
+ "step": 9890
6957
+ },
6958
+ {
6959
+ "epoch": 0.7923060282033936,
6960
+ "grad_norm": 77.1875,
6961
+ "learning_rate": 9.98452528014323e-07,
6962
+ "loss": 132.0871,
6963
+ "step": 9900
6964
+ },
6965
+ {
6966
+ "epoch": 0.7931063373227909,
6967
+ "grad_norm": 78.375,
6968
+ "learning_rate": 9.984509649113071e-07,
6969
+ "loss": 132.0267,
6970
+ "step": 9910
6971
+ },
6972
+ {
6973
+ "epoch": 0.7939066464421883,
6974
+ "grad_norm": 75.125,
6975
+ "learning_rate": 9.984494018082913e-07,
6976
+ "loss": 131.7625,
6977
+ "step": 9920
6978
+ },
6979
+ {
6980
+ "epoch": 0.7947069555615857,
6981
+ "grad_norm": 78.75,
6982
+ "learning_rate": 9.984478387052756e-07,
6983
+ "loss": 131.006,
6984
+ "step": 9930
6985
+ },
6986
+ {
6987
+ "epoch": 0.795507264680983,
6988
+ "grad_norm": 72.375,
6989
+ "learning_rate": 9.984462756022598e-07,
6990
+ "loss": 130.9044,
6991
+ "step": 9940
6992
+ },
6993
+ {
6994
+ "epoch": 0.7963075738003804,
6995
+ "grad_norm": 77.8125,
6996
+ "learning_rate": 9.984447124992438e-07,
6997
+ "loss": 131.8305,
6998
+ "step": 9950
6999
+ },
7000
+ {
7001
+ "epoch": 0.7971078829197777,
7002
+ "grad_norm": 79.8125,
7003
+ "learning_rate": 9.98443149396228e-07,
7004
+ "loss": 129.9232,
7005
+ "step": 9960
7006
+ },
7007
+ {
7008
+ "epoch": 0.7979081920391752,
7009
+ "grad_norm": 72.5625,
7010
+ "learning_rate": 9.984415862932122e-07,
7011
+ "loss": 131.7845,
7012
+ "step": 9970
7013
+ },
7014
+ {
7015
+ "epoch": 0.7987085011585725,
7016
+ "grad_norm": 72.5,
7017
+ "learning_rate": 9.984400231901962e-07,
7018
+ "loss": 131.7831,
7019
+ "step": 9980
7020
+ },
7021
+ {
7022
+ "epoch": 0.7995088102779698,
7023
+ "grad_norm": 77.125,
7024
+ "learning_rate": 9.984384600871805e-07,
7025
+ "loss": 133.024,
7026
+ "step": 9990
7027
+ },
7028
+ {
7029
+ "epoch": 0.8003091193973673,
7030
+ "grad_norm": 81.375,
7031
+ "learning_rate": 9.984368969841647e-07,
7032
+ "loss": 131.0512,
7033
+ "step": 10000
7034
+ },
7035
+ {
7036
+ "epoch": 0.8003091193973673,
7037
+ "eval_loss": 2.0527355670928955,
7038
+ "eval_runtime": 416.7476,
7039
+ "eval_samples_per_second": 1574.473,
7040
+ "eval_steps_per_second": 49.202,
7041
+ "step": 10000
7042
  }
7043
  ],
7044
  "logging_steps": 10,
 
7058
  "attributes": {}
7059
  }
7060
  },
7061
+ "total_flos": 2.761116577746125e+19,
7062
  "train_batch_size": 4,
7063
  "trial_name": null,
7064
  "trial_params": null