CocoRoF commited on
Commit
b640956
·
verified ·
1 Parent(s): f18fc4d

Training in progress, step 7500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d820555e107b3daf1c636d9917d5f80dea6a75a32d0c2c188bdbbc00c133122f
3
  size 368988278
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67e015b0f596fee442abd71af93158ddf702d0b03ddbafe627cd1e3991ffb717
3
  size 368988278
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a571cc91616e22c217ada61055f53aa1e24e5a5f9572b666bf2db98883e18211
3
  size 1107079290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:500d83d312df8ef8772133628901aa3ec8b169d37b27727f6771ed38095522d3
3
  size 1107079290
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74386f26f36ed67f56395205881e5db2d0c28ffcbeed50dd95b28771d2dac588
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69bcb5a0833345d8383f742fc9fcb902b5c0cae43ce63aee4b910dcd39dcda65
3
+ size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41c88f9de084200454883a13c3717941ea3fd433e2f8735507fc30611f9c5501
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fae392ec6232cbf9da21d6ed12bc8247d0d24e7f3a3606acd23be00f3e8bbfc5
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:965b00d4cb4710ebab57c8787b9925bb3f77b8eeba94a186ec4bc1c2f326ef3f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbf3e7ca9991a58b0b16574a3c653483c551c270aa05aba06c162ea593f7b0f2
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5dc374b8b9a4c45c950f9d136feab85a767081fa59f0c7d68ed3a62060c4949
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c695bebf6bcb75cbe26378bfe0ab7e2a33c49f713b9d6e4d10632b24322977e7
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c7c212fb779217f1edac0baf44f67b608eefc1e0e4e3f5a9dd7eb557032c1bc
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5ebb13c71265c5464c9aa9bb9b66f07764d73befe6cd63a2aaf8e781bf0a374
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86e1effd626ce1e95dd68a0c8089fe19218f2b24dfe9e45ed2cab1c0ebc10ba1
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12cc6e245e189be568c8dfd43a4dd8f04bb3dbd9f17f41458107935d2c2a6a9d
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:799cc83f60dfc1c4243cfd6403592112414a2eec494e6832f10221c96ff62c20
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36086646e9a8f76fea69f8a227112e83bb63524964ccdfb82f4cdad88b90e5e4
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:586777c398770c3255d3a1f48c7fef44ea9d89117c627c9ea490e16bfd9a49ba
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b44153bacf860d0ca6ce4c6b9380a199feab8a72ca613e6745bfb671b02c4e4
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63760f32c7083938d4e3abdc98e10004ebfa433e1f6bcf2cfc2e234eef1cefc2
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13f8e14c2c7c91bc7deff16a401fb72a8859494fc7925b25ee4ffb4e04a50c64
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.40015455969868363,
5
  "eval_steps": 2500,
6
- "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3523,6 +3523,1764 @@
3523
  "eval_samples_per_second": 1567.295,
3524
  "eval_steps_per_second": 48.978,
3525
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3526
  }
3527
  ],
3528
  "logging_steps": 10,
@@ -3542,7 +5300,7 @@
3542
  "attributes": {}
3543
  }
3544
  },
3545
- "total_flos": 1.3805582888730624e+19,
3546
  "train_batch_size": 4,
3547
  "trial_name": null,
3548
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6002318395480254,
5
  "eval_steps": 2500,
6
+ "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3523
  "eval_samples_per_second": 1567.295,
3524
  "eval_steps_per_second": 48.978,
3525
  "step": 5000
3526
+ },
3527
+ {
3528
+ "epoch": 0.40095486881808096,
3529
+ "grad_norm": 72.4375,
3530
+ "learning_rate": 9.992168853890664e-07,
3531
+ "loss": 133.3489,
3532
+ "step": 5010
3533
+ },
3534
+ {
3535
+ "epoch": 0.40175517793747834,
3536
+ "grad_norm": 74.5625,
3537
+ "learning_rate": 9.992153222860506e-07,
3538
+ "loss": 132.38,
3539
+ "step": 5020
3540
+ },
3541
+ {
3542
+ "epoch": 0.4025554870568757,
3543
+ "grad_norm": 81.25,
3544
+ "learning_rate": 9.992137591830347e-07,
3545
+ "loss": 133.7667,
3546
+ "step": 5030
3547
+ },
3548
+ {
3549
+ "epoch": 0.4033557961762731,
3550
+ "grad_norm": 74.5625,
3551
+ "learning_rate": 9.992121960800189e-07,
3552
+ "loss": 133.8293,
3553
+ "step": 5040
3554
+ },
3555
+ {
3556
+ "epoch": 0.4041561052956705,
3557
+ "grad_norm": 71.25,
3558
+ "learning_rate": 9.99210632977003e-07,
3559
+ "loss": 133.3581,
3560
+ "step": 5050
3561
+ },
3562
+ {
3563
+ "epoch": 0.4049564144150678,
3564
+ "grad_norm": 79.5625,
3565
+ "learning_rate": 9.992090698739873e-07,
3566
+ "loss": 132.9888,
3567
+ "step": 5060
3568
+ },
3569
+ {
3570
+ "epoch": 0.4057567235344652,
3571
+ "grad_norm": 73.9375,
3572
+ "learning_rate": 9.992075067709715e-07,
3573
+ "loss": 132.4104,
3574
+ "step": 5070
3575
+ },
3576
+ {
3577
+ "epoch": 0.40655703265386256,
3578
+ "grad_norm": 72.5,
3579
+ "learning_rate": 9.992059436679555e-07,
3580
+ "loss": 132.3801,
3581
+ "step": 5080
3582
+ },
3583
+ {
3584
+ "epoch": 0.40735734177325994,
3585
+ "grad_norm": 76.3125,
3586
+ "learning_rate": 9.992043805649397e-07,
3587
+ "loss": 132.4045,
3588
+ "step": 5090
3589
+ },
3590
+ {
3591
+ "epoch": 0.40815765089265726,
3592
+ "grad_norm": 76.5625,
3593
+ "learning_rate": 9.99202817461924e-07,
3594
+ "loss": 132.4824,
3595
+ "step": 5100
3596
+ },
3597
+ {
3598
+ "epoch": 0.40895796001205464,
3599
+ "grad_norm": 74.125,
3600
+ "learning_rate": 9.992012543589082e-07,
3601
+ "loss": 132.0051,
3602
+ "step": 5110
3603
+ },
3604
+ {
3605
+ "epoch": 0.409758269131452,
3606
+ "grad_norm": 72.9375,
3607
+ "learning_rate": 9.991996912558924e-07,
3608
+ "loss": 133.8627,
3609
+ "step": 5120
3610
+ },
3611
+ {
3612
+ "epoch": 0.4105585782508494,
3613
+ "grad_norm": 78.9375,
3614
+ "learning_rate": 9.991981281528764e-07,
3615
+ "loss": 132.2818,
3616
+ "step": 5130
3617
+ },
3618
+ {
3619
+ "epoch": 0.4113588873702468,
3620
+ "grad_norm": 75.375,
3621
+ "learning_rate": 9.991965650498606e-07,
3622
+ "loss": 131.2578,
3623
+ "step": 5140
3624
+ },
3625
+ {
3626
+ "epoch": 0.4121591964896441,
3627
+ "grad_norm": 77.125,
3628
+ "learning_rate": 9.991950019468448e-07,
3629
+ "loss": 133.7901,
3630
+ "step": 5150
3631
+ },
3632
+ {
3633
+ "epoch": 0.4129595056090415,
3634
+ "grad_norm": 80.9375,
3635
+ "learning_rate": 9.991934388438289e-07,
3636
+ "loss": 132.6137,
3637
+ "step": 5160
3638
+ },
3639
+ {
3640
+ "epoch": 0.41375981472843887,
3641
+ "grad_norm": 81.875,
3642
+ "learning_rate": 9.99191875740813e-07,
3643
+ "loss": 132.9288,
3644
+ "step": 5170
3645
+ },
3646
+ {
3647
+ "epoch": 0.41456012384783625,
3648
+ "grad_norm": 78.6875,
3649
+ "learning_rate": 9.991903126377973e-07,
3650
+ "loss": 132.9595,
3651
+ "step": 5180
3652
+ },
3653
+ {
3654
+ "epoch": 0.4153604329672336,
3655
+ "grad_norm": 76.9375,
3656
+ "learning_rate": 9.991887495347813e-07,
3657
+ "loss": 132.3299,
3658
+ "step": 5190
3659
+ },
3660
+ {
3661
+ "epoch": 0.41616074208663095,
3662
+ "grad_norm": 79.0,
3663
+ "learning_rate": 9.991871864317655e-07,
3664
+ "loss": 132.1807,
3665
+ "step": 5200
3666
+ },
3667
+ {
3668
+ "epoch": 0.41696105120602833,
3669
+ "grad_norm": 75.4375,
3670
+ "learning_rate": 9.991856233287497e-07,
3671
+ "loss": 133.9722,
3672
+ "step": 5210
3673
+ },
3674
+ {
3675
+ "epoch": 0.4177613603254257,
3676
+ "grad_norm": 73.4375,
3677
+ "learning_rate": 9.99184060225734e-07,
3678
+ "loss": 132.654,
3679
+ "step": 5220
3680
+ },
3681
+ {
3682
+ "epoch": 0.41856166944482304,
3683
+ "grad_norm": 74.1875,
3684
+ "learning_rate": 9.991824971227182e-07,
3685
+ "loss": 133.0139,
3686
+ "step": 5230
3687
+ },
3688
+ {
3689
+ "epoch": 0.4193619785642204,
3690
+ "grad_norm": 79.25,
3691
+ "learning_rate": 9.991809340197022e-07,
3692
+ "loss": 131.9049,
3693
+ "step": 5240
3694
+ },
3695
+ {
3696
+ "epoch": 0.4201622876836178,
3697
+ "grad_norm": 75.8125,
3698
+ "learning_rate": 9.991793709166864e-07,
3699
+ "loss": 131.9633,
3700
+ "step": 5250
3701
+ },
3702
+ {
3703
+ "epoch": 0.4209625968030152,
3704
+ "grad_norm": 74.25,
3705
+ "learning_rate": 9.991778078136706e-07,
3706
+ "loss": 132.2361,
3707
+ "step": 5260
3708
+ },
3709
+ {
3710
+ "epoch": 0.42176290592241256,
3711
+ "grad_norm": 72.8125,
3712
+ "learning_rate": 9.991762447106548e-07,
3713
+ "loss": 131.267,
3714
+ "step": 5270
3715
+ },
3716
+ {
3717
+ "epoch": 0.4225632150418099,
3718
+ "grad_norm": 77.5625,
3719
+ "learning_rate": 9.991746816076388e-07,
3720
+ "loss": 132.4257,
3721
+ "step": 5280
3722
+ },
3723
+ {
3724
+ "epoch": 0.42336352416120726,
3725
+ "grad_norm": 79.3125,
3726
+ "learning_rate": 9.99173118504623e-07,
3727
+ "loss": 132.785,
3728
+ "step": 5290
3729
+ },
3730
+ {
3731
+ "epoch": 0.42416383328060464,
3732
+ "grad_norm": 76.9375,
3733
+ "learning_rate": 9.991715554016073e-07,
3734
+ "loss": 131.3379,
3735
+ "step": 5300
3736
+ },
3737
+ {
3738
+ "epoch": 0.424964142400002,
3739
+ "grad_norm": 75.25,
3740
+ "learning_rate": 9.991699922985915e-07,
3741
+ "loss": 133.1422,
3742
+ "step": 5310
3743
+ },
3744
+ {
3745
+ "epoch": 0.42576445151939935,
3746
+ "grad_norm": 72.5625,
3747
+ "learning_rate": 9.991684291955755e-07,
3748
+ "loss": 132.1037,
3749
+ "step": 5320
3750
+ },
3751
+ {
3752
+ "epoch": 0.4265647606387967,
3753
+ "grad_norm": 79.0,
3754
+ "learning_rate": 9.991668660925597e-07,
3755
+ "loss": 132.1573,
3756
+ "step": 5330
3757
+ },
3758
+ {
3759
+ "epoch": 0.4273650697581941,
3760
+ "grad_norm": 75.0625,
3761
+ "learning_rate": 9.99165302989544e-07,
3762
+ "loss": 132.0645,
3763
+ "step": 5340
3764
+ },
3765
+ {
3766
+ "epoch": 0.4281653788775915,
3767
+ "grad_norm": 72.3125,
3768
+ "learning_rate": 9.99163739886528e-07,
3769
+ "loss": 132.8727,
3770
+ "step": 5350
3771
+ },
3772
+ {
3773
+ "epoch": 0.4289656879969888,
3774
+ "grad_norm": 77.4375,
3775
+ "learning_rate": 9.991621767835122e-07,
3776
+ "loss": 133.5671,
3777
+ "step": 5360
3778
+ },
3779
+ {
3780
+ "epoch": 0.4297659971163862,
3781
+ "grad_norm": 74.3125,
3782
+ "learning_rate": 9.991606136804964e-07,
3783
+ "loss": 132.3078,
3784
+ "step": 5370
3785
+ },
3786
+ {
3787
+ "epoch": 0.43056630623578357,
3788
+ "grad_norm": 81.375,
3789
+ "learning_rate": 9.991590505774806e-07,
3790
+ "loss": 133.0639,
3791
+ "step": 5380
3792
+ },
3793
+ {
3794
+ "epoch": 0.43136661535518095,
3795
+ "grad_norm": 74.0625,
3796
+ "learning_rate": 9.991574874744648e-07,
3797
+ "loss": 131.2669,
3798
+ "step": 5390
3799
+ },
3800
+ {
3801
+ "epoch": 0.43216692447457833,
3802
+ "grad_norm": 79.375,
3803
+ "learning_rate": 9.991559243714488e-07,
3804
+ "loss": 131.9368,
3805
+ "step": 5400
3806
+ },
3807
+ {
3808
+ "epoch": 0.43296723359397565,
3809
+ "grad_norm": 77.9375,
3810
+ "learning_rate": 9.99154361268433e-07,
3811
+ "loss": 132.4038,
3812
+ "step": 5410
3813
+ },
3814
+ {
3815
+ "epoch": 0.43376754271337303,
3816
+ "grad_norm": 83.375,
3817
+ "learning_rate": 9.991527981654173e-07,
3818
+ "loss": 132.8988,
3819
+ "step": 5420
3820
+ },
3821
+ {
3822
+ "epoch": 0.4345678518327704,
3823
+ "grad_norm": 73.6875,
3824
+ "learning_rate": 9.991512350624015e-07,
3825
+ "loss": 133.3637,
3826
+ "step": 5430
3827
+ },
3828
+ {
3829
+ "epoch": 0.4353681609521678,
3830
+ "grad_norm": 74.5625,
3831
+ "learning_rate": 9.991496719593855e-07,
3832
+ "loss": 133.3608,
3833
+ "step": 5440
3834
+ },
3835
+ {
3836
+ "epoch": 0.4361684700715651,
3837
+ "grad_norm": 71.0,
3838
+ "learning_rate": 9.991481088563697e-07,
3839
+ "loss": 131.265,
3840
+ "step": 5450
3841
+ },
3842
+ {
3843
+ "epoch": 0.4369687791909625,
3844
+ "grad_norm": 73.4375,
3845
+ "learning_rate": 9.99146545753354e-07,
3846
+ "loss": 132.0777,
3847
+ "step": 5460
3848
+ },
3849
+ {
3850
+ "epoch": 0.4377690883103599,
3851
+ "grad_norm": 75.125,
3852
+ "learning_rate": 9.99144982650338e-07,
3853
+ "loss": 132.4698,
3854
+ "step": 5470
3855
+ },
3856
+ {
3857
+ "epoch": 0.43856939742975726,
3858
+ "grad_norm": 73.875,
3859
+ "learning_rate": 9.991434195473221e-07,
3860
+ "loss": 131.6167,
3861
+ "step": 5480
3862
+ },
3863
+ {
3864
+ "epoch": 0.43936970654915464,
3865
+ "grad_norm": 76.9375,
3866
+ "learning_rate": 9.991418564443064e-07,
3867
+ "loss": 131.9013,
3868
+ "step": 5490
3869
+ },
3870
+ {
3871
+ "epoch": 0.44017001566855196,
3872
+ "grad_norm": 73.4375,
3873
+ "learning_rate": 9.991402933412906e-07,
3874
+ "loss": 131.2487,
3875
+ "step": 5500
3876
+ },
3877
+ {
3878
+ "epoch": 0.44097032478794934,
3879
+ "grad_norm": 83.4375,
3880
+ "learning_rate": 9.991387302382746e-07,
3881
+ "loss": 133.703,
3882
+ "step": 5510
3883
+ },
3884
+ {
3885
+ "epoch": 0.4417706339073467,
3886
+ "grad_norm": 78.375,
3887
+ "learning_rate": 9.991371671352588e-07,
3888
+ "loss": 133.2442,
3889
+ "step": 5520
3890
+ },
3891
+ {
3892
+ "epoch": 0.4425709430267441,
3893
+ "grad_norm": 71.375,
3894
+ "learning_rate": 9.99135604032243e-07,
3895
+ "loss": 131.6987,
3896
+ "step": 5530
3897
+ },
3898
+ {
3899
+ "epoch": 0.4433712521461414,
3900
+ "grad_norm": 76.1875,
3901
+ "learning_rate": 9.991340409292272e-07,
3902
+ "loss": 131.8815,
3903
+ "step": 5540
3904
+ },
3905
+ {
3906
+ "epoch": 0.4441715612655388,
3907
+ "grad_norm": 80.8125,
3908
+ "learning_rate": 9.991324778262112e-07,
3909
+ "loss": 132.4904,
3910
+ "step": 5550
3911
+ },
3912
+ {
3913
+ "epoch": 0.4449718703849362,
3914
+ "grad_norm": 75.1875,
3915
+ "learning_rate": 9.991309147231955e-07,
3916
+ "loss": 130.7195,
3917
+ "step": 5560
3918
+ },
3919
+ {
3920
+ "epoch": 0.44577217950433357,
3921
+ "grad_norm": 74.5625,
3922
+ "learning_rate": 9.991293516201797e-07,
3923
+ "loss": 132.3294,
3924
+ "step": 5570
3925
+ },
3926
+ {
3927
+ "epoch": 0.4465724886237309,
3928
+ "grad_norm": 77.5,
3929
+ "learning_rate": 9.99127788517164e-07,
3930
+ "loss": 132.4166,
3931
+ "step": 5580
3932
+ },
3933
+ {
3934
+ "epoch": 0.44737279774312827,
3935
+ "grad_norm": 74.375,
3936
+ "learning_rate": 9.991262254141481e-07,
3937
+ "loss": 132.624,
3938
+ "step": 5590
3939
+ },
3940
+ {
3941
+ "epoch": 0.44817310686252565,
3942
+ "grad_norm": 82.3125,
3943
+ "learning_rate": 9.991246623111321e-07,
3944
+ "loss": 131.5993,
3945
+ "step": 5600
3946
+ },
3947
+ {
3948
+ "epoch": 0.44897341598192303,
3949
+ "grad_norm": 70.5,
3950
+ "learning_rate": 9.991230992081163e-07,
3951
+ "loss": 130.5851,
3952
+ "step": 5610
3953
+ },
3954
+ {
3955
+ "epoch": 0.4497737251013204,
3956
+ "grad_norm": 78.9375,
3957
+ "learning_rate": 9.991215361051006e-07,
3958
+ "loss": 131.7671,
3959
+ "step": 5620
3960
+ },
3961
+ {
3962
+ "epoch": 0.45057403422071773,
3963
+ "grad_norm": 78.9375,
3964
+ "learning_rate": 9.991199730020846e-07,
3965
+ "loss": 132.6647,
3966
+ "step": 5630
3967
+ },
3968
+ {
3969
+ "epoch": 0.4513743433401151,
3970
+ "grad_norm": 77.5,
3971
+ "learning_rate": 9.991184098990688e-07,
3972
+ "loss": 131.5212,
3973
+ "step": 5640
3974
+ },
3975
+ {
3976
+ "epoch": 0.4521746524595125,
3977
+ "grad_norm": 80.25,
3978
+ "learning_rate": 9.99116846796053e-07,
3979
+ "loss": 131.0642,
3980
+ "step": 5650
3981
+ },
3982
+ {
3983
+ "epoch": 0.4529749615789099,
3984
+ "grad_norm": 80.875,
3985
+ "learning_rate": 9.991152836930372e-07,
3986
+ "loss": 131.6869,
3987
+ "step": 5660
3988
+ },
3989
+ {
3990
+ "epoch": 0.4537752706983072,
3991
+ "grad_norm": 73.125,
3992
+ "learning_rate": 9.991137205900212e-07,
3993
+ "loss": 132.0529,
3994
+ "step": 5670
3995
+ },
3996
+ {
3997
+ "epoch": 0.4545755798177046,
3998
+ "grad_norm": 77.8125,
3999
+ "learning_rate": 9.991121574870055e-07,
4000
+ "loss": 133.4353,
4001
+ "step": 5680
4002
+ },
4003
+ {
4004
+ "epoch": 0.45537588893710196,
4005
+ "grad_norm": 78.9375,
4006
+ "learning_rate": 9.991105943839897e-07,
4007
+ "loss": 131.1083,
4008
+ "step": 5690
4009
+ },
4010
+ {
4011
+ "epoch": 0.45617619805649934,
4012
+ "grad_norm": 81.6875,
4013
+ "learning_rate": 9.991090312809739e-07,
4014
+ "loss": 132.5374,
4015
+ "step": 5700
4016
+ },
4017
+ {
4018
+ "epoch": 0.45697650717589666,
4019
+ "grad_norm": 75.8125,
4020
+ "learning_rate": 9.99107468177958e-07,
4021
+ "loss": 131.5541,
4022
+ "step": 5710
4023
+ },
4024
+ {
4025
+ "epoch": 0.45777681629529404,
4026
+ "grad_norm": 77.875,
4027
+ "learning_rate": 9.991059050749421e-07,
4028
+ "loss": 133.4234,
4029
+ "step": 5720
4030
+ },
4031
+ {
4032
+ "epoch": 0.4585771254146914,
4033
+ "grad_norm": 82.6875,
4034
+ "learning_rate": 9.991043419719263e-07,
4035
+ "loss": 132.8354,
4036
+ "step": 5730
4037
+ },
4038
+ {
4039
+ "epoch": 0.4593774345340888,
4040
+ "grad_norm": 69.4375,
4041
+ "learning_rate": 9.991027788689105e-07,
4042
+ "loss": 130.9775,
4043
+ "step": 5740
4044
+ },
4045
+ {
4046
+ "epoch": 0.4601777436534862,
4047
+ "grad_norm": 73.875,
4048
+ "learning_rate": 9.991012157658948e-07,
4049
+ "loss": 133.6315,
4050
+ "step": 5750
4051
+ },
4052
+ {
4053
+ "epoch": 0.4609780527728835,
4054
+ "grad_norm": 77.125,
4055
+ "learning_rate": 9.990996526628788e-07,
4056
+ "loss": 133.1302,
4057
+ "step": 5760
4058
+ },
4059
+ {
4060
+ "epoch": 0.4617783618922809,
4061
+ "grad_norm": 75.875,
4062
+ "learning_rate": 9.99098089559863e-07,
4063
+ "loss": 132.4343,
4064
+ "step": 5770
4065
+ },
4066
+ {
4067
+ "epoch": 0.46257867101167827,
4068
+ "grad_norm": 74.8125,
4069
+ "learning_rate": 9.990965264568472e-07,
4070
+ "loss": 132.1773,
4071
+ "step": 5780
4072
+ },
4073
+ {
4074
+ "epoch": 0.46337898013107565,
4075
+ "grad_norm": 74.5625,
4076
+ "learning_rate": 9.990949633538312e-07,
4077
+ "loss": 131.587,
4078
+ "step": 5790
4079
+ },
4080
+ {
4081
+ "epoch": 0.46417928925047297,
4082
+ "grad_norm": 73.6875,
4083
+ "learning_rate": 9.990934002508154e-07,
4084
+ "loss": 132.3335,
4085
+ "step": 5800
4086
+ },
4087
+ {
4088
+ "epoch": 0.46497959836987035,
4089
+ "grad_norm": 78.375,
4090
+ "learning_rate": 9.990918371477997e-07,
4091
+ "loss": 131.2027,
4092
+ "step": 5810
4093
+ },
4094
+ {
4095
+ "epoch": 0.46577990748926773,
4096
+ "grad_norm": 74.875,
4097
+ "learning_rate": 9.990902740447837e-07,
4098
+ "loss": 132.1983,
4099
+ "step": 5820
4100
+ },
4101
+ {
4102
+ "epoch": 0.4665802166086651,
4103
+ "grad_norm": 76.8125,
4104
+ "learning_rate": 9.990887109417679e-07,
4105
+ "loss": 130.0307,
4106
+ "step": 5830
4107
+ },
4108
+ {
4109
+ "epoch": 0.4673805257280625,
4110
+ "grad_norm": 74.375,
4111
+ "learning_rate": 9.99087147838752e-07,
4112
+ "loss": 132.1818,
4113
+ "step": 5840
4114
+ },
4115
+ {
4116
+ "epoch": 0.4681808348474598,
4117
+ "grad_norm": 80.25,
4118
+ "learning_rate": 9.990855847357363e-07,
4119
+ "loss": 131.9706,
4120
+ "step": 5850
4121
+ },
4122
+ {
4123
+ "epoch": 0.4689811439668572,
4124
+ "grad_norm": 78.3125,
4125
+ "learning_rate": 9.990840216327205e-07,
4126
+ "loss": 132.3331,
4127
+ "step": 5860
4128
+ },
4129
+ {
4130
+ "epoch": 0.4697814530862546,
4131
+ "grad_norm": 75.25,
4132
+ "learning_rate": 9.990824585297045e-07,
4133
+ "loss": 132.2483,
4134
+ "step": 5870
4135
+ },
4136
+ {
4137
+ "epoch": 0.47058176220565195,
4138
+ "grad_norm": 73.125,
4139
+ "learning_rate": 9.990808954266888e-07,
4140
+ "loss": 132.6989,
4141
+ "step": 5880
4142
+ },
4143
+ {
4144
+ "epoch": 0.4713820713250493,
4145
+ "grad_norm": 78.625,
4146
+ "learning_rate": 9.99079332323673e-07,
4147
+ "loss": 133.315,
4148
+ "step": 5890
4149
+ },
4150
+ {
4151
+ "epoch": 0.47218238044444666,
4152
+ "grad_norm": 73.8125,
4153
+ "learning_rate": 9.990777692206572e-07,
4154
+ "loss": 131.2918,
4155
+ "step": 5900
4156
+ },
4157
+ {
4158
+ "epoch": 0.47298268956384404,
4159
+ "grad_norm": 82.0625,
4160
+ "learning_rate": 9.990762061176414e-07,
4161
+ "loss": 132.8047,
4162
+ "step": 5910
4163
+ },
4164
+ {
4165
+ "epoch": 0.4737829986832414,
4166
+ "grad_norm": 83.4375,
4167
+ "learning_rate": 9.990746430146254e-07,
4168
+ "loss": 133.7549,
4169
+ "step": 5920
4170
+ },
4171
+ {
4172
+ "epoch": 0.47458330780263874,
4173
+ "grad_norm": 71.9375,
4174
+ "learning_rate": 9.990730799116096e-07,
4175
+ "loss": 132.3743,
4176
+ "step": 5930
4177
+ },
4178
+ {
4179
+ "epoch": 0.4753836169220361,
4180
+ "grad_norm": 82.25,
4181
+ "learning_rate": 9.990715168085939e-07,
4182
+ "loss": 130.8257,
4183
+ "step": 5940
4184
+ },
4185
+ {
4186
+ "epoch": 0.4761839260414335,
4187
+ "grad_norm": 75.8125,
4188
+ "learning_rate": 9.990699537055779e-07,
4189
+ "loss": 130.2371,
4190
+ "step": 5950
4191
+ },
4192
+ {
4193
+ "epoch": 0.4769842351608309,
4194
+ "grad_norm": 78.1875,
4195
+ "learning_rate": 9.99068390602562e-07,
4196
+ "loss": 131.3828,
4197
+ "step": 5960
4198
+ },
4199
+ {
4200
+ "epoch": 0.47778454428022826,
4201
+ "grad_norm": 75.9375,
4202
+ "learning_rate": 9.990668274995463e-07,
4203
+ "loss": 132.8423,
4204
+ "step": 5970
4205
+ },
4206
+ {
4207
+ "epoch": 0.4785848533996256,
4208
+ "grad_norm": 77.3125,
4209
+ "learning_rate": 9.990652643965303e-07,
4210
+ "loss": 133.5517,
4211
+ "step": 5980
4212
+ },
4213
+ {
4214
+ "epoch": 0.47938516251902297,
4215
+ "grad_norm": 72.5,
4216
+ "learning_rate": 9.990637012935145e-07,
4217
+ "loss": 132.8932,
4218
+ "step": 5990
4219
+ },
4220
+ {
4221
+ "epoch": 0.48018547163842035,
4222
+ "grad_norm": 74.25,
4223
+ "learning_rate": 9.990621381904987e-07,
4224
+ "loss": 133.124,
4225
+ "step": 6000
4226
+ },
4227
+ {
4228
+ "epoch": 0.4809857807578177,
4229
+ "grad_norm": 77.0625,
4230
+ "learning_rate": 9.99060575087483e-07,
4231
+ "loss": 132.1078,
4232
+ "step": 6010
4233
+ },
4234
+ {
4235
+ "epoch": 0.48178608987721505,
4236
+ "grad_norm": 76.0625,
4237
+ "learning_rate": 9.99059011984467e-07,
4238
+ "loss": 132.8352,
4239
+ "step": 6020
4240
+ },
4241
+ {
4242
+ "epoch": 0.48258639899661243,
4243
+ "grad_norm": 79.4375,
4244
+ "learning_rate": 9.990574488814512e-07,
4245
+ "loss": 132.3257,
4246
+ "step": 6030
4247
+ },
4248
+ {
4249
+ "epoch": 0.4833867081160098,
4250
+ "grad_norm": 74.375,
4251
+ "learning_rate": 9.990558857784354e-07,
4252
+ "loss": 132.8362,
4253
+ "step": 6040
4254
+ },
4255
+ {
4256
+ "epoch": 0.4841870172354072,
4257
+ "grad_norm": 74.375,
4258
+ "learning_rate": 9.990543226754196e-07,
4259
+ "loss": 131.3189,
4260
+ "step": 6050
4261
+ },
4262
+ {
4263
+ "epoch": 0.48498732635480457,
4264
+ "grad_norm": 72.0625,
4265
+ "learning_rate": 9.990527595724038e-07,
4266
+ "loss": 131.8034,
4267
+ "step": 6060
4268
+ },
4269
+ {
4270
+ "epoch": 0.4857876354742019,
4271
+ "grad_norm": 80.5625,
4272
+ "learning_rate": 9.99051196469388e-07,
4273
+ "loss": 131.7185,
4274
+ "step": 6070
4275
+ },
4276
+ {
4277
+ "epoch": 0.4865879445935993,
4278
+ "grad_norm": 77.5625,
4279
+ "learning_rate": 9.99049633366372e-07,
4280
+ "loss": 133.0989,
4281
+ "step": 6080
4282
+ },
4283
+ {
4284
+ "epoch": 0.48738825371299666,
4285
+ "grad_norm": 74.4375,
4286
+ "learning_rate": 9.990480702633563e-07,
4287
+ "loss": 131.1641,
4288
+ "step": 6090
4289
+ },
4290
+ {
4291
+ "epoch": 0.48818856283239404,
4292
+ "grad_norm": 77.625,
4293
+ "learning_rate": 9.990465071603405e-07,
4294
+ "loss": 131.6567,
4295
+ "step": 6100
4296
+ },
4297
+ {
4298
+ "epoch": 0.48898887195179136,
4299
+ "grad_norm": 80.25,
4300
+ "learning_rate": 9.990449440573245e-07,
4301
+ "loss": 132.3284,
4302
+ "step": 6110
4303
+ },
4304
+ {
4305
+ "epoch": 0.48978918107118874,
4306
+ "grad_norm": 72.9375,
4307
+ "learning_rate": 9.990433809543087e-07,
4308
+ "loss": 131.856,
4309
+ "step": 6120
4310
+ },
4311
+ {
4312
+ "epoch": 0.4905894901905861,
4313
+ "grad_norm": 72.6875,
4314
+ "learning_rate": 9.99041817851293e-07,
4315
+ "loss": 132.8506,
4316
+ "step": 6130
4317
+ },
4318
+ {
4319
+ "epoch": 0.4913897993099835,
4320
+ "grad_norm": 76.0625,
4321
+ "learning_rate": 9.99040254748277e-07,
4322
+ "loss": 132.9671,
4323
+ "step": 6140
4324
+ },
4325
+ {
4326
+ "epoch": 0.4921901084293808,
4327
+ "grad_norm": 81.875,
4328
+ "learning_rate": 9.990386916452612e-07,
4329
+ "loss": 133.4054,
4330
+ "step": 6150
4331
+ },
4332
+ {
4333
+ "epoch": 0.4929904175487782,
4334
+ "grad_norm": 73.1875,
4335
+ "learning_rate": 9.990371285422454e-07,
4336
+ "loss": 132.2609,
4337
+ "step": 6160
4338
+ },
4339
+ {
4340
+ "epoch": 0.4937907266681756,
4341
+ "grad_norm": 73.9375,
4342
+ "learning_rate": 9.990355654392296e-07,
4343
+ "loss": 133.1529,
4344
+ "step": 6170
4345
+ },
4346
+ {
4347
+ "epoch": 0.49459103578757296,
4348
+ "grad_norm": 74.125,
4349
+ "learning_rate": 9.990340023362136e-07,
4350
+ "loss": 133.8595,
4351
+ "step": 6180
4352
+ },
4353
+ {
4354
+ "epoch": 0.49539134490697034,
4355
+ "grad_norm": 72.0625,
4356
+ "learning_rate": 9.990324392331978e-07,
4357
+ "loss": 132.0925,
4358
+ "step": 6190
4359
+ },
4360
+ {
4361
+ "epoch": 0.49619165402636767,
4362
+ "grad_norm": 72.5,
4363
+ "learning_rate": 9.99030876130182e-07,
4364
+ "loss": 133.2302,
4365
+ "step": 6200
4366
+ },
4367
+ {
4368
+ "epoch": 0.49699196314576505,
4369
+ "grad_norm": 71.9375,
4370
+ "learning_rate": 9.990293130271663e-07,
4371
+ "loss": 132.7785,
4372
+ "step": 6210
4373
+ },
4374
+ {
4375
+ "epoch": 0.4977922722651624,
4376
+ "grad_norm": 78.0625,
4377
+ "learning_rate": 9.990277499241505e-07,
4378
+ "loss": 133.0601,
4379
+ "step": 6220
4380
+ },
4381
+ {
4382
+ "epoch": 0.4985925813845598,
4383
+ "grad_norm": 74.6875,
4384
+ "learning_rate": 9.990261868211345e-07,
4385
+ "loss": 132.1573,
4386
+ "step": 6230
4387
+ },
4388
+ {
4389
+ "epoch": 0.49939289050395713,
4390
+ "grad_norm": 80.3125,
4391
+ "learning_rate": 9.990246237181187e-07,
4392
+ "loss": 132.7871,
4393
+ "step": 6240
4394
+ },
4395
+ {
4396
+ "epoch": 0.5001931996233545,
4397
+ "grad_norm": 74.875,
4398
+ "learning_rate": 9.99023060615103e-07,
4399
+ "loss": 132.2812,
4400
+ "step": 6250
4401
+ },
4402
+ {
4403
+ "epoch": 0.5009935087427518,
4404
+ "grad_norm": 80.875,
4405
+ "learning_rate": 9.990214975120871e-07,
4406
+ "loss": 130.9305,
4407
+ "step": 6260
4408
+ },
4409
+ {
4410
+ "epoch": 0.5017938178621493,
4411
+ "grad_norm": 72.4375,
4412
+ "learning_rate": 9.990199344090712e-07,
4413
+ "loss": 133.3334,
4414
+ "step": 6270
4415
+ },
4416
+ {
4417
+ "epoch": 0.5025941269815466,
4418
+ "grad_norm": 71.875,
4419
+ "learning_rate": 9.990183713060554e-07,
4420
+ "loss": 132.1767,
4421
+ "step": 6280
4422
+ },
4423
+ {
4424
+ "epoch": 0.503394436100944,
4425
+ "grad_norm": 79.375,
4426
+ "learning_rate": 9.990168082030396e-07,
4427
+ "loss": 133.0485,
4428
+ "step": 6290
4429
+ },
4430
+ {
4431
+ "epoch": 0.5041947452203414,
4432
+ "grad_norm": 72.8125,
4433
+ "learning_rate": 9.990152451000236e-07,
4434
+ "loss": 132.0657,
4435
+ "step": 6300
4436
+ },
4437
+ {
4438
+ "epoch": 0.5049950543397387,
4439
+ "grad_norm": 82.5,
4440
+ "learning_rate": 9.990136819970078e-07,
4441
+ "loss": 130.4747,
4442
+ "step": 6310
4443
+ },
4444
+ {
4445
+ "epoch": 0.5057953634591361,
4446
+ "grad_norm": 77.5,
4447
+ "learning_rate": 9.99012118893992e-07,
4448
+ "loss": 133.164,
4449
+ "step": 6320
4450
+ },
4451
+ {
4452
+ "epoch": 0.5065956725785334,
4453
+ "grad_norm": 72.5625,
4454
+ "learning_rate": 9.990105557909763e-07,
4455
+ "loss": 131.2053,
4456
+ "step": 6330
4457
+ },
4458
+ {
4459
+ "epoch": 0.5073959816979309,
4460
+ "grad_norm": 74.5,
4461
+ "learning_rate": 9.990089926879603e-07,
4462
+ "loss": 132.6195,
4463
+ "step": 6340
4464
+ },
4465
+ {
4466
+ "epoch": 0.5081962908173282,
4467
+ "grad_norm": 78.5625,
4468
+ "learning_rate": 9.990074295849445e-07,
4469
+ "loss": 131.7617,
4470
+ "step": 6350
4471
+ },
4472
+ {
4473
+ "epoch": 0.5089965999367255,
4474
+ "grad_norm": 73.5,
4475
+ "learning_rate": 9.990058664819287e-07,
4476
+ "loss": 131.7562,
4477
+ "step": 6360
4478
+ },
4479
+ {
4480
+ "epoch": 0.509796909056123,
4481
+ "grad_norm": 79.4375,
4482
+ "learning_rate": 9.99004303378913e-07,
4483
+ "loss": 133.1088,
4484
+ "step": 6370
4485
+ },
4486
+ {
4487
+ "epoch": 0.5105972181755203,
4488
+ "grad_norm": 79.3125,
4489
+ "learning_rate": 9.990027402758971e-07,
4490
+ "loss": 133.757,
4491
+ "step": 6380
4492
+ },
4493
+ {
4494
+ "epoch": 0.5113975272949176,
4495
+ "grad_norm": 74.0625,
4496
+ "learning_rate": 9.990011771728811e-07,
4497
+ "loss": 133.9952,
4498
+ "step": 6390
4499
+ },
4500
+ {
4501
+ "epoch": 0.512197836414315,
4502
+ "grad_norm": 72.5625,
4503
+ "learning_rate": 9.989996140698654e-07,
4504
+ "loss": 131.275,
4505
+ "step": 6400
4506
+ },
4507
+ {
4508
+ "epoch": 0.5129981455337124,
4509
+ "grad_norm": 78.4375,
4510
+ "learning_rate": 9.989980509668496e-07,
4511
+ "loss": 131.3286,
4512
+ "step": 6410
4513
+ },
4514
+ {
4515
+ "epoch": 0.5137984546531098,
4516
+ "grad_norm": 73.6875,
4517
+ "learning_rate": 9.989964878638338e-07,
4518
+ "loss": 131.1604,
4519
+ "step": 6420
4520
+ },
4521
+ {
4522
+ "epoch": 0.5145987637725071,
4523
+ "grad_norm": 77.0,
4524
+ "learning_rate": 9.989949247608178e-07,
4525
+ "loss": 131.0352,
4526
+ "step": 6430
4527
+ },
4528
+ {
4529
+ "epoch": 0.5153990728919045,
4530
+ "grad_norm": 77.8125,
4531
+ "learning_rate": 9.98993361657802e-07,
4532
+ "loss": 132.0469,
4533
+ "step": 6440
4534
+ },
4535
+ {
4536
+ "epoch": 0.5161993820113019,
4537
+ "grad_norm": 76.1875,
4538
+ "learning_rate": 9.989917985547862e-07,
4539
+ "loss": 133.7251,
4540
+ "step": 6450
4541
+ },
4542
+ {
4543
+ "epoch": 0.5169996911306992,
4544
+ "grad_norm": 70.6875,
4545
+ "learning_rate": 9.989902354517702e-07,
4546
+ "loss": 131.82,
4547
+ "step": 6460
4548
+ },
4549
+ {
4550
+ "epoch": 0.5178000002500966,
4551
+ "grad_norm": 76.8125,
4552
+ "learning_rate": 9.989886723487545e-07,
4553
+ "loss": 131.712,
4554
+ "step": 6470
4555
+ },
4556
+ {
4557
+ "epoch": 0.518600309369494,
4558
+ "grad_norm": 77.9375,
4559
+ "learning_rate": 9.989871092457387e-07,
4560
+ "loss": 132.1365,
4561
+ "step": 6480
4562
+ },
4563
+ {
4564
+ "epoch": 0.5194006184888913,
4565
+ "grad_norm": 74.0,
4566
+ "learning_rate": 9.989855461427227e-07,
4567
+ "loss": 131.9362,
4568
+ "step": 6490
4569
+ },
4570
+ {
4571
+ "epoch": 0.5202009276082887,
4572
+ "grad_norm": 77.3125,
4573
+ "learning_rate": 9.98983983039707e-07,
4574
+ "loss": 133.708,
4575
+ "step": 6500
4576
+ },
4577
+ {
4578
+ "epoch": 0.5210012367276861,
4579
+ "grad_norm": 77.75,
4580
+ "learning_rate": 9.989824199366911e-07,
4581
+ "loss": 132.9561,
4582
+ "step": 6510
4583
+ },
4584
+ {
4585
+ "epoch": 0.5218015458470834,
4586
+ "grad_norm": 76.8125,
4587
+ "learning_rate": 9.989808568336753e-07,
4588
+ "loss": 133.186,
4589
+ "step": 6520
4590
+ },
4591
+ {
4592
+ "epoch": 0.5226018549664808,
4593
+ "grad_norm": 79.125,
4594
+ "learning_rate": 9.989792937306596e-07,
4595
+ "loss": 131.5064,
4596
+ "step": 6530
4597
+ },
4598
+ {
4599
+ "epoch": 0.5234021640858781,
4600
+ "grad_norm": 74.5,
4601
+ "learning_rate": 9.989777306276438e-07,
4602
+ "loss": 131.7443,
4603
+ "step": 6540
4604
+ },
4605
+ {
4606
+ "epoch": 0.5242024732052756,
4607
+ "grad_norm": 75.9375,
4608
+ "learning_rate": 9.989761675246278e-07,
4609
+ "loss": 132.0182,
4610
+ "step": 6550
4611
+ },
4612
+ {
4613
+ "epoch": 0.5250027823246729,
4614
+ "grad_norm": 74.25,
4615
+ "learning_rate": 9.98974604421612e-07,
4616
+ "loss": 132.1322,
4617
+ "step": 6560
4618
+ },
4619
+ {
4620
+ "epoch": 0.5258030914440702,
4621
+ "grad_norm": 77.25,
4622
+ "learning_rate": 9.989730413185962e-07,
4623
+ "loss": 131.4318,
4624
+ "step": 6570
4625
+ },
4626
+ {
4627
+ "epoch": 0.5266034005634677,
4628
+ "grad_norm": 79.375,
4629
+ "learning_rate": 9.989714782155802e-07,
4630
+ "loss": 131.6486,
4631
+ "step": 6580
4632
+ },
4633
+ {
4634
+ "epoch": 0.527403709682865,
4635
+ "grad_norm": 72.875,
4636
+ "learning_rate": 9.989699151125644e-07,
4637
+ "loss": 132.6251,
4638
+ "step": 6590
4639
+ },
4640
+ {
4641
+ "epoch": 0.5282040188022624,
4642
+ "grad_norm": 77.75,
4643
+ "learning_rate": 9.989683520095487e-07,
4644
+ "loss": 130.9398,
4645
+ "step": 6600
4646
+ },
4647
+ {
4648
+ "epoch": 0.5290043279216597,
4649
+ "grad_norm": 78.0625,
4650
+ "learning_rate": 9.989667889065329e-07,
4651
+ "loss": 132.2289,
4652
+ "step": 6610
4653
+ },
4654
+ {
4655
+ "epoch": 0.5298046370410571,
4656
+ "grad_norm": 80.75,
4657
+ "learning_rate": 9.989652258035169e-07,
4658
+ "loss": 133.6902,
4659
+ "step": 6620
4660
+ },
4661
+ {
4662
+ "epoch": 0.5306049461604545,
4663
+ "grad_norm": 77.0,
4664
+ "learning_rate": 9.989636627005011e-07,
4665
+ "loss": 131.2565,
4666
+ "step": 6630
4667
+ },
4668
+ {
4669
+ "epoch": 0.5314052552798518,
4670
+ "grad_norm": 72.3125,
4671
+ "learning_rate": 9.989620995974853e-07,
4672
+ "loss": 133.0545,
4673
+ "step": 6640
4674
+ },
4675
+ {
4676
+ "epoch": 0.5322055643992493,
4677
+ "grad_norm": 76.125,
4678
+ "learning_rate": 9.989605364944693e-07,
4679
+ "loss": 131.2922,
4680
+ "step": 6650
4681
+ },
4682
+ {
4683
+ "epoch": 0.5330058735186466,
4684
+ "grad_norm": 70.8125,
4685
+ "learning_rate": 9.989589733914536e-07,
4686
+ "loss": 133.2325,
4687
+ "step": 6660
4688
+ },
4689
+ {
4690
+ "epoch": 0.5338061826380439,
4691
+ "grad_norm": 83.5625,
4692
+ "learning_rate": 9.989574102884378e-07,
4693
+ "loss": 133.1627,
4694
+ "step": 6670
4695
+ },
4696
+ {
4697
+ "epoch": 0.5346064917574413,
4698
+ "grad_norm": 73.4375,
4699
+ "learning_rate": 9.98955847185422e-07,
4700
+ "loss": 132.0812,
4701
+ "step": 6680
4702
+ },
4703
+ {
4704
+ "epoch": 0.5354068008768387,
4705
+ "grad_norm": 84.8125,
4706
+ "learning_rate": 9.989542840824062e-07,
4707
+ "loss": 131.5462,
4708
+ "step": 6690
4709
+ },
4710
+ {
4711
+ "epoch": 0.536207109996236,
4712
+ "grad_norm": 75.0,
4713
+ "learning_rate": 9.989527209793904e-07,
4714
+ "loss": 132.3298,
4715
+ "step": 6700
4716
+ },
4717
+ {
4718
+ "epoch": 0.5370074191156334,
4719
+ "grad_norm": 82.0625,
4720
+ "learning_rate": 9.989511578763744e-07,
4721
+ "loss": 133.5979,
4722
+ "step": 6710
4723
+ },
4724
+ {
4725
+ "epoch": 0.5378077282350308,
4726
+ "grad_norm": 73.8125,
4727
+ "learning_rate": 9.989495947733586e-07,
4728
+ "loss": 130.3924,
4729
+ "step": 6720
4730
+ },
4731
+ {
4732
+ "epoch": 0.5386080373544282,
4733
+ "grad_norm": 85.1875,
4734
+ "learning_rate": 9.989480316703429e-07,
4735
+ "loss": 130.9538,
4736
+ "step": 6730
4737
+ },
4738
+ {
4739
+ "epoch": 0.5394083464738255,
4740
+ "grad_norm": 77.4375,
4741
+ "learning_rate": 9.989464685673269e-07,
4742
+ "loss": 132.9888,
4743
+ "step": 6740
4744
+ },
4745
+ {
4746
+ "epoch": 0.5402086555932228,
4747
+ "grad_norm": 78.75,
4748
+ "learning_rate": 9.98944905464311e-07,
4749
+ "loss": 132.472,
4750
+ "step": 6750
4751
+ },
4752
+ {
4753
+ "epoch": 0.5410089647126203,
4754
+ "grad_norm": 73.6875,
4755
+ "learning_rate": 9.989433423612953e-07,
4756
+ "loss": 132.7828,
4757
+ "step": 6760
4758
+ },
4759
+ {
4760
+ "epoch": 0.5418092738320176,
4761
+ "grad_norm": 74.9375,
4762
+ "learning_rate": 9.989417792582795e-07,
4763
+ "loss": 131.1772,
4764
+ "step": 6770
4765
+ },
4766
+ {
4767
+ "epoch": 0.542609582951415,
4768
+ "grad_norm": 73.5625,
4769
+ "learning_rate": 9.989402161552635e-07,
4770
+ "loss": 132.5478,
4771
+ "step": 6780
4772
+ },
4773
+ {
4774
+ "epoch": 0.5434098920708124,
4775
+ "grad_norm": 79.5,
4776
+ "learning_rate": 9.989386530522478e-07,
4777
+ "loss": 130.6843,
4778
+ "step": 6790
4779
+ },
4780
+ {
4781
+ "epoch": 0.5442102011902097,
4782
+ "grad_norm": 73.5625,
4783
+ "learning_rate": 9.98937089949232e-07,
4784
+ "loss": 131.9659,
4785
+ "step": 6800
4786
+ },
4787
+ {
4788
+ "epoch": 0.5450105103096071,
4789
+ "grad_norm": 73.3125,
4790
+ "learning_rate": 9.98935526846216e-07,
4791
+ "loss": 131.0417,
4792
+ "step": 6810
4793
+ },
4794
+ {
4795
+ "epoch": 0.5458108194290044,
4796
+ "grad_norm": 74.3125,
4797
+ "learning_rate": 9.989339637432002e-07,
4798
+ "loss": 133.0612,
4799
+ "step": 6820
4800
+ },
4801
+ {
4802
+ "epoch": 0.5466111285484018,
4803
+ "grad_norm": 75.25,
4804
+ "learning_rate": 9.989324006401844e-07,
4805
+ "loss": 132.5644,
4806
+ "step": 6830
4807
+ },
4808
+ {
4809
+ "epoch": 0.5474114376677992,
4810
+ "grad_norm": 77.75,
4811
+ "learning_rate": 9.989308375371686e-07,
4812
+ "loss": 132.1365,
4813
+ "step": 6840
4814
+ },
4815
+ {
4816
+ "epoch": 0.5482117467871965,
4817
+ "grad_norm": 77.1875,
4818
+ "learning_rate": 9.989292744341529e-07,
4819
+ "loss": 132.7418,
4820
+ "step": 6850
4821
+ },
4822
+ {
4823
+ "epoch": 0.549012055906594,
4824
+ "grad_norm": 79.125,
4825
+ "learning_rate": 9.989277113311369e-07,
4826
+ "loss": 131.9871,
4827
+ "step": 6860
4828
+ },
4829
+ {
4830
+ "epoch": 0.5498123650259913,
4831
+ "grad_norm": 75.0,
4832
+ "learning_rate": 9.98926148228121e-07,
4833
+ "loss": 131.313,
4834
+ "step": 6870
4835
+ },
4836
+ {
4837
+ "epoch": 0.5506126741453886,
4838
+ "grad_norm": 74.6875,
4839
+ "learning_rate": 9.989245851251053e-07,
4840
+ "loss": 132.0703,
4841
+ "step": 6880
4842
+ },
4843
+ {
4844
+ "epoch": 0.551412983264786,
4845
+ "grad_norm": 73.5625,
4846
+ "learning_rate": 9.989230220220895e-07,
4847
+ "loss": 130.9999,
4848
+ "step": 6890
4849
+ },
4850
+ {
4851
+ "epoch": 0.5522132923841834,
4852
+ "grad_norm": 75.0625,
4853
+ "learning_rate": 9.989214589190735e-07,
4854
+ "loss": 132.2652,
4855
+ "step": 6900
4856
+ },
4857
+ {
4858
+ "epoch": 0.5530136015035808,
4859
+ "grad_norm": 74.125,
4860
+ "learning_rate": 9.989198958160577e-07,
4861
+ "loss": 131.5572,
4862
+ "step": 6910
4863
+ },
4864
+ {
4865
+ "epoch": 0.5538139106229781,
4866
+ "grad_norm": 75.8125,
4867
+ "learning_rate": 9.98918332713042e-07,
4868
+ "loss": 130.6713,
4869
+ "step": 6920
4870
+ },
4871
+ {
4872
+ "epoch": 0.5546142197423755,
4873
+ "grad_norm": 75.0625,
4874
+ "learning_rate": 9.98916769610026e-07,
4875
+ "loss": 132.5074,
4876
+ "step": 6930
4877
+ },
4878
+ {
4879
+ "epoch": 0.5554145288617729,
4880
+ "grad_norm": 83.5,
4881
+ "learning_rate": 9.989152065070102e-07,
4882
+ "loss": 132.7792,
4883
+ "step": 6940
4884
+ },
4885
+ {
4886
+ "epoch": 0.5562148379811702,
4887
+ "grad_norm": 78.6875,
4888
+ "learning_rate": 9.989136434039944e-07,
4889
+ "loss": 132.4211,
4890
+ "step": 6950
4891
+ },
4892
+ {
4893
+ "epoch": 0.5570151471005675,
4894
+ "grad_norm": 84.125,
4895
+ "learning_rate": 9.989120803009786e-07,
4896
+ "loss": 133.1216,
4897
+ "step": 6960
4898
+ },
4899
+ {
4900
+ "epoch": 0.557815456219965,
4901
+ "grad_norm": 80.8125,
4902
+ "learning_rate": 9.989105171979626e-07,
4903
+ "loss": 131.8983,
4904
+ "step": 6970
4905
+ },
4906
+ {
4907
+ "epoch": 0.5586157653393623,
4908
+ "grad_norm": 77.875,
4909
+ "learning_rate": 9.989089540949468e-07,
4910
+ "loss": 131.918,
4911
+ "step": 6980
4912
+ },
4913
+ {
4914
+ "epoch": 0.5594160744587597,
4915
+ "grad_norm": 73.1875,
4916
+ "learning_rate": 9.98907390991931e-07,
4917
+ "loss": 131.1051,
4918
+ "step": 6990
4919
+ },
4920
+ {
4921
+ "epoch": 0.5602163835781571,
4922
+ "grad_norm": 78.0625,
4923
+ "learning_rate": 9.989058278889153e-07,
4924
+ "loss": 132.776,
4925
+ "step": 7000
4926
+ },
4927
+ {
4928
+ "epoch": 0.5610166926975544,
4929
+ "grad_norm": 82.875,
4930
+ "learning_rate": 9.989042647858995e-07,
4931
+ "loss": 131.2556,
4932
+ "step": 7010
4933
+ },
4934
+ {
4935
+ "epoch": 0.5618170018169518,
4936
+ "grad_norm": 79.8125,
4937
+ "learning_rate": 9.989027016828835e-07,
4938
+ "loss": 131.6359,
4939
+ "step": 7020
4940
+ },
4941
+ {
4942
+ "epoch": 0.5626173109363491,
4943
+ "grad_norm": 78.875,
4944
+ "learning_rate": 9.989011385798677e-07,
4945
+ "loss": 132.1506,
4946
+ "step": 7030
4947
+ },
4948
+ {
4949
+ "epoch": 0.5634176200557466,
4950
+ "grad_norm": 74.5625,
4951
+ "learning_rate": 9.98899575476852e-07,
4952
+ "loss": 131.6759,
4953
+ "step": 7040
4954
+ },
4955
+ {
4956
+ "epoch": 0.5642179291751439,
4957
+ "grad_norm": 76.5,
4958
+ "learning_rate": 9.988980123738362e-07,
4959
+ "loss": 131.1462,
4960
+ "step": 7050
4961
+ },
4962
+ {
4963
+ "epoch": 0.5650182382945412,
4964
+ "grad_norm": 82.625,
4965
+ "learning_rate": 9.988964492708202e-07,
4966
+ "loss": 130.7009,
4967
+ "step": 7060
4968
+ },
4969
+ {
4970
+ "epoch": 0.5658185474139387,
4971
+ "grad_norm": 84.875,
4972
+ "learning_rate": 9.988948861678044e-07,
4973
+ "loss": 134.6625,
4974
+ "step": 7070
4975
+ },
4976
+ {
4977
+ "epoch": 0.566618856533336,
4978
+ "grad_norm": 77.1875,
4979
+ "learning_rate": 9.988933230647886e-07,
4980
+ "loss": 132.1597,
4981
+ "step": 7080
4982
+ },
4983
+ {
4984
+ "epoch": 0.5674191656527333,
4985
+ "grad_norm": 75.6875,
4986
+ "learning_rate": 9.988917599617726e-07,
4987
+ "loss": 131.5931,
4988
+ "step": 7090
4989
+ },
4990
+ {
4991
+ "epoch": 0.5682194747721308,
4992
+ "grad_norm": 73.6875,
4993
+ "learning_rate": 9.988901968587568e-07,
4994
+ "loss": 131.8727,
4995
+ "step": 7100
4996
+ },
4997
+ {
4998
+ "epoch": 0.5690197838915281,
4999
+ "grad_norm": 73.25,
5000
+ "learning_rate": 9.98888633755741e-07,
5001
+ "loss": 132.6044,
5002
+ "step": 7110
5003
+ },
5004
+ {
5005
+ "epoch": 0.5698200930109255,
5006
+ "grad_norm": 85.75,
5007
+ "learning_rate": 9.98887070652725e-07,
5008
+ "loss": 131.7462,
5009
+ "step": 7120
5010
+ },
5011
+ {
5012
+ "epoch": 0.5706204021303228,
5013
+ "grad_norm": 74.0625,
5014
+ "learning_rate": 9.988855075497093e-07,
5015
+ "loss": 132.0296,
5016
+ "step": 7130
5017
+ },
5018
+ {
5019
+ "epoch": 0.5714207112497202,
5020
+ "grad_norm": 80.6875,
5021
+ "learning_rate": 9.988839444466935e-07,
5022
+ "loss": 134.3069,
5023
+ "step": 7140
5024
+ },
5025
+ {
5026
+ "epoch": 0.5722210203691176,
5027
+ "grad_norm": 76.0,
5028
+ "learning_rate": 9.988823813436777e-07,
5029
+ "loss": 131.401,
5030
+ "step": 7150
5031
+ },
5032
+ {
5033
+ "epoch": 0.5730213294885149,
5034
+ "grad_norm": 81.5625,
5035
+ "learning_rate": 9.98880818240662e-07,
5036
+ "loss": 132.4782,
5037
+ "step": 7160
5038
+ },
5039
+ {
5040
+ "epoch": 0.5738216386079124,
5041
+ "grad_norm": 76.4375,
5042
+ "learning_rate": 9.988792551376461e-07,
5043
+ "loss": 132.7547,
5044
+ "step": 7170
5045
+ },
5046
+ {
5047
+ "epoch": 0.5746219477273097,
5048
+ "grad_norm": 72.9375,
5049
+ "learning_rate": 9.988776920346301e-07,
5050
+ "loss": 131.6157,
5051
+ "step": 7180
5052
+ },
5053
+ {
5054
+ "epoch": 0.575422256846707,
5055
+ "grad_norm": 75.5625,
5056
+ "learning_rate": 9.988761289316144e-07,
5057
+ "loss": 132.9658,
5058
+ "step": 7190
5059
+ },
5060
+ {
5061
+ "epoch": 0.5762225659661044,
5062
+ "grad_norm": 73.3125,
5063
+ "learning_rate": 9.988745658285986e-07,
5064
+ "loss": 131.4694,
5065
+ "step": 7200
5066
+ },
5067
+ {
5068
+ "epoch": 0.5770228750855018,
5069
+ "grad_norm": 76.6875,
5070
+ "learning_rate": 9.988730027255828e-07,
5071
+ "loss": 132.546,
5072
+ "step": 7210
5073
+ },
5074
+ {
5075
+ "epoch": 0.5778231842048991,
5076
+ "grad_norm": 77.6875,
5077
+ "learning_rate": 9.988714396225668e-07,
5078
+ "loss": 131.9255,
5079
+ "step": 7220
5080
+ },
5081
+ {
5082
+ "epoch": 0.5786234933242965,
5083
+ "grad_norm": 82.0625,
5084
+ "learning_rate": 9.98869876519551e-07,
5085
+ "loss": 133.1269,
5086
+ "step": 7230
5087
+ },
5088
+ {
5089
+ "epoch": 0.5794238024436938,
5090
+ "grad_norm": 75.5625,
5091
+ "learning_rate": 9.988683134165352e-07,
5092
+ "loss": 131.8465,
5093
+ "step": 7240
5094
+ },
5095
+ {
5096
+ "epoch": 0.5802241115630913,
5097
+ "grad_norm": 78.125,
5098
+ "learning_rate": 9.988667503135193e-07,
5099
+ "loss": 131.8865,
5100
+ "step": 7250
5101
+ },
5102
+ {
5103
+ "epoch": 0.5810244206824886,
5104
+ "grad_norm": 77.625,
5105
+ "learning_rate": 9.988651872105035e-07,
5106
+ "loss": 131.2077,
5107
+ "step": 7260
5108
+ },
5109
+ {
5110
+ "epoch": 0.5818247298018859,
5111
+ "grad_norm": 73.0,
5112
+ "learning_rate": 9.988636241074877e-07,
5113
+ "loss": 132.2729,
5114
+ "step": 7270
5115
+ },
5116
+ {
5117
+ "epoch": 0.5826250389212834,
5118
+ "grad_norm": 76.125,
5119
+ "learning_rate": 9.988620610044717e-07,
5120
+ "loss": 132.5173,
5121
+ "step": 7280
5122
+ },
5123
+ {
5124
+ "epoch": 0.5834253480406807,
5125
+ "grad_norm": 72.875,
5126
+ "learning_rate": 9.98860497901456e-07,
5127
+ "loss": 130.8854,
5128
+ "step": 7290
5129
+ },
5130
+ {
5131
+ "epoch": 0.5842256571600781,
5132
+ "grad_norm": 76.5625,
5133
+ "learning_rate": 9.988589347984401e-07,
5134
+ "loss": 130.3204,
5135
+ "step": 7300
5136
+ },
5137
+ {
5138
+ "epoch": 0.5850259662794755,
5139
+ "grad_norm": 79.375,
5140
+ "learning_rate": 9.988573716954244e-07,
5141
+ "loss": 131.2047,
5142
+ "step": 7310
5143
+ },
5144
+ {
5145
+ "epoch": 0.5858262753988728,
5146
+ "grad_norm": 76.6875,
5147
+ "learning_rate": 9.988558085924086e-07,
5148
+ "loss": 132.3948,
5149
+ "step": 7320
5150
+ },
5151
+ {
5152
+ "epoch": 0.5866265845182702,
5153
+ "grad_norm": 75.0625,
5154
+ "learning_rate": 9.988542454893926e-07,
5155
+ "loss": 131.0941,
5156
+ "step": 7330
5157
+ },
5158
+ {
5159
+ "epoch": 0.5874268936376675,
5160
+ "grad_norm": 80.375,
5161
+ "learning_rate": 9.988526823863768e-07,
5162
+ "loss": 132.392,
5163
+ "step": 7340
5164
+ },
5165
+ {
5166
+ "epoch": 0.588227202757065,
5167
+ "grad_norm": 75.3125,
5168
+ "learning_rate": 9.98851119283361e-07,
5169
+ "loss": 132.1595,
5170
+ "step": 7350
5171
+ },
5172
+ {
5173
+ "epoch": 0.5890275118764623,
5174
+ "grad_norm": 80.5,
5175
+ "learning_rate": 9.988495561803452e-07,
5176
+ "loss": 131.9138,
5177
+ "step": 7360
5178
+ },
5179
+ {
5180
+ "epoch": 0.5898278209958596,
5181
+ "grad_norm": 74.5625,
5182
+ "learning_rate": 9.988479930773294e-07,
5183
+ "loss": 129.6595,
5184
+ "step": 7370
5185
+ },
5186
+ {
5187
+ "epoch": 0.590628130115257,
5188
+ "grad_norm": 74.5625,
5189
+ "learning_rate": 9.988464299743135e-07,
5190
+ "loss": 130.2861,
5191
+ "step": 7380
5192
+ },
5193
+ {
5194
+ "epoch": 0.5914284392346544,
5195
+ "grad_norm": 71.625,
5196
+ "learning_rate": 9.988448668712977e-07,
5197
+ "loss": 132.6865,
5198
+ "step": 7390
5199
+ },
5200
+ {
5201
+ "epoch": 0.5922287483540517,
5202
+ "grad_norm": 80.25,
5203
+ "learning_rate": 9.988433037682819e-07,
5204
+ "loss": 132.5223,
5205
+ "step": 7400
5206
+ },
5207
+ {
5208
+ "epoch": 0.5930290574734491,
5209
+ "grad_norm": 70.5,
5210
+ "learning_rate": 9.98841740665266e-07,
5211
+ "loss": 132.5188,
5212
+ "step": 7410
5213
+ },
5214
+ {
5215
+ "epoch": 0.5938293665928465,
5216
+ "grad_norm": 71.4375,
5217
+ "learning_rate": 9.988401775622501e-07,
5218
+ "loss": 132.2418,
5219
+ "step": 7420
5220
+ },
5221
+ {
5222
+ "epoch": 0.5946296757122439,
5223
+ "grad_norm": 75.25,
5224
+ "learning_rate": 9.988386144592343e-07,
5225
+ "loss": 130.8538,
5226
+ "step": 7430
5227
+ },
5228
+ {
5229
+ "epoch": 0.5954299848316412,
5230
+ "grad_norm": 75.8125,
5231
+ "learning_rate": 9.988370513562183e-07,
5232
+ "loss": 131.213,
5233
+ "step": 7440
5234
+ },
5235
+ {
5236
+ "epoch": 0.5962302939510385,
5237
+ "grad_norm": 74.0,
5238
+ "learning_rate": 9.988354882532026e-07,
5239
+ "loss": 132.8653,
5240
+ "step": 7450
5241
+ },
5242
+ {
5243
+ "epoch": 0.597030603070436,
5244
+ "grad_norm": 74.25,
5245
+ "learning_rate": 9.988339251501868e-07,
5246
+ "loss": 131.1389,
5247
+ "step": 7460
5248
+ },
5249
+ {
5250
+ "epoch": 0.5978309121898333,
5251
+ "grad_norm": 79.0625,
5252
+ "learning_rate": 9.98832362047171e-07,
5253
+ "loss": 131.8387,
5254
+ "step": 7470
5255
+ },
5256
+ {
5257
+ "epoch": 0.5986312213092307,
5258
+ "grad_norm": 92.9375,
5259
+ "learning_rate": 9.988307989441552e-07,
5260
+ "loss": 132.9031,
5261
+ "step": 7480
5262
+ },
5263
+ {
5264
+ "epoch": 0.5994315304286281,
5265
+ "grad_norm": 75.75,
5266
+ "learning_rate": 9.988292358411392e-07,
5267
+ "loss": 130.4166,
5268
+ "step": 7490
5269
+ },
5270
+ {
5271
+ "epoch": 0.6002318395480254,
5272
+ "grad_norm": 75.25,
5273
+ "learning_rate": 9.988276727381234e-07,
5274
+ "loss": 131.6747,
5275
+ "step": 7500
5276
+ },
5277
+ {
5278
+ "epoch": 0.6002318395480254,
5279
+ "eval_loss": 2.0600602626800537,
5280
+ "eval_runtime": 418.6157,
5281
+ "eval_samples_per_second": 1567.447,
5282
+ "eval_steps_per_second": 48.983,
5283
+ "step": 7500
5284
  }
5285
  ],
5286
  "logging_steps": 10,
 
5300
  "attributes": {}
5301
  }
5302
  },
5303
+ "total_flos": 2.0708374333095936e+19,
5304
  "train_batch_size": 4,
5305
  "trial_name": null,
5306
  "trial_params": null